=head1 BOPHScraper =head1 NAME Webg::BOPHScraper - Scraper para el Boletín Oficial de la Provincia de Huesca =head1 SYNOPSIS use Webg::BOPHScraper->new(); my $boph = Webg::BOPHScraper->new(); $boph->download("2008","05"); $boph->doit("La Puebla de Castro","contenidos/boletin-lapuebla.html","LAPUEBLA DE CASTRO","LA PUEBLA"); =head1 DESCRIPTION Downloads announcements for different Major Councils from www.dphuesca.es =cut package Webg::BOPHScraper; use strict; sub new { my $type = shift; my %params = @_; my $self = {}; my @fields = qw( url_path ); foreach my $field (@fields) { (exists $params{$field} ) ? $self->{$field} = $params{$field} : (); } unless ( defined $self->{url_path} ) { $self->{url_path} = "http://www.dphuesca.es/boph/pdfs"; } @{$self->{theme_change}} = ('COMARCAS', 'AYUNTAMIENTO', 'ENTIDADES LOCALES', 'ENTIDAD LOCAL', 'ADMINISTRACIÓN DEL ESTADO', 'Administración del Estado', 'SOCIEDAD DE GESTIÓN'); @{$self->{days}} = ( "Lunes", "Martes", "Miércoles", "Jueves", "Viernes", "Sábado", "Domingo" ); $self->{init_an} = "AYUNTAMIENTO DE"; bless $self, $type; } sub download { my $self = shift; my $year = shift; my $month = shift; unless ( -e "txt" && -d "txt" ) { mkdir("txt"); } unless ( -e "pdf" && -d "pdf" ) { mkdir("pdf"); } foreach my $day ( 1..31 ) { $day < 10 ? $day = "0$day" : (); unless ( -e "pdf/BOPH${year}${month}$day.pdf" ) { print "# Getting $self->{url_path}/BOPH${year}${month}$day.pdf\n"; system("wget --quiet $self->{url_path}/BOPH${year}${month}$day.pdf"); if ( -e "BOPH${year}${month}$day.pdf") { system("mv BOPH${year}${month}$day.pdf pdf/"); } } unless ( -e "txt/BOPH${year}${month}$day.txt" ) { print "# Translating BOPH${year}${month}$day.pdf\n"; system("pdftotext -q -eol unix -enc Latin1 -nopgbrk -raw pdf/BOPH${year}${month}$day.pdf"); if ( -e "pdf/BOPH${year}${month}$day.txt" ) { system("mv pdf/BOPH${year}${month}$day.txt txt/"); } } } } sub doit { my $self = shift; my $title = shift; my $filename = shift; my $rss_filename = shift; my @pattern = @_; my $pattern = join("|",@pattern); my @anouncements = (); my @records = (); foreach my $file ( ) { push(@records,$self->parse($pattern,$file)); } my @sort = reverse @records; $self->dump_as_html($filename,$title,@sort); if ( $rss_filename =~ m/.*\.rss\Z/i ) { $self->dump_as_rss($rss_filename,$title,@sort); } } sub dump_as_rss { my $self = shift; my $filename = shift; my $title = shift; my @records = @_; open(RSS,">$filename"); print RSS < Anuncios B.O.P.H. Ayuntamiento de $title http://www.lapuebladecastro.com/boletin/ Histórico de anuncios publicados en el Boletín Oficial de la Provincia de Huesca es La Puebla de Castro http://www.lapuebladecastro.com/gif/ban/t_tescudo.gif http://www.lapuebladecastro.com 30 30 EOF foreach my $post ( @records ) { my ($name) = $post->{file} =~ m/(BOPH.*)\.txt/gi; my $url = "$self->{url_path}/$name.pdf"; print RSS < Anuncio $title B.O.P.H. $post->{date} $url $post->{text} EOF } print HTML < EOF close(HTML); } sub dump_as_html { my $self = shift; my $filename = shift; my $title = shift; my @records = @_; open(HTML,">$filename"); print HTML <Anuncios del Ayto. de $title en el Boletín Oficial de la Provincia de Huesca

Haz click sobre los enlaces a las fechas para obtener la edición del Boletín Original en formato PDF

EOF foreach my $post ( @records ) { my $text = $post->{text}; $text =~ s/\A(\d+)/$1<\/font>/gi; my ($name) = $post->{file} =~ m/(BOPH.*)\.txt/gi; my $url = "$self->{url_path}/$name.pdf"; print HTML "
"; print HTML "

$post->{date} "; print HTML "(Extracto del B.O.P.H.)

\n"; print HTML "

"; print HTML "$text

\n\n\n"; } print HTML <get_lines_from_file($file); my @anuncios = $self->ayuntamiento($pattern,$file,@lines); @anuncios; } sub ayuntamiento { my $self = shift; my $pattern = shift; my $file = shift; my @lines = @_; my @records = (); my $date = $lines[6]; my @dias = @{$self->{days}}; my $dias = join("|",@dias); foreach my $i ( 0..15 ) { if ( $lines[$i] =~ m/($dias)/gi ) { $date = $lines[$i]; } } chomp($date); my ($end_line,@codes) = $self->code_search($pattern,@lines); my $regexp = join("|",@codes); my @theme_change = @{$self->{theme_change}}; push(@theme_change,@codes); my $theme_change = join("|",@theme_change); unless ( @codes ) { return (); } foreach my $i ( $end_line..$#lines ) { my $line = $lines[$i]; chomp($line); if ( ($#records >= 0) && ($line =~ m/\A($theme_change)/g)) { unless ( defined $records[$#records]->{fline} ) { $records[$#records]->{fline} = $i-1; } } if ( $line =~ m/\A$regexp\s*\Z/ ) { push(@records,{ iline => $i }); } } foreach my $record ( @records ) { $record->{date} = $date; $record->{file} = $file; foreach my $i ( $record->{iline} .. $record->{fline} ) { $record->{text} .= $lines[$i]; } } @records; } sub code_search { my $self = shift; my $pattern = shift; my @lines = @_; my @codes = (); my $end_line; foreach my $i ( 0..$#lines ) { my $line = $lines[$i]; my $code = ""; if ( $line =~ m/B\.\s*O\.\s*P\.\s*HU\.-\s* N/gi ) { $end_line = $i; last; } if ( (($code) = $line =~ m/\A(\d+)\s*\.*\s*$self->{init_an} ($pattern)\s*\.*/gi) ) { push(@codes,$code); } } ($end_line,@codes); } sub get_lines_from_file { my $self = shift; my $file = shift; open(TXT,$file); my @lines = ; close(TXT); @lines; } 1; =head1 LICENSE AND COPYRIGHT Copyright: Alfonso Egio 2008. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.000 or, at your option, any later version of Perl 5 you may have available. The full text of the licenses can be found at the following url: http://www.opensource.org/licenses/artistic-license-2.0.php =head1 AUTHOR Alfonso Egio