Site programming by Marcin Junczys-Dowmunt



 
 
 

Gazeta Crawler

From Zakład Logiki Stosowanej

use HTML::LinkExtor;
use LWP::Simple;
use Encode qw (decode);
use utf8;

binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
     
$| = 1;     
     
my %urls = ( "http://wiadomosci.gazeta.pl" => 1 );

while(my $url = next_url()) {
    print STDERR "Reading $url\n";
    
    my $site = get($url);
    to_txt($site);
    
    my $p = HTML::LinkExtor->new(\&callback, $url);
    $p->parse($site);
}

sub next_url {
    my @notvisited = grep { $urls{$_} > 0 } keys %urls;
    my ($url) = sort { $urls{$a} <=> $urls{$b} } @notvisited; 
    $urls{$url} = 0;
    
    return $url;
}

sub callback {
    my($tag, %links) = @_;
    if($tag eq "a" and $links{href} =~ /^http:\/\/wiadomosci.gazeta.pl/) {
        
        $links{href} =~ /^(.*\.html)/;
        my $url = $1;
        
        if($url and not exists($urls{ $url })) {
            $urls{ $url } = scalar keys %urls;
        }
    }
}

sub to_txt {
    my $text = shift;
        
    $text = decode ("iso-8859-2", $text);
    $text =~ s/\n/ /g;
    $text =~ /<h1>(.*?)<\/h1>.*?<h5\sclass="author">(.*?)<\/h5>.*?<h6\sclass="date">(.*?)<\/h6>/sig;
    my $title = $1;
    my $author = $2;
    my $date = $3;
    my $source = '';
    if($text =~ m/<div\sid=\"source\">.*?<\/span>(.*?)<\/div>/sig){
            $source = $1;
    }
    $text =~ s/<script.*?<\/script>//gsi;
    $text =~ /<h4>(.*?)<\/h4>.*<div id=\"artykul\">(.*?)<\/div>/si;
    my $summary = $1;
    my $content = $2;
    $summary =~ s/<br>/\n/gs;
    $content =~ s/<br>/\n/gs;
    $content =~ s/<span class="txt_srodtytul"><b>//gs;
    $content =~ s/<\/b><\/span>//gs;
    
    if($author and $content and $title) {
        print STDERR "\nAuthor:\t $author\n";
        print STDERR "Title:\t $title\n";
        print STDERR "Date:\t $date\n\n";
    
        print "\nAuthor:\t $author\n";
        print "Title:\t $title\n";
        print "Date:\t $date\n\n";
    
    }
}