Gazeta Crawler
From Zakład Logiki Stosowanej
(Różnice między wersjami)
Wersja z dnia 21:37, 22 maj 2008 (edytuj) Junczys (Dyskusja | wkład) d (New page: <pre> use HTML::LinkExtor; use LWP::Simple; use Encode qw (decode); use utf8; binmode(STDOUT, ":utf8"); binmode(STDERR, ":utf8"); $| = 1; my %urls = ( "http://wiadomosci....) ← Poprzednia edycja |
Aktualna wersja (21:37, 22 maj 2008) (edytuj) (undo) Junczys (Dyskusja | wkład) d (New page: <pre> use HTML::LinkExtor; use LWP::Simple; use Encode qw (decode); use utf8; binmode(STDOUT, ":utf8"); binmode(STDERR, ":utf8"); $| = 1; my %urls = ( "http://wiadomosci....) |
Aktualna wersja
use HTML::LinkExtor; use LWP::Simple; use Encode qw (decode); use utf8; binmode(STDOUT, ":utf8"); binmode(STDERR, ":utf8"); $| = 1; my %urls = ( "http://wiadomosci.gazeta.pl" => 1 ); while(my $url = next_url()) { print STDERR "Reading $url\n"; my $site = get($url); to_txt($site); my $p = HTML::LinkExtor->new(\&callback, $url); $p->parse($site); } sub next_url { my @notvisited = grep { $urls{$_} > 0 } keys %urls; my ($url) = sort { $urls{$a} <=> $urls{$b} } @notvisited; $urls{$url} = 0; return $url; } sub callback { my($tag, %links) = @_; if($tag eq "a" and $links{href} =~ /^http:\/\/wiadomosci.gazeta.pl/) { $links{href} =~ /^(.*\.html)/; my $url = $1; if($url and not exists($urls{ $url })) { $urls{ $url } = scalar keys %urls; } } } sub to_txt { my $text = shift; $text = decode ("iso-8859-2", $text); $text =~ s/\n/ /g; $text =~ /<h1>(.*?)<\/h1>.*?<h5\sclass="author">(.*?)<\/h5>.*?<h6\sclass="date">(.*?)<\/h6>/sig; my $title = $1; my $author = $2; my $date = $3; my $source = ''; if($text =~ m/<div\sid=\"source\">.*?<\/span>(.*?)<\/div>/sig){ $source = $1; } $text =~ s/<script.*?<\/script>//gsi; $text =~ /<h4>(.*?)<\/h4>.*<div id=\"artykul\">(.*?)<\/div>/si; my $summary = $1; my $content = $2; $summary =~ s/<br>/\n/gs; $content =~ s/<br>/\n/gs; $content =~ s/<span class="txt_srodtytul"><b>//gs; $content =~ s/<\/b><\/span>//gs; if($author and $content and $title) { print STDERR "\nAuthor:\t $author\n"; print STDERR "Title:\t $title\n"; print STDERR "Date:\t $date\n\n"; print "\nAuthor:\t $author\n"; print "Title:\t $title\n"; print "Date:\t $date\n\n"; } }