From Zakład Logiki Stosowanej
(różn) ← Poprzednia wersja | Aktualna wersja (różn) | Następna wersja → (różn)
use HTML::LinkExtor;
use LWP::Simple;
use Encode qw (decode);
use utf8;
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
$| = 1;
my %urls = ( "http://wiadomosci.gazeta.pl" => 1 );
while(my $url = next_url()) {
print STDERR "Reading $url\n";
my $site = get($url);
to_txt($site);
my $p = HTML::LinkExtor->new(\&callback, $url);
$p->parse($site);
}
sub next_url {
my @notvisited = grep { $urls{$_} > 0 } keys %urls;
my ($url) = sort { $urls{$a} <=> $urls{$b} } @notvisited;
$urls{$url} = 0;
return $url;
}
sub callback {
my($tag, %links) = @_;
if($tag eq "a" and $links{href} =~ /^http:\/\/wiadomosci.gazeta.pl/) {
$links{href} =~ /^(.*\.html)/;
my $url = $1;
if($url and not exists($urls{ $url })) {
$urls{ $url } = scalar keys %urls;
}
}
}
sub to_txt {
my $text = shift;
$text = decode ("iso-8859-2", $text);
$text =~ s/\n/ /g;
$text =~ /<h1>(.*?)<\/h1>.*?<h5\sclass="author">(.*?)<\/h5>.*?<h6\sclass="date">(.*?)<\/h6>/sig;
my $title = $1;
my $author = $2;
my $date = $3;
my $source = '';
if($text =~ m/<div\sid=\"source\">.*?<\/span>(.*?)<\/div>/sig){
$source = $1;
}
$text =~ s/<script.*?<\/script>//gsi;
$text =~ /<h4>(.*?)<\/h4>.*<div id=\"artykul\">(.*?)<\/div>/si;
my $summary = $1;
my $content = $2;
$summary =~ s/<br>/\n/gs;
$content =~ s/<br>/\n/gs;
$content =~ s/<span class="txt_srodtytul"><b>//gs;
$content =~ s/<\/b><\/span>//gs;
if($author and $content and $title) {
print STDERR "\nAuthor:\t $author\n";
print STDERR "Title:\t $title\n";
print STDERR "Date:\t $date\n\n";
print "\nAuthor:\t $author\n";
print "Title:\t $title\n";
print "Date:\t $date\n\n";
}
}