X-Git-Url: https://git.harvie.cz/?a=blobdiff_plain;f=perl%2Fcrawler%2Fcrawl-q.pl;h=fc5b6b301d4c526a842db5c5d4b8c7a12479a357;hb=HEAD;hp=6b7027c092dff12849fae02af9fe5757addef25d;hpb=21c4e16782bf8fee46cf478cd933a9ba11cca1c7;p=mirrors%2FPrograms.git diff --git a/perl/crawler/crawl-q.pl b/perl/crawler/crawl-q.pl index 6b7027c..fc5b6b3 100755 --- a/perl/crawler/crawl-q.pl +++ b/perl/crawler/crawl-q.pl @@ -1,20 +1,17 @@ #!/usr/bin/env perl +package WebCrawler; + use strict; use warnings; use threads; use Thread::Queue; -use LWP::Simple qw($ua get); - $ua->timeout(3); - - -package WebCrawler; - -my $urldb = './urls.txt'; +use BerkeleyDB; +use LWP::Simple qw($ua get); $ua->timeout(3); print "This is libwww-perl-$LWP::VERSION\n"; + my $urlqueue = Thread::Queue->new(); -#open(my $fp, "<$urldb"); while(<$fp>) { $urlqueue->enqueue($_); } close($fp); $urlqueue->enqueue("http://root.cz/"); $urlqueue->enqueue("http://blog.harvie.cz/"); @@ -25,30 +22,42 @@ sub uniq(\@) { return(@$i); } -use Data::Dumper; +sub crawl_url { +#sub crawl_url($ \%) { + my ($url, $crawled) = @_; + #use Data::Dumper; print(Dumper(%$crawled)."\n"); + print "#\tGET $url\n"; + $_ = LWP::Simple::get($url) || return; + %$crawled->{$url}=1; + my @urls; + if($_) { + @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls + #@urls = /(http:\/\/[^\/'" ]*)/g; #domains + return uniq(@urls); + } +} + +#use Data::Dumper; #die(Dumper($urlqueue)); -while(my @urlqueue) { - print($_."\n"); -} die; +#while(my @urlqueue) { +# print($_."\n"); +#} die; -my $threads = 20; +my $threads = 3; for(my $i=0; $i<$threads; $i++) { my $thr = threads->create(sub { print("[*] Worker #$i running!\n"); + my $env = new BerkeleyDB::Env -Home => "/tmp/", -Flags => DB_CREATE| DB_INIT_CDB | DB_INIT_MPOOL || die "cannot open environment: $BerkeleyDB::Error\n"; + my $db = tie my %crawled, "BerkeleyDB::Hash", -Filename => 'urls.db', -Flags => DB_CREATE, -Env => $env || die "Cannot open DB!\n"; + while (my $url = $urlqueue->dequeue()) { - #$urlqueue->enqueue($url); - print "#$i:\tGET $url\n"; - $_ = LWP::Simple::get($url); - my @urls; - if($_) { - @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls - #@urls = /(http:\/\/[^\/'" ]*)/g; #domains - if($urlqueue->pending() < 1000) { - #print("#$i:\tENQ: @urls\n"); - $urlqueue->enqueue(uniq(@urls)); - #while(uniq(@urls)) { $urlqueue->enqueue($_); } - } + #print "#$i:\tGET $url\n"; + my @urls = crawl_url($url, \%crawled); + $db->sync(); + if($urlqueue->pending() < 1000) { + #print("#$i:\tENQ: @urls\n"); + $urlqueue->enqueue(@urls); } }