From: Harvie Date: Thu, 5 May 2011 15:54:32 +0000 (+0200) Subject: another crawler experiments with perl and db X-Git-Url: https://git.harvie.cz/?a=commitdiff_plain;h=fe8cffc026ce97ad58b35b63e4ddb9536f59e3ca;p=mirrors%2FPrograms.git another crawler experiments with perl and db --- diff --git a/perl/crawler/crawl-b.pl b/perl/crawler/crawl-b.pl new file mode 100755 index 0000000..486bf6d --- /dev/null +++ b/perl/crawler/crawl-b.pl @@ -0,0 +1,74 @@ +#!/usr/bin/env perl +use strict; +use warnings; +use threads; +#use Thread::Queue; +#use threads::shared; +use LWP::Simple qw($ua get); $ua->timeout(3); +use BerkeleyDB; + +print STDERR "This is libwww-perl-$LWP::VERSION\n"; + +#my @queue :shared; +#my %crawled :shared; +tie my @queue, "BerkeleyDB::Recno", -Filename => 'queue.db', -Flags => DB_CREATE || die "Cannot open DB1!\n"; +tie my %crawled, "BerkeleyDB::Hash", -Filename => 'urls.db', -Flags => DB_CREATE || die "Cannot open DB2!\n"; + +push(@queue, 'http://root.cz/'); +push(@queue, 'http://blog.harvie.cz/'); + +untie @queue; +untie %crawled; + + +sub uniq(\@) { + my ($i) = @_; + my %h; + @$i = grep(!$h{$_}++, @$i); + return(@$i); +} + +my $threads = 3; +for(my $i=0; $i<$threads; $i++) { + my $thr = threads->create(sub { + print("[*] Worker #$i running!\n"); + + my $env = new BerkeleyDB::Env + -Home => "/tmp/" , + -Flags => DB_CREATE| DB_INIT_CDB | DB_INIT_MPOOL + or die "cannot open environment: $BerkeleyDB::Error\n"; + + + tie my @queue, "BerkeleyDB::Recno", -Filename => 'queue.db', -Flags => DB_CREATE, -Env => $env || die "Cannot open DB1!\n"; + tie my %crawled, "BerkeleyDB::Hash", -Filename => 'urls.db', -Flags => DB_CREATE, -Env => $env || die "Cannot open DB2!\n"; + + while (1) { + print "omg ".shift(@queue)."\n"; + if(length($_) < 3) { sleep(1); next; } + + print "#$i:\tGET $_\n"; + $_ = LWP::Simple::get($_); + + my @urls; + if($_) { + @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls + #@urls = /(http:\/\/[^\/'" ]*)/g; #domains + if($i && @queue < 1000) { + #print("#$i:\tENQ: @urls\n"); + push(@queue, uniq(@urls)); + #while(uniq(@urls)) { push(@queue, $_); } + } + } + } + + print("[*] Worker #$i stopped!\n"); + }) -> detach(); +} + +while(1) { + print '###: '.@queue."\n"; + uniq(@queue); + print '###: '.@queue."\n"; + + sleep(5); +} diff --git a/perl/crawler/crawl-q.pl b/perl/crawler/crawl-q.pl index 6b7027c..fc5b6b3 100755 --- a/perl/crawler/crawl-q.pl +++ b/perl/crawler/crawl-q.pl @@ -1,20 +1,17 @@ #!/usr/bin/env perl +package WebCrawler; + use strict; use warnings; use threads; use Thread::Queue; -use LWP::Simple qw($ua get); - $ua->timeout(3); - - -package WebCrawler; - -my $urldb = './urls.txt'; +use BerkeleyDB; +use LWP::Simple qw($ua get); $ua->timeout(3); print "This is libwww-perl-$LWP::VERSION\n"; + my $urlqueue = Thread::Queue->new(); -#open(my $fp, "<$urldb"); while(<$fp>) { $urlqueue->enqueue($_); } close($fp); $urlqueue->enqueue("http://root.cz/"); $urlqueue->enqueue("http://blog.harvie.cz/"); @@ -25,30 +22,42 @@ sub uniq(\@) { return(@$i); } -use Data::Dumper; +sub crawl_url { +#sub crawl_url($ \%) { + my ($url, $crawled) = @_; + #use Data::Dumper; print(Dumper(%$crawled)."\n"); + print "#\tGET $url\n"; + $_ = LWP::Simple::get($url) || return; + %$crawled->{$url}=1; + my @urls; + if($_) { + @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls + #@urls = /(http:\/\/[^\/'" ]*)/g; #domains + return uniq(@urls); + } +} + +#use Data::Dumper; #die(Dumper($urlqueue)); -while(my @urlqueue) { - print($_."\n"); -} die; +#while(my @urlqueue) { +# print($_."\n"); +#} die; -my $threads = 20; +my $threads = 3; for(my $i=0; $i<$threads; $i++) { my $thr = threads->create(sub { print("[*] Worker #$i running!\n"); + my $env = new BerkeleyDB::Env -Home => "/tmp/", -Flags => DB_CREATE| DB_INIT_CDB | DB_INIT_MPOOL || die "cannot open environment: $BerkeleyDB::Error\n"; + my $db = tie my %crawled, "BerkeleyDB::Hash", -Filename => 'urls.db', -Flags => DB_CREATE, -Env => $env || die "Cannot open DB!\n"; + while (my $url = $urlqueue->dequeue()) { - #$urlqueue->enqueue($url); - print "#$i:\tGET $url\n"; - $_ = LWP::Simple::get($url); - my @urls; - if($_) { - @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls - #@urls = /(http:\/\/[^\/'" ]*)/g; #domains - if($urlqueue->pending() < 1000) { - #print("#$i:\tENQ: @urls\n"); - $urlqueue->enqueue(uniq(@urls)); - #while(uniq(@urls)) { $urlqueue->enqueue($_); } - } + #print "#$i:\tGET $url\n"; + my @urls = crawl_url($url, \%crawled); + $db->sync(); + if($urlqueue->pending() < 1000) { + #print("#$i:\tENQ: @urls\n"); + $urlqueue->enqueue(@urls); } }