9 use LWP
::Simple
qw($ua get); $ua->timeout(3);
11 print "This is libwww-perl-$LWP::VERSION\n";
14 my $urlqueue = Thread::Queue->new();
15 $urlqueue->enqueue("http://root.cz/");
16 $urlqueue->enqueue("http://blog.harvie.cz/");
21 @$i = grep(!$h{$_}++, @$i);
26 #sub crawl_url($ \%) {
27 my ($url, $crawled) = @_;
28 #use Data::Dumper; print(Dumper(%$crawled)."\n");
29 print "#\tGET $url\n";
30 $_ = LWP::Simple::get($url) || return;
34 @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls
35 #@urls = /(http:\/\/[^\/'" ]*)/g; #domains
41 #die(Dumper($urlqueue));
42 #while(my @urlqueue) {
47 for(my $i=0; $i<$threads; $i++) {
48 my $thr = threads->create(sub {
49 print("[*] Worker #$i running!\n");
51 my $env = new BerkeleyDB::Env -Home => "/tmp/", -Flags => DB_CREATE| DB_INIT_CDB | DB_INIT_MPOOL || die "cannot open environment: $BerkeleyDB::Error\n";
52 my $db = tie my %crawled, "BerkeleyDB::Hash", -Filename => 'urls.db', -Flags => DB_CREATE, -Env => $env || die "Cannot open DB!\n";
54 while (my $url = $urlqueue->dequeue()) {
55 #print "#$i:\tGET $url\n";
56 my @urls = crawl_url($url, \%crawled);
58 if($urlqueue->pending() < 1000) {
59 #print("#$i:\tENQ: @urls\n");
60 $urlqueue->enqueue(@urls);
64 print("[*] Worker #$i stopped!\n");
69 print '###: '.$urlqueue->pending()."\n";
This page took 0.289969 seconds and 4 git commands to generate.