Commit | Line | Data |
---|---|---|
fe8cffc0 H |
1 | #!/usr/bin/env perl |
2 | use strict; | |
3 | use warnings; | |
4 | use threads; | |
5 | #use Thread::Queue; | |
6 | #use threads::shared; | |
7 | use LWP::Simple qw($ua get); $ua->timeout(3); | |
8 | use BerkeleyDB; | |
9 | ||
10 | print STDERR "This is libwww-perl-$LWP::VERSION\n"; | |
11 | ||
12 | #my @queue :shared; | |
13 | #my %crawled :shared; | |
14 | tie my @queue, "BerkeleyDB::Recno", -Filename => 'queue.db', -Flags => DB_CREATE || die "Cannot open DB1!\n"; | |
15 | tie my %crawled, "BerkeleyDB::Hash", -Filename => 'urls.db', -Flags => DB_CREATE || die "Cannot open DB2!\n"; | |
16 | ||
17 | push(@queue, 'http://root.cz/'); | |
18 | push(@queue, 'http://blog.harvie.cz/'); | |
19 | ||
20 | untie @queue; | |
21 | untie %crawled; | |
22 | ||
23 | ||
24 | sub uniq(\@) { | |
25 | my ($i) = @_; | |
26 | my %h; | |
27 | @$i = grep(!$h{$_}++, @$i); | |
28 | return(@$i); | |
29 | } | |
30 | ||
31 | my $threads = 3; | |
32 | for(my $i=0; $i<$threads; $i++) { | |
33 | my $thr = threads->create(sub { | |
34 | print("[*] Worker #$i running!\n"); | |
35 | ||
36 | my $env = new BerkeleyDB::Env | |
37 | -Home => "/tmp/" , | |
38 | -Flags => DB_CREATE| DB_INIT_CDB | DB_INIT_MPOOL | |
39 | or die "cannot open environment: $BerkeleyDB::Error\n"; | |
40 | ||
41 | ||
42 | tie my @queue, "BerkeleyDB::Recno", -Filename => 'queue.db', -Flags => DB_CREATE, -Env => $env || die "Cannot open DB1!\n"; | |
43 | tie my %crawled, "BerkeleyDB::Hash", -Filename => 'urls.db', -Flags => DB_CREATE, -Env => $env || die "Cannot open DB2!\n"; | |
44 | ||
45 | while (1) { | |
46 | print "omg ".shift(@queue)."\n"; | |
47 | if(length($_) < 3) { sleep(1); next; } | |
48 | ||
49 | print "#$i:\tGET $_\n"; | |
50 | $_ = LWP::Simple::get($_); | |
51 | ||
52 | my @urls; | |
53 | if($_) { | |
54 | @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls | |
55 | #@urls = /(http:\/\/[^\/'" ]*)/g; #domains | |
56 | if($i && @queue < 1000) { | |
57 | #print("#$i:\tENQ: @urls\n"); | |
58 | push(@queue, uniq(@urls)); | |
59 | #while(uniq(@urls)) { push(@queue, $_); } | |
60 | } | |
61 | } | |
62 | } | |
63 | ||
64 | print("[*] Worker #$i stopped!\n"); | |
65 | }) -> detach(); | |
66 | } | |
67 | ||
68 | while(1) { | |
69 | print '###: '.@queue."\n"; | |
70 | uniq(@queue); | |
71 | print '###: '.@queue."\n"; | |
72 | ||
73 | sleep(5); | |
74 | } |