Commit | Line | Data |
---|---|---|
21c4e167 H |
1 | #!/usr/bin/env perl |
2 | use strict; | |
3 | use warnings; | |
4 | use threads; | |
5 | use Thread::Queue; | |
6 | use LWP::Simple qw($ua get); | |
7 | $ua->timeout(3); | |
8 | ||
9 | ||
10 | package WebCrawler; | |
11 | ||
12 | my $urldb = './urls.txt'; | |
13 | ||
14 | print "This is libwww-perl-$LWP::VERSION\n"; | |
15 | ||
16 | my $urlqueue = Thread::Queue->new(); | |
17 | #open(my $fp, "<$urldb"); while(<$fp>) { $urlqueue->enqueue($_); } close($fp); | |
18 | $urlqueue->enqueue("http://root.cz/"); | |
19 | $urlqueue->enqueue("http://blog.harvie.cz/"); | |
20 | ||
21 | sub uniq(\@) { | |
22 | my ($i) = @_; | |
23 | my %h; | |
24 | @$i = grep(!$h{$_}++, @$i); | |
25 | return(@$i); | |
26 | } | |
27 | ||
28 | use Data::Dumper; | |
29 | #die(Dumper($urlqueue)); | |
30 | while(my @urlqueue) { | |
31 | print($_."\n"); | |
32 | } die; | |
33 | ||
34 | my $threads = 20; | |
35 | for(my $i=0; $i<$threads; $i++) { | |
36 | my $thr = threads->create(sub { | |
37 | print("[*] Worker #$i running!\n"); | |
38 | ||
39 | while (my $url = $urlqueue->dequeue()) { | |
40 | #$urlqueue->enqueue($url); | |
41 | print "#$i:\tGET $url\n"; | |
42 | $_ = LWP::Simple::get($url); | |
43 | my @urls; | |
44 | if($_) { | |
45 | @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls | |
46 | #@urls = /(http:\/\/[^\/'" ]*)/g; #domains | |
47 | if($urlqueue->pending() < 1000) { | |
48 | #print("#$i:\tENQ: @urls\n"); | |
49 | $urlqueue->enqueue(uniq(@urls)); | |
50 | #while(uniq(@urls)) { $urlqueue->enqueue($_); } | |
51 | } | |
52 | } | |
53 | } | |
54 | ||
55 | print("[*] Worker #$i stopped!\n"); | |
56 | }) -> detach(); | |
57 | } | |
58 | ||
59 | while(1) { | |
60 | print '###: '.$urlqueue->pending()."\n"; | |
61 | sleep(3); | |
62 | } |