Commit | Line | Data |
---|---|---|
21c4e167 H |
1 | #!/usr/bin/env perl |
2 | use strict; | |
3 | use warnings; | |
4 | use threads; | |
5 | use Thread::Queue; | |
6 | use threads::shared; | |
7 | use LWP::Simple qw($ua get); | |
8 | $ua->timeout(3); | |
9 | ||
10 | package WebCrawler; | |
11 | ||
12 | my $urldb = './urls.txt'; | |
13 | ||
14 | print "This is libwww-perl-$LWP::VERSION\n"; | |
15 | ||
16 | my @urlbuffer :shared; | |
17 | #open(my $fp, "<$urldb"); @urlbuffer = <$fp>; close($fp); chop(@urlbuffer); | |
18 | push(@urlbuffer, 'http://root.cz/'); | |
19 | push(@urlbuffer, 'http://blog.harvie.cz/'); | |
20 | ||
21 | sub uniq(\@) { | |
22 | my ($i) = @_; | |
23 | my %h; | |
24 | @$i = grep(!$h{$_}++, @$i); | |
25 | return(@$i); | |
26 | } | |
27 | ||
28 | ||
29 | ||
30 | ||
31 | my $threads = 10; | |
32 | for(my $i=0; $i<$threads; $i++) { | |
33 | my $thr = threads->create(sub { | |
34 | print("[*] Worker #$i running!\n"); | |
35 | ||
36 | while (1) { | |
37 | lock(@urlbuffer); shift(@urlbuffer); | |
38 | if(length($_) < 3) { sleep(1); next; } | |
39 | ||
40 | print "#$i:\tGET $_\n"; | |
41 | $_ = LWP::Simple::get($_); | |
42 | ||
43 | my @urls; | |
44 | if($_) { | |
45 | @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls | |
46 | #@urls = /(http:\/\/[^\/'" ]*)/g; #domains | |
47 | if($i && @urlbuffer < 1000) { | |
48 | #print("#$i:\tENQ: @urls\n"); | |
49 | lock(@urlbuffer); | |
50 | push(@urlbuffer, uniq(@urls)); | |
51 | #while(uniq(@urls)) { push(@urlbuffer, $_); } | |
52 | } | |
53 | } | |
54 | } | |
55 | ||
56 | print("[*] Worker #$i stopped!\n"); | |
57 | }) -> detach(); | |
58 | } | |
59 | ||
60 | while(1) { | |
61 | print '###: '.@urlbuffer."\n"; | |
62 | lock(@urlbuffer); uniq(@urlbuffer); | |
63 | print '###: '.@urlbuffer."\n"; | |
64 | ||
65 | open(my $fp, ">$urldb"); | |
66 | print $fp join("\n", @urlbuffer); | |
67 | close($fp); | |
68 | ||
69 | sleep(5); | |
70 | } |