Commit | Line | Data |
---|---|---|
21c4e167 | 1 | #!/usr/bin/env perl |
fe8cffc0 H |
2 | package WebCrawler; |
3 | ||
21c4e167 H |
4 | use strict; |
5 | use warnings; | |
6 | use threads; | |
7 | use Thread::Queue; | |
fe8cffc0 H |
8 | use BerkeleyDB; |
9 | use LWP::Simple qw($ua get); $ua->timeout(3); | |
21c4e167 H |
10 | |
11 | print "This is libwww-perl-$LWP::VERSION\n"; | |
12 | ||
fe8cffc0 | 13 | |
21c4e167 | 14 | my $urlqueue = Thread::Queue->new(); |
21c4e167 H |
15 | $urlqueue->enqueue("http://root.cz/"); |
16 | $urlqueue->enqueue("http://blog.harvie.cz/"); | |
17 | ||
18 | sub uniq(\@) { | |
19 | my ($i) = @_; | |
20 | my %h; | |
21 | @$i = grep(!$h{$_}++, @$i); | |
22 | return(@$i); | |
23 | } | |
24 | ||
fe8cffc0 H |
25 | sub crawl_url { |
26 | #sub crawl_url($ \%) { | |
27 | my ($url, $crawled) = @_; | |
28 | #use Data::Dumper; print(Dumper(%$crawled)."\n"); | |
29 | print "#\tGET $url\n"; | |
30 | $_ = LWP::Simple::get($url) || return; | |
31 | %$crawled->{$url}=1; | |
32 | my @urls; | |
33 | if($_) { | |
34 | @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls | |
35 | #@urls = /(http:\/\/[^\/'" ]*)/g; #domains | |
36 | return uniq(@urls); | |
37 | } | |
38 | } | |
39 | ||
40 | #use Data::Dumper; | |
21c4e167 | 41 | #die(Dumper($urlqueue)); |
fe8cffc0 H |
42 | #while(my @urlqueue) { |
43 | # print($_."\n"); | |
44 | #} die; | |
21c4e167 | 45 | |
fe8cffc0 | 46 | my $threads = 3; |
21c4e167 H |
47 | for(my $i=0; $i<$threads; $i++) { |
48 | my $thr = threads->create(sub { | |
49 | print("[*] Worker #$i running!\n"); | |
50 | ||
fe8cffc0 H |
51 | my $env = new BerkeleyDB::Env -Home => "/tmp/", -Flags => DB_CREATE| DB_INIT_CDB | DB_INIT_MPOOL || die "cannot open environment: $BerkeleyDB::Error\n"; |
52 | my $db = tie my %crawled, "BerkeleyDB::Hash", -Filename => 'urls.db', -Flags => DB_CREATE, -Env => $env || die "Cannot open DB!\n"; | |
53 | ||
21c4e167 | 54 | while (my $url = $urlqueue->dequeue()) { |
fe8cffc0 H |
55 | #print "#$i:\tGET $url\n"; |
56 | my @urls = crawl_url($url, \%crawled); | |
57 | $db->sync(); | |
58 | if($urlqueue->pending() < 1000) { | |
59 | #print("#$i:\tENQ: @urls\n"); | |
60 | $urlqueue->enqueue(@urls); | |
21c4e167 H |
61 | } |
62 | } | |
63 | ||
64 | print("[*] Worker #$i stopped!\n"); | |
65 | }) -> detach(); | |
66 | } | |
67 | ||
68 | while(1) { | |
69 | print '###: '.$urlqueue->pending()."\n"; | |
70 | sleep(3); | |
71 | } |