docs
[mirrors/Programs.git] / perl / crawler / crawl-q.pl
1 #!/usr/bin/env perl
2 package WebCrawler;
3
4 use strict;
5 use warnings;
6 use threads;
7 use Thread::Queue;
8 use BerkeleyDB;
9 use LWP::Simple qw($ua get); $ua->timeout(3);
10
11 print "This is libwww-perl-$LWP::VERSION\n";
12
13
14 my $urlqueue = Thread::Queue->new();
15 $urlqueue->enqueue("http://root.cz/");
16 $urlqueue->enqueue("http://blog.harvie.cz/");
17
18 sub uniq(\@) {
19 my ($i) = @_;
20 my %h;
21 @$i = grep(!$h{$_}++, @$i);
22 return(@$i);
23 }
24
25 sub crawl_url {
26 #sub crawl_url($ \%) {
27 my ($url, $crawled) = @_;
28 #use Data::Dumper; print(Dumper(%$crawled)."\n");
29 print "#\tGET $url\n";
30 $_ = LWP::Simple::get($url) || return;
31 %$crawled->{$url}=1;
32 my @urls;
33 if($_) {
34 @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls
35 #@urls = /(http:\/\/[^\/'" ]*)/g; #domains
36 return uniq(@urls);
37 }
38 }
39
40 #use Data::Dumper;
41 #die(Dumper($urlqueue));
42 #while(my @urlqueue) {
43 # print($_."\n");
44 #} die;
45
46 my $threads = 3;
47 for(my $i=0; $i<$threads; $i++) {
48 my $thr = threads->create(sub {
49 print("[*] Worker #$i running!\n");
50
51 my $env = new BerkeleyDB::Env -Home => "/tmp/", -Flags => DB_CREATE| DB_INIT_CDB | DB_INIT_MPOOL || die "cannot open environment: $BerkeleyDB::Error\n";
52 my $db = tie my %crawled, "BerkeleyDB::Hash", -Filename => 'urls.db', -Flags => DB_CREATE, -Env => $env || die "Cannot open DB!\n";
53
54 while (my $url = $urlqueue->dequeue()) {
55 #print "#$i:\tGET $url\n";
56 my @urls = crawl_url($url, \%crawled);
57 $db->sync();
58 if($urlqueue->pending() < 1000) {
59 #print("#$i:\tENQ: @urls\n");
60 $urlqueue->enqueue(@urls);
61 }
62 }
63
64 print("[*] Worker #$i stopped!\n");
65 }) -> detach();
66 }
67
68 while(1) {
69 print '###: '.$urlqueue->pending()."\n";
70 sleep(3);
71 }
This page took 0.328178 seconds and 4 git commands to generate.