docs
[mirrors/Programs.git] / perl / crawler / crawl-q.pl
CommitLineData
21c4e167 1#!/usr/bin/env perl
fe8cffc0
H
2package WebCrawler;
3
21c4e167
H
4use strict;
5use warnings;
6use threads;
7use Thread::Queue;
fe8cffc0
H
8use BerkeleyDB;
9use LWP::Simple qw($ua get); $ua->timeout(3);
21c4e167
H
10
11print "This is libwww-perl-$LWP::VERSION\n";
12
fe8cffc0 13
21c4e167 14my $urlqueue = Thread::Queue->new();
21c4e167
H
15$urlqueue->enqueue("http://root.cz/");
16$urlqueue->enqueue("http://blog.harvie.cz/");
17
18sub uniq(\@) {
19 my ($i) = @_;
20 my %h;
21 @$i = grep(!$h{$_}++, @$i);
22 return(@$i);
23}
24
fe8cffc0
H
25sub crawl_url {
26#sub crawl_url($ \%) {
27 my ($url, $crawled) = @_;
28 #use Data::Dumper; print(Dumper(%$crawled)."\n");
29 print "#\tGET $url\n";
30 $_ = LWP::Simple::get($url) || return;
31 %$crawled->{$url}=1;
32 my @urls;
33 if($_) {
34 @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls
35 #@urls = /(http:\/\/[^\/'" ]*)/g; #domains
36 return uniq(@urls);
37 }
38}
39
40#use Data::Dumper;
21c4e167 41#die(Dumper($urlqueue));
fe8cffc0
H
42#while(my @urlqueue) {
43# print($_."\n");
44#} die;
21c4e167 45
fe8cffc0 46my $threads = 3;
21c4e167
H
47for(my $i=0; $i<$threads; $i++) {
48 my $thr = threads->create(sub {
49 print("[*] Worker #$i running!\n");
50
fe8cffc0
H
51 my $env = new BerkeleyDB::Env -Home => "/tmp/", -Flags => DB_CREATE| DB_INIT_CDB | DB_INIT_MPOOL || die "cannot open environment: $BerkeleyDB::Error\n";
52 my $db = tie my %crawled, "BerkeleyDB::Hash", -Filename => 'urls.db', -Flags => DB_CREATE, -Env => $env || die "Cannot open DB!\n";
53
21c4e167 54 while (my $url = $urlqueue->dequeue()) {
fe8cffc0
H
55 #print "#$i:\tGET $url\n";
56 my @urls = crawl_url($url, \%crawled);
57 $db->sync();
58 if($urlqueue->pending() < 1000) {
59 #print("#$i:\tENQ: @urls\n");
60 $urlqueue->enqueue(@urls);
21c4e167
H
61 }
62 }
63
64 print("[*] Worker #$i stopped!\n");
65 }) -> detach();
66}
67
68while(1) {
69 print '###: '.$urlqueue->pending()."\n";
70 sleep(3);
71}
This page took 0.352907 seconds and 4 git commands to generate.