another crawler experiments with perl and db
[mirrors/Programs.git] / perl / crawler / crawl-b.pl
CommitLineData
fe8cffc0
H
1#!/usr/bin/env perl
2use strict;
3use warnings;
4use threads;
5#use Thread::Queue;
6#use threads::shared;
7use LWP::Simple qw($ua get); $ua->timeout(3);
8use BerkeleyDB;
9
10print STDERR "This is libwww-perl-$LWP::VERSION\n";
11
12#my @queue :shared;
13#my %crawled :shared;
14tie my @queue, "BerkeleyDB::Recno", -Filename => 'queue.db', -Flags => DB_CREATE || die "Cannot open DB1!\n";
15tie my %crawled, "BerkeleyDB::Hash", -Filename => 'urls.db', -Flags => DB_CREATE || die "Cannot open DB2!\n";
16
17push(@queue, 'http://root.cz/');
18push(@queue, 'http://blog.harvie.cz/');
19
20untie @queue;
21untie %crawled;
22
23
24sub uniq(\@) {
25 my ($i) = @_;
26 my %h;
27 @$i = grep(!$h{$_}++, @$i);
28 return(@$i);
29}
30
31my $threads = 3;
32for(my $i=0; $i<$threads; $i++) {
33 my $thr = threads->create(sub {
34 print("[*] Worker #$i running!\n");
35
36 my $env = new BerkeleyDB::Env
37 -Home => "/tmp/" ,
38 -Flags => DB_CREATE| DB_INIT_CDB | DB_INIT_MPOOL
39 or die "cannot open environment: $BerkeleyDB::Error\n";
40
41
42 tie my @queue, "BerkeleyDB::Recno", -Filename => 'queue.db', -Flags => DB_CREATE, -Env => $env || die "Cannot open DB1!\n";
43 tie my %crawled, "BerkeleyDB::Hash", -Filename => 'urls.db', -Flags => DB_CREATE, -Env => $env || die "Cannot open DB2!\n";
44
45 while (1) {
46 print "omg ".shift(@queue)."\n";
47 if(length($_) < 3) { sleep(1); next; }
48
49 print "#$i:\tGET $_\n";
50 $_ = LWP::Simple::get($_);
51
52 my @urls;
53 if($_) {
54 @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls
55 #@urls = /(http:\/\/[^\/'" ]*)/g; #domains
56 if($i && @queue < 1000) {
57 #print("#$i:\tENQ: @urls\n");
58 push(@queue, uniq(@urls));
59 #while(uniq(@urls)) { push(@queue, $_); }
60 }
61 }
62 }
63
64 print("[*] Worker #$i stopped!\n");
65 }) -> detach();
66}
67
68while(1) {
69 print '###: '.@queue."\n";
70 uniq(@queue);
71 print '###: '.@queue."\n";
72
73 sleep(5);
74}
This page took 0.124982 seconds and 4 git commands to generate.