6b7027c092dff12849fae02af9fe5757addef25d
[mirrors/Programs.git] / perl / crawler / crawl-q.pl
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
4 use threads;
5 use Thread::Queue;
6 use LWP::Simple qw($ua get);
7 $ua->timeout(3);
8
9
10 package WebCrawler;
11
12 my $urldb = './urls.txt';
13
14 print "This is libwww-perl-$LWP::VERSION\n";
15
16 my $urlqueue = Thread::Queue->new();
17 #open(my $fp, "<$urldb"); while(<$fp>) { $urlqueue->enqueue($_); } close($fp);
18 $urlqueue->enqueue("http://root.cz/");
19 $urlqueue->enqueue("http://blog.harvie.cz/");
20
21 sub uniq(\@) {
22 my ($i) = @_;
23 my %h;
24 @$i = grep(!$h{$_}++, @$i);
25 return(@$i);
26 }
27
28 use Data::Dumper;
29 #die(Dumper($urlqueue));
30 while(my @urlqueue) {
31 print($_."\n");
32 } die;
33
34 my $threads = 20;
35 for(my $i=0; $i<$threads; $i++) {
36 my $thr = threads->create(sub {
37 print("[*] Worker #$i running!\n");
38
39 while (my $url = $urlqueue->dequeue()) {
40 #$urlqueue->enqueue($url);
41 print "#$i:\tGET $url\n";
42 $_ = LWP::Simple::get($url);
43 my @urls;
44 if($_) {
45 @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls
46 #@urls = /(http:\/\/[^\/'" ]*)/g; #domains
47 if($urlqueue->pending() < 1000) {
48 #print("#$i:\tENQ: @urls\n");
49 $urlqueue->enqueue(uniq(@urls));
50 #while(uniq(@urls)) { $urlqueue->enqueue($_); }
51 }
52 }
53 }
54
55 print("[*] Worker #$i stopped!\n");
56 }) -> detach();
57 }
58
59 while(1) {
60 print '###: '.$urlqueue->pending()."\n";
61 sleep(3);
62 }
This page took 0.287365 seconds and 3 git commands to generate.