Bomba keypad
[mirrors/Programs.git] / perl / crawler / crawl.pl
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
4 use threads;
5 use Thread::Queue;
6 use threads::shared;
7 use LWP::Simple qw($ua get);
8 $ua->timeout(3);
9
10 package WebCrawler;
11
12 my $urldb = './urls.txt';
13
14 print "This is libwww-perl-$LWP::VERSION\n";
15
16 my @urlbuffer :shared;
17 #open(my $fp, "<$urldb"); @urlbuffer = <$fp>; close($fp); chop(@urlbuffer);
18 push(@urlbuffer, 'http://root.cz/');
19 push(@urlbuffer, 'http://blog.harvie.cz/');
20
21 sub uniq(\@) {
22 my ($i) = @_;
23 my %h;
24 @$i = grep(!$h{$_}++, @$i);
25 return(@$i);
26 }
27
28
29
30
31 my $threads = 10;
32 for(my $i=0; $i<$threads; $i++) {
33 my $thr = threads->create(sub {
34 print("[*] Worker #$i running!\n");
35
36 while (1) {
37 lock(@urlbuffer); shift(@urlbuffer);
38 if(length($_) < 3) { sleep(1); next; }
39
40 print "#$i:\tGET $_\n";
41 $_ = LWP::Simple::get($_);
42
43 my @urls;
44 if($_) {
45 @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls
46 #@urls = /(http:\/\/[^\/'" ]*)/g; #domains
47 if($i && @urlbuffer < 1000) {
48 #print("#$i:\tENQ: @urls\n");
49 lock(@urlbuffer);
50 push(@urlbuffer, uniq(@urls));
51 #while(uniq(@urls)) { push(@urlbuffer, $_); }
52 }
53 }
54 }
55
56 print("[*] Worker #$i stopped!\n");
57 }) -> detach();
58 }
59
60 while(1) {
61 print '###: '.@urlbuffer."\n";
62 lock(@urlbuffer); uniq(@urlbuffer);
63 print '###: '.@urlbuffer."\n";
64
65 open(my $fp, ">$urldb");
66 print $fp join("\n", @urlbuffer);
67 close($fp);
68
69 sleep(5);
70 }
This page took 0.307212 seconds and 4 git commands to generate.