Added OpenSSH client wrapper to support DNSSEC verifiing
[mirrors/Programs.git] / perl / crawler / crawl.pl
... / ...
CommitLineData
1#!/usr/bin/env perl
2use strict;
3use warnings;
4use threads;
5use Thread::Queue;
6use threads::shared;
7use LWP::Simple qw($ua get);
8 $ua->timeout(3);
9
10package WebCrawler;
11
12my $urldb = './urls.txt';
13
14print "This is libwww-perl-$LWP::VERSION\n";
15
16my @urlbuffer :shared;
17#open(my $fp, "<$urldb"); @urlbuffer = <$fp>; close($fp); chop(@urlbuffer);
18push(@urlbuffer, 'http://root.cz/');
19push(@urlbuffer, 'http://blog.harvie.cz/');
20
21sub uniq(\@) {
22 my ($i) = @_;
23 my %h;
24 @$i = grep(!$h{$_}++, @$i);
25 return(@$i);
26}
27
28
29
30
31my $threads = 10;
32for(my $i=0; $i<$threads; $i++) {
33 my $thr = threads->create(sub {
34 print("[*] Worker #$i running!\n");
35
36 while (1) {
37 lock(@urlbuffer); shift(@urlbuffer);
38 if(length($_) < 3) { sleep(1); next; }
39
40 print "#$i:\tGET $_\n";
41 $_ = LWP::Simple::get($_);
42
43 my @urls;
44 if($_) {
45 @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls
46 #@urls = /(http:\/\/[^\/'" ]*)/g; #domains
47 if($i && @urlbuffer < 1000) {
48 #print("#$i:\tENQ: @urls\n");
49 lock(@urlbuffer);
50 push(@urlbuffer, uniq(@urls));
51 #while(uniq(@urls)) { push(@urlbuffer, $_); }
52 }
53 }
54 }
55
56 print("[*] Worker #$i stopped!\n");
57 }) -> detach();
58}
59
60while(1) {
61 print '###: '.@urlbuffer."\n";
62 lock(@urlbuffer); uniq(@urlbuffer);
63 print '###: '.@urlbuffer."\n";
64
65 open(my $fp, ">$urldb");
66 print $fp join("\n", @urlbuffer);
67 close($fp);
68
69 sleep(5);
70}
This page took 0.366738 seconds and 4 git commands to generate.