Added some small boring scripts and programs writen in few last years
[mirrors/Programs.git] / perl / crawler / crawl-q.pl
CommitLineData
21c4e167
H
1#!/usr/bin/env perl
2use strict;
3use warnings;
4use threads;
5use Thread::Queue;
6use LWP::Simple qw($ua get);
7 $ua->timeout(3);
8
9
10package WebCrawler;
11
12my $urldb = './urls.txt';
13
14print "This is libwww-perl-$LWP::VERSION\n";
15
16my $urlqueue = Thread::Queue->new();
17#open(my $fp, "<$urldb"); while(<$fp>) { $urlqueue->enqueue($_); } close($fp);
18$urlqueue->enqueue("http://root.cz/");
19$urlqueue->enqueue("http://blog.harvie.cz/");
20
21sub uniq(\@) {
22 my ($i) = @_;
23 my %h;
24 @$i = grep(!$h{$_}++, @$i);
25 return(@$i);
26}
27
28use Data::Dumper;
29#die(Dumper($urlqueue));
30while(my @urlqueue) {
31 print($_."\n");
32} die;
33
34my $threads = 20;
35for(my $i=0; $i<$threads; $i++) {
36 my $thr = threads->create(sub {
37 print("[*] Worker #$i running!\n");
38
39 while (my $url = $urlqueue->dequeue()) {
40 #$urlqueue->enqueue($url);
41 print "#$i:\tGET $url\n";
42 $_ = LWP::Simple::get($url);
43 my @urls;
44 if($_) {
45 @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls
46 #@urls = /(http:\/\/[^\/'" ]*)/g; #domains
47 if($urlqueue->pending() < 1000) {
48 #print("#$i:\tENQ: @urls\n");
49 $urlqueue->enqueue(uniq(@urls));
50 #while(uniq(@urls)) { $urlqueue->enqueue($_); }
51 }
52 }
53 }
54
55 print("[*] Worker #$i stopped!\n");
56 }) -> detach();
57}
58
59while(1) {
60 print '###: '.$urlqueue->pending()."\n";
61 sleep(3);
62}
This page took 0.164586 seconds and 4 git commands to generate.