Added some small boring scripts and programs writen in few last years
[mirrors/Programs.git] / perl / crawler / crawl-old.pl
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
4 use LWP::Simple qw($ua get);
5 $ua->timeout(3);
6
7
8 package WebCrawler;
9
10 my $urldb = './urls.txt';
11
12 print "This is libwww-perl-$LWP::VERSION\n";
13
14 our @urlbuffer;
15
16 open(my $fp, "<$urldb");
17 @urlbuffer = <$fp>;
18 close($fp);
19
20 push(@urlbuffer, 'http://root.cz/');
21 #push(@urlbuffer, 'http://blog.harvie.cz/');
22 crawl();
23
24 sub uniq(\@) {
25 my ($i) = @_;
26 my %h;
27 @$i = grep(!$h{$_}++, @$i);
28 return(@$i);
29 }
30
31
32 sub status {
33 print scalar(@urlbuffer)." buffered URLs\n";
34 uniq(@urlbuffer);
35 print scalar(@urlbuffer)." buffered URLs\n";
36
37 open(my $fp, ">$urldb");
38 print $fp join("\n", @urlbuffer);
39 close($fp);
40 }
41
42 sub crawl {
43 while(my $_ = shift(@urlbuffer)) {
44 status();
45 #my $_ = shift(@urlbuffer);
46 print "GET $_\n";
47 $_ = LWP::Simple::get($_);
48 my @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls
49 #my @urls = /(http:\/\/[^\/'" ]*)/g; #domains
50 push(@urlbuffer, uniq(@urls));
51 #print "ARRAY: @urlbuffer\n\n";
52 #foreach (@urls) { print "$_\n"; }
53
54 }
55 }
56
This page took 0.289891 seconds and 4 git commands to generate.