Fix filename
[mirrors/Programs.git] / perl / crawler / crawl-old.pl
CommitLineData
21c4e167
H
1#!/usr/bin/env perl
2use strict;
3use warnings;
4use LWP::Simple qw($ua get);
5 $ua->timeout(3);
6
7
8package WebCrawler;
9
10my $urldb = './urls.txt';
11
12print "This is libwww-perl-$LWP::VERSION\n";
13
14our @urlbuffer;
15
16open(my $fp, "<$urldb");
17@urlbuffer = <$fp>;
18close($fp);
19
20push(@urlbuffer, 'http://root.cz/');
21#push(@urlbuffer, 'http://blog.harvie.cz/');
22crawl();
23
24sub uniq(\@) {
25 my ($i) = @_;
26 my %h;
27 @$i = grep(!$h{$_}++, @$i);
28 return(@$i);
29}
30
31
32sub status {
33 print scalar(@urlbuffer)." buffered URLs\n";
34 uniq(@urlbuffer);
35 print scalar(@urlbuffer)." buffered URLs\n";
36
37 open(my $fp, ">$urldb");
38 print $fp join("\n", @urlbuffer);
39 close($fp);
40}
41
42sub crawl {
43 while(my $_ = shift(@urlbuffer)) {
44 status();
45 #my $_ = shift(@urlbuffer);
46 print "GET $_\n";
47 $_ = LWP::Simple::get($_);
48 my @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls
49 #my @urls = /(http:\/\/[^\/'" ]*)/g; #domains
50 push(@urlbuffer, uniq(@urls));
51 #print "ARRAY: @urlbuffer\n\n";
52 #foreach (@urls) { print "$_\n"; }
53
54 }
55}
56
This page took 0.4079 seconds and 4 git commands to generate.