Commit | Line | Data |
---|---|---|
21c4e167 H |
1 | #!/usr/bin/env perl |
2 | use strict; | |
3 | use warnings; | |
4 | use LWP::Simple qw($ua get); | |
5 | $ua->timeout(3); | |
6 | ||
7 | ||
8 | package WebCrawler; | |
9 | ||
10 | my $urldb = './urls.txt'; | |
11 | ||
12 | print "This is libwww-perl-$LWP::VERSION\n"; | |
13 | ||
14 | our @urlbuffer; | |
15 | ||
16 | open(my $fp, "<$urldb"); | |
17 | @urlbuffer = <$fp>; | |
18 | close($fp); | |
19 | ||
20 | push(@urlbuffer, 'http://root.cz/'); | |
21 | #push(@urlbuffer, 'http://blog.harvie.cz/'); | |
22 | crawl(); | |
23 | ||
24 | sub uniq(\@) { | |
25 | my ($i) = @_; | |
26 | my %h; | |
27 | @$i = grep(!$h{$_}++, @$i); | |
28 | return(@$i); | |
29 | } | |
30 | ||
31 | ||
32 | sub status { | |
33 | print scalar(@urlbuffer)." buffered URLs\n"; | |
34 | uniq(@urlbuffer); | |
35 | print scalar(@urlbuffer)." buffered URLs\n"; | |
36 | ||
37 | open(my $fp, ">$urldb"); | |
38 | print $fp join("\n", @urlbuffer); | |
39 | close($fp); | |
40 | } | |
41 | ||
42 | sub crawl { | |
43 | while(my $_ = shift(@urlbuffer)) { | |
44 | status(); | |
45 | #my $_ = shift(@urlbuffer); | |
46 | print "GET $_\n"; | |
47 | $_ = LWP::Simple::get($_); | |
48 | my @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls | |
49 | #my @urls = /(http:\/\/[^\/'" ]*)/g; #domains | |
50 | push(@urlbuffer, uniq(@urls)); | |
51 | #print "ARRAY: @urlbuffer\n\n"; | |
52 | #foreach (@urls) { print "$_\n"; } | |
53 | ||
54 | } | |
55 | } | |
56 |