| 1 | #!/usr/bin/env perl |
| 2 | use strict; |
| 3 | use warnings; |
| 4 | use LWP::Simple qw($ua get); |
| 5 | $ua->timeout(3); |
| 6 | |
| 7 | |
| 8 | package WebCrawler; |
| 9 | |
| 10 | my $urldb = './urls.txt'; |
| 11 | |
| 12 | print "This is libwww-perl-$LWP::VERSION\n"; |
| 13 | |
| 14 | our @urlbuffer; |
| 15 | |
| 16 | open(my $fp, "<$urldb"); |
| 17 | @urlbuffer = <$fp>; |
| 18 | close($fp); |
| 19 | |
| 20 | push(@urlbuffer, 'http://root.cz/'); |
| 21 | #push(@urlbuffer, 'http://blog.harvie.cz/'); |
| 22 | crawl(); |
| 23 | |
| 24 | sub uniq(\@) { |
| 25 | my ($i) = @_; |
| 26 | my %h; |
| 27 | @$i = grep(!$h{$_}++, @$i); |
| 28 | return(@$i); |
| 29 | } |
| 30 | |
| 31 | |
| 32 | sub status { |
| 33 | print scalar(@urlbuffer)." buffered URLs\n"; |
| 34 | uniq(@urlbuffer); |
| 35 | print scalar(@urlbuffer)." buffered URLs\n"; |
| 36 | |
| 37 | open(my $fp, ">$urldb"); |
| 38 | print $fp join("\n", @urlbuffer); |
| 39 | close($fp); |
| 40 | } |
| 41 | |
| 42 | sub crawl { |
| 43 | while(my $_ = shift(@urlbuffer)) { |
| 44 | status(); |
| 45 | #my $_ = shift(@urlbuffer); |
| 46 | print "GET $_\n"; |
| 47 | $_ = LWP::Simple::get($_); |
| 48 | my @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls |
| 49 | #my @urls = /(http:\/\/[^\/'" ]*)/g; #domains |
| 50 | push(@urlbuffer, uniq(@urls)); |
| 51 | #print "ARRAY: @urlbuffer\n\n"; |
| 52 | #foreach (@urls) { print "$_\n"; } |
| 53 | |
| 54 | } |
| 55 | } |
| 56 | |