--- /dev/null
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use threads;
+#use Thread::Queue;
+#use threads::shared;
+use LWP::Simple qw($ua get); $ua->timeout(3);
+use BerkeleyDB;
+
+print STDERR "This is libwww-perl-$LWP::VERSION\n";
+
+#my @queue :shared;
+#my %crawled :shared;
+tie my @queue, "BerkeleyDB::Recno", -Filename => 'queue.db', -Flags => DB_CREATE || die "Cannot open DB1!\n";
+tie my %crawled, "BerkeleyDB::Hash", -Filename => 'urls.db', -Flags => DB_CREATE || die "Cannot open DB2!\n";
+
+push(@queue, 'http://root.cz/');
+push(@queue, 'http://blog.harvie.cz/');
+
+untie @queue;
+untie %crawled;
+
+
+sub uniq(\@) {
+ my ($i) = @_;
+ my %h;
+ @$i = grep(!$h{$_}++, @$i);
+ return(@$i);
+}
+
+my $threads = 3;
+for(my $i=0; $i<$threads; $i++) {
+ my $thr = threads->create(sub {
+ print("[*] Worker #$i running!\n");
+
+ my $env = new BerkeleyDB::Env
+ -Home => "/tmp/" ,
+ -Flags => DB_CREATE| DB_INIT_CDB | DB_INIT_MPOOL
+ or die "cannot open environment: $BerkeleyDB::Error\n";
+
+
+ tie my @queue, "BerkeleyDB::Recno", -Filename => 'queue.db', -Flags => DB_CREATE, -Env => $env || die "Cannot open DB1!\n";
+ tie my %crawled, "BerkeleyDB::Hash", -Filename => 'urls.db', -Flags => DB_CREATE, -Env => $env || die "Cannot open DB2!\n";
+
+ while (1) {
+ print "omg ".shift(@queue)."\n";
+ if(length($_) < 3) { sleep(1); next; }
+
+ print "#$i:\tGET $_\n";
+ $_ = LWP::Simple::get($_);
+
+ my @urls;
+ if($_) {
+ @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls
+ #@urls = /(http:\/\/[^\/'" ]*)/g; #domains
+ if($i && @queue < 1000) {
+ #print("#$i:\tENQ: @urls\n");
+ push(@queue, uniq(@urls));
+ #while(uniq(@urls)) { push(@queue, $_); }
+ }
+ }
+ }
+
+ print("[*] Worker #$i stopped!\n");
+ }) -> detach();
+}
+
+while(1) {
+ print '###: '.@queue."\n";
+ uniq(@queue);
+ print '###: '.@queue."\n";
+
+ sleep(5);
+}
#!/usr/bin/env perl
+package WebCrawler;
+
use strict;
use warnings;
use threads;
use Thread::Queue;
-use LWP::Simple qw($ua get);
- $ua->timeout(3);
-
-
-package WebCrawler;
-
-my $urldb = './urls.txt';
+use BerkeleyDB;
+use LWP::Simple qw($ua get); $ua->timeout(3);
print "This is libwww-perl-$LWP::VERSION\n";
+
my $urlqueue = Thread::Queue->new();
-#open(my $fp, "<$urldb"); while(<$fp>) { $urlqueue->enqueue($_); } close($fp);
$urlqueue->enqueue("http://root.cz/");
$urlqueue->enqueue("http://blog.harvie.cz/");
return(@$i);
}
-use Data::Dumper;
+sub crawl_url {
+#sub crawl_url($ \%) {
+ my ($url, $crawled) = @_;
+ #use Data::Dumper; print(Dumper(%$crawled)."\n");
+ print "#\tGET $url\n";
+ $_ = LWP::Simple::get($url) || return;
+ %$crawled->{$url}=1;
+ my @urls;
+ if($_) {
+ @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls
+ #@urls = /(http:\/\/[^\/'" ]*)/g; #domains
+ return uniq(@urls);
+ }
+}
+
+#use Data::Dumper;
#die(Dumper($urlqueue));
-while(my @urlqueue) {
- print($_."\n");
-} die;
+#while(my @urlqueue) {
+# print($_."\n");
+#} die;
-my $threads = 20;
+my $threads = 3;
for(my $i=0; $i<$threads; $i++) {
my $thr = threads->create(sub {
print("[*] Worker #$i running!\n");
+ my $env = new BerkeleyDB::Env -Home => "/tmp/", -Flags => DB_CREATE| DB_INIT_CDB | DB_INIT_MPOOL || die "cannot open environment: $BerkeleyDB::Error\n";
+ my $db = tie my %crawled, "BerkeleyDB::Hash", -Filename => 'urls.db', -Flags => DB_CREATE, -Env => $env || die "Cannot open DB!\n";
+
while (my $url = $urlqueue->dequeue()) {
- #$urlqueue->enqueue($url);
- print "#$i:\tGET $url\n";
- $_ = LWP::Simple::get($url);
- my @urls;
- if($_) {
- @urls = /(http:\/\/[_a-zA-Z0-9\.\-]+\.[a-zA-Z]{2,4}\/{1}[-_~&=\ ?\.a-z0-9\/]*)/g; #urls
- #@urls = /(http:\/\/[^\/'" ]*)/g; #domains
- if($urlqueue->pending() < 1000) {
- #print("#$i:\tENQ: @urls\n");
- $urlqueue->enqueue(uniq(@urls));
- #while(uniq(@urls)) { $urlqueue->enqueue($_); }
- }
+ #print "#$i:\tGET $url\n";
+ my @urls = crawl_url($url, \%crawled);
+ $db->sync();
+ if($urlqueue->pending() < 1000) {
+ #print("#$i:\tENQ: @urls\n");
+ $urlqueue->enqueue(@urls);
}
}