3 //HSp33der Web Crawler 5.1
6 *This thing crawls the web and printing found URLs to STDOUT.
12 $crawl_file = 'crawl.txt';
13 $lastc_file = 'last.txt';
14 $nextc_file = 'next.txt';
15 $seed_url = 'http://jyxo.cz/s?q=%23linux+format%3Ahtml&d=cz&cnt=3&o=noacc,nostem';
16 $maximum_file_size = 500000; //Maximum filesize of downloaded page in bytes
19 $eregi_url_blacklist = array(
20 '(W3\.org|W3C)', //Hell knows...
21 '(shop|xxx|porn|lesbian|hot)', //Commercial sites
22 '(google|209.85.135.104|amazon.com|seznam.cz|centrum.cz|atlas.cz|zoznam.sk|quick.cz)', //Big sites
23 '.\.(css|ico|gif|jpg|png|bmp|cgi)', //Misc. webpage content
24 '.\.(avi|mpg|mpeg|mov|wmv|wm|mp3|ogg|wma)', //Multimedia files
25 '.\.(pdf|swf|flv|cfm)', //Other text files
26 '.\.(exe|zip|rar|gz|bz|bz2|tar)' //Big and binary files
30 function in_file($string, $file) { //Like idn_array(), but with file
31 if($GLOBALS['debug']) echo("in_file\n");
32 $in = fopen($GLOBALS['crawl_file'], 'r');
33 $string = trim($string);
34 $len = strlen($string);
36 $line = trim(fgets($in));
37 //if( ($len == strlen($line)) && (strncasecmp($string, $line, $len) == 0) ) return(1);
38 if( ($len == strlen($line)) && (eregi($string, $line)) ) return(1);
44 function file_size_check($file, $max, $block = 250) { //If $file is bigger than $max, returns 0 else 1
45 if($GLOBALS['debug']) echo("fscheck\n");
46 if($GLOBALS['debug']) echo("*FileSizeCheck!: $file\n");
47 $fp = @fopen
($file, 'r');
49 if($GLOBALS['debug']) echo("**Can't open!!!: $file\n");
54 $dl = $dl+
strlen(fgets($fp, $block)); //echo("$dl\n"); //Debug
57 if($GLOBALS['debug']) echo("**Too much big file!!!: $file\n");
64 function check_crawl_url($url) { //URL Filter for crawl_url function
65 if($GLOBALS['debug']) echo("check_crawl_url\n");
66 foreach($GLOBALS['eregi_url_blacklist'] as $black_url) {
67 if(eregi($black_url, $url)) return(0);
70 @fclose
($GLOBALS['total']);
71 if(in_file($url, $GLOBALS['crawl_file'])) return(0);
72 $GLOBALS['total'] = fopen($GLOBALS['crawl_file'], 'a');
74 if(!file_size_check($url, $GLOBALS['maximum_file_size'])) return(0);
75 return(1); //1 == disable whitelisting, 0 == enable whitelisting
78 function save_crawled_url($url) { //Saves URL to database, etc...
79 if($GLOBALS['debug']) echo("save_crawled_url\n");
80 $GLOBALS['total_urls']++
;
82 echo($GLOBALS['total_urls'].':');
84 @fwrite
($GLOBALS['total'], $url."\n");
85 @fwrite
($GLOBALS['next'], $url."\n");
88 function crawl_url_once($url) { //Main crawling function
89 if($GLOBALS['debug']) echo("crawl_url_once\n");
90 save_crawled_url($url);
91 $in = @file
($url); if(!$in ||
!is_array($in)) return(1);
92 foreach($in as $line) {
93 $line = spliti('href="http://', $line);
94 if(sizeof($line) > 1) {
95 array_shift($line); //print_r($line); //Debug
96 foreach($line as $nurl) {
97 $nurl = spliti('(\?|#|\*|")', $nurl);
98 $nurl = 'http://'.trim(htmlspecialchars_decode($nurl[0])); //echo($nurl."\n"); //Debug
99 if(check_crawl_url($nurl)) save_crawled_url($nurl);
109 if(@filesize
($nextc_file) < 8) {
110 $next = fopen($nextc_file, 'a');
111 fwrite($next, $seed_url."\n");
114 if(is_file($nextc_file)) {
115 @unlink
($lastc_file);
116 copy($nextc_file, $lastc_file);
122 if($debug) echo("\n-LOOP\n");
123 $last = fopen($lastc_file, 'r');
124 $next = fopen($nextc_file, 'a');
125 $total = fopen($crawl_file, 'a');
126 while(!feof($last)) {
127 $url = trim(fgets($last));
128 crawl_url_once($url);
130 @fclose
($crawl_file);
131 @fclose
($lastc_file);
132 @fclose
($nextc_file);
134 copy($nextc_file, $lastc_file);
This page took 1.901303 seconds and 4 git commands to generate.