#!/usr/bin/php $max) { fclose($fp); if($GLOBALS['debug']) echo("**Too much big file!!!: $file\n"); return(0); } } fclose($fp); return(1); } function check_crawl_url($url) { //URL Filter for crawl_url function foreach($GLOBALS['eregi_url_blacklist'] as $black_url) { if(eregi($black_url, $url)) return(0); } if(in_array($url, $GLOBALS['url_db'])) return(0); if(!file_size_check($url, $GLOBALS['maximum_file_size'])) return(0); foreach($GLOBALS['eregi_url_whitelist'] as $white_url) { if(eregi($white_url, $url)) return(1); } return(1); //1 == disable whitelisting, 0 == enable whitelisting } function found_url($url) { //What to do with found URL $test = @fopen($url, 'r'); if(!$test) { if($GLOBALS['debug']) echo("> Can't open file!: $url\n"); return(1); } echo($url."\n"); } function crawl_url($url) { //Recursive crawler (with cleanup of course...) found_url($url); if($GLOBALS['i'] >= $GLOBALS['cache_size']) return(0); $in = @file($url); if(!$in || !is_array($in)) return(1); foreach($in as $line) { $line = spliti('href="http://', $line); if(sizeof($line) > 1) { array_shift($line); //print_r($line); //Debug foreach($line as $nurl) { $nurl = spliti('(\?|#|\*|")', $nurl); $nurl = 'http://'.trim(htmlspecialchars_decode($nurl[0])); //echo($nurl."\n"); //Debug if(check_crawl_url($nurl)) { array_push($GLOBALS['url_db'], $nurl); $GLOBALS['i']++; $GLOBALS['total']++; if($GLOBALS['debug']) echo("-cache: ".$GLOBALS['i']." +total urls crawled: ".$GLOBALS['total']."\n"); //Debug if($GLOBALS['i'] < $GLOBALS['cache_size']) { crawl_url($nurl); } if($GLOBALS['i'] >= $GLOBALS['cache_size']) return(0); } } } } } //CODE (Cleanup for crawl_url()) if($debug) echo("!!! DEBUG MODE ON !!!\n"); while(1) { if($debug) echo("\n+Strating with: ".$url_db[0]."\n"); //Debug foreach($url_db as $url) { if($i < $cache_size) crawl_url($url); } //Cache cleanup if($debug) echo("!Cache Cleanup\n"); //Debug while(sizeof($url_db) > $min_cache_size) { array_shift($url_db); } $url_db = array_reverse($url_db); $i = $min_cache_size; }