#!/usr/bin/php $max) { fclose($fp); if($GLOBALS['debug']) echo("**Too much big file!!!: $file\n"); return(0); } } @fclose($fp); return(1); } function check_crawl_url($url) { //URL Filter for crawl_url function if($GLOBALS['debug']) echo("check_crawl_url\n"); foreach($GLOBALS['eregi_url_blacklist'] as $black_url) { if(eregi($black_url, $url)) return(0); } @fclose($GLOBALS['total']); if(in_file($url, $GLOBALS['crawl_file'])) return(0); $GLOBALS['total'] = fopen($GLOBALS['crawl_file'], 'a'); if(!file_size_check($url, $GLOBALS['maximum_file_size'])) return(0); return(1); //1 == disable whitelisting, 0 == enable whitelisting } function save_crawled_url($url) { //Saves URL to database, etc... if($GLOBALS['debug']) echo("save_crawled_url\n"); $GLOBALS['total_urls']++; $url = trim($url); echo($GLOBALS['total_urls'].':'); echo($url."\n"); @fwrite($GLOBALS['total'], $url."\n"); @fwrite($GLOBALS['next'], $url."\n"); } function crawl_url_once($url) { //Main crawling function if($GLOBALS['debug']) echo("crawl_url_once\n"); save_crawled_url($url); $in = @file($url); if(!$in || !is_array($in)) return(1); foreach($in as $line) { $line = spliti('href="http://', $line); if(sizeof($line) > 1) { array_shift($line); //print_r($line); //Debug foreach($line as $nurl) { $nurl = spliti('(\?|#|\*|")', $nurl); $nurl = 'http://'.trim(htmlspecialchars_decode($nurl[0])); //echo($nurl."\n"); //Debug if(check_crawl_url($nurl)) save_crawled_url($nurl); } } } } //CODE if(@filesize($nextc_file) < 8) { $next = fopen($nextc_file, 'a'); fwrite($next, $seed_url."\n"); fclose($next); } if(is_file($nextc_file)) { @unlink($lastc_file); copy($nextc_file, $lastc_file); unlink($nextc_file); } $total_urls = 0; while(1) { if($debug) echo("\n-LOOP\n"); $last = fopen($lastc_file, 'r'); $next = fopen($nextc_file, 'a'); $total = fopen($crawl_file, 'a'); while(!feof($last)) { $url = trim(fgets($last)); crawl_url_once($url); } @fclose($crawl_file); @fclose($lastc_file); @fclose($nextc_file); unlink($lastc_file); copy($nextc_file, $lastc_file); unlink($nextc_file); }