| 1 | #!/usr/bin/php |
| 2 | <?php |
| 3 | //HSp33der Web Crawler 4.1 |
| 4 | //<-Harvie 2oo7 |
| 5 | /* |
| 6 | *This thing crawls the web and printing found URLs to STDOUT. |
| 7 | *Use it well... |
| 8 | */ |
| 9 | |
| 10 | //Basic settings |
| 11 | $first_url = 'http://harvie.stokoruna.cz'; |
| 12 | $first_url = 'http://harvie.ath.cx'; |
| 13 | $first_url = 'http://www.google.cz/search?q=web'; |
| 14 | $first_url = 'http://www.google.com/search?as_q=www&num=10000'; |
| 15 | $first_url = 'http://jyxo.cz/s?q=%23linux+format%3Ahtml&d=cz&cnt=1000&o=noacc,nostem'; //I will start crawling here |
| 16 | //$first_url = 'http://ftp.mozilla.org/pub/mozilla.org/camino/releases/Camino-1.5.dmg'; //I will start crawling here |
| 17 | $maximum_file_size = 1000000; //Maximum filesize of downloaded page in bytes |
| 18 | $cache_size = 100; //Maximal cache size |
| 19 | $min_cache_size = 20; //Cache size after cleanup |
| 20 | |
| 21 | //Filter settings |
| 22 | $eregi_url_blacklist = array( |
| 23 | '(W3\.org|W3C)', //Hell knows... |
| 24 | '(shop|xxx|porn|lesbian|hot)', //Commercial sites |
| 25 | '(google|209.85.135.104|amazon.com|seznam.cz|centrum.cz|atlas.cz|zoznam.sk|quick.cz)', //Big sites |
| 26 | '.\.(css|ico|gif|jpg|png|bmp|cgi)', //Misc. webpage content |
| 27 | '.\.(avi|mpg|mpeg|mov|wmv|wm|mp3|ogg|wma)', //Multimedia files |
| 28 | '.\.(pdf|swf|flv|cfm)', //Other text files |
| 29 | '.\.(exe|zip|rar|gz|bz|bz2|tar)' //Big and binary files |
| 30 | ); |
| 31 | $eregi_url_whitelist = array( |
| 32 | '.\.(html|htm|shtml|php|xml|jsp|asp)' //HTML files only (Whitelisting disabled by default...) |
| 33 | ); |
| 34 | |
| 35 | //Development settings |
| 36 | //$debug = true; |
| 37 | |
| 38 | //Init |
| 39 | $url_db = array($first_url); |
| 40 | $i = 0; |
| 41 | $total = 0; |
| 42 | |
| 43 | //Functions |
| 44 | |
| 45 | function file_size_check($file, $max, $block = 256) { //If $file is bigger than $max, returns 0 else 1 |
| 46 | if($GLOBALS['debug']) echo("*FileSizeCheck!: $file\n"); |
| 47 | $fp = @fopen($file, 'r'); |
| 48 | if(!$fp) { |
| 49 | if($GLOBALS['debug']) echo("**Can't open!!!: $file\n"); |
| 50 | return(0); |
| 51 | } |
| 52 | $dl = 0; |
| 53 | while(!feof($fp)) { |
| 54 | $dl = $dl+strlen(fgets($fp, $block)); //echo("$dl\n"); //Debug |
| 55 | if($dl > $max) { |
| 56 | fclose($fp); |
| 57 | if($GLOBALS['debug']) echo("**Too much big file!!!: $file\n"); |
| 58 | return(0); |
| 59 | } |
| 60 | } |
| 61 | fclose($fp); |
| 62 | return(1); |
| 63 | } |
| 64 | function check_crawl_url($url) { //URL Filter for crawl_url function |
| 65 | foreach($GLOBALS['eregi_url_blacklist'] as $black_url) { |
| 66 | if(eregi($black_url, $url)) return(0); |
| 67 | } |
| 68 | if(in_array($url, $GLOBALS['url_db'])) return(0); |
| 69 | if(!file_size_check($url, $GLOBALS['maximum_file_size'])) return(0); |
| 70 | foreach($GLOBALS['eregi_url_whitelist'] as $white_url) { |
| 71 | if(eregi($white_url, $url)) return(1); |
| 72 | } |
| 73 | return(1); //1 == disable whitelisting, 0 == enable whitelisting |
| 74 | } |
| 75 | |
| 76 | function found_url($url) { //What to do with found URL |
| 77 | $test = @fopen($url, 'r'); |
| 78 | if(!$test) { |
| 79 | if($GLOBALS['debug']) echo("> Can't open file!: $url\n"); |
| 80 | return(1); |
| 81 | } |
| 82 | |
| 83 | echo($url."\n"); |
| 84 | } |
| 85 | |
| 86 | function crawl_url($url) { //Recursive crawler (with cleanup of course...) |
| 87 | found_url($url); |
| 88 | if($GLOBALS['i'] >= $GLOBALS['cache_size']) return(0); |
| 89 | $in = @file($url); if(!$in || !is_array($in)) return(1); |
| 90 | foreach($in as $line) { |
| 91 | $line = spliti('href="http://', $line); |
| 92 | if(sizeof($line) > 1) { |
| 93 | array_shift($line); //print_r($line); //Debug |
| 94 | foreach($line as $nurl) { |
| 95 | $nurl = spliti('(\?|#|\*|")', $nurl); |
| 96 | $nurl = 'http://'.trim(htmlspecialchars_decode($nurl[0])); //echo($nurl."\n"); //Debug |
| 97 | if(check_crawl_url($nurl)) { |
| 98 | array_push($GLOBALS['url_db'], $nurl); |
| 99 | $GLOBALS['i']++; $GLOBALS['total']++; |
| 100 | if($GLOBALS['debug']) echo("-cache: ".$GLOBALS['i']." +total urls crawled: ".$GLOBALS['total']."\n"); //Debug |
| 101 | if($GLOBALS['i'] < $GLOBALS['cache_size']) { |
| 102 | crawl_url($nurl); |
| 103 | } |
| 104 | if($GLOBALS['i'] >= $GLOBALS['cache_size']) return(0); |
| 105 | } |
| 106 | } |
| 107 | } |
| 108 | } |
| 109 | } |
| 110 | |
| 111 | |
| 112 | //CODE (Cleanup for crawl_url()) |
| 113 | if($debug) echo("!!! DEBUG MODE ON !!!\n"); |
| 114 | while(1) { |
| 115 | if($debug) echo("\n+Strating with: ".$url_db[0]."\n"); //Debug |
| 116 | foreach($url_db as $url) { |
| 117 | if($i < $cache_size) crawl_url($url); |
| 118 | } |
| 119 | //Cache cleanup |
| 120 | if($debug) echo("!Cache Cleanup\n"); //Debug |
| 121 | while(sizeof($url_db) > $min_cache_size) { |
| 122 | array_shift($url_db); |
| 123 | } |
| 124 | $url_db = array_reverse($url_db); |
| 125 | $i = $min_cache_size; |
| 126 | } |