| 1 | #!/usr/bin/php |
| 2 | <?php |
| 3 | //HSp33der Web Crawler 5.1 |
| 4 | //<-Harvie 2oo7 |
| 5 | /* |
| 6 | *This thing crawls the web and printing found URLs to STDOUT. |
| 7 | *New technology!!! |
| 8 | *Use it well... |
| 9 | */ |
| 10 | |
| 11 | //Basic settings |
| 12 | $crawl_file = 'crawl.txt'; |
| 13 | $lastc_file = 'last.txt'; |
| 14 | $nextc_file = 'next.txt'; |
| 15 | $seed_url = 'http://jyxo.cz/s?q=%23linux+format%3Ahtml&d=cz&cnt=3&o=noacc,nostem'; |
| 16 | $maximum_file_size = 500000; //Maximum filesize of downloaded page in bytes |
| 17 | //$debug = true; |
| 18 | //Filter settings |
| 19 | $eregi_url_blacklist = array( |
| 20 | '(W3\.org|W3C)', //Hell knows... |
| 21 | '(shop|xxx|porn|lesbian|hot)', //Commercial sites |
| 22 | '(google|209.85.135.104|amazon.com|seznam.cz|centrum.cz|atlas.cz|zoznam.sk|quick.cz)', //Big sites |
| 23 | '.\.(css|ico|gif|jpg|png|bmp|cgi)', //Misc. webpage content |
| 24 | '.\.(avi|mpg|mpeg|mov|wmv|wm|mp3|ogg|wma)', //Multimedia files |
| 25 | '.\.(pdf|swf|flv|cfm)', //Other text files |
| 26 | '.\.(exe|zip|rar|gz|bz|bz2|tar)' //Big and binary files |
| 27 | ); |
| 28 | |
| 29 | //Functions |
| 30 | function in_file($string, $file) { //Like idn_array(), but with file |
| 31 | if($GLOBALS['debug']) echo("in_file\n"); |
| 32 | $in = fopen($GLOBALS['crawl_file'], 'r'); |
| 33 | $string = trim($string); |
| 34 | $len = strlen($string); |
| 35 | while(!feof($in)) { |
| 36 | $line = trim(fgets($in)); |
| 37 | //if( ($len == strlen($line)) && (strncasecmp($string, $line, $len) == 0) ) return(1); |
| 38 | if( ($len == strlen($line)) && (eregi($string, $line)) ) return(1); |
| 39 | } |
| 40 | fclose($in); |
| 41 | return(0); |
| 42 | } |
| 43 | |
| 44 | function file_size_check($file, $max, $block = 250) { //If $file is bigger than $max, returns 0 else 1 |
| 45 | if($GLOBALS['debug']) echo("fscheck\n"); |
| 46 | if($GLOBALS['debug']) echo("*FileSizeCheck!: $file\n"); |
| 47 | $fp = @fopen($file, 'r'); |
| 48 | if(!$fp) { |
| 49 | if($GLOBALS['debug']) echo("**Can't open!!!: $file\n"); |
| 50 | return(0); |
| 51 | } |
| 52 | $dl = 0; |
| 53 | while(!feof($fp)) { |
| 54 | $dl = $dl+strlen(fgets($fp, $block)); //echo("$dl\n"); //Debug |
| 55 | if($dl > $max) { |
| 56 | fclose($fp); |
| 57 | if($GLOBALS['debug']) echo("**Too much big file!!!: $file\n"); |
| 58 | return(0); |
| 59 | } |
| 60 | } |
| 61 | @fclose($fp); |
| 62 | return(1); |
| 63 | } |
| 64 | function check_crawl_url($url) { //URL Filter for crawl_url function |
| 65 | if($GLOBALS['debug']) echo("check_crawl_url\n"); |
| 66 | foreach($GLOBALS['eregi_url_blacklist'] as $black_url) { |
| 67 | if(eregi($black_url, $url)) return(0); |
| 68 | } |
| 69 | |
| 70 | @fclose($GLOBALS['total']); |
| 71 | if(in_file($url, $GLOBALS['crawl_file'])) return(0); |
| 72 | $GLOBALS['total'] = fopen($GLOBALS['crawl_file'], 'a'); |
| 73 | |
| 74 | if(!file_size_check($url, $GLOBALS['maximum_file_size'])) return(0); |
| 75 | return(1); //1 == disable whitelisting, 0 == enable whitelisting |
| 76 | } |
| 77 | |
| 78 | function save_crawled_url($url) { //Saves URL to database, etc... |
| 79 | if($GLOBALS['debug']) echo("save_crawled_url\n"); |
| 80 | $GLOBALS['total_urls']++; |
| 81 | $url = trim($url); |
| 82 | echo($GLOBALS['total_urls'].':'); |
| 83 | echo($url."\n"); |
| 84 | @fwrite($GLOBALS['total'], $url."\n"); |
| 85 | @fwrite($GLOBALS['next'], $url."\n"); |
| 86 | } |
| 87 | |
| 88 | function crawl_url_once($url) { //Main crawling function |
| 89 | if($GLOBALS['debug']) echo("crawl_url_once\n"); |
| 90 | save_crawled_url($url); |
| 91 | $in = @file($url); if(!$in || !is_array($in)) return(1); |
| 92 | foreach($in as $line) { |
| 93 | $line = spliti('href="http://', $line); |
| 94 | if(sizeof($line) > 1) { |
| 95 | array_shift($line); //print_r($line); //Debug |
| 96 | foreach($line as $nurl) { |
| 97 | $nurl = spliti('(\?|#|\*|")', $nurl); |
| 98 | $nurl = 'http://'.trim(htmlspecialchars_decode($nurl[0])); //echo($nurl."\n"); //Debug |
| 99 | if(check_crawl_url($nurl)) save_crawled_url($nurl); |
| 100 | } |
| 101 | } |
| 102 | } |
| 103 | } |
| 104 | |
| 105 | |
| 106 | |
| 107 | |
| 108 | //CODE |
| 109 | if(@filesize($nextc_file) < 8) { |
| 110 | $next = fopen($nextc_file, 'a'); |
| 111 | fwrite($next, $seed_url."\n"); |
| 112 | fclose($next); |
| 113 | } |
| 114 | if(is_file($nextc_file)) { |
| 115 | @unlink($lastc_file); |
| 116 | copy($nextc_file, $lastc_file); |
| 117 | unlink($nextc_file); |
| 118 | } |
| 119 | |
| 120 | $total_urls = 0; |
| 121 | while(1) { |
| 122 | if($debug) echo("\n-LOOP\n"); |
| 123 | $last = fopen($lastc_file, 'r'); |
| 124 | $next = fopen($nextc_file, 'a'); |
| 125 | $total = fopen($crawl_file, 'a'); |
| 126 | while(!feof($last)) { |
| 127 | $url = trim(fgets($last)); |
| 128 | crawl_url_once($url); |
| 129 | } |
| 130 | @fclose($crawl_file); |
| 131 | @fclose($lastc_file); |
| 132 | @fclose($nextc_file); |
| 133 | unlink($lastc_file); |
| 134 | copy($nextc_file, $lastc_file); |
| 135 | unlink($nextc_file); |
| 136 | } |
| 137 | |
| 138 | |