php/crawler/old/wcrawl.phps

   1 #!/usr/bin/php
   2 <?php
   3 //HSp33der Web Crawler 4.1
   4 //<-Harvie 2oo7
   5 /*
   6  *This thing crawls the web and printing found URLs to STDOUT.
   7  *Use it well...
   8  */
   9
  10 //Basic settings
  11 $first_url = 'http://harvie.stokoruna.cz';
  12 $first_url = 'http://harvie.ath.cx';
  13 $first_url = 'http://www.google.cz/search?q=web';
  14 $first_url = 'http://www.google.com/search?as_q=www&num=10000';
  15 $first_url = 'http://jyxo.cz/s?q=%23linux+format%3Ahtml&d=cz&cnt=1000&o=noacc,nostem'; //I will start crawling here
  16 //$first_url = 'http://ftp.mozilla.org/pub/mozilla.org/camino/releases/Camino-1.5.dmg'; //I will start crawling here
  17 $maximum_file_size = 1000000; //Maximum filesize of downloaded page in bytes
  18 $cache_size = 100; //Maximal cache size
  19 $min_cache_size = 20; //Cache size after cleanup
  20
  21 //Filter settings
  22 $eregi_url_blacklist = array(
  23         '(W3\.org|W3C)', //Hell knows...
  24         '(shop|xxx|porn|lesbian|hot)', //Commercial sites
  25         '(google|209.85.135.104|amazon.com|seznam.cz|centrum.cz|atlas.cz|zoznam.sk|quick.cz)', //Big sites
  26         '.\.(css|ico|gif|jpg|png|bmp|cgi)', //Misc. webpage content
  27         '.\.(avi|mpg|mpeg|mov|wmv|wm|mp3|ogg|wma)', //Multimedia files
  28         '.\.(pdf|swf|flv|cfm)', //Other text files
  29         '.\.(exe|zip|rar|gz|bz|bz2|tar)' //Big and binary files
  30 );
  31 $eregi_url_whitelist = array(
  32         '.\.(html|htm|shtml|php|xml|jsp|asp)' //HTML files only (Whitelisting disabled by default...)
  33 );
  34
  35 //Development settings
  36 //$debug = true;
  37
  38 //Init
  39 $url_db = array($first_url);
  40 $i = 0;
  41 $total = 0;
  42
  43 //Functions
  44
  45 function file_size_check($file, $max, $block = 256) { //If $file is bigger than $max, returns 0 else 1
  46         if($GLOBALS['debug']) echo("*FileSizeCheck!: $file\n");
  47         $fp = @fopen($file, 'r');
  48         if(!$fp) {
  49                 if($GLOBALS['debug']) echo("**Can't open!!!: $file\n");
  50                 return(0);
  51         }
  52         $dl = 0;
  53         while(!feof($fp)) {
  54                 $dl = $dl+strlen(fgets($fp, $block)); //echo("$dl\n"); //Debug
  55                 if($dl > $max) {
  56                         fclose($fp);
  57                         if($GLOBALS['debug']) echo("**Too much big file!!!: $file\n");
  58                         return(0);
  59                 }
  60         }
  61         fclose($fp);
  62         return(1);
  63 }
  64 function check_crawl_url($url) { //URL Filter for crawl_url function
  65         foreach($GLOBALS['eregi_url_blacklist'] as $black_url) {
  66                 if(eregi($black_url, $url)) return(0);
  67         }
  68         if(in_array($url, $GLOBALS['url_db'])) return(0);
  69         if(!file_size_check($url, $GLOBALS['maximum_file_size'])) return(0);
  70         foreach($GLOBALS['eregi_url_whitelist'] as $white_url) {
  71                 if(eregi($white_url, $url)) return(1);
  72         }
  73         return(1); //1 == disable whitelisting, 0 == enable whitelisting
  74 }
  75
  76 function found_url($url) { //What to do with found URL
  77         $test = @fopen($url, 'r');
  78         if(!$test) {
  79                 if($GLOBALS['debug']) echo("> Can't open file!: $url\n");
  80                 return(1);
  81         }
  82
  83         echo($url."\n");
  84 }
  85
  86 function crawl_url($url) { //Recursive crawler (with cleanup of course...)
  87         found_url($url);
  88         if($GLOBALS['i'] >= $GLOBALS['cache_size']) return(0);
  89         $in = @file($url); if(!$in || !is_array($in)) return(1);
  90         foreach($in as $line) {
  91                 $line = spliti('href="http://', $line);
  92                 if(sizeof($line) > 1) {
  93                         array_shift($line); //print_r($line); //Debug
  94                         foreach($line as $nurl) {
  95                                 $nurl = spliti('(\?|#|\*|")', $nurl);
  96                                 $nurl = 'http://'.trim(htmlspecialchars_decode($nurl[0])); //echo($nurl."\n"); //Debug
  97                                 if(check_crawl_url($nurl)) {
  98                                         array_push($GLOBALS['url_db'], $nurl);
  99                                         $GLOBALS['i']++; $GLOBALS['total']++;
 100                                         if($GLOBALS['debug']) echo("-cache: ".$GLOBALS['i']." +total urls crawled: ".$GLOBALS['total']."\n"); //Debug
 101                                         if($GLOBALS['i'] < $GLOBALS['cache_size']) {
 102                                                 crawl_url($nurl);
 103                                         }
 104                                         if($GLOBALS['i'] >= $GLOBALS['cache_size']) return(0);
 105                                 }
 106                         }
 107                 }
 108         }
 109 }
 110
 111
 112 //CODE (Cleanup for crawl_url())
 113 if($debug) echo("!!! DEBUG MODE ON !!!\n");
 114 while(1) {
 115         if($debug) echo("\n+Strating with: ".$url_db[0]."\n"); //Debug
 116         foreach($url_db as $url) {
 117                 if($i < $cache_size) crawl_url($url);
 118         }
 119         //Cache cleanup
 120         if($debug) echo("!Cache Cleanup\n"); //Debug
 121         while(sizeof($url_db) > $min_cache_size) {
 122                 array_shift($url_db);
 123         }
 124         $url_db = array_reverse($url_db);
 125         $i = $min_cache_size;
 126 }