php/crawler/old/crawler.php

   1 #!/usr/bin/php
   2 <?php
   3 //HSp33der Web Crawler 5.1
   4 //<-Harvie 2oo7
   5 /*
   6  *This thing crawls the web and printing found URLs to STDOUT.
   7  *New technology!!!
   8  *Use it well...
   9  */
  10
  11 //Basic settings
  12 $crawl_file = 'crawl.txt';
  13 $lastc_file = 'last.txt';
  14 $nextc_file = 'next.txt';
  15 $seed_url   = 'http://jyxo.cz/s?q=%23linux+format%3Ahtml&d=cz&cnt=3&o=noacc,nostem';
  16 $maximum_file_size = 500000; //Maximum filesize of downloaded page in bytes
  17 //$debug = true;
  18 //Filter settings
  19 $eregi_url_blacklist = array(
  20         '(W3\.org|W3C)', //Hell knows...
  21         '(shop|xxx|porn|lesbian|hot)', //Commercial sites
  22         '(google|209.85.135.104|amazon.com|seznam.cz|centrum.cz|atlas.cz|zoznam.sk|quick.cz)', //Big sites
  23         '.\.(css|ico|gif|jpg|png|bmp|cgi)', //Misc. webpage content
  24         '.\.(avi|mpg|mpeg|mov|wmv|wm|mp3|ogg|wma)', //Multimedia files
  25         '.\.(pdf|swf|flv|cfm)', //Other text files
  26         '.\.(exe|zip|rar|gz|bz|bz2|tar)' //Big and binary files
  27 );
  28
  29 //Functions
  30 function in_file($string, $file) { //Like idn_array(), but with file
  31         if($GLOBALS['debug']) echo("in_file\n");
  32         $in = fopen($GLOBALS['crawl_file'], 'r');
  33         $string = trim($string);
  34         $len = strlen($string);
  35         while(!feof($in)) {
  36                 $line = trim(fgets($in));
  37                 //if( ($len == strlen($line)) && (strncasecmp($string, $line, $len) == 0) ) return(1);
  38                 if( ($len == strlen($line)) && (eregi($string, $line)) ) return(1);
  39         }
  40         fclose($in);
  41         return(0);
  42 }
  43
  44 function file_size_check($file, $max, $block = 250) { //If $file is bigger than $max, returns 0 else 1
  45         if($GLOBALS['debug']) echo("fscheck\n");
  46         if($GLOBALS['debug']) echo("*FileSizeCheck!: $file\n");
  47         $fp = @fopen($file, 'r');
  48         if(!$fp) {
  49                 if($GLOBALS['debug']) echo("**Can't open!!!: $file\n");
  50                 return(0);
  51         }
  52         $dl = 0;
  53         while(!feof($fp)) {
  54                 $dl = $dl+strlen(fgets($fp, $block)); //echo("$dl\n"); //Debug
  55                 if($dl > $max) {
  56                         fclose($fp);
  57                         if($GLOBALS['debug']) echo("**Too much big file!!!: $file\n");
  58                         return(0);
  59                 }
  60         }
  61         @fclose($fp);
  62         return(1);
  63 }
  64 function check_crawl_url($url) { //URL Filter for crawl_url function
  65         if($GLOBALS['debug']) echo("check_crawl_url\n");
  66         foreach($GLOBALS['eregi_url_blacklist'] as $black_url) {
  67                 if(eregi($black_url, $url)) return(0);
  68         }
  69
  70         @fclose($GLOBALS['total']);
  71         if(in_file($url, $GLOBALS['crawl_file'])) return(0);
  72         $GLOBALS['total'] = fopen($GLOBALS['crawl_file'], 'a');
  73
  74         if(!file_size_check($url, $GLOBALS['maximum_file_size'])) return(0);
  75         return(1); //1 == disable whitelisting, 0 == enable whitelisting
  76 }
  77
  78 function save_crawled_url($url) { //Saves URL to database, etc...
  79         if($GLOBALS['debug']) echo("save_crawled_url\n");
  80         $GLOBALS['total_urls']++;
  81         $url = trim($url);
  82         echo($GLOBALS['total_urls'].':');
  83         echo($url."\n");
  84         @fwrite($GLOBALS['total'], $url."\n");
  85         @fwrite($GLOBALS['next'], $url."\n");
  86 }
  87
  88 function crawl_url_once($url) { //Main crawling function
  89         if($GLOBALS['debug']) echo("crawl_url_once\n");
  90         save_crawled_url($url);
  91         $in = @file($url); if(!$in || !is_array($in)) return(1);
  92         foreach($in as $line) {
  93                 $line = spliti('href="http://', $line);
  94                 if(sizeof($line) > 1) {
  95                         array_shift($line); //print_r($line); //Debug
  96                         foreach($line as $nurl) {
  97                                 $nurl = spliti('(\?|#|\*|")', $nurl);
  98                                 $nurl = 'http://'.trim(htmlspecialchars_decode($nurl[0])); //echo($nurl."\n"); //Debug
  99                                 if(check_crawl_url($nurl)) save_crawled_url($nurl);
 100                         }
 101                 }
 102         }
 103 }
 104
 105
 106
 107
 108 //CODE
 109 if(@filesize($nextc_file) < 8) {
 110   $next = fopen($nextc_file, 'a');
 111   fwrite($next, $seed_url."\n");
 112   fclose($next);
 113 }
 114 if(is_file($nextc_file)) {
 115   @unlink($lastc_file);
 116   copy($nextc_file, $lastc_file);
 117   unlink($nextc_file);
 118 }
 119
 120 $total_urls = 0;
 121 while(1) {
 122   if($debug) echo("\n-LOOP\n");
 123   $last = fopen($lastc_file, 'r');
 124   $next = fopen($nextc_file, 'a');
 125   $total = fopen($crawl_file, 'a');
 126   while(!feof($last)) {
 127     $url = trim(fgets($last));
 128     crawl_url_once($url);
 129   }
 130   @fclose($crawl_file);
 131   @fclose($lastc_file);
 132   @fclose($nextc_file);
 133   unlink($lastc_file);
 134   copy($nextc_file, $lastc_file);
 135   unlink($nextc_file);
 136 }
 137
 138