[mirrors/Programs.git] / php / crawler / old / crawler.php

#!/usr/bin/php
<?php
//HSp33der Web Crawler 5.1
//<-Harvie 2oo7
/*
 *This thing crawls the web and printing found URLs to STDOUT.
 *New technology!!!
 *Use it well...
 */

//Basic settings
$crawl_file = 'crawl.txt';
$lastc_file = 'last.txt';
$nextc_file = 'next.txt';
$seed_url   = 'http://jyxo.cz/s?q=%23linux+format%3Ahtml&d=cz&cnt=3&o=noacc,nostem';
$maximum_file_size = 500000; //Maximum filesize of downloaded page in bytes
//$debug = true;
//Filter settings
$eregi_url_blacklist = array(
	'(W3\.org|W3C)', //Hell knows...
	'(shop|xxx|porn|lesbian|hot)', //Commercial sites
	'(google|209.85.135.104|amazon.com|seznam.cz|centrum.cz|atlas.cz|zoznam.sk|quick.cz)', //Big sites
	'.\.(css|ico|gif|jpg|png|bmp|cgi)', //Misc. webpage content
	'.\.(avi|mpg|mpeg|mov|wmv|wm|mp3|ogg|wma)', //Multimedia files
	'.\.(pdf|swf|flv|cfm)', //Other text files
	'.\.(exe|zip|rar|gz|bz|bz2|tar)' //Big and binary files
);

//Functions
function in_file($string, $file) { //Like idn_array(), but with file
	if($GLOBALS['debug']) echo("in_file\n");
	$in = fopen($GLOBALS['crawl_file'], 'r');
	$string = trim($string);
	$len = strlen($string);
	while(!feof($in)) {
		$line = trim(fgets($in));
		//if( ($len == strlen($line)) && (strncasecmp($string, $line, $len) == 0) ) return(1);
		if( ($len == strlen($line)) && (eregi($string, $line)) ) return(1);
	}
	fclose($in);
	return(0);
}

function file_size_check($file, $max, $block = 250) { //If $file is bigger than $max, returns 0 else 1
	if($GLOBALS['debug']) echo("fscheck\n");
	if($GLOBALS['debug']) echo("*FileSizeCheck!: $file\n");
	$fp = @fopen($file, 'r');
	if(!$fp) {
		if($GLOBALS['debug']) echo("**Can't open!!!: $file\n");
		return(0);
	}
	$dl = 0;
	while(!feof($fp)) {
		$dl = $dl+strlen(fgets($fp, $block)); //echo("$dl\n"); //Debug
		if($dl > $max) {
			fclose($fp);
			if($GLOBALS['debug']) echo("**Too much big file!!!: $file\n");
			return(0);
		}
	}
	@fclose($fp);
	return(1);
}
function check_crawl_url($url) { //URL Filter for crawl_url function
	if($GLOBALS['debug']) echo("check_crawl_url\n");
	foreach($GLOBALS['eregi_url_blacklist'] as $black_url) {
		if(eregi($black_url, $url)) return(0);
	}
	
	@fclose($GLOBALS['total']);
	if(in_file($url, $GLOBALS['crawl_file'])) return(0);
	$GLOBALS['total'] = fopen($GLOBALS['crawl_file'], 'a');
	
	if(!file_size_check($url, $GLOBALS['maximum_file_size'])) return(0);
	return(1); //1 == disable whitelisting, 0 == enable whitelisting
}

function save_crawled_url($url) { //Saves URL to database, etc...
	if($GLOBALS['debug']) echo("save_crawled_url\n");
	$GLOBALS['total_urls']++;
	$url = trim($url);
	echo($GLOBALS['total_urls'].':');
	echo($url."\n");
	@fwrite($GLOBALS['total'], $url."\n");
	@fwrite($GLOBALS['next'], $url."\n");
}

function crawl_url_once($url) { //Main crawling function
	if($GLOBALS['debug']) echo("crawl_url_once\n");
	save_crawled_url($url);
	$in = @file($url); if(!$in || !is_array($in)) return(1);
	foreach($in as $line) {
		$line = spliti('href="http://', $line);
		if(sizeof($line) > 1) {
			array_shift($line); //print_r($line); //Debug
			foreach($line as $nurl) {
				$nurl = spliti('(\?|#|\*|")', $nurl);
				$nurl = 'http://'.trim(htmlspecialchars_decode($nurl[0])); //echo($nurl."\n"); //Debug
				if(check_crawl_url($nurl)) save_crawled_url($nurl);
			}
		}
	}
}


//CODE
if(@filesize($nextc_file) < 8) {
  $next = fopen($nextc_file, 'a');
  fwrite($next, $seed_url."\n");
  fclose($next);
}
if(is_file($nextc_file)) {
  @unlink($lastc_file);
  copy($nextc_file, $lastc_file);
  unlink($nextc_file);
}

$total_urls = 0;
while(1) {
  if($debug) echo("\n-LOOP\n");
  $last = fopen($lastc_file, 'r');
  $next = fopen($nextc_file, 'a');
  $total = fopen($crawl_file, 'a');
  while(!feof($last)) {
    $url = trim(fgets($last));
    crawl_url_once($url);
  }
  @fclose($crawl_file);
  @fclose($lastc_file);
  @fclose($nextc_file);
  unlink($lastc_file);
  copy($nextc_file, $lastc_file);
  unlink($nextc_file);
}
Commit	Line	Data
	1	#!/usr/bin/php
	2	<?php
	3	//HSp33der Web Crawler 5.1
	4	//<-Harvie 2oo7
	5	/*
	6	*This thing crawls the web and printing found URLs to STDOUT.
	7	*New technology!!!
	8	*Use it well...
	9	*/
	10
	11	//Basic settings
	12	$crawl_file = 'crawl.txt';
	13	$lastc_file = 'last.txt';
	14	$nextc_file = 'next.txt';
	15	$seed_url = 'http://jyxo.cz/s?q=%23linux+format%3Ahtml&d=cz&cnt=3&o=noacc,nostem';
	16	$maximum_file_size = 500000; //Maximum filesize of downloaded page in bytes
	17	//$debug = true;
	18	//Filter settings
	19	$eregi_url_blacklist = array(
	20	'(W3\.org\|W3C)', //Hell knows...
	21	'(shop\|xxx\|porn\|lesbian\|hot)', //Commercial sites
	22	'(google\|209.85.135.104\|amazon.com\|seznam.cz\|centrum.cz\|atlas.cz\|zoznam.sk\|quick.cz)', //Big sites
	23	'.\.(css\|ico\|gif\|jpg\|png\|bmp\|cgi)', //Misc. webpage content
	24	'.\.(avi\|mpg\|mpeg\|mov\|wmv\|wm\|mp3\|ogg\|wma)', //Multimedia files
	25	'.\.(pdf\|swf\|flv\|cfm)', //Other text files
	26	'.\.(exe\|zip\|rar\|gz\|bz\|bz2\|tar)' //Big and binary files
	27	);
	28
	29	//Functions
	30	function in_file($string, $file) { //Like idn_array(), but with file
	31	if($GLOBALS['debug']) echo("in_file\n");
	32	$in = fopen($GLOBALS['crawl_file'], 'r');
	33	$string = trim($string);
	34	$len = strlen($string);
	35	while(!feof($in)) {
	36	$line = trim(fgets($in));
	37	//if( ($len == strlen($line)) && (strncasecmp($string, $line, $len) == 0) ) return(1);
	38	if( ($len == strlen($line)) && (eregi($string, $line)) ) return(1);
	39	}
	40	fclose($in);
	41	return(0);
	42	}
	43
	44	function file_size_check($file, $max, $block = 250) { //If $file is bigger than $max, returns 0 else 1
	45	if($GLOBALS['debug']) echo("fscheck\n");
	46	if($GLOBALS['debug']) echo("*FileSizeCheck!: $file\n");
	47	$fp = @fopen($file, 'r');
	48	if(!$fp) {
	49	if($GLOBALS['debug']) echo("**Can't open!!!: $file\n");
	50	return(0);
	51	}
	52	$dl = 0;
	53	while(!feof($fp)) {
	54	$dl = $dl+strlen(fgets($fp, $block)); //echo("$dl\n"); //Debug
	55	if($dl > $max) {
	56	fclose($fp);
	57	if($GLOBALS['debug']) echo("**Too much big file!!!: $file\n");
	58	return(0);
	59	}
	60	}
	61	@fclose($fp);
	62	return(1);
	63	}
	64	function check_crawl_url($url) { //URL Filter for crawl_url function
	65	if($GLOBALS['debug']) echo("check_crawl_url\n");
	66	foreach($GLOBALS['eregi_url_blacklist'] as $black_url) {
	67	if(eregi($black_url, $url)) return(0);
	68	}
	69
	70	@fclose($GLOBALS['total']);
	71	if(in_file($url, $GLOBALS['crawl_file'])) return(0);
	72	$GLOBALS['total'] = fopen($GLOBALS['crawl_file'], 'a');
	73
	74	if(!file_size_check($url, $GLOBALS['maximum_file_size'])) return(0);
	75	return(1); //1 == disable whitelisting, 0 == enable whitelisting
	76	}
	77
	78	function save_crawled_url($url) { //Saves URL to database, etc...
	79	if($GLOBALS['debug']) echo("save_crawled_url\n");
	80	$GLOBALS['total_urls']++;
	81	$url = trim($url);
	82	echo($GLOBALS['total_urls'].':');
	83	echo($url."\n");
	84	@fwrite($GLOBALS['total'], $url."\n");
	85	@fwrite($GLOBALS['next'], $url."\n");
	86	}
	87
	88	function crawl_url_once($url) { //Main crawling function
	89	if($GLOBALS['debug']) echo("crawl_url_once\n");
	90	save_crawled_url($url);
	91	$in = @file($url); if(!$in \|\| !is_array($in)) return(1);
	92	foreach($in as $line) {
	93	$line = spliti('href="http://', $line);
	94	if(sizeof($line) > 1) {
	95	array_shift($line); //print_r($line); //Debug
	96	foreach($line as $nurl) {
	97	$nurl = spliti('(\?\|#\|\*\|")', $nurl);
	98	$nurl = 'http://'.trim(htmlspecialchars_decode($nurl[0])); //echo($nurl."\n"); //Debug
	99	if(check_crawl_url($nurl)) save_crawled_url($nurl);
	100	}
	101	}
	102	}
	103	}
	104
	105
	106
	107
	108	//CODE
	109	if(@filesize($nextc_file) < 8) {
	110	$next = fopen($nextc_file, 'a');
	111	fwrite($next, $seed_url."\n");
	112	fclose($next);
	113	}
	114	if(is_file($nextc_file)) {
	115	@unlink($lastc_file);
	116	copy($nextc_file, $lastc_file);
	117	unlink($nextc_file);
	118	}
	119
	120	$total_urls = 0;
	121	while(1) {
	122	if($debug) echo("\n-LOOP\n");
	123	$last = fopen($lastc_file, 'r');
	124	$next = fopen($nextc_file, 'a');
	125	$total = fopen($crawl_file, 'a');
	126	while(!feof($last)) {
	127	$url = trim(fgets($last));
	128	crawl_url_once($url);
	129	}
	130	@fclose($crawl_file);
	131	@fclose($lastc_file);
	132	@fclose($nextc_file);
	133	unlink($lastc_file);
	134	copy($nextc_file, $lastc_file);
	135	unlink($nextc_file);
	136	}
	137
	138