Added lot of junk laying around on harvie.cz
[mirrors/Programs.git] / php / crawler / old / crawler.php
CommitLineData
8de51304
H
1#!/usr/bin/php
2<?php
3//HSp33der Web Crawler 5.1
4//<-Harvie 2oo7
5/*
6 *This thing crawls the web and printing found URLs to STDOUT.
7 *New technology!!!
8 *Use it well...
9 */
10
11//Basic settings
12$crawl_file = 'crawl.txt';
13$lastc_file = 'last.txt';
14$nextc_file = 'next.txt';
15$seed_url = 'http://jyxo.cz/s?q=%23linux+format%3Ahtml&d=cz&cnt=3&o=noacc,nostem';
16$maximum_file_size = 500000; //Maximum filesize of downloaded page in bytes
17//$debug = true;
18//Filter settings
19$eregi_url_blacklist = array(
20 '(W3\.org|W3C)', //Hell knows...
21 '(shop|xxx|porn|lesbian|hot)', //Commercial sites
22 '(google|209.85.135.104|amazon.com|seznam.cz|centrum.cz|atlas.cz|zoznam.sk|quick.cz)', //Big sites
23 '.\.(css|ico|gif|jpg|png|bmp|cgi)', //Misc. webpage content
24 '.\.(avi|mpg|mpeg|mov|wmv|wm|mp3|ogg|wma)', //Multimedia files
25 '.\.(pdf|swf|flv|cfm)', //Other text files
26 '.\.(exe|zip|rar|gz|bz|bz2|tar)' //Big and binary files
27);
28
29//Functions
30function in_file($string, $file) { //Like idn_array(), but with file
31 if($GLOBALS['debug']) echo("in_file\n");
32 $in = fopen($GLOBALS['crawl_file'], 'r');
33 $string = trim($string);
34 $len = strlen($string);
35 while(!feof($in)) {
36 $line = trim(fgets($in));
37 //if( ($len == strlen($line)) && (strncasecmp($string, $line, $len) == 0) ) return(1);
38 if( ($len == strlen($line)) && (eregi($string, $line)) ) return(1);
39 }
40 fclose($in);
41 return(0);
42}
43
44function file_size_check($file, $max, $block = 250) { //If $file is bigger than $max, returns 0 else 1
45 if($GLOBALS['debug']) echo("fscheck\n");
46 if($GLOBALS['debug']) echo("*FileSizeCheck!: $file\n");
47 $fp = @fopen($file, 'r');
48 if(!$fp) {
49 if($GLOBALS['debug']) echo("**Can't open!!!: $file\n");
50 return(0);
51 }
52 $dl = 0;
53 while(!feof($fp)) {
54 $dl = $dl+strlen(fgets($fp, $block)); //echo("$dl\n"); //Debug
55 if($dl > $max) {
56 fclose($fp);
57 if($GLOBALS['debug']) echo("**Too much big file!!!: $file\n");
58 return(0);
59 }
60 }
61 @fclose($fp);
62 return(1);
63}
64function check_crawl_url($url) { //URL Filter for crawl_url function
65 if($GLOBALS['debug']) echo("check_crawl_url\n");
66 foreach($GLOBALS['eregi_url_blacklist'] as $black_url) {
67 if(eregi($black_url, $url)) return(0);
68 }
69
70 @fclose($GLOBALS['total']);
71 if(in_file($url, $GLOBALS['crawl_file'])) return(0);
72 $GLOBALS['total'] = fopen($GLOBALS['crawl_file'], 'a');
73
74 if(!file_size_check($url, $GLOBALS['maximum_file_size'])) return(0);
75 return(1); //1 == disable whitelisting, 0 == enable whitelisting
76}
77
78function save_crawled_url($url) { //Saves URL to database, etc...
79 if($GLOBALS['debug']) echo("save_crawled_url\n");
80 $GLOBALS['total_urls']++;
81 $url = trim($url);
82 echo($GLOBALS['total_urls'].':');
83 echo($url."\n");
84 @fwrite($GLOBALS['total'], $url."\n");
85 @fwrite($GLOBALS['next'], $url."\n");
86}
87
88function crawl_url_once($url) { //Main crawling function
89 if($GLOBALS['debug']) echo("crawl_url_once\n");
90 save_crawled_url($url);
91 $in = @file($url); if(!$in || !is_array($in)) return(1);
92 foreach($in as $line) {
93 $line = spliti('href="http://', $line);
94 if(sizeof($line) > 1) {
95 array_shift($line); //print_r($line); //Debug
96 foreach($line as $nurl) {
97 $nurl = spliti('(\?|#|\*|")', $nurl);
98 $nurl = 'http://'.trim(htmlspecialchars_decode($nurl[0])); //echo($nurl."\n"); //Debug
99 if(check_crawl_url($nurl)) save_crawled_url($nurl);
100 }
101 }
102 }
103}
104
105
106
107
108//CODE
109if(@filesize($nextc_file) < 8) {
110 $next = fopen($nextc_file, 'a');
111 fwrite($next, $seed_url."\n");
112 fclose($next);
113}
114if(is_file($nextc_file)) {
115 @unlink($lastc_file);
116 copy($nextc_file, $lastc_file);
117 unlink($nextc_file);
118}
119
120$total_urls = 0;
121while(1) {
122 if($debug) echo("\n-LOOP\n");
123 $last = fopen($lastc_file, 'r');
124 $next = fopen($nextc_file, 'a');
125 $total = fopen($crawl_file, 'a');
126 while(!feof($last)) {
127 $url = trim(fgets($last));
128 crawl_url_once($url);
129 }
130 @fclose($crawl_file);
131 @fclose($lastc_file);
132 @fclose($nextc_file);
133 unlink($lastc_file);
134 copy($nextc_file, $lastc_file);
135 unlink($nextc_file);
136}
137
138
This page took 0.268562 seconds and 4 git commands to generate.