Commit | Line | Data |
---|---|---|
84aff5c8 H |
1 | #!/usr/bin/php |
2 | <?php | |
3 | //HSp33der Web Crawler 4.1 | |
4 | //<-Harvie 2oo7 | |
5 | /* | |
6 | *This thing crawls the web and printing found URLs to STDOUT. | |
7 | *Use it well... | |
8 | */ | |
9 | ||
10 | //Basic settings | |
11 | $first_url = 'http://harvie.stokoruna.cz'; | |
12 | $first_url = 'http://harvie.ath.cx'; | |
13 | $first_url = 'http://www.google.cz/search?q=web'; | |
14 | $first_url = 'http://www.google.com/search?as_q=www&num=10000'; | |
15 | $first_url = 'http://jyxo.cz/s?q=%23linux+format%3Ahtml&d=cz&cnt=1000&o=noacc,nostem'; //I will start crawling here | |
16 | //$first_url = 'http://ftp.mozilla.org/pub/mozilla.org/camino/releases/Camino-1.5.dmg'; //I will start crawling here | |
17 | $maximum_file_size = 1000000; //Maximum filesize of downloaded page in bytes | |
18 | $cache_size = 100; //Maximal cache size | |
19 | $min_cache_size = 20; //Cache size after cleanup | |
20 | ||
21 | //Filter settings | |
22 | $eregi_url_blacklist = array( | |
23 | '(W3\.org|W3C)', //Hell knows... | |
24 | '(shop|xxx|porn|lesbian|hot)', //Commercial sites | |
25 | '(google|209.85.135.104|amazon.com|seznam.cz|centrum.cz|atlas.cz|zoznam.sk|quick.cz)', //Big sites | |
26 | '.\.(css|ico|gif|jpg|png|bmp|cgi)', //Misc. webpage content | |
27 | '.\.(avi|mpg|mpeg|mov|wmv|wm|mp3|ogg|wma)', //Multimedia files | |
28 | '.\.(pdf|swf|flv|cfm)', //Other text files | |
29 | '.\.(exe|zip|rar|gz|bz|bz2|tar)' //Big and binary files | |
30 | ); | |
31 | $eregi_url_whitelist = array( | |
32 | '.\.(html|htm|shtml|php|xml|jsp|asp)' //HTML files only (Whitelisting disabled by default...) | |
33 | ); | |
34 | ||
35 | //Development settings | |
36 | //$debug = true; | |
37 | ||
38 | //Init | |
39 | $url_db = array($first_url); | |
40 | $i = 0; | |
41 | $total = 0; | |
42 | ||
43 | //Functions | |
44 | ||
45 | function file_size_check($file, $max, $block = 256) { //If $file is bigger than $max, returns 0 else 1 | |
46 | if($GLOBALS['debug']) echo("*FileSizeCheck!: $file\n"); | |
47 | $fp = @fopen($file, 'r'); | |
48 | if(!$fp) { | |
49 | if($GLOBALS['debug']) echo("**Can't open!!!: $file\n"); | |
50 | return(0); | |
51 | } | |
52 | $dl = 0; | |
53 | while(!feof($fp)) { | |
54 | $dl = $dl+strlen(fgets($fp, $block)); //echo("$dl\n"); //Debug | |
55 | if($dl > $max) { | |
56 | fclose($fp); | |
57 | if($GLOBALS['debug']) echo("**Too much big file!!!: $file\n"); | |
58 | return(0); | |
59 | } | |
60 | } | |
61 | fclose($fp); | |
62 | return(1); | |
63 | } | |
64 | function check_crawl_url($url) { //URL Filter for crawl_url function | |
65 | foreach($GLOBALS['eregi_url_blacklist'] as $black_url) { | |
66 | if(eregi($black_url, $url)) return(0); | |
67 | } | |
68 | if(in_array($url, $GLOBALS['url_db'])) return(0); | |
69 | if(!file_size_check($url, $GLOBALS['maximum_file_size'])) return(0); | |
70 | foreach($GLOBALS['eregi_url_whitelist'] as $white_url) { | |
71 | if(eregi($white_url, $url)) return(1); | |
72 | } | |
73 | return(1); //1 == disable whitelisting, 0 == enable whitelisting | |
74 | } | |
75 | ||
76 | function found_url($url) { //What to do with found URL | |
77 | $test = @fopen($url, 'r'); | |
78 | if(!$test) { | |
79 | if($GLOBALS['debug']) echo("> Can't open file!: $url\n"); | |
80 | return(1); | |
81 | } | |
82 | ||
83 | echo($url."\n"); | |
84 | } | |
85 | ||
86 | function crawl_url($url) { //Recursive crawler (with cleanup of course...) | |
87 | found_url($url); | |
88 | if($GLOBALS['i'] >= $GLOBALS['cache_size']) return(0); | |
89 | $in = @file($url); if(!$in || !is_array($in)) return(1); | |
90 | foreach($in as $line) { | |
91 | $line = spliti('href="http://', $line); | |
92 | if(sizeof($line) > 1) { | |
93 | array_shift($line); //print_r($line); //Debug | |
94 | foreach($line as $nurl) { | |
95 | $nurl = spliti('(\?|#|\*|")', $nurl); | |
96 | $nurl = 'http://'.trim(htmlspecialchars_decode($nurl[0])); //echo($nurl."\n"); //Debug | |
97 | if(check_crawl_url($nurl)) { | |
98 | array_push($GLOBALS['url_db'], $nurl); | |
99 | $GLOBALS['i']++; $GLOBALS['total']++; | |
100 | if($GLOBALS['debug']) echo("-cache: ".$GLOBALS['i']." +total urls crawled: ".$GLOBALS['total']."\n"); //Debug | |
101 | if($GLOBALS['i'] < $GLOBALS['cache_size']) { | |
102 | crawl_url($nurl); | |
103 | } | |
104 | if($GLOBALS['i'] >= $GLOBALS['cache_size']) return(0); | |
105 | } | |
106 | } | |
107 | } | |
108 | } | |
109 | } | |
110 | ||
111 | ||
112 | //CODE (Cleanup for crawl_url()) | |
113 | if($debug) echo("!!! DEBUG MODE ON !!!\n"); | |
114 | while(1) { | |
115 | if($debug) echo("\n+Strating with: ".$url_db[0]."\n"); //Debug | |
116 | foreach($url_db as $url) { | |
117 | if($i < $cache_size) crawl_url($url); | |
118 | } | |
119 | //Cache cleanup | |
120 | if($debug) echo("!Cache Cleanup\n"); //Debug | |
121 | while(sizeof($url_db) > $min_cache_size) { | |
122 | array_shift($url_db); | |
123 | } | |
124 | $url_db = array_reverse($url_db); | |
125 | $i = $min_cache_size; | |
126 | } |