Commit | Line | Data |
---|---|---|
8de51304 H |
1 | #!/usr/bin/php |
2 | <?php | |
3 | //HSp33der Web Crawler 5.1 | |
4 | //<-Harvie 2oo7 | |
5 | /* | |
6 | *This thing crawls the web and printing found URLs to STDOUT. | |
7 | *New technology!!! | |
8 | *Use it well... | |
9 | */ | |
10 | ||
11 | //Basic settings | |
12 | $crawl_file = 'crawl.txt'; | |
13 | $lastc_file = 'last.txt'; | |
14 | $nextc_file = 'next.txt'; | |
15 | $seed_url = 'http://jyxo.cz/s?q=%23linux+format%3Ahtml&d=cz&cnt=3&o=noacc,nostem'; | |
16 | $maximum_file_size = 500000; //Maximum filesize of downloaded page in bytes | |
17 | //$debug = true; | |
18 | //Filter settings | |
19 | $eregi_url_blacklist = array( | |
20 | '(W3\.org|W3C)', //Hell knows... | |
21 | '(shop|xxx|porn|lesbian|hot)', //Commercial sites | |
22 | '(google|209.85.135.104|amazon.com|seznam.cz|centrum.cz|atlas.cz|zoznam.sk|quick.cz)', //Big sites | |
23 | '.\.(css|ico|gif|jpg|png|bmp|cgi)', //Misc. webpage content | |
24 | '.\.(avi|mpg|mpeg|mov|wmv|wm|mp3|ogg|wma)', //Multimedia files | |
25 | '.\.(pdf|swf|flv|cfm)', //Other text files | |
26 | '.\.(exe|zip|rar|gz|bz|bz2|tar)' //Big and binary files | |
27 | ); | |
28 | ||
29 | //Functions | |
30 | function in_file($string, $file) { //Like idn_array(), but with file | |
31 | if($GLOBALS['debug']) echo("in_file\n"); | |
32 | $in = fopen($GLOBALS['crawl_file'], 'r'); | |
33 | $string = trim($string); | |
34 | $len = strlen($string); | |
35 | while(!feof($in)) { | |
36 | $line = trim(fgets($in)); | |
37 | //if( ($len == strlen($line)) && (strncasecmp($string, $line, $len) == 0) ) return(1); | |
38 | if( ($len == strlen($line)) && (eregi($string, $line)) ) return(1); | |
39 | } | |
40 | fclose($in); | |
41 | return(0); | |
42 | } | |
43 | ||
44 | function file_size_check($file, $max, $block = 250) { //If $file is bigger than $max, returns 0 else 1 | |
45 | if($GLOBALS['debug']) echo("fscheck\n"); | |
46 | if($GLOBALS['debug']) echo("*FileSizeCheck!: $file\n"); | |
47 | $fp = @fopen($file, 'r'); | |
48 | if(!$fp) { | |
49 | if($GLOBALS['debug']) echo("**Can't open!!!: $file\n"); | |
50 | return(0); | |
51 | } | |
52 | $dl = 0; | |
53 | while(!feof($fp)) { | |
54 | $dl = $dl+strlen(fgets($fp, $block)); //echo("$dl\n"); //Debug | |
55 | if($dl > $max) { | |
56 | fclose($fp); | |
57 | if($GLOBALS['debug']) echo("**Too much big file!!!: $file\n"); | |
58 | return(0); | |
59 | } | |
60 | } | |
61 | @fclose($fp); | |
62 | return(1); | |
63 | } | |
64 | function check_crawl_url($url) { //URL Filter for crawl_url function | |
65 | if($GLOBALS['debug']) echo("check_crawl_url\n"); | |
66 | foreach($GLOBALS['eregi_url_blacklist'] as $black_url) { | |
67 | if(eregi($black_url, $url)) return(0); | |
68 | } | |
69 | ||
70 | @fclose($GLOBALS['total']); | |
71 | if(in_file($url, $GLOBALS['crawl_file'])) return(0); | |
72 | $GLOBALS['total'] = fopen($GLOBALS['crawl_file'], 'a'); | |
73 | ||
74 | if(!file_size_check($url, $GLOBALS['maximum_file_size'])) return(0); | |
75 | return(1); //1 == disable whitelisting, 0 == enable whitelisting | |
76 | } | |
77 | ||
78 | function save_crawled_url($url) { //Saves URL to database, etc... | |
79 | if($GLOBALS['debug']) echo("save_crawled_url\n"); | |
80 | $GLOBALS['total_urls']++; | |
81 | $url = trim($url); | |
82 | echo($GLOBALS['total_urls'].':'); | |
83 | echo($url."\n"); | |
84 | @fwrite($GLOBALS['total'], $url."\n"); | |
85 | @fwrite($GLOBALS['next'], $url."\n"); | |
86 | } | |
87 | ||
88 | function crawl_url_once($url) { //Main crawling function | |
89 | if($GLOBALS['debug']) echo("crawl_url_once\n"); | |
90 | save_crawled_url($url); | |
91 | $in = @file($url); if(!$in || !is_array($in)) return(1); | |
92 | foreach($in as $line) { | |
93 | $line = spliti('href="http://', $line); | |
94 | if(sizeof($line) > 1) { | |
95 | array_shift($line); //print_r($line); //Debug | |
96 | foreach($line as $nurl) { | |
97 | $nurl = spliti('(\?|#|\*|")', $nurl); | |
98 | $nurl = 'http://'.trim(htmlspecialchars_decode($nurl[0])); //echo($nurl."\n"); //Debug | |
99 | if(check_crawl_url($nurl)) save_crawled_url($nurl); | |
100 | } | |
101 | } | |
102 | } | |
103 | } | |
104 | ||
105 | ||
106 | ||
107 | ||
108 | //CODE | |
109 | if(@filesize($nextc_file) < 8) { | |
110 | $next = fopen($nextc_file, 'a'); | |
111 | fwrite($next, $seed_url."\n"); | |
112 | fclose($next); | |
113 | } | |
114 | if(is_file($nextc_file)) { | |
115 | @unlink($lastc_file); | |
116 | copy($nextc_file, $lastc_file); | |
117 | unlink($nextc_file); | |
118 | } | |
119 | ||
120 | $total_urls = 0; | |
121 | while(1) { | |
122 | if($debug) echo("\n-LOOP\n"); | |
123 | $last = fopen($lastc_file, 'r'); | |
124 | $next = fopen($nextc_file, 'a'); | |
125 | $total = fopen($crawl_file, 'a'); | |
126 | while(!feof($last)) { | |
127 | $url = trim(fgets($last)); | |
128 | crawl_url_once($url); | |
129 | } | |
130 | @fclose($crawl_file); | |
131 | @fclose($lastc_file); | |
132 | @fclose($nextc_file); | |
133 | unlink($lastc_file); | |
134 | copy($nextc_file, $lastc_file); | |
135 | unlink($nextc_file); | |
136 | } | |
137 | ||
138 |