Commit | Line | Data |
---|---|---|
bc13d5d6 H |
1 | <?php |
2 | ||
3 | /************************************************* | |
4 | ||
5 | Snoopy - the PHP net client | |
6 | Author: Monte Ohrt <monte@ispi.net> | |
7 | Copyright (c): 1999-2000 ispi, all rights reserved | |
8 | Version: 1.0 | |
9 | ||
10 | * This library is free software; you can redistribute it and/or | |
11 | * modify it under the terms of the GNU Lesser General Public | |
12 | * License as published by the Free Software Foundation; either | |
13 | * version 2.1 of the License, or (at your option) any later version. | |
14 | * | |
15 | * This library is distributed in the hope that it will be useful, | |
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 | * Lesser General Public License for more details. | |
19 | * | |
20 | * You should have received a copy of the GNU Lesser General Public | |
21 | * License along with this library; if not, write to the Free Software | |
22 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
23 | ||
24 | You may contact the author of Snoopy by e-mail at: | |
25 | monte@ispi.net | |
26 | ||
27 | Or, write to: | |
28 | Monte Ohrt | |
29 | CTO, ispi | |
30 | 237 S. 70th suite 220 | |
31 | Lincoln, NE 68510 | |
32 | ||
33 | The latest version of Snoopy can be obtained from: | |
34 | http://snoopy.sourceforge.com | |
35 | ||
36 | *************************************************/ | |
37 | ||
38 | class Snoopy | |
39 | { | |
40 | /**** Public variables ****/ | |
41 | ||
42 | /* user definable vars */ | |
43 | ||
44 | var $host = "www.php.net"; // host name we are connecting to | |
45 | var $port = 80; // port we are connecting to | |
46 | var $proxy_host = ""; // proxy host to use | |
47 | var $proxy_port = ""; // proxy port to use | |
48 | var $agent = "Snoopy v1.0"; // agent we masquerade as | |
49 | var $referer = ""; // referer info to pass | |
50 | var $cookies = array(); // array of cookies to pass | |
51 | // $cookies["username"]="joe"; | |
52 | var $rawheaders = array(); // array of raw headers to send | |
53 | // $rawheaders["Content-type"]="text/html"; | |
54 | ||
55 | var $maxredirs = 5; // http redirection depth maximum. 0 = disallow | |
56 | var $lastredirectaddr = ""; // contains address of last redirected address | |
57 | var $offsiteok = true; // allows redirection off-site | |
58 | var $maxframes = 0; // frame content depth maximum. 0 = disallow | |
59 | var $expandlinks = true; // expand links to fully qualified URLs. | |
60 | // this only applies to fetchlinks() | |
61 | // or submitlinks() | |
62 | var $passcookies = true; // pass set cookies back through redirects | |
63 | // NOTE: this currently does not respect | |
64 | // dates, domains or paths. | |
65 | ||
66 | var $user = ""; // user for http authentication | |
67 | var $pass = ""; // password for http authentication | |
68 | ||
69 | // http accept types | |
70 | var $accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*"; | |
71 | ||
72 | var $results = ""; // where the content is put | |
73 | ||
74 | var $error = ""; // error messages sent here | |
75 | var $response_code = ""; // response code returned from server | |
76 | var $headers = array(); // headers returned from server sent here | |
77 | var $maxlength = 500000; // max return data length (body) | |
78 | var $read_timeout = 0; // timeout on read operations, in seconds | |
79 | // supported only since PHP 4 Beta 4 | |
80 | // set to 0 to disallow timeouts | |
81 | var $timed_out = false; // if a read operation timed out | |
82 | var $status = 0; // http request status | |
83 | ||
84 | var $curl_path = "/usr/local/bin/curl"; | |
85 | // Snoopy will use cURL for fetching | |
86 | // SSL content if a full system path to | |
87 | // the cURL binary is supplied here. | |
88 | // set to false if you do not have | |
89 | // cURL installed. See http://curl.haxx.se | |
90 | // for details on installing cURL. | |
91 | // Snoopy does *not* use the cURL | |
92 | // library functions built into php, | |
93 | // as these functions are not stable | |
94 | // as of this Snoopy release. | |
95 | ||
96 | /**** Private variables ****/ | |
97 | ||
98 | var $_maxlinelen = 4096; // max line length (headers) | |
99 | ||
100 | var $_httpmethod = "GET"; // default http request method | |
101 | var $_httpversion = "HTTP/1.0"; // default http request version | |
102 | var $_submit_method = "POST"; // default submit method | |
103 | var $_submit_type = "application/x-www-form-urlencoded"; // default submit type | |
104 | var $_mime_boundary = ""; // MIME boundary for multipart/form-data submit type | |
105 | var $_redirectaddr = false; // will be set if page fetched is a redirect | |
106 | var $_redirectdepth = 0; // increments on an http redirect | |
107 | var $_frameurls = array(); // frame src urls | |
108 | var $_framedepth = 0; // increments on frame depth | |
109 | ||
110 | var $_isproxy = false; // set if using a proxy server | |
111 | var $_fp_timeout = 30; // timeout for socket connection | |
112 | ||
113 | /*======================================================================*\ | |
114 | Function: fetch | |
115 | Purpose: fetch the contents of a web page | |
116 | (and possibly other protocols in the | |
117 | future like ftp, nntp, gopher, etc.) | |
118 | Input: $URI the location of the page to fetch | |
119 | Output: $this->results the output text from the fetch | |
120 | \*======================================================================*/ | |
121 | ||
122 | function fetch($URI) | |
123 | { | |
124 | ||
125 | //preg_match("|^([^:]+)://([^:/]+)(:[\d]+)*(.*)|",$URI,$URI_PARTS); | |
126 | $URI_PARTS = parse_url($URI); | |
127 | if (!empty($URI_PARTS["user"])) | |
128 | $this->user = $URI_PARTS["user"]; | |
129 | if (!empty($URI_PARTS["pass"])) | |
130 | $this->pass = $URI_PARTS["pass"]; | |
131 | ||
132 | switch($URI_PARTS["scheme"]) | |
133 | { | |
134 | case "http": | |
135 | $this->host = $URI_PARTS["host"]; | |
136 | if(!empty($URI_PARTS["port"])) | |
137 | $this->port = $URI_PARTS["port"]; | |
138 | if($this->_connect($fp)) | |
139 | { | |
140 | if($this->_isproxy) | |
141 | { | |
142 | // using proxy, send entire URI | |
143 | $this->_httprequest($URI,$fp,$URI,$this->_httpmethod); | |
144 | } | |
145 | else | |
146 | { | |
147 | $path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : ""); | |
148 | // no proxy, send only the path | |
149 | $this->_httprequest($path, $fp, $URI, $this->_httpmethod); | |
150 | } | |
151 | ||
152 | $this->_disconnect($fp); | |
153 | ||
154 | if($this->_redirectaddr) | |
155 | { | |
156 | /* url was redirected, check if we've hit the max depth */ | |
157 | if($this->maxredirs > $this->_redirectdepth) | |
158 | { | |
159 | // only follow redirect if it's on this site, or offsiteok is true | |
160 | if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok) | |
161 | { | |
162 | /* follow the redirect */ | |
163 | $this->_redirectdepth++; | |
164 | $this->lastredirectaddr=$this->_redirectaddr; | |
165 | $this->fetch($this->_redirectaddr); | |
166 | } | |
167 | } | |
168 | } | |
169 | ||
170 | if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0) | |
171 | { | |
172 | $frameurls = $this->_frameurls; | |
173 | $this->_frameurls = array(); | |
174 | ||
175 | while(list(,$frameurl) = each($frameurls)) | |
176 | { | |
177 | if($this->_framedepth < $this->maxframes) | |
178 | { | |
179 | $this->fetch($frameurl); | |
180 | $this->_framedepth++; | |
181 | } | |
182 | else | |
183 | break; | |
184 | } | |
185 | } | |
186 | } | |
187 | else | |
188 | { | |
189 | return false; | |
190 | } | |
191 | return true; | |
192 | break; | |
193 | case "https": | |
194 | if(!$this->curl_path || (!is_executable($this->curl_path))) | |
195 | return false; | |
196 | $this->host = $URI_PARTS["host"]; | |
197 | if(!empty($URI_PARTS["port"])) | |
198 | $this->port = $URI_PARTS["port"]; | |
199 | if($this->_isproxy) | |
200 | { | |
201 | // using proxy, send entire URI | |
202 | $this->_httpsrequest($URI,$URI,$this->_httpmethod); | |
203 | } | |
204 | else | |
205 | { | |
206 | $path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : ""); | |
207 | // no proxy, send only the path | |
208 | $this->_httpsrequest($path, $URI, $this->_httpmethod); | |
209 | } | |
210 | ||
211 | if($this->_redirectaddr) | |
212 | { | |
213 | /* url was redirected, check if we've hit the max depth */ | |
214 | if($this->maxredirs > $this->_redirectdepth) | |
215 | { | |
216 | // only follow redirect if it's on this site, or offsiteok is true | |
217 | if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok) | |
218 | { | |
219 | /* follow the redirect */ | |
220 | $this->_redirectdepth++; | |
221 | $this->lastredirectaddr=$this->_redirectaddr; | |
222 | $this->fetch($this->_redirectaddr); | |
223 | } | |
224 | } | |
225 | } | |
226 | ||
227 | if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0) | |
228 | { | |
229 | $frameurls = $this->_frameurls; | |
230 | $this->_frameurls = array(); | |
231 | ||
232 | while(list(,$frameurl) = each($frameurls)) | |
233 | { | |
234 | if($this->_framedepth < $this->maxframes) | |
235 | { | |
236 | $this->fetch($frameurl); | |
237 | $this->_framedepth++; | |
238 | } | |
239 | else | |
240 | break; | |
241 | } | |
242 | } | |
243 | return true; | |
244 | break; | |
245 | default: | |
246 | // not a valid protocol | |
247 | $this->error = 'Invalid protocol "'.$URI_PARTS["scheme"].'"\n'; | |
248 | return false; | |
249 | break; | |
250 | } | |
251 | return true; | |
252 | } | |
253 | ||
254 | /*======================================================================*\ | |
255 | Function: submit | |
256 | Purpose: submit an http form | |
257 | Input: $URI the location to post the data | |
258 | $formvars the formvars to use. | |
259 | format: $formvars["var"] = "val"; | |
260 | Output: $this->results the text output from the post | |
261 | \*======================================================================*/ | |
262 | ||
263 | function submit($URI, $formvars="", $formfiles="") | |
264 | { | |
265 | unset($postdata); | |
266 | ||
267 | $postdata = $this->_prepare_post_body($formvars, $formfiles); | |
268 | ||
269 | $URI_PARTS = parse_url($URI); | |
270 | if (!empty($URI_PARTS["user"])) | |
271 | $this->user = $URI_PARTS["user"]; | |
272 | if (!empty($URI_PARTS["pass"])) | |
273 | $this->pass = $URI_PARTS["pass"]; | |
274 | ||
275 | switch($URI_PARTS["scheme"]) | |
276 | { | |
277 | case "http": | |
278 | $this->host = $URI_PARTS["host"]; | |
279 | if(!empty($URI_PARTS["port"])) | |
280 | $this->port = $URI_PARTS["port"]; | |
281 | if($this->_connect($fp)) | |
282 | { | |
283 | if($this->_isproxy) | |
284 | { | |
285 | // using proxy, send entire URI | |
286 | $this->_httprequest($URI,$fp,$URI,$this->_submit_method,$this->_submit_type,$postdata); | |
287 | } | |
288 | else | |
289 | { | |
290 | $path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : ""); | |
291 | // no proxy, send only the path | |
292 | $this->_httprequest($path, $fp, $URI, $this->_submit_method, $this->_submit_type, $postdata); | |
293 | } | |
294 | ||
295 | $this->_disconnect($fp); | |
296 | ||
297 | if($this->_redirectaddr) | |
298 | { | |
299 | /* url was redirected, check if we've hit the max depth */ | |
300 | if($this->maxredirs > $this->_redirectdepth) | |
301 | { | |
302 | if(!preg_match("|^".$URI_PARTS["scheme"]."://|", $this->_redirectaddr)) | |
303 | $this->_redirectaddr = $this->_expandlinks($this->_redirectaddr,$URI_PARTS["scheme"]."://".$URI_PARTS["host"]); | |
304 | ||
305 | // only follow redirect if it's on this site, or offsiteok is true | |
306 | if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok) | |
307 | { | |
308 | /* follow the redirect */ | |
309 | $this->_redirectdepth++; | |
310 | $this->lastredirectaddr=$this->_redirectaddr; | |
311 | $this->submit($this->_redirectaddr,$formvars, $formfiles); | |
312 | } | |
313 | } | |
314 | } | |
315 | ||
316 | if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0) | |
317 | { | |
318 | $frameurls = $this->_frameurls; | |
319 | $this->_frameurls = array(); | |
320 | ||
321 | while(list(,$frameurl) = each($frameurls)) | |
322 | { | |
323 | if($this->_framedepth < $this->maxframes) | |
324 | { | |
325 | $this->fetch($frameurl); | |
326 | $this->_framedepth++; | |
327 | } | |
328 | else | |
329 | break; | |
330 | } | |
331 | } | |
332 | ||
333 | } | |
334 | else | |
335 | { | |
336 | return false; | |
337 | } | |
338 | return true; | |
339 | break; | |
340 | case "https": | |
341 | if(!$this->curl_path || (!is_executable($this->curl_path))) | |
342 | return false; | |
343 | $this->host = $URI_PARTS["host"]; | |
344 | if(!empty($URI_PARTS["port"])) | |
345 | $this->port = $URI_PARTS["port"]; | |
346 | if($this->_isproxy) | |
347 | { | |
348 | // using proxy, send entire URI | |
349 | $this->_httpsrequest($URI, $URI, $this->_submit_method, $this->_submit_type, $postdata); | |
350 | } | |
351 | else | |
352 | { | |
353 | $path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : ""); | |
354 | // no proxy, send only the path | |
355 | $this->_httpsrequest($path, $URI, $this->_submit_method, $this->_submit_type, $postdata); | |
356 | } | |
357 | ||
358 | if($this->_redirectaddr) | |
359 | { | |
360 | /* url was redirected, check if we've hit the max depth */ | |
361 | if($this->maxredirs > $this->_redirectdepth) | |
362 | { | |
363 | if(!preg_match("|^".$URI_PARTS["scheme"]."://|", $this->_redirectaddr)) | |
364 | $this->_redirectaddr = $this->_expandlinks($this->_redirectaddr,$URI_PARTS["scheme"]."://".$URI_PARTS["host"]); | |
365 | ||
366 | // only follow redirect if it's on this site, or offsiteok is true | |
367 | if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok) | |
368 | { | |
369 | /* follow the redirect */ | |
370 | $this->_redirectdepth++; | |
371 | $this->lastredirectaddr=$this->_redirectaddr; | |
372 | $this->submit($this->_redirectaddr,$formvars, $formfiles); | |
373 | } | |
374 | } | |
375 | } | |
376 | ||
377 | if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0) | |
378 | { | |
379 | $frameurls = $this->_frameurls; | |
380 | $this->_frameurls = array(); | |
381 | ||
382 | while(list(,$frameurl) = each($frameurls)) | |
383 | { | |
384 | if($this->_framedepth < $this->maxframes) | |
385 | { | |
386 | $this->fetch($frameurl); | |
387 | $this->_framedepth++; | |
388 | } | |
389 | else | |
390 | break; | |
391 | } | |
392 | } | |
393 | return true; | |
394 | break; | |
395 | ||
396 | default: | |
397 | // not a valid protocol | |
398 | $this->error = 'Invalid protocol "'.$URI_PARTS["scheme"].'"\n'; | |
399 | return false; | |
400 | break; | |
401 | } | |
402 | return true; | |
403 | } | |
404 | ||
405 | /*======================================================================*\ | |
406 | Function: fetchlinks | |
407 | Purpose: fetch the links from a web page | |
408 | Input: $URI where you are fetching from | |
409 | Output: $this->results an array of the URLs | |
410 | \*======================================================================*/ | |
411 | ||
412 | function fetchlinks($URI) | |
413 | { | |
414 | if ($this->fetch($URI)) | |
415 | { | |
416 | ||
417 | if(is_array($this->results)) | |
418 | { | |
419 | for($x=0;$x<count($this->results);$x++) | |
420 | $this->results[$x] = $this->_striplinks($this->results[$x]); | |
421 | } | |
422 | else | |
423 | $this->results = $this->_striplinks($this->results); | |
424 | ||
425 | if($this->expandlinks) | |
426 | $this->results = $this->_expandlinks($this->results, $URI); | |
427 | return true; | |
428 | } | |
429 | else | |
430 | return false; | |
431 | } | |
432 | ||
433 | /*======================================================================*\ | |
434 | Function: fetchform | |
435 | Purpose: fetch the form elements from a web page | |
436 | Input: $URI where you are fetching from | |
437 | Output: $this->results the resulting html form | |
438 | \*======================================================================*/ | |
439 | ||
440 | function fetchform($URI) | |
441 | { | |
442 | ||
443 | if ($this->fetch($URI)) | |
444 | { | |
445 | ||
446 | if(is_array($this->results)) | |
447 | { | |
448 | for($x=0;$x<count($this->results);$x++) | |
449 | $this->results[$x] = $this->_stripform($this->results[$x]); | |
450 | } | |
451 | else | |
452 | $this->results = $this->_stripform($this->results); | |
453 | ||
454 | return true; | |
455 | } | |
456 | else | |
457 | return false; | |
458 | } | |
459 | ||
460 | ||
461 | /*======================================================================*\ | |
462 | Function: fetchtext | |
463 | Purpose: fetch the text from a web page, stripping the links | |
464 | Input: $URI where you are fetching from | |
465 | Output: $this->results the text from the web page | |
466 | \*======================================================================*/ | |
467 | ||
468 | function fetchtext($URI) | |
469 | { | |
470 | if($this->fetch($URI)) | |
471 | { | |
472 | if(is_array($this->results)) | |
473 | { | |
474 | for($x=0;$x<count($this->results);$x++) | |
475 | $this->results[$x] = $this->_striptext($this->results[$x]); | |
476 | } | |
477 | else | |
478 | $this->results = $this->_striptext($this->results); | |
479 | return true; | |
480 | } | |
481 | else | |
482 | return false; | |
483 | } | |
484 | ||
485 | /*======================================================================*\ | |
486 | Function: submitlinks | |
487 | Purpose: grab links from a form submission | |
488 | Input: $URI where you are submitting from | |
489 | Output: $this->results an array of the links from the post | |
490 | \*======================================================================*/ | |
491 | ||
492 | function submitlinks($URI, $formvars="", $formfiles="") | |
493 | { | |
494 | if($this->submit($URI,$formvars, $formfiles)) | |
495 | { | |
496 | if(is_array($this->results)) | |
497 | { | |
498 | for($x=0;$x<count($this->results);$x++) | |
499 | { | |
500 | $this->results[$x] = $this->_striplinks($this->results[$x]); | |
501 | if($this->expandlinks) | |
502 | $this->results[$x] = $this->_expandlinks($this->results[$x],$URI); | |
503 | } | |
504 | } | |
505 | else | |
506 | { | |
507 | $this->results = $this->_striplinks($this->results); | |
508 | if($this->expandlinks) | |
509 | $this->results = $this->_expandlinks($this->results,$URI); | |
510 | } | |
511 | return true; | |
512 | } | |
513 | else | |
514 | return false; | |
515 | } | |
516 | ||
517 | /*======================================================================*\ | |
518 | Function: submittext | |
519 | Purpose: grab text from a form submission | |
520 | Input: $URI where you are submitting from | |
521 | Output: $this->results the text from the web page | |
522 | \*======================================================================*/ | |
523 | ||
524 | function submittext($URI, $formvars = "", $formfiles = "") | |
525 | { | |
526 | if($this->submit($URI,$formvars, $formfiles)) | |
527 | { | |
528 | if(is_array($this->results)) | |
529 | { | |
530 | for($x=0;$x<count($this->results);$x++) | |
531 | { | |
532 | $this->results[$x] = $this->_striptext($this->results[$x]); | |
533 | if($this->expandlinks) | |
534 | $this->results[$x] = $this->_expandlinks($this->results[$x],$URI); | |
535 | } | |
536 | } | |
537 | else | |
538 | { | |
539 | $this->results = $this->_striptext($this->results); | |
540 | if($this->expandlinks) | |
541 | $this->results = $this->_expandlinks($this->results,$URI); | |
542 | } | |
543 | return true; | |
544 | } | |
545 | else | |
546 | return false; | |
547 | } | |
548 | ||
549 | ||
550 | ||
551 | /*======================================================================*\ | |
552 | Function: set_submit_multipart | |
553 | Purpose: Set the form submission content type to | |
554 | multipart/form-data | |
555 | \*======================================================================*/ | |
556 | function set_submit_multipart() | |
557 | { | |
558 | $this->_submit_type = "multipart/form-data"; | |
559 | } | |
560 | ||
561 | ||
562 | /*======================================================================*\ | |
563 | Function: set_submit_normal | |
564 | Purpose: Set the form submission content type to | |
565 | application/x-www-form-urlencoded | |
566 | \*======================================================================*/ | |
567 | function set_submit_normal() | |
568 | { | |
569 | $this->_submit_type = "application/x-www-form-urlencoded"; | |
570 | } | |
571 | ||
572 | ||
573 | ||
574 | ||
575 | /*======================================================================*\ | |
576 | Private functions | |
577 | \*======================================================================*/ | |
578 | ||
579 | ||
580 | /*======================================================================*\ | |
581 | Function: _striplinks | |
582 | Purpose: strip the hyperlinks from an html document | |
583 | Input: $document document to strip. | |
584 | Output: $match an array of the links | |
585 | \*======================================================================*/ | |
586 | ||
587 | function _striplinks($document) | |
588 | { | |
589 | preg_match_all("'<\s*a\s+.*href\s*=\s* # find <a href= | |
590 | ([\"\'])? # find single or double quote | |
591 | (?(1) (.*?)\\1 | ([^\s\>]+)) # if quote found, match up to next matching | |
592 | # quote, otherwise match up to next space | |
593 | 'isx",$document,$links); | |
594 | ||
595 | ||
596 | // catenate the non-empty matches from the conditional subpattern | |
597 | ||
598 | while(list($key,$val) = each($links[2])) | |
599 | { | |
600 | if(!empty($val)) | |
601 | $match[] = $val; | |
602 | } | |
603 | ||
604 | while(list($key,$val) = each($links[3])) | |
605 | { | |
606 | if(!empty($val)) | |
607 | $match[] = $val; | |
608 | } | |
609 | ||
610 | // return the links | |
611 | return $match; | |
612 | } | |
613 | ||
614 | /*======================================================================*\ | |
615 | Function: _stripform | |
616 | Purpose: strip the form elements from an html document | |
617 | Input: $document document to strip. | |
618 | Output: $match an array of the links | |
619 | \*======================================================================*/ | |
620 | ||
621 | function _stripform($document) | |
622 | { | |
623 | preg_match_all("'<\/?(FORM|INPUT|SELECT|TEXTAREA|(OPTION))[^<>]*>(?(2)(.*(?=<\/?(option|select)[^<>]*>[\r\n]*)|(?=[\r\n]*))|(?=[\r\n]*))'Usi",$document,$elements); | |
624 | ||
625 | // catenate the matches | |
626 | $match = implode("\r\n",$elements[0]); | |
627 | ||
628 | // return the links | |
629 | return $match; | |
630 | } | |
631 | ||
632 | ||
633 | ||
634 | /*======================================================================*\ | |
635 | Function: _striptext | |
636 | Purpose: strip the text from an html document | |
637 | Input: $document document to strip. | |
638 | Output: $text the resulting text | |
639 | \*======================================================================*/ | |
640 | ||
641 | function _striptext($document) | |
642 | { | |
643 | ||
644 | // I didn't use preg eval (//e) since that is only available in PHP 4.0. | |
645 | // so, list your entities one by one here. I included some of the | |
646 | // more common ones. | |
647 | ||
648 | $search = array("'<script[^>]*?>.*?</script>'si", // strip out javascript | |
649 | "'<[\/\!]*?[^<>]*?>'si", // strip out html tags | |
650 | "'([\r\n])[\s]+'", // strip out white space | |
651 | "'&(quote|#34);'i", // replace html entities | |
652 | "'&(amp|#38);'i", | |
653 | "'&(lt|#60);'i", | |
654 | "'&(gt|#62);'i", | |
655 | "'&(nbsp|#160);'i", | |
656 | "'&(iexcl|#161);'i", | |
657 | "'&(cent|#162);'i", | |
658 | "'&(pound|#163);'i", | |
659 | "'&(copy|#169);'i" | |
660 | ); | |
661 | $replace = array( "", | |
662 | "", | |
663 | "\\1", | |
664 | "\"", | |
665 | "&", | |
666 | "<", | |
667 | ">", | |
668 | " ", | |
669 | chr(161), | |
670 | chr(162), | |
671 | chr(163), | |
672 | chr(169)); | |
673 | ||
674 | $text = preg_replace($search,$replace,$document); | |
675 | ||
676 | return $text; | |
677 | } | |
678 | ||
679 | /*======================================================================*\ | |
680 | Function: _expandlinks | |
681 | Purpose: expand each link into a fully qualified URL | |
682 | Input: $links the links to qualify | |
683 | $URI the full URI to get the base from | |
684 | Output: $expandedLinks the expanded links | |
685 | \*======================================================================*/ | |
686 | ||
687 | function _expandlinks($links,$URI) | |
688 | { | |
689 | ||
690 | preg_match("/^[^\?]+/",$URI,$match); | |
691 | ||
692 | $match = preg_replace("|/[^\/\.]+\.[^\/\.]+$|","",$match[0]); | |
693 | ||
694 | $search = array( "|^http://".preg_quote($this->host)."|i", | |
695 | "|^(?!http://)(\/)?(?!mailto:)|i", | |
696 | "|/\./|", | |
697 | "|/[^\/]+/\.\./|" | |
698 | ); | |
699 | ||
700 | $replace = array( "", | |
701 | $match."/", | |
702 | "/", | |
703 | "/" | |
704 | ); | |
705 | ||
706 | $expandedLinks = preg_replace($search,$replace,$links); | |
707 | ||
708 | return $expandedLinks; | |
709 | } | |
710 | ||
711 | /*======================================================================*\ | |
712 | Function: _httprequest | |
713 | Purpose: go get the http data from the server | |
714 | Input: $url the url to fetch | |
715 | $fp the current open file pointer | |
716 | $URI the full URI | |
717 | $body body contents to send if any (POST) | |
718 | Output: | |
719 | \*======================================================================*/ | |
720 | ||
721 | function _httprequest($url,$fp,$URI,$http_method,$content_type="",$body="") | |
722 | { | |
723 | if($this->passcookies && $this->_redirectaddr) | |
724 | $this->setcookies(); | |
725 | ||
726 | $URI_PARTS = parse_url($URI); | |
727 | if(empty($url)) | |
728 | $url = "/"; | |
729 | $headers = $http_method." ".$url." ".$this->_httpversion."\r\n"; | |
730 | if(!empty($this->agent)) | |
731 | $headers .= "User-Agent: ".$this->agent."\r\n"; | |
732 | if(!empty($this->host) && !isset($this->rawheaders['Host'])) | |
733 | $headers .= "Host: ".$this->host."\r\n"; | |
734 | if(!empty($this->accept)) | |
735 | $headers .= "Accept: ".$this->accept."\r\n"; | |
736 | if(!empty($this->referer)) | |
737 | $headers .= "Referer: ".$this->referer."\r\n"; | |
738 | if(!empty($this->cookies)) | |
739 | { | |
740 | if(!is_array($this->cookies)) | |
741 | $this->cookies = (array)$this->cookies; | |
742 | ||
743 | reset($this->cookies); | |
744 | if ( count($this->cookies) > 0 ) { | |
745 | $cookie_headers .= 'Cookie: '; | |
746 | foreach ( $this->cookies as $cookieKey => $cookieVal ) { | |
747 | $cookie_headers .= $cookieKey."=".urlencode($cookieVal)."; "; | |
748 | } | |
749 | $headers .= substr($cookie_headers,0,-2) . "\r\n"; | |
750 | } | |
751 | } | |
752 | if(!empty($this->rawheaders)) | |
753 | { | |
754 | if(!is_array($this->rawheaders)) | |
755 | $this->rawheaders = (array)$this->rawheaders; | |
756 | while(list($headerKey,$headerVal) = each($this->rawheaders)) | |
757 | $headers .= $headerKey.": ".$headerVal."\r\n"; | |
758 | } | |
759 | if(!empty($content_type)) { | |
760 | $headers .= "Content-type: $content_type"; | |
761 | if ($content_type == "multipart/form-data") | |
762 | $headers .= "; boundary=".$this->_mime_boundary; | |
763 | $headers .= "\r\n"; | |
764 | } | |
765 | if(!empty($body)) | |
766 | $headers .= "Content-length: ".strlen($body)."\r\n"; | |
767 | if(!empty($this->user) || !empty($this->pass)) | |
768 | $headers .= "Authorization: BASIC ".base64_encode($this->user.":".$this->pass)."\r\n"; | |
769 | ||
770 | $headers .= "\r\n"; | |
771 | ||
772 | // set the read timeout if needed | |
773 | if ($this->read_timeout > 0) | |
774 | socket_set_timeout($fp, $this->read_timeout); | |
775 | $this->timed_out = false; | |
776 | ||
777 | fwrite($fp,$headers.$body,strlen($headers.$body)); | |
778 | ||
779 | $this->_redirectaddr = false; | |
780 | unset($this->headers); | |
781 | ||
782 | while($currentHeader = fgets($fp,$this->_maxlinelen)) | |
783 | { | |
784 | if ($this->read_timeout > 0 && $this->_check_timeout($fp)) | |
785 | { | |
786 | $this->status=-100; | |
787 | return false; | |
788 | } | |
789 | ||
790 | if($currentHeader == "\r\n") | |
791 | break; | |
792 | ||
793 | // if a header begins with Location: or URI:, set the redirect | |
794 | if(preg_match("/^(Location:|URI:)/i",$currentHeader)) | |
795 | { | |
796 | // get URL portion of the redirect | |
797 | preg_match("/^(Location:|URI:)\s+(.*)/",chop($currentHeader),$matches); | |
798 | // look for :// in the Location header to see if hostname is included | |
799 | if(!preg_match("|\:\/\/|",$matches[2])) | |
800 | { | |
801 | // no host in the path, so prepend | |
802 | $this->_redirectaddr = $URI_PARTS["scheme"]."://".$this->host.":".$this->port; | |
803 | // eliminate double slash | |
804 | if(!preg_match("|^/|",$matches[2])) | |
805 | $this->_redirectaddr .= "/".$matches[2]; | |
806 | else | |
807 | $this->_redirectaddr .= $matches[2]; | |
808 | } | |
809 | else | |
810 | $this->_redirectaddr = $matches[2]; | |
811 | } | |
812 | ||
813 | if(preg_match("|^HTTP/|",$currentHeader)) | |
814 | { | |
815 | if(preg_match("|^HTTP/[^\s]*\s(.*?)\s|",$currentHeader, $status)) | |
816 | { | |
817 | $this->status= $status[1]; | |
818 | } | |
819 | $this->response_code = $currentHeader; | |
820 | } | |
821 | ||
822 | $this->headers[] = $currentHeader; | |
823 | } | |
824 | ||
825 | $results = fread($fp, $this->maxlength); | |
826 | ||
827 | if ($this->read_timeout > 0 && $this->_check_timeout($fp)) | |
828 | { | |
829 | $this->status=-100; | |
830 | return false; | |
831 | } | |
832 | ||
833 | // check if there is a a redirect meta tag | |
834 | ||
835 | if(preg_match("'<meta[\s]*http-equiv[^>]*?content[\s]*=[\s]*[\"\']?\d+;[\s]+URL[\s]*=[\s]*([^\"\']*?)[\"\']?>'i",$results,$match)) | |
836 | { | |
837 | $this->_redirectaddr = $this->_expandlinks($match[1],$URI); | |
838 | } | |
839 | ||
840 | // have we hit our frame depth and is there frame src to fetch? | |
841 | if(($this->_framedepth < $this->maxframes) && preg_match_all("'<frame\s+.*src[\s]*=[\'\"]?([^\'\"\>]+)'i",$results,$match)) | |
842 | { | |
843 | $this->results[] = $results; | |
844 | for($x=0; $x<count($match[1]); $x++) | |
845 | $this->_frameurls[] = $this->_expandlinks($match[1][$x],$URI_PARTS["scheme"]."://".$this->host); | |
846 | } | |
847 | // have we already fetched framed content? | |
848 | elseif(is_array($this->results)) | |
849 | $this->results[] = $results; | |
850 | // no framed content | |
851 | else | |
852 | $this->results = $results; | |
853 | ||
854 | return true; | |
855 | } | |
856 | ||
857 | /*======================================================================*\ | |
858 | Function: _httpsrequest | |
859 | Purpose: go get the https data from the server using curl | |
860 | Input: $url the url to fetch | |
861 | $URI the full URI | |
862 | $body body contents to send if any (POST) | |
863 | Output: | |
864 | \*======================================================================*/ | |
865 | ||
866 | function _httpsrequest($url,$URI,$http_method,$content_type="",$body="") | |
867 | { | |
868 | if($this->passcookies && $this->_redirectaddr) | |
869 | $this->setcookies(); | |
870 | ||
871 | $headers = array(); | |
872 | ||
873 | $URI_PARTS = parse_url($URI); | |
874 | if(empty($url)) | |
875 | $url = "/"; | |
876 | // GET ... header not needed for curl | |
877 | //$headers[] = $http_method." ".$url." ".$this->_httpversion; | |
878 | if(!empty($this->agent)) | |
879 | $headers[] = "User-Agent: ".$this->agent; | |
880 | if(!empty($this->host)) | |
881 | $headers[] = "Host: ".$this->host; | |
882 | if(!empty($this->accept)) | |
883 | $headers[] = "Accept: ".$this->accept; | |
884 | if(!empty($this->referer)) | |
885 | $headers[] = "Referer: ".$this->referer; | |
886 | if(!empty($this->cookies)) | |
887 | { | |
888 | if(!is_array($this->cookies)) | |
889 | $this->cookies = (array)$this->cookies; | |
890 | ||
891 | reset($this->cookies); | |
892 | if ( count($this->cookies) > 0 ) { | |
893 | $cookie_str = 'Cookie: '; | |
894 | foreach ( $this->cookies as $cookieKey => $cookieVal ) { | |
895 | $cookie_str .= $cookieKey."=".urlencode($cookieVal)."; "; | |
896 | } | |
897 | $headers[] = substr($cookie_str,0,-2); | |
898 | } | |
899 | } | |
900 | if(!empty($this->rawheaders)) | |
901 | { | |
902 | if(!is_array($this->rawheaders)) | |
903 | $this->rawheaders = (array)$this->rawheaders; | |
904 | while(list($headerKey,$headerVal) = each($this->rawheaders)) | |
905 | $headers[] = $headerKey.": ".$headerVal; | |
906 | } | |
907 | if(!empty($content_type)) { | |
908 | if ($content_type == "multipart/form-data") | |
909 | $headers[] = "Content-type: $content_type; boundary=".$this->_mime_boundary; | |
910 | else | |
911 | $headers[] = "Content-type: $content_type"; | |
912 | } | |
913 | if(!empty($body)) | |
914 | $headers[] = "Content-length: ".strlen($body); | |
915 | if(!empty($this->user) || !empty($this->pass)) | |
916 | $headers[] = "Authorization: BASIC ".base64_encode($this->user.":".$this->pass); | |
917 | ||
918 | for($curr_header = 0; $curr_header < count($headers); $curr_header++) | |
919 | $cmdline_params .= " -H \"".$headers[$curr_header]."\""; | |
920 | ||
921 | if(!empty($body)) | |
922 | $cmdline_params .= " -d \"$body\""; | |
923 | ||
924 | if($this->read_timeout > 0) | |
925 | $cmdline_params .= " -m ".$this->read_timeout; | |
926 | ||
927 | $headerfile = uniqid(time()); | |
928 | ||
929 | exec($this->curl_path." -D \"/tmp/$headerfile\"".$cmdline_params." ".$URI,$results,$return); | |
930 | ||
931 | if($return) | |
932 | { | |
933 | $this->error = "Error: cURL could not retrieve the document, error $return."; | |
934 | return false; | |
935 | } | |
936 | ||
937 | ||
938 | $results = implode("\r\n",$results); | |
939 | ||
940 | $result_headers = file("/tmp/$headerfile"); | |
941 | ||
942 | $this->_redirectaddr = false; | |
943 | unset($this->headers); | |
944 | ||
945 | for($currentHeader = 0; $currentHeader < count($result_headers); $currentHeader++) | |
946 | { | |
947 | ||
948 | // if a header begins with Location: or URI:, set the redirect | |
949 | if(preg_match("/^(Location: |URI: )/i",$result_headers[$currentHeader])) | |
950 | { | |
951 | // get URL portion of the redirect | |
952 | preg_match("/^(Location: |URI:)(.*)/",chop($result_headers[$currentHeader]),$matches); | |
953 | // look for :// in the Location header to see if hostname is included | |
954 | if(!preg_match("|\:\/\/|",$matches[2])) | |
955 | { | |
956 | // no host in the path, so prepend | |
957 | $this->_redirectaddr = $URI_PARTS["scheme"]."://".$this->host.":".$this->port; | |
958 | // eliminate double slash | |
959 | if(!preg_match("|^/|",$matches[2])) | |
960 | $this->_redirectaddr .= "/".$matches[2]; | |
961 | else | |
962 | $this->_redirectaddr .= $matches[2]; | |
963 | } | |
964 | else | |
965 | $this->_redirectaddr = $matches[2]; | |
966 | } | |
967 | ||
968 | if(preg_match("|^HTTP/|",$result_headers[$currentHeader])) | |
969 | $this->response_code = $result_headers[$currentHeader]; | |
970 | ||
971 | $this->headers[] = $result_headers[$currentHeader]; | |
972 | } | |
973 | ||
974 | // check if there is a a redirect meta tag | |
975 | ||
976 | if(preg_match("'<meta[\s]*http-equiv[^>]*?content[\s]*=[\s]*[\"\']?\d+;[\s]+URL[\s]*=[\s]*([^\"\']*?)[\"\']?>'i",$results,$match)) | |
977 | { | |
978 | $this->_redirectaddr = $this->_expandlinks($match[1],$URI); | |
979 | } | |
980 | ||
981 | // have we hit our frame depth and is there frame src to fetch? | |
982 | if(($this->_framedepth < $this->maxframes) && preg_match_all("'<frame\s+.*src[\s]*=[\'\"]?([^\'\"\>]+)'i",$results,$match)) | |
983 | { | |
984 | $this->results[] = $results; | |
985 | for($x=0; $x<count($match[1]); $x++) | |
986 | $this->_frameurls[] = $this->_expandlinks($match[1][$x],$URI_PARTS["scheme"]."://".$this->host); | |
987 | } | |
988 | // have we already fetched framed content? | |
989 | elseif(is_array($this->results)) | |
990 | $this->results[] = $results; | |
991 | // no framed content | |
992 | else | |
993 | $this->results = $results; | |
994 | ||
995 | unlink("/tmp/$headerfile"); | |
996 | ||
997 | return true; | |
998 | } | |
999 | ||
1000 | /*======================================================================*\ | |
1001 | Function: setcookies() | |
1002 | Purpose: set cookies for a redirection | |
1003 | \*======================================================================*/ | |
1004 | ||
1005 | function setcookies() | |
1006 | { | |
1007 | for($x=0; $x<count($this->headers); $x++) | |
1008 | { | |
1009 | if(preg_match("/^set-cookie:[\s]+([^=]+)=([^;]+)/i", $this->headers[$x],$match)) | |
1010 | $this->cookies[$match[1]] = $match[2]; | |
1011 | } | |
1012 | } | |
1013 | ||
1014 | ||
1015 | /*======================================================================*\ | |
1016 | Function: _check_timeout | |
1017 | Purpose: checks whether timeout has occurred | |
1018 | Input: $fp file pointer | |
1019 | \*======================================================================*/ | |
1020 | ||
1021 | function _check_timeout($fp) | |
1022 | { | |
1023 | if ($this->read_timeout > 0) { | |
1024 | $fp_status = socket_get_status($fp); | |
1025 | if ($fp_status["timed_out"]) { | |
1026 | $this->timed_out = true; | |
1027 | return true; | |
1028 | } | |
1029 | } | |
1030 | return false; | |
1031 | } | |
1032 | ||
1033 | /*======================================================================*\ | |
1034 | Function: _connect | |
1035 | Purpose: make a socket connection | |
1036 | Input: $fp file pointer | |
1037 | \*======================================================================*/ | |
1038 | ||
1039 | function _connect(&$fp) | |
1040 | { | |
1041 | if(!empty($this->proxy_host) && !empty($this->proxy_port)) | |
1042 | { | |
1043 | $this->_isproxy = true; | |
1044 | $host = $this->proxy_host; | |
1045 | $port = $this->proxy_port; | |
1046 | } | |
1047 | else | |
1048 | { | |
1049 | $host = $this->host; | |
1050 | $port = $this->port; | |
1051 | } | |
1052 | ||
1053 | $this->status = 0; | |
1054 | ||
1055 | if($fp = fsockopen( | |
1056 | $host, | |
1057 | $port, | |
1058 | $errno, | |
1059 | $errstr, | |
1060 | $this->_fp_timeout | |
1061 | )) | |
1062 | { | |
1063 | // socket connection succeeded | |
1064 | ||
1065 | return true; | |
1066 | } | |
1067 | else | |
1068 | { | |
1069 | // socket connection failed | |
1070 | $this->status = $errno; | |
1071 | switch($errno) | |
1072 | { | |
1073 | case -3: | |
1074 | $this->error="socket creation failed (-3)"; | |
1075 | case -4: | |
1076 | $this->error="dns lookup failure (-4)"; | |
1077 | case -5: | |
1078 | $this->error="connection refused or timed out (-5)"; | |
1079 | default: | |
1080 | $this->error="connection failed (".$errno.")"; | |
1081 | } | |
1082 | return false; | |
1083 | } | |
1084 | } | |
1085 | /*======================================================================*\ | |
1086 | Function: _disconnect | |
1087 | Purpose: disconnect a socket connection | |
1088 | Input: $fp file pointer | |
1089 | \*======================================================================*/ | |
1090 | ||
1091 | function _disconnect($fp) | |
1092 | { | |
1093 | return(fclose($fp)); | |
1094 | } | |
1095 | ||
1096 | ||
1097 | /*======================================================================*\ | |
1098 | Function: _prepare_post_body | |
1099 | Purpose: Prepare post body according to encoding type | |
1100 | Input: $formvars - form variables | |
1101 | $formfiles - form upload files | |
1102 | Output: post body | |
1103 | \*======================================================================*/ | |
1104 | ||
1105 | function _prepare_post_body($formvars, $formfiles) | |
1106 | { | |
1107 | settype($formvars, "array"); | |
1108 | settype($formfiles, "array"); | |
1109 | ||
1110 | if (count($formvars) == 0 && count($formfiles) == 0) | |
1111 | return; | |
1112 | ||
1113 | switch ($this->_submit_type) { | |
1114 | case "application/x-www-form-urlencoded": | |
1115 | reset($formvars); | |
1116 | while(list($key,$val) = each($formvars)) { | |
1117 | if (is_array($val) || is_object($val)) { | |
1118 | while (list($cur_key, $cur_val) = each($val)) { | |
1119 | $postdata .= urlencode($key)."[]=".urlencode($cur_val)."&"; | |
1120 | } | |
1121 | } else | |
1122 | $postdata .= urlencode($key)."=".urlencode($val)."&"; | |
1123 | } | |
1124 | break; | |
1125 | ||
1126 | case "multipart/form-data": | |
1127 | $this->_mime_boundary = "Snoopy".md5(uniqid(microtime())); | |
1128 | ||
1129 | reset($formvars); | |
1130 | while(list($key,$val) = each($formvars)) { | |
1131 | if (is_array($val) || is_object($val)) { | |
1132 | while (list($cur_key, $cur_val) = each($val)) { | |
1133 | $postdata .= "--".$this->_mime_boundary."\r\n"; | |
1134 | $postdata .= "Content-Disposition: form-data; name=\"$key\[\]\"\r\n\r\n"; | |
1135 | $postdata .= "$cur_val\r\n"; | |
1136 | } | |
1137 | } else { | |
1138 | $postdata .= "--".$this->_mime_boundary."\r\n"; | |
1139 | $postdata .= "Content-Disposition: form-data; name=\"$key\"\r\n\r\n"; | |
1140 | $postdata .= "$val\r\n"; | |
1141 | } | |
1142 | } | |
1143 | ||
1144 | reset($formfiles); | |
1145 | while (list($field_name, $file_names) = each($formfiles)) { | |
1146 | settype($file_names, "array"); | |
1147 | while (list(, $file_name) = each($file_names)) { | |
1148 | if (!is_readable($file_name)) continue; | |
1149 | ||
1150 | $fp = fopen($file_name, "r"); | |
1151 | $file_content = fread($fp, filesize($file_name)); | |
1152 | fclose($fp); | |
1153 | $base_name = basename($file_name); | |
1154 | ||
1155 | $postdata .= "--".$this->_mime_boundary."\r\n"; | |
1156 | $postdata .= "Content-Disposition: form-data; name=\"$field_name\"; filename=\"$base_name\"\r\n\r\n"; | |
1157 | $postdata .= "$file_content\r\n"; | |
1158 | } | |
1159 | } | |
1160 | $postdata .= "--".$this->_mime_boundary."--\r\n"; | |
1161 | break; | |
1162 | } | |
1163 | ||
1164 | return $postdata; | |
1165 | } | |
1166 | } | |
1167 | ||
1168 | ?> |