Subversion-Projekte lars-tiefland.php_share

Revision

Details | Letzte Änderung | Log anzeigen | RSS feed

Revision Autor Zeilennr. Zeile
1 lars 1
<?php
2
 
3
require_once(HTML2PS_DIR.'fetcher._interface.class.php');
4
 
5
define('HTTP_OK',200);
6
 
7
/**
8
 * @TODO send authorization headers only if they have been required by the server;
9
 */
10
class FetcherUrl extends Fetcher {
11
  var $_connections;
12
 
13
  var $protocol;
14
  var $host;
15
  var $port;
16
  var $path;
17
 
18
  var $url;
19
 
20
  var $headers;
21
  var $content;
22
  var $code;
23
 
24
  var $redirects;
25
 
26
  // Authorization
27
 
28
  var $user;
29
  var $pass;
30
 
31
  // ---------------------------------------------
32
  // FetcherURL - PUBLIC methods
33
  // ---------------------------------------------
34
 
35
  // "Fetcher" interface implementation
36
 
37
  function get_base_url() {
38
    return $this->url;
39
  }
40
 
41
  function get_data($data_id) {
42
    $this->redirects = 0;
43
 
44
    if ($this->fetch($data_id)) {
45
      if ($this->code != HTTP_OK) {
46
 
47
        $_server_response = $this->headers;
48
        $_http_error = $this->code;
49
        $_url = htmlspecialchars($data_id);
50
 
51
        ob_start();
52
        include('templates/error._http.tpl');
53
        $this->error_message .= ob_get_contents();
54
        ob_end_clean();
55
 
56
        error_log("Cannot open $data_id, HTTP result code is: ".$this->code);
57
 
58
        return null;
59
      };
60
 
61
      return new FetchedDataURL($this->content,
62
                                explode("\r\n",$this->headers),
63
                                $this->url);
64
    } elseif ($this->redirects > MAX_REDIRECTS) {
65
      $_server_response    = $this->headers;
66
      $_url = htmlspecialchars($data_id);
67
 
68
      ob_start();
69
      include('templates/error._redirects.tpl');
70
      $this->error_message .= ob_get_contents();
71
      ob_end_clean();
72
 
73
      error_log(sprintf("Cannot open %s, too many redirects",
74
                        $data_id));
75
 
76
      return null;
77
    } else {
78
      $_server_response = $this->headers;
79
      $_url = htmlspecialchars($data_id);
80
 
81
      ob_start();
82
      include('templates/error._connection.tpl');
83
      $this->error_message .= ob_get_contents();
84
      ob_end_clean();
85
 
86
      error_log(sprintf("Cannot open %s",
87
                        $data_id));
88
 
89
      return null;
90
    }
91
  }
92
 
93
  function error_message() {
94
    return $this->error_message;
95
  }
96
 
97
  // FetcherURL - constructor
98
 
99
  function FetcherURL() {
100
    $this->_connections = array();
101
 
102
    $this->error_message = "";
103
 
104
    $this->redirects = 0;
105
    $this->port = 80;
106
 
107
    // Default encoding
108
    //    $this->encoding = "iso-8859-1";
109
 
110
    $this->user_agent = DEFAULT_USER_AGENT;
111
  }
112
 
113
  // ---------------------------------------------
114
  // FetcherURL - PRIVATE methods
115
  // ---------------------------------------------
116
 
117
  /**
118
   * Connects to the target host using either HTTP or HTTPS protocol;
119
   * returns handle to connection socked or 'null' in case connection failed.
120
   *
121
   * @access private
122
   * @final
123
   * @return resource
124
   */
125
  function _connect() {
126
    // Connect to the target host
127
    if ($this->protocol == "https") {
128
      return $this->_connect_ssl();
129
    };
130
 
131
    $fp = @fsockopen($this->host,$this->port,$errno,$errstr,HTML2PS_CONNECTION_TIMEOUT);
132
 
133
    if (!$fp) {
134
      $message = sprintf("Cannot connect to %s:%d - (%d) %s",
135
                         $this->host,
136
                         $this->port,
137
                         $errno,
138
                         $errstr);
139
      error_log($message);
140
      $this->error_message = $message;
141
      return null;
142
    };
143
 
144
    return $fp;
145
  }
146
 
147
  function _connect_ssl() {
148
    /**
149
     * Check if there's SSL support library loaded
150
     *
151
     * Note that in certain situations (e.g. Windows + PHP 4.4.0 + Apache 2 on my development box)
152
     * openssl extension IS present, but fsockopen still complains "No SSL support in this build".
153
     * (probably PHP bug?)
154
     */
155
    if (!extension_loaded('openssl')) {
156
      $message = sprintf("Cannot connect to %s:%d. SSL Extension missing",
157
                         $this->host,
158
                         $this->port);
159
      error_log($message);
160
      $this->error_message .= $message;
161
      return null;
162
    };
163
 
164
    $fp = @fsockopen("ssl://$this->host", $this->port, $errno, $errstr, 5);
165
 
166
    if (!$fp) {
167
      $message = sprintf("Cannot connect to %s:%d - (%d) %s<br/>Missing SSL support?",
168
                         $this->host,
169
                         $this->port,
170
                         $errno,
171
                         $errstr);
172
      error_log($message);
173
      $this->error_message = $message;
174
      return null;
175
    };
176
 
177
    return $fp;
178
  }
179
 
180
  function _extract_code($res) {
181
    // Check return code
182
    // Note the return code will always be contained in the response, so
183
    // the we may not check the result of 'preg_match' - it matches always.
184
    //
185
    // A month later: nope, not always.
186
    //
187
    if (preg_match('/\s(\d+)\s/',$res,$matches)) {
188
      $result = $matches[1];
189
    } else {
190
      $result = "200";
191
    };
192
 
193
    return $result;
194
  }
195
 
196
  function _fix_location($location) {
197
    if (substr($location, 0, 7) == "http://") { return $location; };
198
    if (substr($location, 0, 8) == "https://") { return $location; };
199
 
200
    if ($location{0} == "/") {
201
      return $this->protocol."://".$this->host.$location;
202
    };
203
 
204
    return $this->protocol."://".$this->host.$this->path.$location;
205
  }
206
 
207
  function fetch($url) {
208
    /**
209
     * Handle empty $url value; unfortunaltely, parse_url will treat empty value as valid
210
     * URL, so fetcher will attempt to fetch something from the localhost instead of
211
     * passing control to subsequent user-defined fetchers (which probably will know
212
     * how to handle this).
213
     */
214
    if ($url === "") {
215
      return null;
216
    }
217
 
218
    $this->url = $url;
219
 
220
    $parts = @parse_url($this->url);
221
 
222
    /**
223
     * If an malformed URL have been specified, add a message to the log file and
224
     * continue processing (as such URLs may be found in otherwise good HTML file -
225
     * for example, invalid image or CSS reference)
226
     */
227
    if ($parts == false) {
228
      error_log(sprintf("The URL '%s' could not be parsed", $this->url));
229
 
230
      $this->content = '';
231
      $this->code = HTTP_OK;
232
      return true;
233
    };
234
 
235
    /**
236
     * Setup default values
237
     */
238
    $this->protocol = 'http';
239
    $this->host = 'localhost';
240
    $this->user = "";
241
    $this->pass = "";
242
    $this->port = 80;
243
    $this->path = "/";
244
    $this->query = "";
245
 
246
    if (isset($parts['scheme']))   { $this->protocol  = $parts['scheme'];    };
247
    if (isset($parts['host']))     { $this->host      = $parts['host'];      };
248
    if (isset($parts['user']))     { $this->user      = $parts['user'];      };
249
    if (isset($parts['pass']))     { $this->pass      = $parts['pass'];      };
250
    if (isset($parts['port']))     { $this->port      = $parts['port'];      };
251
    if (isset($parts['path']))     { $this->path      = $parts['path'];      } else { $this->path = "/"; };
252
    if (isset($parts['query']))    { $this->path     .= '?'.$parts['query']; };
253
 
254
    switch (strtolower($this->protocol)) {
255
    case 'http':
256
      return $this->fetch_http();
257
    case 'https':
258
      return $this->fetch_https();
259
    case 'file':
260
      $this->host = "";
261
      return $this->fetch_file();
262
    default:
263
      $message = sprintf("Unsupported protocol: %s", $this->protocol);
264
      error_log($message);
265
      $this->error_message .= $message;
266
      return null;
267
    }
268
  }
269
 
270
  function fetch_http() {
271
    $res = $this->_head();
272
 
273
    if (is_null($res)) { return null; };
274
    $this->code = $this->_extract_code($res);
275
 
276
    return $this->_process_code($res);
277
  }
278
 
279
  function fetch_https() {
280
    /**
281
     * SSL works via port 443
282
     */
283
    if ($this->protocol == "https" && !isset($parts['port'])) {
284
       $this->port = 443;
285
    }
286
 
287
    $res = $this->_head();
288
 
289
    if (is_null($res)) { return null; };
290
    $this->code = $this->_extract_code($res);
291
 
292
    return $this->_process_code($res);
293
  }
294
 
295
  function fetch_file() {
296
    if (PHP_OS == "WINNT") {
297
      $path = substr($this->url, 7);
298
      if ($path{0} == "/") { $path = substr($path, 1); };
299
    } else {
300
      $path = substr($this->url, 7);
301
    };
302
 
303
    $normalized_path = realpath(urldecode($path));
304
    $normalized_path_part = substr($normalized_path, 0, strlen(FILE_PROTOCOL_RESTRICT));
305
    if ($normalized_path_part !== FILE_PROTOCOL_RESTRICT) {
306
      error_log(sprintf("Access denied to file '%s'", $normalized_path));
307
 
308
      $this->content = "";
309
      $this->code = HTTP_OK;
310
      return true;
311
    }
312
 
313
    $this->content = @file_get_contents($normalized_path);
314
    $this->code = HTTP_OK;
315
 
316
    return true;
317
  }
318
 
319
  function _get() {
320
    $socket = $this->_connect();
321
    if (is_null($socket)) { return null; };
322
 
323
    // Build the HEAD request header (we're saying we're just a browser as some pages don't like non-standard user-agents)
324
    $header  = "GET ".$this->path." HTTP/1.1\r\n";
325
    $header .= "Host: ".$this->host."\r\n";
326
    $header .= "Accept: */*\r\n";
327
    $header .= "User-Agent: ".$this->user_agent."\r\n";
328
    $header .= "Connection: keep-alive\r\n";
329
    $header .= "Referer: ".$this->protocol."://".$this->host.$this->path."\r\n";
330
    $header .= $this->_header_basic_authorization();
331
    $header .= "\r\n";
332
 
333
    fputs ($socket, $header);
334
    // Get the responce
335
    $res = "";
336
 
337
    // The PHP-recommended construction
338
    //    while (!feof($fp)) { $res .= fread($fp, 4096); };
339
    // hangs indefinitely on www.searchscout.com, for example.
340
    // seems that they do not close conection on their side or somewhat similar;
341
 
342
    // let's assume that there will be no HTML pages greater than 1 Mb
343
 
344
    $res = fread($socket, 1024*1024);
345
 
346
    // Close connection handle, we do not need it anymore
347
    fclose($socket);
348
 
349
    return $res;
350
  }
351
 
352
  function _head() {
353
    $socket = $this->_connect();
354
 
355
    if (is_null($socket)) { return null; };
356
 
357
    // Build the HEAD request header (we're saying we're just a browser as some pages don't like non-standard user-agents)
358
    $header  = "HEAD ".$this->path." HTTP/1.1\r\n";
359
    $header .= "Host: ".$this->host."\r\n";
360
    $header .= "Accept: */*\r\n";
361
    $header .= "User-Agent: ".$this->user_agent."\r\n";
362
    $header .= "Connection: keep-alive\r\n";
363
    $header .= "Accept: text/html\r\n";
364
    $header .= "Referer: ".$this->protocol."://".$this->host.$this->path."\r\n";
365
 
366
    $header .= $this->_header_basic_authorization();
367
 
368
    $header .= "\r\n";
369
 
370
    // Send the header
371
    fputs ($socket, $header);
372
    // Get the responce
373
    $res = "";
374
 
375
    // The PHP-recommended construction
376
    //    while (!feof($fp)) { $res .= fread($fp, 4096); };
377
    // hangs indefinitely on www.searchscout.com, for example.
378
    // seems that they do not close conection on their side or somewhat similar;
379
 
380
    // let's assume that there will be no HTML pages greater than 1 Mb
381
 
382
    $res = fread($socket, 4096);
383
 
384
    // Close connection handle, we do not need it anymore
385
    fclose($socket);
386
 
387
    return $res;
388
  }
389
 
390
  function _process_code($res, $used_get = false) {
391
    switch ($this->code) {
392
    case '200': // OK
393
      if (preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) {
394
        $this->headers = $matches[1];
395
      };
396
 
397
      /**
398
       * @todo add error processing here
399
       *
400
       * Note: file_get_contents is smart enough to use basic authorization headers provided
401
       * user name / password are given in the URL.
402
       */
403
      $this->content = @file_get_contents($this->url);
404
 
405
      return true;
406
      break;
407
    case '301': // Moved Permanently
408
      $this->redirects++;
409
      if ($this->redirects > MAX_REDIRECTS) { return false; };
410
      preg_match('/Location: ([\S]+)/i',$res,$matches);
411
      return $this->fetch($this->_fix_location($matches[1]));
412
    case '302': // Found
413
      $this->redirects++;
414
      if ($this->redirects > MAX_REDIRECTS) { return false; };
415
      preg_match('/Location: ([\S]+)/i',$res,$matches);
416
      error_log('Redirected to:'.$matches[1]);
417
 
418
      return $this->fetch($this->_fix_location($matches[1]));
419
    case '400': // Bad request
420
    case '401': // Unauthorized
421
    case '402': // Payment required
422
    case '403': // Forbidden
423
    case '404': // Not found - but should return some html content - error page
424
    case '406': // Not acceptable
425
      if (!preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) {
426
        error_log("Unrecognized HTTP response");
427
        return false;
428
      };
429
      $this->headers = $matches[1];
430
      $this->content = @file_get_contents($this->url);
431
      return true;
432
    case '405': // Method not allowed; some sites (like MSN.COM) do not like "HEAD" HTTP requests
433
      // Try to get URL information using GET request (if we didn't tried it before)
434
      if (!$used_get) {
435
        $res = $this->_get();
436
        if (is_null($res)) { return null; };
437
        $this->code = $this->_extract_code($res);
438
        return $this->_process_code($res, true);
439
      } else {
440
        if (!preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) {
441
          error_log("Unrecognized HTTP response");
442
          return false;
443
        };
444
        $this->headers = $matches[1];
445
        $this->content = @file_get_contents($this->url);
446
        return true;
447
      };
448
    default:
449
      error_log("Unrecognized HTTP result code:".$this->code);
450
      return false;
451
    };
452
  }
453
 
454
  function _header_basic_authorization() {
455
    if (!is_null($this->user) && $this->user != "") {
456
      return sprintf("Authorization: Basic %s\r\n", base64_encode($this->user.":".$this->pass));
457
    };
458
  }
459
}
460
?>