Location: PHPKode > projects > Html2ps > html2ps-2.0.43/fetcher.url.class.php
<?php

require_once(HTML2PS_DIR.'fetcher._interface.class.php');

define('HTTP_OK',200);

/**
 * @TODO send authorization headers only if they have been required by the server;
 */
class FetcherUrl extends Fetcher {
  var $_connections;

  var $protocol;
  var $host;
  var $port;
  var $path;

  var $url;

  var $headers;
  var $content;
  var $code;

  var $redirects;

  // Authorization

  var $user;
  var $pass;

  // ---------------------------------------------
  // FetcherURL - PUBLIC methods
  // ---------------------------------------------

  // "Fetcher" interface implementation

  function get_base_url() {
    return $this->url;
  }

  function get_data($data_id) {
    $this->redirects = 0;

    if ($this->fetch($data_id)) {
      if ($this->code != HTTP_OK) {

        $_server_response = $this->headers;
        $_http_error = $this->code;
        $_url = htmlspecialchars($data_id);

        ob_start();
        include('templates/error._http.tpl');
        $this->error_message .= ob_get_contents();
        ob_end_clean();

        error_log("Cannot open $data_id, HTTP result code is: ".$this->code);

        return null;
      };

      return new FetchedDataURL($this->content,
                                explode("\r\n",$this->headers),
                                $this->url);
    } elseif ($this->redirects > MAX_REDIRECTS) {
      $_server_response    = $this->headers;
      $_url = htmlspecialchars($data_id);

      ob_start();
      include('templates/error._redirects.tpl');
      $this->error_message .= ob_get_contents();
      ob_end_clean();

      error_log(sprintf("Cannot open %s, too many redirects",
                        $data_id));

      return null;
    } else {
      $_server_response = $this->headers;
      $_url = htmlspecialchars($data_id);

      ob_start();
      include('templates/error._connection.tpl');
      $this->error_message .= ob_get_contents();
      ob_end_clean();

      error_log(sprintf("Cannot open %s",
                        $data_id));

      return null;
    }
  }

  function error_message() {
    return $this->error_message;
  }

  // FetcherURL - constructor

  function FetcherURL() {
    $this->_connections = array();

    $this->error_message = "";

    $this->redirects = 0;
    $this->port = 80;

    // Default encoding
    //    $this->encoding = "iso-8859-1";

    $this->user_agent = DEFAULT_USER_AGENT;
  }

  // ---------------------------------------------
  // FetcherURL - PRIVATE methods
  // ---------------------------------------------

  /**
   * Connects to the target host using either HTTP or HTTPS protocol;
   * returns handle to connection socked or 'null' in case connection failed.
   *
   * @access private
   * @final
   * @return resource
   */
  function _connect() {
    // Connect to the target host
    if ($this->protocol == "https") {
      return $this->_connect_ssl();
    };

    $fp = @fsockopen($this->host,$this->port,$errno,$errstr,HTML2PS_CONNECTION_TIMEOUT);

    if (!$fp) {
      $message = sprintf("Cannot connect to %s:%d - (%d) %s", 
                         $this->host, 
                         $this->port,
                         $errno,
                         $errstr);
      error_log($message);
      $this->error_message = $message;
      return null;
    };

    return $fp;
  }

  function _connect_ssl() {
    /**
     * Check if there's SSL support library loaded 
     * 
     * Note that in certain situations (e.g. Windows + PHP 4.4.0 + Apache 2 on my development box)
     * openssl extension IS present, but fsockopen still complains "No SSL support in this build".
     * (probably PHP bug?) 
     */
    if (!extension_loaded('openssl')) {
      $message = sprintf("Cannot connect to %s:%d. SSL Extension missing", 
                         $this->host, 
                         $this->port);
      error_log($message);
      $this->error_message .= $message;
      return null;
    };

    $fp = @fsockopen("ssl://$this->host", $this->port, $errno, $errstr, 5);

    if (!$fp) {
      $message = sprintf("Cannot connect to %s:%d - (%d) %s<br/>Missing SSL support?", 
                         $this->host, 
                         $this->port,
                         $errno,
                         $errstr);
      error_log($message);
      $this->error_message = $message;
      return null;
    };

    return $fp;
  }

  function _extract_code($res) {
    // Check return code
    // Note the return code will always be contained in the response, so
    // the we may not check the result of 'preg_match' - it matches always.
    //
    // A month later: nope, not always.
    //
    if (preg_match('/\s(\d+)\s/',$res,$matches)) {
      $result = $matches[1];
    } else {
      $result = "200";
    };

    return $result;
  }

  function _fix_location($location) {
    if (substr($location, 0, 7) == "http://") { return $location; };
    if (substr($location, 0, 8) == "https://") { return $location; };

    if ($location{0} == "/") {
      return $this->protocol."://".$this->host.$location;
    };

    return $this->protocol."://".$this->host.$this->path.$location;
  }

  function fetch($url) {
    /**
     * Handle empty $url value; unfortunaltely, parse_url will treat empty value as valid
     * URL, so fetcher will attempt to fetch something from the localhost instead of 
     * passing control to subsequent user-defined fetchers (which probably will know
     * how to handle this).
     */
    if ($url === "") {
      return null;
    }

    $this->url = $url;

    $parts = @parse_url($this->url);

    /**
     * If an malformed URL have been specified, add a message to the log file and 
     * continue processing (as such URLs may be found in otherwise good HTML file - 
     * for example, invalid image or CSS reference)
     */
    if ($parts == false) {
      error_log(sprintf("The URL '%s' could not be parsed", $this->url));

      $this->content = '';
      $this->code = HTTP_OK;
      return true;
    };
   
    /**
     * Setup default values
     */
    $this->protocol = 'http';
    $this->host = 'localhost';
    $this->user = "";
    $this->pass = "";
    $this->port = 80;
    $this->path = "/";
    $this->query = "";

    if (isset($parts['scheme']))   { $this->protocol  = $parts['scheme'];    };
    if (isset($parts['host']))     { $this->host      = $parts['host'];      };
    if (isset($parts['user']))     { $this->user      = $parts['user'];      };
    if (isset($parts['pass']))     { $this->pass      = $parts['pass'];      };
    if (isset($parts['port']))     { $this->port      = $parts['port'];      };
    if (isset($parts['path']))     { $this->path      = $parts['path'];      } else { $this->path = "/"; };
    if (isset($parts['query']))    { $this->path     .= '?'.$parts['query']; };
  
    switch (strtolower($this->protocol)) {
    case 'http':
      return $this->fetch_http();
    case 'https':
      return $this->fetch_https();
    case 'file':
      $this->host = "";
      return $this->fetch_file();
    default:
      $message = sprintf("Unsupported protocol: %s", $this->protocol);
      error_log($message);
      $this->error_message .= $message;
      return null;
    }
  }

  function fetch_http() {
    $res = $this->_head();

    if (is_null($res)) { return null; };
    $this->code = $this->_extract_code($res);

    return $this->_process_code($res);
  }

  function fetch_https() {
    /**
     * SSL works via port 443
     */
    if ($this->protocol == "https" && !isset($parts['port'])) {
       $this->port = 443;
    }

    $res = $this->_head();

    if (is_null($res)) { return null; };
    $this->code = $this->_extract_code($res);

    return $this->_process_code($res);
  }

  function fetch_file() {
    if (PHP_OS == "WINNT") {
      $path = substr($this->url, 7);
      if ($path{0} == "/") { $path = substr($path, 1); };
    } else {
      $path = substr($this->url, 7);
    };

    $normalized_path = realpath(urldecode($path));
    $normalized_path_part = substr($normalized_path, 0, strlen(FILE_PROTOCOL_RESTRICT));
    if ($normalized_path_part !== FILE_PROTOCOL_RESTRICT) {
      error_log(sprintf("Access denied to file '%s'", $normalized_path));

      $this->content = "";
      $this->code = HTTP_OK;
      return true;
    }

    $this->content = @file_get_contents($normalized_path);
    $this->code = HTTP_OK;

    return true;
  }

  function _get() {
    $socket = $this->_connect();
    if (is_null($socket)) { return null; };

    // Build the HEAD request header (we're saying we're just a browser as some pages don't like non-standard user-agents)
    $header  = "GET ".$this->path." HTTP/1.1\r\n";
    $header .= "Host: ".$this->host."\r\n";
    $header .= "Accept: */*\r\n";
    $header .= "User-Agent: ".$this->user_agent."\r\n";
    $header .= "Connection: keep-alive\r\n";
    $header .= "Referer: ".$this->protocol."://".$this->host.$this->path."\r\n";   
    $header .= $this->_header_basic_authorization();
    $header .= "\r\n";

    fputs ($socket, $header);
    // Get the responce
    $res = "";

    // The PHP-recommended construction
    //    while (!feof($fp)) { $res .= fread($fp, 4096); };
    // hangs indefinitely on www.searchscout.com, for example.
    // seems that they do not close conection on their side or somewhat similar;

    // let's assume that there will be no HTML pages greater than 1 Mb

    $res = fread($socket, 1024*1024);

    // Close connection handle, we do not need it anymore
    fclose($socket);

    return $res;
  }

  function _head() {
    $socket = $this->_connect();

    if (is_null($socket)) { return null; };

    // Build the HEAD request header (we're saying we're just a browser as some pages don't like non-standard user-agents)
    $header  = "HEAD ".$this->path." HTTP/1.1\r\n";
    $header .= "Host: ".$this->host."\r\n";
    $header .= "Accept: */*\r\n";
    $header .= "User-Agent: ".$this->user_agent."\r\n";
    $header .= "Connection: keep-alive\r\n";
    $header .= "Accept: text/html\r\n";
    $header .= "Referer: ".$this->protocol."://".$this->host.$this->path."\r\n";

    $header .= $this->_header_basic_authorization();

    $header .= "\r\n";

    // Send the header
    fputs ($socket, $header);
    // Get the responce
    $res = "";

    // The PHP-recommended construction
    //    while (!feof($fp)) { $res .= fread($fp, 4096); };
    // hangs indefinitely on www.searchscout.com, for example.
    // seems that they do not close conection on their side or somewhat similar;

    // let's assume that there will be no HTML pages greater than 1 Mb

    $res = fread($socket, 4096);

    // Close connection handle, we do not need it anymore
    fclose($socket);

    return $res;
  }

  function _process_code($res, $used_get = false) {
    switch ($this->code) {
    case '200': // OK
      if (preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) {
        $this->headers = $matches[1];
      };

      /**
       * @todo add error processing here
       * 
       * Note: file_get_contents is smart enough to use basic authorization headers provided 
       * user name / password are given in the URL.
       */
      $this->content = @file_get_contents($this->url);

      return true;
      break;
    case '301': // Moved Permanently
      $this->redirects++;
      if ($this->redirects > MAX_REDIRECTS) { return false; };
      preg_match('/Location: ([\S]+)/i',$res,$matches);
      return $this->fetch($this->_fix_location($matches[1]));
    case '302': // Found
      $this->redirects++;
      if ($this->redirects > MAX_REDIRECTS) { return false; };
      preg_match('/Location: ([\S]+)/i',$res,$matches);
      error_log('Redirected to:'.$matches[1]);

      return $this->fetch($this->_fix_location($matches[1]));
    case '400': // Bad request
    case '401': // Unauthorized
    case '402': // Payment required
    case '403': // Forbidden
    case '404': // Not found - but should return some html content - error page
    case '406': // Not acceptable
      if (!preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) {
        error_log("Unrecognized HTTP response");
        return false;
      };
      $this->headers = $matches[1];
      $this->content = @file_get_contents($this->url);
      return true;
    case '405': // Method not allowed; some sites (like MSN.COM) do not like "HEAD" HTTP requests
      // Try to get URL information using GET request (if we didn't tried it before)
      if (!$used_get) {
        $res = $this->_get();
        if (is_null($res)) { return null; };
        $this->code = $this->_extract_code($res);
        return $this->_process_code($res, true);
      } else {
        if (!preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) {
          error_log("Unrecognized HTTP response");
          return false;
        };
        $this->headers = $matches[1];
        $this->content = @file_get_contents($this->url);
        return true;
      };
    default:
      error_log("Unrecognized HTTP result code:".$this->code);
      return false;
    };
  }

  function _header_basic_authorization() {
    if (!is_null($this->user) && $this->user != "") {
      return sprintf("Authorization: Basic %s\r\n", base64_encode($this->user.":".$this->pass));
    };
  }
}
?>
Return current item: Html2ps