Location: PHPKode > projects > PHPCrawl > PHPCrawl_070/classes/phpcrawler.class.php
<?php
/////////////////////////////////////////////////////////
// PHPCrawl
// - class PHPCrawler:
//
// The main-class, version 0.7,
// 2007/01/03
//
// Copyright (C) 2003 Uwe Hunfeld (hide@address.com)
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the 
// Free Software Foundation; either version 2 of the License, or 
// at your option) any later version. 
//
// This program is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. 
//
// You should have received a copy of the GNU General Public License along with this
// program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
// Boston, MA 02111-1307, USA.
//
/////////////////////////////////////////////////////////

class PHPCrawler
{  
  // Version
  var $class_version = "0.7";
  
  // Base Infos
  var $url_to_crawl = "http://www.foo.com";
  
  // Limits
  var $page_limit_all = 0;
  var $page_limit_count_ct_only = true;  // which pages to count ? Only the
                                         // received pages (matched contend-type)
  // Follow-option-vars
  var $general_follow_mode = 2; // General follow mode
                                // 0: Follow EVERYTHING
                                // 1: Follow only links to the same domain
                                // 2: Follow only links to the exact same HOST
                                // 3: Follow only links to the exact same HOST and path
  
  var $follow_redirects_till_content = true; // Follow redirect till there was found REAL content,
                                             // doesnt matter which follow-mode was set
  
  var $not_follow_matches = array();
  var $follow_matches = array();
  var $follow_redirects = true;
  
  var $link_priorities = array(); // Will contain all link-priority-matches set
                                  // by the user
    
  var $store_extended_linkinfo = true; // Decides if the crawler should store extended-linkinfo like linktext,
                                       // linkcode etc. for the user
                                       
  var $parse_robots_txt = false; // Should the robots.txt-file be parsed?
  
  // INTERN VARS, DONT TOUCH !
    
  var $pageRequest; // An instance of the PHPCrawlerPageRequest-Object
  
  var $base_file;
  var $base_path;  // http://www.foo.com/stuff/index.htm -> /stuff
  var $base_host;  // http://www.foo.com/stuff/index.htm -> www.foo.com
  var $base_domain; //  http://www.foo.com/stuff/index.htm -> foo.com
  var $base_port; // http://www.foo.com:443/stuff/ -> 443
  
    
  var $urls_to_crawl = array(); // Walking array, will contain 
		                              // all links that should be followed
		                              // its build like that (an example):
		                              // $urls_to_crawl[6][2]["urlrebuild"]
		                              // This is the second url in the
		                              // priority_array number 6
		                              // Each element is an array again that contains
		                              // the elements ["link_raw"], ["url_rebuild"], ["refering_url"],
		                              // ["linktext"] ans ["linkcode"] later on
		                              // IMPORTANT: All URLs in here WILL BE CRAWLED,
		                              // all "operations" like filtering and manipulating a.s.o. will be
		                              // done BEFORE the links/urls will be put in here !!

  var $url_map = array(); // This array will contain all md5-hashes of URLS that were
                        // put into the array $urls_to_crawl, BUT AS KEYS, like
                        // $url_map["http://www.foo.com/bar.html"]=true (md5, not the url itself)
                        // This is for checking if a found URL is already in there
                        // or not, this improves performance A LOT. 
    
  // KICKED OUT
  // now its all in $urls_to_crawl                           
  // var $referers_to_urls_to_crawl=array(); // The referers to the pathes to crawl
  // var $linktexts_of_urls_to_crawl=array();
  
  var $content_found = false; // Just a flag that switches to TRUE if ANY content was found
  
  var $status_return = array(); // Status-array to return after process finished
  
  var $max_priority_level = 0; // Will contain the highest priority_level set by the user
  
  var $benchmark = false; // internal
  
  // Constructor
  function PHPCrawler()
  {
    $this->initCrawler();
  }
  
  function initCrawler()
  {
    // Include needed class-files
    $classpath = dirname(__FILE__);
    
    // Utils-class
    if (!class_exists("PHPCrawlerUtils"))
    {
      include_once($classpath."/phpcrawlerutils.class.php");
    }
    
    // PageRequest-class
    if (!class_exists("PHPCrawlerPageRequest"))
    {
      include_once($classpath."/phpcrawlerpagerequest.class.php");
      
      // Initiate a new PageRequestor
      $this->pageRequest = &new PHPCrawlerPageRequest();
    }
  }
  
  // For debugging and stuff ONLY !
  function getmicrotime()
  { 
    list($usec, $sec) = explode(" ",microtime()); 
    return ((float)$usec + (float)$sec); 
  } 
  
  function go()
  {  
    $starting_time = $this->getmicrotime();
    
    // Init, split given URL into host, port, path and file a.s.o.
    $url_parts = PHPCrawlerUtils::splitURL($this->url_to_crawl);
    
    // Set base-host and base-path "global" for this class,
    // we need it very often (i guess at this point...)
    $this->base_path = $url_parts["path"];
    $this->base_host = $url_parts["host"];
    $this->base_domain = $url_parts["domain"];
    
    // If the base port wasnt set by the user ->
    // take the one from the given start-URL.
    if ($this->base_port == "") $this->base_port = $url_parts["port"];
    
    // if the base-port WAS set by the user
    $url_parts["port"] = $this->base_port;
    
    // Reset the base_url
    $this->url_to_crawl = PHPCrawlerUtils::rebuildURL($url_parts);
    $this->url_to_crawl = PHPCrawlerUtils::normalizeURL($this->url_to_crawl);
    
    // Init counters
    $links_followed=0;
    $files_received=0;
    
    // Put the first url into our main-array
    $tmp[0]["url_rebuild"] = $this->url_to_crawl;
    PHPCrawlerUtils::removeMatchingLinks($tmp, $this->not_follow_matches);
    
    if (isset($tmp[0]["url_rebuild"]) &&  $tmp[0]["url_rebuild"] != "")
    {
      PHPCrawlerUtils::addToArray($tmp, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo);
    }
    
    // MAIN-LOOP -------------------------------------------------------------------
    
    // It works like this:
    // The first loop looks through all the "Priority"-arrays and checks if any
    // of these arrays is filled with URLS.
    
    for ($pri_level = $this->max_priority_level+1; $pri_level > -1; $pri_level--)
    {
      // Yep. Found a priority-array with at least one URL
      if (isset($this->urls_to_crawl[$pri_level]) && !isset($stop_crawling))
      {   
        // Now "process" all URLS in this priroity-array
        @reset($this->urls_to_crawl[$pri_level]);
        while (list($key) = @each($this->urls_to_crawl[$pri_level]))      
        {
          $all_start = $this->getmicrotime();
          
          $stop_crawling_this_level = false; // init
          
          // Request URL (crawl())
          unset($page_data);
          
          if (!isset($this->urls_to_crawl[$pri_level][$key]["referer_url"])) 
          {
            $this->urls_to_crawl[$pri_level][$key]["referer_url"] = "";
          }
          
          $page_data = $this->pageRequest->receivePage($this->urls_to_crawl[$pri_level][$key]["url_rebuild"],
                                                       $this->urls_to_crawl[$pri_level][$key]["referer_url"]);

          // If the request-object just irnored the URL ->
          // -> Stop and remove URL from Array
          if ($page_data == false)
          {
            unset($this->urls_to_crawl[$pri_level][$key]);
            continue; 
          }
          
          $links_followed++;
          
          // Now $page_data["links_found"] contains all found links at this point
          
          // Check if a "<base href.."-tag is given in the source and xtract
          // the base URL
          // !! Doesnt have to be rebuild cause it only can be a full
          // qualified URL !!
          $base_url = PHPCrawlerUtils::getBasePathFromTag($page_data["source"]);
          if ($base_url == "") $actual_url = &$this->urls_to_crawl[$pri_level][$key]["url_rebuild"];
          else $actual_url = $base_url;
          
          // Set flag "content_found" if..content was found
          if (isset($page_data["http_status_code"]) && $page_data["http_status_code"]==200) $content_found = true;
          
          // Check for a REDIRECT-header and if wanted, put it into the array of found links
          $redirect = PHPCrawlerUtils::getRedirectLocation($page_data["header"]);
          if ($redirect && $this->follow_redirects==true)
          {
            $tmp_array["link_raw"] = $redirect;
            $tmp_array["referer_url"] = $this->urls_to_crawl[$pri_level][$key]["url_rebuild"];
            $page_data["links_found"][] = $tmp_array;
          }
          
          // Count files that have been received completly
          if ($page_data["received"] == true) $files_received++;
          
          // If traffic-limit is reached -> stop crawling
          if ($page_data["traffic_limit_reached"] == true) $stop_crawling = true;
          
          // Check if pagelimit is reached if set
          // (and check WHICH page-limit was set)
          if ($this->page_limit_all > 0)
          {
            if ($this->page_limit_count_ct_only==true && $files_received >= $this->page_limit_all)
            {
              $stop_crawling = true;
            }
            elseif ($this->page_limit_count_ct_only==false && $links_followed >= $this->page_limit_all)
            {
              $stop_crawling = true;
            }
          }
          
          // Add the actual referer to the page_data array for the handlePageData-method
          $page_data["refering_linktext"] = &$this->urls_to_crawl[$pri_level][$key]["linktext"];
          $page_data["refering_link_raw"] = &$this->urls_to_crawl[$pri_level][$key]["link_raw"];
          $page_data["refering_linkcode"] = &$this->urls_to_crawl[$pri_level][$key]["linkcode"];
           
          // build new absolute URLs from found links
          $page_data["links_found"] = PHPCrawlerUtils::buildURLs($page_data["links_found"], $actual_url);
          
          // Call the overridable user-function here, but first
          // "save" the found links from user-manipulation
          $links_found = $page_data["links_found"];
          $user_return = $this->handlePageData($page_data);
          
          // Stop crawling if user returned a negative value
          if ($user_return < 0)
          {
            $stop_crawling=true;
            $page_data["user_abort"] = true;
          }
          
          // Compare the found links with link-priorities set by the user
          // and add the priority-level to our array $links_found
          if ($this->benchmark==true) $bm_start = $this->getmicrotime();
          PHPCrawlerUtils::addURLPriorities ($links_found, $this->link_priorities);
          if ($this->benchmark==true) echo "addUrlPriorities(): ".($this->getmicrotime() - $bm_start)."<br>";
          
          // Here we can delete the tmp-file maybe created by the pageRequest-object
          if (file_exists($this->pageRequest->tmp_file)) @unlink($this->pageRequest->tmp_file);
          
          // Stop everything if a limit was reached
          if (isset($stop_crawling))
          {
            break;
            $pri_level=1000;
          }
          
          // Remove links to other hosts if follow_mode is 2 or 3
          if ($this->general_follow_mode == 2 || $this->general_follow_mode == 3)
          {
            PHPCrawlerUtils::removeURLsToOtherHosts($links_found, $this->urls_to_crawl[$pri_level][$key]["url_rebuild"]);
          }
          
          // Remove links to other domains if follow_mode=1
          if ($this->general_follow_mode == 1)
          {
            PHPCrawlerUtils::removeURLsToOtherDomains($links_found, $this->urls_to_crawl[$pri_level][$key]["url_rebuild"]);
          }
       
          // Remove "pathUp"-links if follow_mode=3
          // (fe: base-site: www.foo.com/bar/index.htm -> dont follow: www.foo.com/anotherbar/xyz)
          if ($this->general_follow_mode == 3)
          {
            PHPCrawlerUtils::removePathUpLinks($links_found, $this->url_to_crawl);
          }
          
          // If given, dont follow "not matching"-links
          // (dont follow given preg_matches)
          if (count($this->not_follow_matches) > 0)
          {
            PHPCrawlerUtils::removeMatchingLinks($links_found, $this->not_follow_matches);
          }
          
          // If given, just follow "matching"-links
          // (only follow given preg_matches)
          if (count($this->follow_matches) > 0)
          {
            $links_found=&PHPCrawlerUtils::removeNotMatchingLinks($links_found, $this->follow_matches);
          }
          
          // Add found and filtered links to the main_array urls_to_crawl
          if ($this->benchmark == true) $bm_start = $this->getmicrotime();
          PHPCrawlerUtils::addToArray($links_found, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo);
          if ($this->benchmark == true) echo "addToArray(): ".($this->getmicrotime() - $bm_start)."<br>";
          
          // If there is wasnt any content found so far (code 200) and theres
          // a redirect location
          // -> follow it, doesnt matter what follow-mode was choosen !
          // (put it into the main-array !)
          if (!isset($content_found) && $redirect != "" && $this->follow_redirects_till_content == true)
          {
            $rd[0]["url_rebuild"] = phpcrawlerutils::buildURL($redirect, $actual_url);
            $rd[0]["priority_level"] = 0;
            PHPCrawlerUtils::addToArray($rd, $this->urls_to_crawl, $this->url_map, $this->store_extended_linkinfo);
          }
    
          // Now we remove the actual URL from the priority-array
          unset($this->urls_to_crawl[$pri_level][$key]);
          
          // Now we check if a priority-array with a higher priority
          // contains URLS and if so, stop processing this pri-array and "switch" to the higher
          // one
          for ($pri_level_check = $this->max_priority_level+1; $pri_level_check > $pri_level; $pri_level_check--)
          {
            if (isset($this->urls_to_crawl[$pri_level_check]) && $pri_level_check > $pri_level)
            {
              $stop_crawling_this_level = true;
            }
          }
          
          // Stop crawling this level
          if ($stop_crawling_this_level == true) 
          {
            $pri_level = $this->max_priority_level+1;
            break;
          }
          
          // Unset crawled URL, not nedded anymore
          unset($this->urls_to_crawl[$pri_level][$key]);
          
          // echo "All:".($this->getmicrotime()-$all_start);
          
        } // end of loop over priority-array
        
        // If a priority_level was crawled completely -> unset the whole array
        if ($stop_crawling_this_level == false)
        {
          unset($this->urls_to_crawl[$pri_level]);
        }
        
      } // end if priority-level exists
    
    } // end of main loop
    
    
    // Loop stopped here, build report-array (status_return)
    
    $this->status_return["links_followed"] = $links_followed;
    $this->status_return["files_received"] = $files_received;
    $this->status_return["bytes_received"] = $this->pageRequest->traffic_all;
    
    $this->status_return["traffic_limit_reached"] = $page_data["traffic_limit_reached"];
    
    if (isset($page_data["file_limit_reached"]))
    {
      $this->status_return["file_limit_reached"] = $page_data["file_limit_reached"];
    }
    else $this->status_return["file_limit_reached"] = false;
    
    if (isset($page_data["user_abort"]))
    {
      $this->status_return["user_abort"] = $page_data["user_abort"];
    }
    else $this->status_return["user_abort"] = false;
    
    if (isset($stop_crawling))
    {
      $this->status_return["limit_reached"] = true;
    }
    else {
      $this->status_return["limit_reached"] = false;
    }
    
    // Process-time
    $this->status_return["process_runtime"] = $this->getMicroTime() - $starting_time;
    
    // Average bandwith / throughput
    $this->status_return["data_throughput"] = round($this->status_return["bytes_received"] / $this->status_return["process_runtime"]);
  }
  
  // Overridable method
  function handlePageData(&$page_data)
  {
    // Any default action here ??
    // NO.
  }
  
  // public methods -------------------------------------------------------------------
  
  // Start-URL
  function setURL($url)
  {
    $this->initCrawler();
    
    $url = trim($url);
    if ($url!="" && is_string($url))
    {
      if (substr($url,0,7) != "http://" && substr($url,0,8) != "https://")
      {
        $url = "http://".$url;
      }
      $this->url_to_crawl = PHPCrawlerUtils::normalizeURL($url);
      return true;
    }
    else return false;
  }
  
  // Set port of base URL
  function setPort($port)
  {
    $this->initCrawler();
    
    // Check argument
    if (preg_match("/^[0-9]{1,5}$/", $port))
    {
      $this->base_port = trim($port);
      return true;
    }
    else return false;
  }
  
  // TMP-file to use
  function setTmpFile($path_to_file)
  {
    $this->initCrawler();
    
    //Check if writable
    $fp = @fopen($path_to_file, "w");
    if (!$fp) return false;
    else
    {
      fclose($fp);
      $this->pageRequest->tmp_file = trim($path_to_file);
      return true;
    }
  }
  
  // Set the follow mode
  function setFollowMode($mode)
  {
    $this->initCrawler();
    
    // Check argument
    if (preg_match("/^[0-3]{1}$/", $mode))
    {
      $this->general_follow_mode=$mode;
      return true;
    }
    else return false;
  }
  
  // How many pages to crawl MAX (limit), and which pages to count
  // til the limit is reached (true=followed pages, false=all pages)
  // limit=0 means NO LIMIT
  function setPageLimit($limit, $mode=false)
  {
    $this->initCrawler();
    
    // Check argument
    if (preg_match("/^[0-9]*$/", $limit))
    {
      $this->page_limit_count_ct_only = $mode;
      $this->page_limit_all = $limit;
      return true;
    }
    else return false;
  }
  
  // How many bytes to crawl MAX
  // limit=0 means NO LIMIT
  function setTrafficLimit($limit, $mode=true)
  {
    $this->initCrawler();
    if (preg_match("/^[0-9]*$/", $limit))
    {
      $this->pageRequest->traffic_limit_all = $limit;
      $this->pageRequest->traffic_limit_complete_page = $mode;
      return true;
    }
    else return false;
  }
  
  // Set the limit of bytes per page/file
  function setContentSizeLimit($limit)
  {
    $this->initCrawler();
    if (preg_match("/^[0-9]*$/", $limit))
    {
      $this->pageRequest->pagesize_limit = $limit;
      return true;
    }
    else return false;
  }
  
  // which contenttype to follow ?
  // if not specified, everything will be followed
  function addReceiveContentType($expression)
  {
    $this->initCrawler();
    $check = PHPCrawlerUtils::checkExpressionPattern($expression); // Check pattern
    if ($check == true)
    {
      $this->pageRequest->follow_content_type[] = trim(strtolower($expression));
    }
    return $check;
  }
  
  // which links to follow ? (preg_match)
  // if not specified, everything will be followed
  function addNonFollowMatch ($expression)
  {
    $this->initCrawler();
    $check = PHPCrawlerUtils::checkExpressionPattern($expression); // Check pattern
    if ($check == true)
    {
      $this->not_follow_matches[]=trim($expression);
    }
    return $check;
  }
  
  // which links to follow ? (preg_match)
  // if not specified, everything will be followed
  function addFollowMatch ($expression)
  {
    $this->initCrawler();
    $check = PHPCrawlerUtils::checkExpressionPattern($expression); // Check pattern
    if ($check == true)
    {
      $this->follow_matches[]=trim($expression);
    }
    return $check;
  }
  
  // which content-types (preg_match) should
  // be streamed to memory directly ?
  function addReceiveToMemoryMatch ($expression)
  {
    $this->initCrawler();
    $check = PHPCrawlerUtils::checkExpressionPattern($expression); // Check pattern
    if ($check == true)
    {
      $this->pageRequest->receive_to_memory_matches[] = trim($expression);
    }
    
    return $check;
  }
  
  // which content-types (preg_match) should
  // be streamed to the tmp-file ?
  function addReceiveToTmpFileMatch ($expression)
  {
    $this->initCrawler();
    
    $check = PHPCrawlerUtils::checkExpressionPattern($expression); // Check pattern
    if ($check == true)
    {
      $this->pageRequest->receive_to_file_matches[] = trim($expression);
    }
    return $check;
  }
  
  // Follow redirects ? (Header)
  function setFollowRedirects ($mode)
  {
    $this->initCrawler();
    
    if (is_bool($mode))
    {
      $this->follow_redirects = $mode;
      return true;
    }
    else return false;
  }
  
  // Follow redirects till some content was found in ANY WAY ?
  function setFollowRedirectsTillContent ($mode)
  {
    $this->initCrawler();
    
    if (is_bool($mode))
    {
      $this->follow_redirects_till_content = $mode;
      return true;
    }
    else return false;
  }
  
  // Enable/disable cookies
  function setCookieHandling ($mode)
  {
    $this->initCrawler();
    if (is_bool($mode))
    {
      $this->pageRequest->handle_cookies = $mode;
      return true;
    }
    else return false;
  }
  
  // Socket-connection-timeout
  function setConnectionTimeout($timeout) 
  {
    $this->initCrawler();
    if (preg_match("/^[0-9]*\.{0,1}[0-9]*$/", $timeout))
    {
      $this->pageRequest->socket_mean_timeout = $timeout;
      return true;
    }
    else return false;
  }
  
  // Stream timeout
  function setStreamTimeout($timeout)
  {
    $this->initCrawler();
    if (preg_match("/^[0-9]*\.{0,1}[0-9]*$/", $timeout))
    {
      $this->pageRequest->socket_read_timeout = $timeout;
      return true;
    }
    else return false;
  }
  
  // Return Status-array after crawling-process
  function getReport()
  {
    return $this->status_return;
  }
  
  // Method adds link-priorities
  function addLinkPriority ($expression, $level)
  {
    $this->initCrawler();
    
    $check = PHPCrawlerUtils::checkExpressionPattern($expression); // Check pattern
    if ($check==true && preg_match("/^[0-9]*$/", $level))
    {
      $c = count($this->link_priorities);
      $this->link_priorities[$c]["match"] = trim($expression);
      $this->link_priorities[$c]["level"] = trim($level);
      
      // Set the maximum-priority-level
      if ($this->max_priority_level < $level)
      {
        $this->max_priority_level=$level;
      }
      return true;
    }
    else return false;
  }
  
  // Method adds an authentication-login for special URLs given in expression
  // (PCRE)
  function addBasicAuthentication($expression, $username, $password)
  {
    $this->initCrawler();
    $check = PHPCrawlerUtils::checkExpressionPattern($expression); // Check pattern
    
    if ($check == true)
    {
      $c = count($this->pageRequest->basic_authentications);
      $this->pageRequest->basic_authentications[$c]["match"] = $expression;
      $this->pageRequest->basic_authentications[$c]["username"] = $username;
      $this->pageRequest->basic_authentications[$c]["password"] = $password;
      return true;
    }
    else return false; 
  }
  
  // Method adds linktags to extract links from
  function addLinkExtractionTags ()
  { 
    $this->initCrawler();
    $tags = func_get_args();

    for ($x=0; $x<count($tags); $x++)
    {
      if (trim($tags[$x])!="" && is_string($tags[$x]))
      {
        if (!in_array($tags[$x], $this->pageRequest->linktags_to_extract))
        {
          $this->pageRequest->linktags_to_extract[] = $tags[$x];
        }
      }
      else $error = true;
    }
    
    if (isset($error)) return false;
    else return true;
  }
  
  // Set agressive link-extraction true/false
  function setAggressiveLinkExtraction ($mode)
  {
    $this->initCrawler();
    if (is_bool($mode))
    {
      $this->pageRequest->aggressive_link_extraction = $mode;
      return true;
    }
    else return false;
  }
  
  // Sets the user-agent string
  function setUserAgentString ($string)
  {
    $this->initCrawler();
    $this->pageRequest->user_agent_string = $string;
    return true;
  }
  
  // Method enables/disables storage of extended link-information
  // (reduces memory-usage)
  function disableExtendedLinkInfo ($mode)
  {
    $this->initCrawler();
    if (is_bool($mode)) {
      if ($mode==true) $this->store_extended_linkinfo=false;
      else $this->store_extended_linkinfo=true;
      return true;
    }
    else return false;
  }
  
  // Enables/Disables parsing of robots.txt-file.
  function obeyRobotsTxt($mode)
  {
    if (is_bool($mode))
    {
      $this->pageRequest->use_robots_txt_files = $mode;
      return true;
    }
    else
    {
      return false;
    }
  } 
}
?>
Return current item: PHPCrawl