Location: PHPKode > projects > PHPCrawl > PHPCrawl_080/libs/PHPCrawlerURLFilter.class.php
<?php
/**
 * Class for filtering URLs by given filter-rules.
 *
 * @package phpcrawl
 * @internal
 */
class PHPCrawlerURLFilter
{
  /**
   * The full qualified and normalized URL the crawling-prpocess was started with.
   *
   * @var string
   */
  protected $starting_url = "";
  
  /**
   * The URL-parts of the starting-url.
   *
   * @var array The URL-parts as returned by PHPCrawlerUtils::splitURL()
   */
  protected $starting_url_parts = array();
  
  /**
   * Array containing regex-rules for URLs that should be followed.
   *
   * @var array
   */
  protected $url_follow_rules = array();
  
  /**
   * Array containing regex-rules for URLs that should NOT be followed.
   *
   * @var array
   */
  protected $url_filter_rules = array();
  
  /**
   * Defines whether nofollow-tags should get obeyed.
   *
   * @var bool
   */
  public $obey_nofollow_tags = false;
  
  /**
   * The general follow-mode of the crawler
   *
   * @var int The follow-mode
   *
   *          0 -> follow every links
   *          1 -> stay in domain
   *          2 -> stay in host
   *          3 -> stay in path
   */
  public $general_follow_mode = 2;
 
  /**
   * Current PHPCrawlerDocumentInfo-object of the current document
   *
   * @var PHPCrawlerDocumentInfo
   */
  protected $CurrentDocumentInfo = null;
  
  /**
   * Sets the base-URL of the crawling process some rules relate to
   *
   * @param string $starting_url The URL the crawling-process was started with.
   */
  public function setBaseURL($starting_url)
  {
    $this->starting_url = $starting_url;
    
    // Parts of the starting-URL
    $this->starting_url_parts = PHPCrawlerUtils::splitURL($starting_url);
  }
  
  /**
   * Filters the given URLs (contained in the given PHPCrawlerDocumentInfo-object) by the given rules.
   *
   * @param PHPCrawlerDocumentInfo $DocumentInfo PHPCrawlerDocumentInfo-object containing all found links of the current document.
   */
  public function filterUrls(PHPCrawlerDocumentInfo $DocumentInfo)
  {
    PHPCrawlerBenchmark::start("filtering_urls");
    
    $this->CurrentDocumentInfo = $DocumentInfo;
    
    $filtered_urls = array();
    
    $cnt = count($DocumentInfo->links_found_url_descriptors);
    for ($x=0; $x<$cnt; $x++)
    {
      if (!$this->urlMatchesRules($DocumentInfo->links_found_url_descriptors[$x]))
      {
        $DocumentInfo->links_found_url_descriptors[$x] = null;
      }
    }
    
    PHPCrawlerBenchmark::stop("filtering_urls");
  }
  
  /**
   * Filters out all non-redirect-URLs from the URLs given in the PHPCrawlerDocumentInfo-object
   *
   * @param PHPCrawlerDocumentInfo $DocumentInfo PHPCrawlerDocumentInfo-object containing all found links of the current document.
   */
  public static function keepRedirectUrls(PHPCrawlerDocumentInfo $DocumentInfo)
  {
    $cnt = count($DocumentInfo->links_found_url_descriptors);
    for ($x=0; $x<$cnt; $x++)
    {
      if ($DocumentInfo->links_found_url_descriptors[$x]->is_redirect_url == false)
      {
        $DocumentInfo->links_found_url_descriptors[$x] = null;
      }
    }
  }
  
  /**
   * Checks whether a given URL matches the rules.
   *
   * @param string $url  The URL as a PHPCrawlerURLDescriptor-object
   * @return bool TRUE if the URL matches the defined rules.
   */
  protected function urlMatchesRules(PHPCrawlerURLDescriptor $url)
  { 
    // URL-parts of the URL to check against the filter-rules
    $url_parts = PHPCrawlerUtils::splitURL($url->url_rebuild);
    
    // Kick out all links that r NOT of protocol "http" or "https"
    if ($url_parts["protocol"] != "http://" && $url_parts["protocol"] != "https://")
    {
      return false;
    }
    
    // If meta-tag "robots"->"nofollow" is present and obey_nofollow_tags is TRUE -> always kick out URL
    if ($this->obey_nofollow_tags == true &&
        isset($this->CurrentDocumentInfo->meta_attributes["robots"]) &&
        preg_match("#nofollow# i", $this->CurrentDocumentInfo->meta_attributes["robots"]))
    {
      return false;
    }
    
    // If linkcode contains "rel='nofollow'" and obey_nofollow_tags is TRUE -> always kick out URL
    if ($this->obey_nofollow_tags == true)
    {
      if (preg_match("#^<[^>]*rel\s*=\s*(?|\"\s*nofollow\s*\"|'\s*nofollow\s*'|\s*nofollow\s*)[^>]*>#", $url->linkcode))
      {
        return false;
      }
    }
    
    // Filter URLs to other domains if wanted
    if ($this->general_follow_mode >= 1)
    {
      if ($url_parts["domain"] != $this->starting_url_parts["domain"]) return false;
    }
    
    // Filter URLs to other hosts if wanted
    if ($this->general_follow_mode >= 2)
    {
      // Ignore "www." at the beginning of the host, because "www.foo.com" is the same host as "foo.com"
      if (preg_replace("#^www\.#", "", $url_parts["host"]) != preg_replace("#^www\.#", "", $this->starting_url_parts["host"]))
        return false;
    }
    
    // Filter URLs leading path-up if wanted
    if ($this->general_follow_mode == 3)
    {
      if ($url_parts["protocol"] != $this->starting_url_parts["protocol"] ||
          preg_replace("#^www\.#", "", $url_parts["host"]) != preg_replace("#^www\.#", "", $this->starting_url_parts["host"]) ||
          substr($url_parts["path"], 0, strlen($this->starting_url_parts["path"])) != $this->starting_url_parts["path"])
      {
        return false;
      }
    }
    
    // Filter URLs by url_filter_rules
    for ($x=0; $x<count($this->url_filter_rules); $x++)
    {
      if (preg_match($this->url_filter_rules[$x], $url->url_rebuild)) return false;
    }
    
    // Filter URLs by url_follow_rules
    if (count($this->url_follow_rules) > 0)
    {
      $match_found = false;
      for ($x=0; $x<count($this->url_follow_rules); $x++)
      {
        if (preg_match($this->url_follow_rules[$x], $url->url_rebuild))
        {
          $match_found = true;
          break;
        }
      }
      
      if ($match_found == false) return false;
    }
    
    return true;
  }
  
  public function addURLFollowRule($regex)
  {
    $check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern
    
    if ($check == true)
    {
      $this->url_follow_rules[] = trim($regex);
    }
    return $check;
  }
  
  /**
   * Adds a rule to the list of rules that decide which URLs found on a page should be ignored by the crawler. 
   */
  public function addURLFilterRule($regex)
  {
    $check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern
    
    if ($check == true)
    {
      $this->url_filter_rules[] = trim($regex);
    }
    return $check;
  }
  
  /**
   * Adds a bunch of rules to the list of rules that decide which URLs found on a page should be ignored by the crawler. 
   */
  public function addURLFilterRules($regex_array)
  {
    $cnt = count($regex_array);
    for ($x=0; $x<$cnt; $x++)
    {
      $this->addURLFilterRule($regex_array[$x]);
    }
  }
}
?>
Return current item: PHPCrawl