Location: PHPKode > projects > PHPCrawl > PHPCrawl_080/libs/UrlCache/PHPCrawlerSQLiteURLCache.class.php
<?php
/**
 * Class for caching/storing URLs/links in a SQLite-database-file.
 *
 * @package phpcrawl
 * @internal
 */
class PHPCrawlerSQLiteURLCache extends PHPCrawlerURLCacheBase
{
  /**
   * PDO-object for querying SQLite-file.
   *
   * @var PDO
   */
  protected $PDO;
  
  /**
   * Prepared statement for inserting URLS into the db-file as PDOStatement-object.
   *
   * @var PDOStatement
   */
  protected $PreparedInsertStatement;
  
  protected $sqlite_db_file;
  
  protected $db_analyzed = false;
  
  /**
   * Initiates an SQLite-URL-cache.
   *
   * @param string $file            The SQLite-fiel to use.
   * @param bool   $create_tables   Defines whether all necessary tables should be created
   */
  public function __construct($file, $create_tables = false)
  {
    $this->sqlite_db_file = $file;
    $this->openConnection($create_tables);
  }
  
  /**
   * Returns the next URL from the cache that should be crawled.
   *
   * @return PhpCrawlerURLDescriptor An PhpCrawlerURLDescriptor or NULL if currently no
   *                                 URL to process.
   */
  public function getNextUrl()
  {
    PHPCrawlerBenchmark::start("fetching_next_url_from_sqlitecache"); 
    
    $ok = $this->PDO->exec("BEGIN EXCLUSIVE TRANSACTION");
    
    // Get row with max priority-level
    $Result = $this->PDO->query("SELECT max(priority_level) AS max_priority_level FROM urls WHERE in_process = 0 AND processed = 0;");
    $row = $Result->fetch(PDO::FETCH_ASSOC);
    
    if ($row["max_priority_level"] == null) 
    {
      $Result->closeCursor();
      $this->PDO->exec("COMMIT;");
      return null;
    }
    
    $Result = $this->PDO->query("SELECT * FROM urls WHERE priority_level = ".$row["max_priority_level"]." and in_process = 0 AND processed = 0;");
    $row = $Result->fetch(PDO::FETCH_ASSOC);
    $Result->closeCursor();
     
    // Update row (set in process-flag)
    $this->PDO->exec("UPDATE urls SET in_process = 1 WHERE id = ".$row["id"].";");
    
    $this->PDO->exec("COMMIT;");
    
    PHPCrawlerBenchmark::stop("fetching_next_url_from_sqlitecache");
     
    // Return URL
    return new PHPCrawlerURLDescriptor($row["url_rebuild"], $row["link_raw"], $row["linkcode"], $row["linktext"], $row["refering_url"]);
  }
  
  /**
   * Has no function in this class
   */
  public function getAllURLs()
  {
  }
  
  /**
   * Removes all URLs and all priority-rules from the URL-cache.
   */
  public function clear()
  {
    $this->PDO->exec("DELETE FROM urls;");
    $this->PDO->exec("VACUUM;");
  }
  
  /**
   * Adds an URL to the url-cache
   *
   * @param PHPCrawlerURLDescriptor $UrlDescriptor      
   */
  public function addURL(PHPCrawlerURLDescriptor $UrlDescriptor)
  {
    if ($UrlDescriptor == null) return;
    
    // Hash of the URL
    $map_key = md5($UrlDescriptor->url_rebuild);
    
    // Get priority of URL
    $priority_level = $this->getUrlPriority($UrlDescriptor->url_rebuild);
    
    $this->createPreparedInsertStatement();
                                                                    
    // Insert URL via prepared statement
    $this->PreparedInsertStatement->execute(array(":priority_level" => $priority_level,
                                                  ":distinct_hash" => $map_key,
                                                  ":link_raw" => $UrlDescriptor->link_raw,
                                                  ":linkcode" => $UrlDescriptor->linkcode,
                                                  ":linktext" => $UrlDescriptor->linktext,
                                                  ":refering_url" => $UrlDescriptor->refering_url,
                                                  ":url_rebuild" => $UrlDescriptor->url_rebuild,
                                                  ":is_redirect_url" => $UrlDescriptor->is_redirect_url));
  }
  
  /**
   * Adds an bunch of URLs to the url-cache
   *
   * @param array $urls  A numeric array containing the URLs as PHPCrawlerURLDescriptor-objects
   */
  public function addURLs($urls)
  {
    PHPCrawlerBenchmark::start("adding_urls_to_sqlitecache"); 
    
    $this->PDO->exec("BEGIN EXCLUSIVE TRANSACTION;");
    
    $cnt = count($urls);
    for ($x=0; $x<$cnt; $x++)
    {
      if ($urls[$x] != null)
      {
        $this->addURL($urls[$x]);
      }
    }
    
    $this->PDO->exec("COMMIT;");
    $this->PreparedInsertStatement->closeCursor();
        
    if ($this->db_analyzed == false)
    {
      $this->PDO->exec("ANALYZE;");
      $this->db_analyzed = true;
    }
    
    PHPCrawlerBenchmark::stop("adding_urls_to_sqlitecache"); 
  }
  
  /**
   * Marks the given URL in the cache as "followed"
   *
   * @param PHPCrawlerURLDescriptor $UrlDescriptor
   */
  public function markUrlAsFollowed(PHPCrawlerURLDescriptor $UrlDescriptor)
  {
    PHPCrawlerBenchmark::start("marking_url_as_followes");
    $hash = md5($UrlDescriptor->url_rebuild);
    $this->PDO->exec("UPDATE urls SET processed = 1, in_process = 0 WHERE distinct_hash = '".$hash."';");
    PHPCrawlerBenchmark::stop("marking_url_as_followes"); 
  }
  
  /**
   * Checks whether there are URLs left in the cache that should be processed or not.
   *
   * @return bool
   */
  public function containsURLs()
  {
    PHPCrawlerBenchmark::start("checking_for_urls_in_cache");
    
    $Result = $this->PDO->query("SELECT id FROM urls WHERE processed = 0 OR in_process = 1 LIMIT 1;");
    
    $has_columns = $Result->fetchColumn();
    
    $Result->closeCursor();
    
    PHPCrawlerBenchmark::stop("checking_for_urls_in_cache");
    
    if ($has_columns != false)
    {
      return true;
    }
    else return false;
  }
  
  /**
   * Creates the sqlite-db-file and opens connection to it.
   *
   * @param bool $create_tables Defines whether all necessary tables should be created
   */
  protected function openConnection($create_tables = false)
  {
    PHPCrawlerBenchmark::start("connecting_to_sqlite_db");
    
    // Open sqlite-file
    try
    {
      $this->PDO = new PDO("sqlite:".$this->sqlite_db_file);
    }
    catch (Exception $e)
    {
      throw new Exception("Error creating SQLite-cache-file, ".$e->getMessage().", try installing sqlite3-extension for PHP.");
    }
    
    $this->PDO->exec("PRAGMA journal_mode = OFF");
    
    $this->PDO->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING);
    $this->PDO->setAttribute(PDO::ATTR_TIMEOUT, 100);
    
    if ($create_tables == true)
    {
      // Create url-table (if not exists)
      $this->PDO->exec("CREATE TABLE IF NOT EXISTS urls (id integer PRIMARY KEY AUTOINCREMENT,
                                                         in_process bool DEFAULT 0,
                                                         processed bool DEFAULT 0,
                                                         priority_level integer,
                                                         distinct_hash TEXT UNIQUE,
                                                         link_raw TEXT,
                                                         linkcode TEXT,
                                                         linktext TEXT,
                                                         refering_url TEXT,
                                                         url_rebuild TEXT,
                                                         is_redirect_url bool);");
      
      // Create indexes (seems that indexes make the whole thingy slower)
      $this->PDO->exec("CREATE INDEX IF NOT EXISTS priority_level ON urls (priority_level);");
      $this->PDO->exec("CREATE INDEX IF NOT EXISTS distinct_hash ON urls (distinct_hash);");
      $this->PDO->exec("CREATE INDEX IF NOT EXISTS in_process ON urls (in_process);");
      $this->PDO->exec("CREATE INDEX IF NOT EXISTS processed ON urls (processed);");
      
      $this->PDO->exec("ANALYZE;");
    }
    
    PHPCrawlerBenchmark::stop("connecting_to_sqlite_db");
  }
  
  /**
   * Creates the prepared statement for insterting URLs into database (if not done yet)
   */
  protected function createPreparedInsertStatement()
  {
    if ($this->PreparedInsertStatement == null)
    {
      // Prepared statement for URL-inserts                                      
      $this->PreparedInsertStatement = $this->PDO->prepare("INSERT OR IGNORE INTO urls (priority_level, distinct_hash, link_raw, linkcode, linktext, refering_url, url_rebuild, is_redirect_url)
                                                            VALUES(:priority_level,
                                                                   :distinct_hash,
                                                                   :link_raw,
                                                                   :linkcode,
                                                                   :linktext,
                                                                   :refering_url,
                                                                   :url_rebuild,
                                                                   :is_redirect_url);");
    }
  }
  
  /**
   * Cleans up the cache after is it not needed anymore.
   */
  public function cleanup()
  {
    unlink($this->sqlite_db_file);
  }
}
?>
Return current item: PHPCrawl