Location: PHPKode > projects > PhpSERA > phpsera-0.3alpha1/stuff/class-searchengine.inc.php
<?php

class SearchEngine {
  var $m_id;
  var $m_title;
  var $m_startkey;
  var $m_endkey;
  var $m_separator;
  var $m_host;
  var $m_scriptpath;
  var $m_querydata;
  var $m_addurl;
  var $m_url;
  var $m_langcode;
  var $m_noresult;

  // phpSERA breaks out of all loops if a sane ranking code (0 or higher)
  // was returned by getRanking. If not, it does $max_connect_attempts
  // to retry. If it fails again, it AGAIN does this number of attempts,
  // resulting in a default of a maximum of 6 attempts to get a ranking.
  // Which should be enough, unless the search engine is down or you're
  // having serious network problems.
  var $max_connect_attempts = 3;


  var $timeout = 10;
  var $method = "GET";


  /* constructor */
  function SearchEngine($id, $title, $startkey, $endkey, $separator, $host, $scriptpath, $querydata, $addurl, $url, $langcode, $noresult) {
    $this->m_id = $id;
    $this->set_title($title);
    $this->set_startkey($startkey);
    $this->set_endkey($endkey);
    $this->set_separator($separator);
    $this->set_host($host);
    $this->set_scriptpath($scriptpath);
    $this->set_querydata($querydata);
    $this->set_addurl($addurl);
    $this->set_url($url);
    $this->set_langcode($langcode);
    $this->set_noresult($noresult);
  }


  /*
    name   :  getRanking()
    desc   :  determine website position in SE rankings
    params :  $website    --> website object to look for
              $keyphrase  --> keyphrase object to use
    return : - position # (if listed in search results)
             - "0" (not listed)
             - "-1" (regexp error)
             - "-2" (host connect error)
             - "-3" (regexp error, but website URL found in resultlist)
             
    TODO   : return "-2" on seemingly different SE layout; perhaps
             based on sizeof($searchresults)?
             Perhaps a new field [searchengines].[results]
             Less than [results] = error, more than [results]+2 error or so...
  */
  function getRanking($website, $keyphrase) {
    $url = $website->get_url();
    $phrase = $keyphrase->get_phrase();

    /* init some variables */
    $regexp_begin = $searchengine[3];   /* "<ol>\n<li>" */
    $regexp_end   = $searchengine[4];   /* "<\/ol>" */
    $key          = $searchengine[5];   /* separator key; "<li>" */
    $host         = $searchengine[6];   /* "search.yahoo.com" */
    $script       = $searchengine[7];   /* "/cgi-bin/search" */
    $data         = str_replace("[__KEYPHRASE__]", urlencode($phrase), $searchengine[8]);
    $langcode     = $searchengine[9];   /* country code (not used here) */
    $noresult     = $searchengine[10];  /* string which would mark an empty resultlist */
  
    $ranking["position"]   = 0; // init to "not listed"


    /*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
      phpSERA typically performs 2 attempts to get a ranking, 
      each attempt consisting of max. 3 connect attempts.

      phpSERA ranking codes:
        0  : not listed or empty resultlist
        >0 : listed, and this is the apparent ranking position
        -1 : regexp error
        -2 : connect error (only after 6 failing attempts!)
        -3 : regexp error (but website URL found in HTML)
    +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/

    for ($ranking_attempt=1; $ranking_attempt <= 2; $ranking_attempt++) {
      /* max. N attempts to connect */
      for ($connect_attempt=1; $connect_attempt <= $this->max_connect_attempts; $connect_attempt++) {
        $html      = getPage($keyphrase);
        if ($html["exitcode"] != -1) break;
      }

      /* if all attempts failed */
      if ($html["exitcode"] == -1) {
        $ranking["position"] = -2;
        $ranking["message"]  = $html["message"]; // i.e. socket timeout
        continue; // next $ranking_attempt
      } else {
    
        $result = preg_match("/".$this->get_startkey()."(.+)".$this->get_endkey()."/s", $html, $templist);
    
        if (!$result) { 
        /*
          ERROR in regular expressions due to:
            1) SE has a zero result list for this query, and returns
               a webpage which doesn't comply with the (otherwise)
               correct regexp's
            2) SE changed it's layout
        */
  
          // SITUATION 1
          if (strlen($this->get_noresult()) > 3 && strstr($html, $this->get_noresult())) {
            $ranking["position"] = 0;
            $ranking["message"] = "No results";
          // SITUATION 2
          } elseif (strstr($searchresults[$i],$website->get_url()) || 
                strstr($searchresults[$i],urlencode($website->get_url()))) {
            $ranking["position"] = -3;
            $ranking["message"] = "Invalid regexp, but website seems to be listed";
          } else {
            $ranking["position"] = -1;
            $ranking["message"] = "Invalid regexp";
          }
          continue; // next $ranking_attempt
        } else {
          /* 
            Hmm... we seem to have HTML output and valid regex's.
            Let's find out the ranking position!
          */
          $searchresults = explode($key, $templist[1]);
      
          // loop through the search result items
          for ($i=0; $i < sizeof($searchresults); $i++) {
            // TODO: improve this check

            $output .= "<li>[index $i, looking for '".$website->get_url()."']".$searchresults[$i]."</li>";

            if (strstr($searchresults[$i],$website->get_url()) || 
                strstr($searchresults[$i],urlencode($website->get_url()))) {
              if ($searchresults[0]=="") $ranking["position"]=$i;
              else $ranking["position"]=$i+1; 
              break 2; // Houston, we have a ranking
            }
          } // for
          if ($verbose) echo "<ol>$output</ol>";
        }
      }
    } // for $ranking_attempt
    return $ranking;
  }


  /*
    name   :  getPage()
    desc   :  grab some webpage
    params :  $keyphrase   --> keyphrase object
    return :  full HTML page or array w/error code
  */
  function getPage($keyphrase) {
    $query   = str_replace(
                    "[__KEYPHRASE__]", 
                    urlencode($keyphrase->get_phrase()), 
                    $this->get_querydata()
                  );

    // try to open socket, exit on error
    // PS: the errorcode and msg should be used
    if (!($fp = fsockopen(
                     $this->get_host(),
                     80, 
                     $ret["error"], 
                     $ret["message"], 
                     $this->timeout))
                   ) {
      $ret["exitcode"]=-1;
      return $ret;
    }

    if ($this->method == "GET") 
      $query = $this->get_path()."?".$query;
    else
      $query = $this->get_path()."?".$query;
  
    // send our request header
    fputs($fp, "$method $query HTTP/1.1\n");
    fputs($fp, "Host: ".$this->get_host()."\n");
    //fputs($fp, "Referer: ".$this->get_host()."/$query\n");
    fputs($fp, "Content-type: application/x-www-form-urlencoded\n");
    if ($method == "POST")
      fputs($fp, "Content-length: " . strlen($query) . "\n");
    fputs($fp, "User-Agent: MSIE\n");
    fputs($fp, "Connection: close\n\n");
    if ($method == "POST")
      fputs($fp, $query);
  
    // retrieve the result page
    while (!feof($fp))
      $buffer .= fgets($fp,128);
  
    // close socket
    fclose($fp);
    return $buffer;
  }




  /* GET functions */

  function get_id() {
    return $this->m_id;
  }

  function get_title() {
    return $this->m_title;
  }

  function get_startkey() {
    return $this->m_startkey;
  }

  function get_endkey() {
    return $this->m_endkey;
  }

  function get_separator() {
    return $this->m_separator;
  }

  function get_host() {
    return $this->m_host;
  }

  function get_scriptpath() {
    return $this->m_scriptpath;
  }

  function get_querydata() {
    return $this->m_querydata;
  }

  function get_addurl() {
    return $this->m_addurl;
  }

  function get_url() {
    return $this->m_url;
  }

  function get_langcode() {
    return $this->m_langcode;
  }

  function get_noresult() {
    return $this->m_noresult;
  }







  /* SET functions */

  function set_title($title) {
    $this->m_title = $title;
  }

  function set_startkey($startkey) {
    $this->m_startkey = $startkey;
  }

  function set_endkey($endkey) {
    $this->m_endkey = $endkey;
  }

  function set_separator($separator) {
    $this->m_separator = $separator;
  }

  function set_host($host) {
    $this->m_host = $host;
  }

  function set_scriptpath($scriptpath) {
    $this->m_script = $scriptpath;
  }

  function set_querydata($querydata) {
    $this->m_querydata = $querydata;
  }

  function set_addurl($addurl) {
    $this->m_addurl = $addurl;
  }

  function set_url($url) {
    $this->m_url = $url;
  }

  function set_langcode($langcode) {
    $this->m_langcode = $langcode;
  }

  function set_noresult($noresult) {
    $this->m_noresult = $noresult;
  }



}

?>
Return current item: PhpSERA