<?php
/**
* Class for performing HTTP-requests.
*
* @package phpcrawl
* @internal
*/
class PHPCrawlerHTTPRequest
{
/**
* The user-agent-string
*/
public $userAgentString = "PHPCrawl";
/**
* Timeout-value for socket-connection
*/
public $socketConnectTimeout = 5;
/**
* Socket-read-timeout
*/
public $socketReadTimeout = 2;
/**
* Limit for content-size to receive
*
* @var int The kimit n bytes
*/
protected $content_size_limit = 0;
/**
* Global counter for traffic this instance of the HTTPRequest-class caused.
*
* @vat int Traffic in bytes
*/
protected $global_traffic_count = 0;
/**
* The time it took te receive data-packets for the request.
*
* @vat float time in seconds and milliseconds.
*/
protected $data_transfer_time = 0;
/**
* Contains all rules defining the content-types that should be received
*
* @var array Numeric array conatining the regex-rules
*/
protected $receive_content_types = array();
/**
* Contains all rules defining the content-types of pages/files that should be streamed directly to
* a temporary file (instead of to memory)
*
* @var array Numeric array conatining the regex-rules
*/
protected $receive_to_file_content_types = array();
/**
* Contains all rules defining the content-types defining which documents shoud get checked for links.
*
* @var array Numeric array conatining the regex-rules
*/
protected $linksearch_content_types = array("#text/html# i");
/**
* The TMP-File to use when a page/file should be streamed to file.
*
* @var string
*/
protected $tmpFile = "phpcrawl.tmp";
/**
* The URL for the request as PHPCrawlerURLDescriptor-object
*
* @var PHPCrawlerURLDescriptor
*/
protected $UrlDescriptor;
/**
* The parts of the URL for the request as returned by PHPCrawlerUtils::splitURL()
*
* @var array
*/
protected $url_parts = array();
/**
* DNS-cache
*
* @var PHPCrawlerDNSCache
*/
public $DNSCache;
/**
* Link-finder object
*
* @var PHPCrawlerLinkFinder
*/
protected $LinkFinder;
/**
* The last response-header this request-instance received.
*/
protected $lastResponseHeader;
/**
* Array containing cookies to send with the request
*
* @array
*/
protected $cookie_array = array();
/**
* Array containing POST-data to send with the request
*
* @var array
*/
protected $post_data = array();
/**
* The proxy to use
*
* @var array Array containing the keys "proxy_host", "proxy_port", "proxy_username", "proxy_password".
*/
protected $proxy;
/**
* The socket used for HTTP-requests
*/
protected $socket;
protected $header_check_callback_function = null;
public function __construct()
{
// Init LinkFinder
if (!class_exists("PHPCrawlerLinkFinder")) include_once(dirname(__FILE__)."/PHPCrawlerLinkFinder.class.php");
$this->LinkFinder = new PHPCrawlerLinkFinder();
// Init DNS-cache
if (!class_exists("PHPCrawlerDNSCache")) include_once(dirname(__FILE__)."/PHPCrawlerDNSCache.class.php");
$this->DNSCache = new PHPCrawlerDNSCache();
// Cookie-Descriptor
if (!class_exists("PHPCrawlerCookieDescriptor")) include_once(dirname(__FILE__)."/PHPCrawlerCookieDescriptor.class.php");
// ResponseHeader-class
if (!class_exists("PHPCrawlerResponseHeader")) include_once(dirname(__FILE__)."/PHPCrawlerResponseHeader.class.php");
}
/**
* Sets the URL for the request.
*
* @param PHPCrawlerURLDescriptor $UrlDescriptor An PHPCrawlerURLDescriptor-object containing the URL to request
*/
public function setUrl(PHPCrawlerURLDescriptor $UrlDescriptor)
{
$this->UrlDescriptor = $UrlDescriptor;
// Split the URL into its parts
$this->url_parts = PHPCrawlerUtils::splitURL($UrlDescriptor->url_rebuild);
}
/**
* Adds a cookie to send with the request.
*
* @param string $name Cookie-name
* @param string $value Cookie-value
*/
public function addCookie($name, $value)
{
$this->cookie_array[$name] = $value;
}
/**
* Adds a cookie to send with the request.
*
* @param PHPCrawlerCookieDescriptor $Cookie
*/
public function addCookieDescriptor(PHPCrawlerCookieDescriptor $Cookie)
{
//var_dump($Cookie);
$this->addCookie($Cookie->name, $Cookie->value);
}
/**
* Adds a bunch of cookies to send with the request
*
* @param array $cookies Numeric array containins cookies as PHPCrawlerCookieDescriptor-objects
*/
public function addCookieDescriptors($cookies)
{
$cnt = count($cookies);
for ($x=0; $x<$cnt; $x++)
{
$this->addCookieDescriptor($cookies[$x]);
}
}
/**
* Removes all cookies to send with the request.
*/
public function clearCookies()
{
$this->cookie_array = array();
}
/**
* Sets the html-tags from which to extract/find links from.
*
* @param array $tag_array Numeric array containing the tags, i.g. array("href", "src", "url", ...)
* @return bool
*/
public function setLinkExtractionTags($tag_array)
{
if (!is_array($tag_array)) return false;
$this->LinkFinder->extract_tags = $tag_array;
return true;
}
/**
* Specifies whether redirect-links set in http-headers should get searched for.
*
* @return bool
*/
public function setFindRedirectURLs($mode)
{
if (!is_bool($mode)) return false;
$this->LinkFinder->find_redirect_urls = $mode;
return true;
}
/**
* Adds post-data to send with the request.
*/
public function addPostData($key, $value)
{
$this->post_data[$key] = $value;
}
/**
* Removes all post-data to send with the request.
*/
public function clearPostData()
{
$this->post_data = array();
}
public function setProxy($proxy_host, $proxy_port, $proxy_username = null, $proxy_password = null)
{
$this->proxy = array();
$this->proxy["proxy_host"] = $proxy_host;
$this->proxy["proxy_port"] = $proxy_port;
$this->proxy["proxy_username"] = $proxy_username;
$this->proxy["proxy_password"] = $proxy_password;
}
/**
* Sets basic-authentication login-data for protected URLs.
*/
public function setBasicAuthentication($username, $password)
{
$this->url_parts["auth_username"] = $username;
$this->url_parts["auth_password"] = $password;
}
/**
* Enables/disables aggresive linksearch
*
* @param bool $mode
* @return bool
*/
public function enableAggressiveLinkSearch($mode)
{
if (!is_bool($mode)) return false;
$this->LinkFinder->aggressive_search = $mode;
return true;
}
public function setHeaderCheckCallbackFunction(&$obj, $method_name)
{
$this->header_check_callback_function = array($obj, $method_name);
}
/**
* Sends the HTTP-request and receives the page/file.
*
* @return A PHPCrawlerDocumentInfo-object containing all information about the received page/file
*/
public function sendRequest()
{
// Prepare LinkFinder
$this->LinkFinder->resetLinkCache();
$this->LinkFinder->setSourceUrl($this->UrlDescriptor);
// Initiate the Response-object and pass base-infos
$PageInfo = new PHPCrawlerDocumentInfo();
$PageInfo->url = $this->UrlDescriptor->url_rebuild;
$PageInfo->protocol = $this->url_parts["protocol"];
$PageInfo->host = $this->url_parts["host"];
$PageInfo->path = $this->url_parts["path"];
$PageInfo->file = $this->url_parts["file"];
$PageInfo->query = $this->url_parts["query"];
$PageInfo->port = $this->url_parts["port"];
// Create header to send
$request_header_lines = $this->buildRequestHeader();
$header_string = trim(implode("", $request_header_lines));
$PageInfo->header_send = $header_string;
// Open socket
$this->openSocket($PageInfo->error_code, $PageInfo->error_string);
// If error occured
if ($PageInfo->error_code != null)
{
// If proxy-error -> throw exception
if ($PageInfo->error_code == PHPCrawlerRequestErrors::ERROR_PROXY_UNREACHABLE)
{
throw new Exception("Unable to connect to proxy '".$this->proxy["proxy_host"]."' on port '".$this->proxy["proxy_port"]."'");
}
$PageInfo->error_occured = true;
return $PageInfo;
}
// Send request
$this->sendRequestHeader($request_header_lines);
// Read response-header
$response_header = $this->readResponseHeader($PageInfo->error_code, $PageInfo->error_string);
// If error occured
if ($PageInfo->error_code != null)
{
$PageInfo->error_occured = true;
return $PageInfo;
}
// Set header-infos
$this->lastResponseHeader = new PHPCrawlerResponseHeader($response_header, $this->UrlDescriptor->url_rebuild);
$PageInfo->responseHeader = $this->lastResponseHeader;
$PageInfo->header = $this->lastResponseHeader->header_raw;
$PageInfo->http_status_code = $this->lastResponseHeader->http_status_code;
$PageInfo->content_type = $this->lastResponseHeader->content_type;
$PageInfo->cookies = $this->lastResponseHeader->cookies;
// Referer-Infos
if ($this->UrlDescriptor->refering_url != null)
{
$PageInfo->referer_url = $this->UrlDescriptor->refering_url;
$PageInfo->refering_linkcode = $this->UrlDescriptor->linkcode;
$PageInfo->refering_link_raw = $this->UrlDescriptor->link_raw;
$PageInfo->refering_linktext = $this->UrlDescriptor->linktext;
}
// Call header-check-callback
$ret = 0;
if ($this->header_check_callback_function != null)
$ret = call_user_func($this->header_check_callback_function, $this->lastResponseHeader);
// Check if content should be received
$receive = $this->decideRecevieContent($this->lastResponseHeader);
if ($ret < 0 || $receive == false)
{
@fclose($this->socket);
$PageInfo->received = false;
$PageInfo->links_found_url_descriptors = $this->LinkFinder->getAllURLs(); // Maybe found a link/redirect in the header
$PageInfo->meta_attributes = $this->LinkFinder->getAllMetaAttributes();
return $PageInfo;
}
else
{
$PageInfo->received = true;
}
// Check if content should be streamd to file
$stream_to_file = $this->decideStreamToFile($response_header);
// Read content
$response_content = $this->readResponseContent($stream_to_file, $PageInfo->error_code, $PageInfo->error_string, $PageInfo->received_completely, $PageInfo->bytes_received);
// If error occured
if ($PageInfo->error_code != null)
{
$PageInfo->error_occured = true;
}
@fclose($this->socket);
// Complete ResponseObject
$PageInfo->content = $PageInfo->source = $response_content;
$PageInfo->received_completly = $PageInfo->received_completely;
$PageInfo->data_transfer_time = $this->data_transfer_time;
$PageInfo->data_transfer_rate = $PageInfo->bytes_received / $this->data_transfer_time;
if ($stream_to_file == true)
{
$PageInfo->received_to_file = true;
$PageInfo->content_tmp_file = $this->tmpFile;
}
else $PageInfo->received_to_memory = true;
$PageInfo->links_found_url_descriptors = $this->LinkFinder->getAllURLs();
$PageInfo->meta_attributes = $this->LinkFinder->getAllMetaAttributes();
$PageInfo->setLinksFoundArray();
return $PageInfo;
}
/**
* Opens the socket to the host.
*
* @param int &$error_code Error-code by referenct if an error occured.
* @param string &$error_string Error-string by reference
* @return bool TRUE if socket could be opened, otherwise FALSE.
*/
protected function openSocket(&$error_code, &$error_string)
{
PHPCrawlerBenchmark::start("connecting_server");
// SSL or not?
if ($this->url_parts["protocol"] == "https://") $protocol_prefix = "ssl://";
else $protocol_prefix = "";
// If SSL-request, but openssl is not installed
if ($protocol_prefix == "ssl://" && !extension_loaded("openssl"))
{
$error_code = PHPCrawlerRequestErrors::ERROR_SSL_NOT_SUPPORTED;
$error_string = "Error connecting to ".$this->url_parts["protocol"].$this->url_parts["host"].": SSL/HTTPS-requests not supported, extension openssl not installed.";
}
// Get IP for hostname
$ip_address = $this->DNSCache->getIP($this->url_parts["host"]);
// Open socket
if ($this->proxy != null)
{
$this->socket = @fsockopen ($this->proxy["proxy_host"], $this->proxy["proxy_port"], $error_code, $error_str, $this->socketConnectTimeout);
}
else
{
$this->socket = @fsockopen ($protocol_prefix.$ip_address, $this->url_parts["port"], $error_code, $error_str, $this->socketConnectTimeout);
}
PHPCrawlerBenchmark::stop("connecting_server");
// If socket not opened -> throw error
if ($this->socket == false)
{
// If proxy not reachable
if ($this->proxy != null)
{
$error_code = PHPCrawlerRequestErrors::ERROR_PROXY_UNREACHABLE;
$error_string = "Error connecting to proxy ".$this->proxy["proxy_host"].": Host unreachable (".$error_str.").";
return false;
}
else
{
$error_code = PHPCrawlerRequestErrors::ERROR_HOST_UNREACHABLE;
$error_string = "Error connecting to ".$this->url_parts["protocol"].$this->url_parts["host"].": Host unreachable (".$error_str.").";
return false;
}
}
else return true;
}
/**
* Send the request-header.
*/
protected function sendRequestHeader($request_header_lines)
{
PHPCrawlerBenchmark::start("sending_header");
// Header senden
$cnt = count($request_header_lines);
for ($x=0; $x<$cnt; $x++)
{
fputs($this->socket, $request_header_lines[$x]);
}
PHPCrawlerBenchmark::stop("sending_header");
}
/**
* Reads the response-header.
*
* @param int &$error_code Error-code by reference if an error occured.
* @param string &$error_string Error-string by reference
* @return string The response-header or NULL if an error occured
*/
protected function readResponseHeader(&$error_code, &$error_string)
{
PHPCrawlerBenchmark::start("server_response_time");
PHPCrawlerBenchmark::start("data_transfer_time", true);
$status = socket_get_status($this->socket);
$source_read = "";
$header = "";
$server_responded = false;
while ($status["eof"] == false)
{
socket_set_timeout($this->socket, $this->socketReadTimeout);
// Read from socket
$line_read = fgets($this->socket, 1024); // Das @ ist da um die blöde "SSL fatal protocol error"-Warnung zu unterdrücken,
// die keinen Sinn macht
if ($server_responded == false)
{
$server_responded = true;
PHPCrawlerBenchmark::stop("server_response_time");
PHPCrawlerBenchmark::start("retreiving_header");
}
$source_read .= $line_read;
$this->global_traffic_count += strlen($line_read);
$status = socket_get_status($this->socket);
// Socket timed out
if ($status["timed_out"] == true)
{
$error_code = PHPCrawlerRequestErrors::ERROR_SOCKET_TIMEOUT;
$error_string = "Socket-stream timed out (timeout set to ".$this->socketReadTimeout." sec).";
return $header;
}
// No "HTTP" at beginnig of response
if (strtolower(substr($source_read, 0, 4)) != "http")
{
$error_code = PHPCrawlerRequestErrors::ERROR_NO_HTTP_HEADER;
$error_string = "HTTP-protocol error.";
return $header;
}
if (substr($source_read, -4, 4) == "\r\n\r\n")
{
$header = substr($source_read, 0, strlen($source_read)-2);
// Search for links (redirects) in the header
$this->LinkFinder->processHTTPHeader($header);
PHPCrawlerBenchmark::stop("retreiving_header");
PHPCrawlerBenchmark::stop("data_transfer_time");
return $header;
}
}
// No header found
if ($header == "")
{
$error_code = PHPCrawlerRequestErrors::ERROR_NO_HTTP_HEADER;
$error_string = "Host doesn't respond with a HTTP-header.";
return null;
}
}
/**
* Reads the response-content.
*
* @param bool $stream_to_file If TRUE, the content will be streamed diretly to the temporary file and
* this method will not return the content as a string.
* @param int &$error_code Error-code by reference if an error occured.
* @param &string &$error_string Error-string by reference
* @param &string &$document_received_completely Flag indicatign whether the content was received completely passed by reference
* @param &string &$bytes_received Number of bytes received, passed by reference
* @return string The response-content/source. May be emtpy if an error ocdured or data was streamed to the tmp-file.
*/
protected function readResponseContent($stream_to_file = false, &$error_code, &$error_string, &$document_received_completely, &$bytes_received)
{
PHPCrawlerBenchmark::start("retreiving_content");
PHPCrawlerBenchmark::start("data_transfer_time", true);
// If content should be streamed to file
if ($stream_to_file == true)
{
$fp = @fopen($this->tmpFile, "w");
if ($fp == false)
{
$error_code = PHPCrawlerRequestErrors::ERROR_TMP_FILE_NOT_WRITEABLE;
$error_string = "Couldn't open the temporary file ".$this->tmpFile." for writing.";
return "";
}
}
// Init
$status = socket_get_status($this->socket);
$source_portion = "";
$source_complete = "";
$bytes_received = 0;
$document_received_completely = true;
$stop_receving = false;
while ($stop_receving == false)
{
socket_set_timeout($this->socket, $this->socketReadTimeout);
// Read from socket
$line_read = @fread($this->socket, 1024); // Das @ ist da um die blöde "SSL fatal protocol error"-Warnung zu unterdrücken,
// die keinen Sinn macht
// Check socket-status
$status = socket_get_status($this->socket);
// Check for EOF
if ($status["eof"] == true) $stop_receving = true;
// Socket timed out
if ($status["timed_out"] == true)
{
$stop_receving = true;
$error_code = PHPCrawlerRequestErrors::ERROR_SOCKET_TIMEOUT;
$error_string = "Socket-stream timed out (timeout set to ".$this->socketReadTimeout." sec).";
$document_received_completely = false;
}
else
{
$source_portion .= $line_read;
$bytes_received += strlen($line_read);
$this->global_traffic_count += strlen($line_read);
// Stream to file or store source in memory
if ($stream_to_file == true)
{
@fwrite($fp, $line_read);
}
else
{
$source_complete .= $line_read;
}
}
// Check if content-length stated in the header is reached
if ($this->lastResponseHeader->content_length == $bytes_received)
{
$stop_receving = true;
}
// Check if contentsize-limit is reached
if ($this->content_size_limit > 0 && $this->content_size_limit <= $bytes_received)
{
$stop_receving = true;
}
// Find links in portion of the source
if (strlen($source_portion) >= 100000 || $stop_receving == true)
{
if (PHPCrawlerUtils::checkStringAgainstRegexArray($this->lastResponseHeader->content_type, $this->linksearch_content_types))
{
PHPCrawlerBenchmark::stop("retreiving_content");
PHPCrawlerBenchmark::stop("data_transfer_time");
$this->LinkFinder->findLinksInHTMLChunk($source_portion);
$source_portion = substr($source_portion, -1500);
PHPCrawlerBenchmark::start("retreiving_content");
PHPCrawlerBenchmark::start("data_transfer_time", true);
}
}
}
if ($stream_to_file == true) @fclose($fp);
PHPCrawlerBenchmark::stop("retreiving_content");
PHPCrawlerBenchmark::stop("data_transfer_time");
$this->data_transfer_time = PHPCrawlerBenchmark::getElapsedTime("data_transfer_time");
PHPCrawlerBenchmark::reset("data_transfer_time");
return $source_complete;
}
/**
* Builds the request-header from the given settings.
*
* @return array Numeric array containing the lines of the request-header
*/
protected function buildRequestHeader()
{
// Create header
$headerlines = array();
// Methode(GET or POST)
if (count($this->post_data) > 0) $request_type = "POST";
else $request_type = "GET";
if ($this->proxy != null)
{
// A Proxy needs the full qualified URL in the GET or POST headerline.
$headerlines[] = $request_type." ".$this->UrlDescriptor->url_rebuild ." HTTP/1.0\r\n";
}
else
{
$query = $this->prepareHTTPRequestQuery($this->url_parts["path"].$this->url_parts["file"].$this->url_parts["query"]);
$headerlines[] = $request_type." ".$query." HTTP/1.0\r\n";
}
$headerlines[] = "HOST: ".$this->url_parts["host"]."\r\n";
$headerlines[] = "User-Agent: ".str_replace("\n", "", $this->userAgentString)."\r\n";
// Referer
if ($this->UrlDescriptor->refering_url != null)
{
$headerlines[] = "Referer: ".$this->UrlDescriptor->refering_url."\r\n";
}
// Cookies
$headerlines[] = $this->buildCookieHeader();
// Authentication
if ($this->url_parts["auth_username"] != "" && $this->url_parts["auth_password"] != "")
{
$auth_string = base64_encode($this->url_parts["auth_username"].":".$this->url_parts["auth_password"]);
$headerlines[] = "Authorization: Basic ".$auth_string."\r\n";
}
// Proxy authentication
if ($this->proxy != null && $this->proxy["proxy_username"] != null)
{
$auth_string = base64_encode($this->proxy["proxy_username"].":".$this->proxy["proxy_password"]);
$headerlines[] = "Proxy-Authorization: Basic ".$auth_string."\r\n";
}
$headerlines[] = "Connection: close\r\n";
// Wenn POST-Request
if ($request_type == "POST")
{
// Post-Content bauen
$post_content = $this->buildPostContent();
$headerlines[] = "Content-Type: multipart/form-data; boundary=---------------------------10786153015124\r\n";
$headerlines[] = "Content-Length: ".strlen($post_content)."\r\n\r\n";
$headerlines[] = $post_content;
}
else
{
$headerlines[] = "\r\n";
}
return $headerlines;
}
/**
* Prepares the given HTTP-query-string for the HTTP-request.
*
* HTTP-query-strings always should be utf8-encoded and urlencoded afterwards.
* So "/path/file?test=tatütata" will be converted to "/path/file?test=tat%C3%BCtata":
*
* @param stirng The quetry-string (like "/path/file?test=tatütata")
* @return string
*/
protected function prepareHTTPRequestQuery($query)
{
// If string already is URL-encoded -> do nothing
if (PHPCrawlerUtils::isUrlEncodedString($query)) return $query;
// if query is already utf-8 encoded -> simply urlencode it,
// otherwise encode it to utf8 first.
if (PHPCrawlerUtils::isUTF8String($query) == true)
{
$query = rawurlencode($query);
}
else
{
$query = rawurlencode(utf8_encode($query));
}
// Replace url-specific signs back
$query = str_replace("%2F", "/", $query);
$query = str_replace("%3F", "?", $query);
$query = str_replace("%3D", "=", $query);
$query = str_replace("%26", "&", $query);
return $query;
}
/**
* Builds the post-content from the postdata-array for the header to send with the request (MIME-style)
*
* @return array Numeric array containing the lines of the POST-part for the header
*/
protected function buildPostContent()
{
$post_content = "";
// Post-Data
@reset($this->post_data);
while (list($key, $value) = @each($this->post_data))
{
$post_content .= "-----------------------------10786153015124\r\n";
$post_content .= "Content-Disposition: form-data; name=\"".$key."\"\r\n\r\n";
$post_content .= $value."\r\n";
}
$post_content .= "-----------------------------10786153015124\r\n";
return $post_content;
}
/**
* Builds the cookie-header-part for the header to send.
*
* @return string The cookie-header-part, i.e. "Cookie: test=bla; palimm=palaber"
*/
protected function buildCookieHeader()
{
$cookie_string = "";
@reset($this->cookie_array);
while(list($key, $value) = @each($this->cookie_array))
{
$cookie_string .= "; ".$key."=".$value."";
}
if ($cookie_string != "")
{
return "Cookie: ".substr($cookie_string, 2)."\r\n";
}
else
{
return "";
}
}
/**
* Checks whether the content of this page/file should be received (based on the content-type
* and the applied rules)
*
* @param PHPCrawlerResponseHeader $responseHeader The response-header as an PHPCrawlerResponseHeader-object
* @return bool TRUE if the content should be received
*/
protected function decideRecevieContent(PHPCrawlerResponseHeader $responseHeader)
{
// Get Content-Type from header
$content_type = $responseHeader->content_type;
// No Content-Type given
if ($content_type == null) return false;
// Check against the given rules
$receive = PHPCrawlerUtils::checkStringAgainstRegexArray($content_type, $this->receive_content_types);
return $receive;
}
/**
* Checks whether the content of this page/file should be streamed directly to file.
*
* @param string $response_header The response-header
* @return bool TRUE if the content should be streamed to TMP-file
*/
protected function decideStreamToFile($response_header)
{
if (count($this->receive_to_file_content_types) == 0) return false;
// Get Content-Type from header
$content_type = PHPCrawlerUtils::getHeaderValue($response_header, "content-type");
// No Content-Type given
if ($content_type == null) return false;
// Check against the given rules
$receive = PHPCrawlerUtils::checkStringAgainstRegexArray($content_type, $this->receive_to_file_content_types);
return $receive;
}
/**
* Adds a rule to the list of rules that decides which pages or files - regarding their content-type - should be received
*
* If the content-type of a requested document doesn't match with the given rules, the request will be aborted after the header
* was received.
*
* @param string $regex The rule as a regular-expression
* @return bool TRUE if the rule was added to the list.
* FALSE if the given regex is not valid.
*/
public function addReceiveContentType($regex)
{
$check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern
if ($check == true)
{
$this->receive_content_types[] = trim(strtolower($regex));
}
return $check;
}
/**
* Adds a rule to the list of rules that decides what types of content should be streamed diretly to the temporary file.
*
* If a content-type of a page or file matches with one of these rules, the content will be streamed directly into the temporary file
* given in setTmpFile() without claiming local RAM.
*
* @param string $regex The rule as a regular-expression
* @return bool TRUE if the rule was added to the list and the regex is valid.
*/
public function addStreamToFileContentType($regex)
{
$check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern
if ($check == true)
{
$this->receive_to_file_content_types[] = trim($regex);
}
return $check;
}
/**
* Sets the temporary file to use when content of found documents should be streamed directly into a temporary file.
*
* @param string $tmp_file The TMP-file to use.
*/
public function setTmpFile($tmp_file)
{
//Check if writable
$fp = @fopen($tmp_file, "w");
if (!$fp)
{
return false;
}
else
{
fclose($fp);
$this->tmpFile = $tmp_file;
return true;
}
}
/**
* Sets the size-limit in bytes for content the request should receive.
*
* @param int $bytes
* @return bool
*/
public function setContentSizeLimit($bytes)
{
if (preg_match("#^[0-9]*$#", $bytes))
{
$this->content_size_limit = $bytes;
return true;
}
else return false;
}
/**
* Returns the global traffic this instance of the HTTPRequest-class caused so far.
*
* @return int The traffic in bytes.
*/
public function getGlobalTrafficCount()
{
return $this->global_traffic_count;
}
/**
* Adds a rule to the list of rules that decide what kind of documents should get
* checked for links in (regarding their content-type)
*
* @param string $regex Regular-expression defining the rule
* @return bool TRUE if the rule was successfully added
*/
function addLinkSearchContentType($regex)
{
$check = PHPCrawlerUtils::checkRegexPattern($regex); // Check pattern
if ($check == true)
{
$this->linksearch_content_types[] = trim($regex);
}
return $check;
}
}
?>