<?php
function getFileContents($url, $get_charset) {
global $user_agent, $url_status, $home_charset, $cn_seg;
global $user1, $pwd1, $user2, $pwd2, $user3, $pwd3, $clear, $include_dir, $idna;
$urlparts = parse_addr($url);
$path = $urlparts['path'];
$host = $urlparts['host'];
if ($idna) {
require_once "$include_dir/idna_converter.php";
// Initialize the converter class
$IDN = new idna_convert(array('idn_version' => 2008));
// The input string, if input is not UTF-8 or UCS-4, it must be converted before
//$url = utf8_encode($url);
// Encode it to its readyble presentation
$host = $IDN->encode($host);
}
if ($urlparts['query'] != "")
$path .= "?".$urlparts['query'];
if (isset ($urlparts['port'])) {
$port = (int) $urlparts['port'];
} else
if ($urlparts['scheme'] == "http") {
$port = 80;
} else
if ($urlparts['scheme'] == "https") {
$port = 443;
}
if ($port == 80) {
$portq = "";
} else {
$portq = ":$port";
}
$all = "Accept-Encoding: 0";
$auth = sprintf("Authorization: Basic %s", base64_encode($user1 . ":" . $pwd1));
$request1 = "GET $path HTTP/1.0\r\nHost: $host$portq\r\n$all\r\nUser-Agent: $user_agent\r\n$auth\r\n\r\n";
$fsocket_timeout = 60;
if (substr($url, 0, 5) == "https") {
$target = "ssl://".$host;
} else {
$target = $host;
}
@fclose($fp); //close any previous socket connection
$errno = 0;
$errstr = "";
$fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
$contents = array ();
if (!$fp) {
$contents['state'] = "NOHOST";
return $contents;
} else {
if (!fputs($fp, $request1)) {
$contents['state'] = "Cannot send request";
return $contents;
}
$answer = fgets($fp, 4096);
if (strpos($answer, "401")) { // Try with second and third authorization
fclose($fp);
$errno = 0;
$errstr = "";
$fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
print $errstr;
$linkstate = "ok";
if (!$fp) {
$status['state'] = "NOHOST";
} else {
$user = $user2;
$pwd = $pwd2;
$answer = auth_connect($fp, $user, $pwd, $path, $host, $portq);
}
if (strpos($answer, "401")) {
fclose($fp);
$errno = 0;
$errstr = "";
$fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
print $errstr;
$linkstate = "ok";
if (!$fp) {
$status['state'] = "NOHOST";
} else {
$user = $user3;
$pwd = $pwd3;
$answer = auth_connect($fp, $user, $pwd, $path, $host, $portq);
}
}
}
$data = null;
$pageSize = 0;
socket_set_timeout($fp, $fsocket_timeout);
$status = socket_get_status($fp);
while ((!feof($fp) && !$status['timed_out']) && ($pageSize < 16000) ) {
$data .= fgets($fp, 8192);
$pageSize = number_format(strlen($data)/1024, 2, ".", "");
}
fclose($fp);
if ($status['timed_out'] == 1) {
$contents['state'] = "timeout";
} else {
$contents['state'] = "ok";
$contents['file'] = substr($data, strpos($data, "\r\n\r\n") + 4);
if ($get_charset == 1) { // if charset is already known, don't enter here
if (($url_status['content'] == 'text' || $url_status['content'] == 'xml' || $url_status['content'] == 'xhtml')){ // do not search if pdf, doc, rtf, xls, rss etc.
$hedlen = strlen($data) - strlen($contents['file']);
$contents['header'] = substr($data,0,$hedlen);
$chrSet = '';
// search for encoding or charset in the header
$inp = strtoupper($contents['header']);
if (preg_match("'encoding=[\'\"](.*?)[\'\"]'si", $inp, $regs)) {
$chrSet = trim(strtoupper($regs[1])); // get encoding of current XML or XHTML file and use it furtheron
} else {
if (preg_match("'charset=(.*?)[\'\"]'si", $inp, $regs)) {
$chrSet = trim(strtoupper($regs[1])); // get charset of current HTML file and use it furtheron
}
}
if(trim($chrSet) != ''){
$contents['charset'] = $chrSet;
} else { //not found, need to search in file
$inp = strtoupper($contents['file']);
if (preg_match("@(encoding=(\"|'))(.*?)('|\")@si", $inp, $regs)) {
$chrSet = trim(strtoupper($regs[1])); // get encoding of current XML or XHTML file and use it furtheron
} else {
if (preg_match("'charset=(.*?)[\'\"]'si", $inp, $regs)) {
$chrSet = trim(strtoupper($regs[1])); // get charset of current HTML file and use it furtheron
}
}
if(trim($chrSet) != ''){
$contents['charset'] = $chrSet;
} else {
$contents['charset'] = $home_charset; // nothing found, we need to use default charset for DOCs, PDFs, etc
}
}
}
}
}
}
if ($clear == 1) unset ($data, $inp, $urlparts, $lines, $chrSet, $request, $status);
return $contents;
}
// try to connect without and with 'Basic Authorization'
function auth_connect($fp, $user, $pwd, $path, $host, $portq, $call) {
global $user_agent;
$all = "Accept-Encoding: 0";
socket_set_timeout($fp, 60);
$auth = sprintf("Authorization: Basic %s", base64_encode($user . ":" . $pwd));
$request0 = "GET $path HTTP/1.1\r\nHost: $host$portq\r\n$all\r\nUser-Agent: $user_agent\r\n\r\n";
$request = "GET $path HTTP/1.1\r\nHost: $host$portq\r\n$all\r\nUser-Agent: $user_agent\r\n$auth\r\n\r\n";
if ($call = "1") {
fputs($fp, $request0);
} else {
fputs($fp, $request);
}
return (fgets($fp, 4096));
}
// check if URL is accessible and try to connect
function url_status($url) {
global $user_agent, $index_pdf, $index_doc, $index_rtf, $index_xls, $index_ppt, $index_ods, $index_odt, $realnum, $index_rss;
global $plus_nr, $user1, $pwd1, $user2, $pwd2, $user3, $pwd3, $clear, $index_rar, $index_zip, $index_csv, $browser_string;
global $include_dir, $idna, $ext, $strip_sessids, $debug;
$url0 = $url;
$state = array();
$status = array();
if ($idna) {
$urlparts = parse_all_url($url); // currently only working for port 80
} else {
$urlparts = parse_url($url);
}
//echo "\r\n\r\n<br>urlparts Array:<br><pre>";print_r($urlparts);echo "</pre>\r\n";
$path = $urlparts['path'];
$host = $urlparts['host'];
if ($idna) {
require_once "$include_dir/idna_converter.php";
// Initialize the converter class
$IDN = new idna_convert(array('idn_version' => 2008));
// The input string, if input is not UTF-8 or UCS-4, it must be converted before
//$input = utf8_encode($url);
// Encode it to its readable presentation
$host = $IDN->encode($host);
}
if (isset($urlparts['query'])) {
$path .= "?".$urlparts['query'];
}
if (!isset($urlparts['path']) && !isset($urlparts['query'])) {
$path = "/";
}
// prepare alll for socket open
if (isset ($urlparts['port'])) {
$port = (int) $urlparts['port'];
} else
if ($urlparts['scheme'] == "http") {
$port = 80;
} else
if ($urlparts['scheme'] == "https") {
$port = 443;
}
if ($port == 80) {
$portq = "";
} else {
$portq = ":$port";
}
if (substr($url, 0, 5) == "https") {
$target = "ssl://".$host;
} else {
$target = $host;
}
$all = "*/*";
$auth = sprintf("Authorization: Basic %s", base64_encode($user1 . ":" . $pwd1));
$fsocket_timeout = 60;
$errno = 0;
$errstr = "";
// request with first authorization
$request1 = "GET $path HTTP/1.1\r\nHost: $host$portq\r\n$all\r\nUser-Agent: $user_agent\r\n$auth\r\n\r\n";
$fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
socket_set_timeout($fp, 60);
fputs($fp, $request1);
$answer = fgets($fp, 4096); // get the first row of the HTTP header
fclose($fp);
//echo "\r\n\r\n<br /> answer 00: '$answer'<br />\r\n";
if (!$answer) {
$status['state'] = "NOHOST";
}
// some server do noit accept $all = "*/*"
if (strstr($answer, "400")) {
$status['state'] = "HTTP/1.1 400 Bad Request";
}
if ($status['state'] != "NOHOST") {
if (!preg_match("/301|302|303|307|400/i", $answer)) {
require_once( 'http.php' );
header('Content-Type: text/xml');
$http_client = new http( HTTP_V11, false);
$http_client ->host = $host;
$http_client ->user_agent = $user_agent;
$http_client ->_auth_login = $user1;
$http_client ->_auth_pwd = $pwd1;
$http_client ->_debug = '';
// now connect to the remote URL (host was already defined above)
$answer = $http_client->get($path);
//echo "\r\n\r\n<br /> answer01 von http.php: '$answer'<br />\r\n";
if ($answer == "200") {
$linkstate = "ok";
$status['state'] = $http_client->get_response_header( 'Status' ) ;
$status['Content-Encoding'] = $http_client->get_response_header( 'Content-Encoding' ) ;
$status['Transfer-Encoding'] = $http_client->get_response_header( 'Transfer-Encoding' ) ;
$status['Content-Type'] = "Content-Type: ".$http_client->get_response_header( 'Content-Type' ) ;
$content = $status['Content-Type'] ;
if(strstr ($status['Content-Type'], "text" )) {
$status['content'] = "text" ;
}
// get charset
if (preg_match("@charset=([a-z0-9,\- ]+)@i", $status['Content-Type'], $charreg)) {
$status['charset'] = strtoupper(trim($charreg[1]));
}
$status['path'] = $http_client->get_response_header( 'Location' ) ;
if ($status['path']) {
$path = $status['path'];
$status['relocate'] = "Relocated by HTTP to $path";
}
$status['date'] = $http_client->get_response_header( 'date' ) ;
if (!$status['date']) {
$status['date'] = $http_client->get_response_header( 'Last-Modified' ) ;
}
$status['body'] = $http_client->get_response_body() ;
}
unset($http_client);
}
//echo "\r\n\r\n<br>status Array after http class:<br><pre>";print_r($status);echo "</pre>\r\n";
if ($status['state'] == "200" ) {
$status['state'] = "ok" ;
} else {
if (isset ($urlparts['port'])) {
$port = (int) $urlparts['port'];
} else
if ($urlparts['scheme'] == "http") {
$port = 80;
} else
if ($urlparts['scheme'] == "https") {
$port = 443;
}
if ($port == 80) {
$portq = "";
} else {
$portq = ":$port";
}
if (substr($url, 0, 5) == "https") {
$target = "ssl://".$host;
} else {
$target = $host;
}
//$accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
//$lang = "*/*";
//$encode = "gzip, deflate";
// request with first authorization
//$request1 = "GET $path HTTP/1.1\r\nHost: $host$portq\r\nUser-Agent: $user_agent\r\nAccept: $accept\r\n\Accept-Language: $lang\r\nAccept-Encoding: $encode\r\n$auth\r\n\r\n";
$all = "Accept-Encoding: 0";
$auth = sprintf("Authorization: Basic %s", base64_encode($user1 . ":" . $pwd1));
$fsocket_timeout = 60;
$errno = 0;
$errstr = "";
// request with first authorization
$request1 = "GET $path HTTP/1.1\r\nHost: $host$portq\r\n$all\r\nUser-Agent: $user_agent\r\n$auth\r\n\r\n";
ini_set("user_agent", $user_agent);
$fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
socket_set_timeout($fp, 60);
// we wil try to something from all the header rows
fputs($fp, $request1);
$answer = @fgets($fp, 4096); // get the first row of the HTTP header
$answer0 = $answer; // remember this first answer
//echo "\r\n\r\n<br /> answer0 in alter Version: '$answer'<br />\r\n";
if (strpos($answer, "503")) { // temporary unreachable
$retry = '';
$license = '';
if ($debug == "2") {
while ($answer) {
$answer = fgets($fp, 4096);
if (preg_match("/Retry-after: *([^\n\r ]+)/i", $answer, $regs)) {
$retry = $regs[0];
}
if (preg_match("/License status: *([^\n\r ]+)/i", $answer, $regs)) {
$license = $regs[0];
break;
}
}
}
// prepare status message for HTTP 503
$status['state'] = "Unreachable: HTTP 503 Service temporary unavailable<br />$retry<br />$license";
$linkstate = "Unreachable";
}
if (strpos($answer, "500") && $browser_string) { // try with standard browser http_user_agent (some servers do not like crawler)
fclose($fp); // close existing connection
sleep(1); // might not be necessary to wait, but . . .
$browser_agent = "Mozilla/5.0 (Windows NT 6.1; rv:5.0) Gecko/20100101 Firefox/5.0";
$browser_request = "GET $path HTTP/1.1\r\nHost: $host$portq\r\n$all\r\nUser-Agent: $browser_agent\r\n$auth\r\n\r\n";
$fsocket_timeout = 60;
$errno = 0;
$errstr = "";
//try to re-connect
$fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
print $errstr;
$linkstate = "ok";
if (!$fp) {
$status['state'] = "NOHOST";
} else {
fputs($fp, $browser_request);
$answer = fgets($fp, 4096);
ini_set("user_agent", $browser_agent); // overwrite $user_agent with $browser_agent
}
}
// some servers obligatory need a slash at the end of the path. We'll try it here
// some other server do not like the slash as last charachter of the path, lets follow also this quirk
if ((strpos($answer, "301") || strpos($answer, "400") || strpos($answer, "404")) && !isset($urlparts['query'])) { // try with slash at the end of host or path
fclose($fp); // close existing connection
sleep(1); // might not be necessary to wait, but . . .
if ($path != "/" && !strstr($path, ".")) {
// if last charachter of $path isn't already a slash, add a slash at the end of the path
if (strrpos($path, "/") != strlen($path)-1) {
$path .="/";
}
}
$browser_agent = "Mozilla/5.0 (Windows NT 6.1; rv:5.0) Gecko/20100101 Firefox/5.0";
$request = "GET $path HTTP/1.1\r\nHost: $host$portq\r\n$all\r\nUser-Agent: $user_agent\r\n$auth\r\n\r\n";
$fsocket_timeout = 60;
$errno = 0;
$errstr = "";
//try to re-connect
$fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
print $errstr;
$linkstate = "ok";
if (!$fp) {
$status['state'] = "NOHOST";
} else {
fputs($fp, $request);
$answer = fgets($fp, 4096);
}
$status['path1'] = $path; // remember the corrected path, if we will try to get the file contents
//echo "\r\n\r\n<br /> answer01: '$answer'<br />\r\n";
// some other server do not like the slash as last charachter of the path, lets follow also this quirk
if (strpos($answer, "404")) {
fclose($fp); // close existing connection
sleep(1); // might not be necessary to wait, but . . .
// if last charachter of $path isn't already a slash, add a slash at the end of the path
if (strrpos($path, "/") == strlen($path)-1) {
$path = substr($path, 0, strlen($path)-1);
}
$browser_agent = "Mozilla/5.0 (Windows NT 6.1; rv:5.0) Gecko/20100101 Firefox/5.0";
$request = "GET $path HTTP/1.1\r\nHost: $host$portq\r\n$all\r\nUser-Agent: $user_agent\r\n$auth\r\n\r\n";
$fsocket_timeout = 60;
$errno = 0;
$errstr = "";
//try to re-connect
$fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
print $errstr;
$linkstate = "ok";
if (!$fp) {
$status['state'] = "NOHOST";
} else {
fputs($fp, $request);
$answer = fgets($fp, 4096);
}
$status['path1'] = $path; // remember the corrected path, if we will try to get the file contents
}
}
//echo "\r\n\r\n<br /> answer02: '$answer'<br />\r\n";
if (strpos($answer, "401")) { // try without authorization (some servers do not like the $auth annex)
fclose($fp);
$errno = 0;
$errstr = "";
$call = '1';
$fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
print $errstr;
$linkstate = "ok";
if (!$fp) {
$status['state'] = "NOHOST";
} else {
$user = $user1;
$pwd = $pwd1;
$answer = auth_connect($fp, $user, $pwd, $path, $host, $portq, $call);
}
//echo "\r\n\r\n<br /> answer1 in url_status: '$answer'<br />\r\n";
if (strpos($answer, "401")) { // try with second authorization
fclose($fp);
$errno = 0;
$errstr = "";
$call = '2';
$fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
print $errstr;
$linkstate = "ok";
if (!$fp) {
$status['state'] = "NOHOST";
} else {
$user = $user2;
$pwd = $pwd2;
$answer = auth_connect($fp, $user, $pwd, $path, $host, $portq, $call);
}
}
if (strpos($answer, "401")) { // try with third authorization
fclose($fp);
$errno = 0;
$errstr = "";
$call = '3';
$fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
print $errstr;
$linkstate = "ok";
if (!$fp) {
$status['state'] = "NOHOST";
} else {
$user = $user3;
$pwd = $pwd3;
$answer = auth_connect($fp, $user, $pwd, $path, $host, $portq, $call);
}
}
}
$regs = Array ();
if (preg_match("{HTTP/[0-9.]+ (([0-9])[0-9]{2})}i", $answer, $regs)) {
$httpcode = $regs[2];
$full_httpcode = $regs[1];
if ($httpcode <> 2 && $httpcode <> 3) {
$status['state'] = "Unreachable: HTTP $full_httpcode";
$linkstate = "Unreachable";
$realnum -- ;
}
}
$answer1 = $answer;
// this is the entry for usual response
if ($linkstate <> "Unreachable" ) {
$content = '';
while ($answer && strlen($answer) > 2) {
$answer = fgets($fp, 4096);
// get any relocation/redirection
if (preg_match("/location: *([^\n\r ]+)/i", $answer, $regs)) {
$status['path'] = $regs[1]; // URL redirected
$status['relocate'] = "Relocated by HTTP $full_httpcode to ";
}
// get Last-Modified date
if (preg_match("/(Date|Last-Modified): *([a-z0-9,: ]+)/i", $answer, $regs)) {
$status['date'] = $regs[2];
}
// get Content-Encoding like 'gzip'
if (preg_match("/Content-Encoding: *([a-z0-9,: ]+)/i", $answer, $regs)) {
$status['Content-Encoding'] = strtolower(trim($regs[1]));
}
// get Transfer-Encoding like 'chunked'
if (preg_match("/Transfer-Encoding: *([a-z0-9,: ]+)/i", $answer, $regs)) {
$status['Transfer-Encoding'] = strtolower(trim($regs[1]));
}
// get Content-Type and if available Charset
if (preg_match("/Content-Type:/i", $answer)) {
$content = $answer;
if (preg_match("@charset=([a-z0-9,\- ]+)@i", $answer, $regs)) {
$status['charset'] = strtoupper(trim($regs[1]));
}
}
}
//echo "\r\n\r\n<br /> content: '$content'<br />\r\n";
//echo "\r\n\r\n<br /> linkstate: '$linkstate'<br />\r\n";
//echo "\r\n\r\n<br /> answer02: '$answer1'<br />\r\n";
if (preg_match("/200/i", $answer1)) {
$status['state'] = 'ok';
}
// relocated URL? So we need to overwrite the $status array and define the type of content
if ($linkstate <> "Unreachable" && preg_match("/301|302|303|307/i", $answer0) && preg_match("/200/i", $answer1)) {
while ($answer1 && strlen($answer1) > 2) {
$answer1 = fgets($fp, 4096);
// get any relocation/redirection
if (preg_match("/Location: *([^\n\r ]+)/i", $answer1, $regs)) {
$status['path'] = $regs[1]; // URL redirected
$status['relocate'] = "Relocated by HTTP $full_httpcode to ";
$status['state'] = '';
}
// get Last-Modified date
if (preg_match("/(Date|Last-Modified): *([a-z0-9,: ]+)/i", $answer1, $regs)) {
$status['date'] = $regs[2];
}
// get Content-Encoding like 'gzip'
if (preg_match("/Content-Encoding: *([a-z0-9,: ]+)/i", $answer, $regs)) {
$status['encoding'] = $regs[1];
}
// get Transfer-Encoding like 'chunked'
if (preg_match("/Transfer-Encoding: *([a-z0-9,: ]+)/i", $answer, $regs)) {
$status['Transfer-Encoding'] = $regs[1];
}
// get Content-Type and if available Charset
if (preg_match("/Content-Type:/i", $answer1)) {
$status['content'] = $answer1;
$content = $answer1;
if (preg_match("@charset=([a-z0-9,\- ]+)@i", $answer1, $regs)) {
$status['charset'] = strtoupper(trim($regs[1]));
}
}
if ($content && $status['path']) { // these 2 conditions would be enough to index thelocated reURL
$status['state'] = "ok";
}
}
// if the relocated URL or the Content-Type could not be detected, we need to GET the complete header info from the remote server
if ($status['state'] != "ok") {
$header = array();
$header = get_headers($url);
foreach ($header as $value) {
if (preg_match("/location: *([^\n\r ]+)/i", $value, $regs)) {
$status['path'] = $regs[1]; // URL redirected
$status['relocate'] = "Relocated by HTTP $full_httpcode to ";
}
}
}
// if the relocated path is relative, add the calling URL
if (!stristr($status['path'], "ttp")) {
$url = substr($url, 0, strrpos($url, "/")+1);
$status['path'] = $url.$status['path'];
}
// analyze the header
if ($header) {
// check for multiple redirection
$i = '0';
foreach ($header as $value) {
if (preg_match("/HTTP\/(.*?)301|HTTP\/(.*?)302|HTTP\/(.*?)303|HTTP\/(.*?)307/i", $value)) {
$i++;
}
}
if ($i > "1") {
// Example for requested cookie: http://www.fogelplast.ru/
$status['state'] = "Multiple redirections, which is not supported by Sphider-plus version $plus_nr";
} else {
// try to find the content type of the relocated URL
krsort ($header);
foreach ($header as $value) {
if (preg_match("/Content-Type: *([^\n\r ]+)/i", $value, $regs)) {
$status['content'] = $regs[1]; // content type
$content = $value; // content type
// get charset
if (preg_match("@charset=([a-z0-9,\- ]+)@i", $regs[1], $charreg)) {
$status['charset'] = strtoupper(trim($charreg[1]));
}
break;
}
}
// check for valid file type in order to become indexed
foreach ($ext as $this_suffix) {
if (stristr($status['content'], $this_suffix)) {
$status['state'] = "Not text or html";
}
}
}
}
}
}
} // end row by row analyzing the header
// if Admin selected, remove session from relocated URL
if ($status['state'] == "ok" && $strip_sessids == 1) {
$status['path'] = remove_sessid($status['path']);
}
if ($status['state'] == "ok") {
$socket_status = socket_get_status($fp);
@fclose($fp);
if (preg_match("{Content-Type: *([a-z/.-]*)}i", $content, $regs)) {
if ($regs[1] == 'text/html' || $regs[1] == 'text/' || $regs[1] == 'text/plain') {
$status['content'] = 'text';
$status['state'] = 'ok';
} else if ($regs[1] == 'application/pdf' && $index_pdf == 1) {
$status['content'] = 'pdf';
$status['state'] = 'ok';
} else if ($regs[1] == 'application/pdf' && $index_pdf == 0) {
$status['content'] = 'pdf';
$status['state'] = 'Indexing of PDF files is not activated in Admin Settings';
} else if (($regs[1] == 'application/msword' || $regs[1] == 'application/vnd.ms-word') && $index_doc == 1) {
$status['content'] = 'doc';
$status['state'] = 'ok';
} else if (($regs[1] == 'application/msword' || $regs[1] == 'application/vnd.ms-word') && $index_doc == 0) {
$status['content'] = 'doc';
$status['state'] = 'Indexing of DOC files is not activated in Admin Settings';
} else if (($regs[1] == 'text/rtf') && $index_rtf == 1) {
$status['content'] = 'rtf';
$status['state'] = 'ok';
} else if (($regs[1] == 'text/rtf') && $index_rtf == 0) {
$status['content'] = 'rtf';
$status['state'] = 'Indexing of RTF files is not activated in Admin Settings';
} else if (($regs[1] == 'application/excel' || $regs[1] == 'application/vnd.ms-excel') && $index_xls == 1) {
$status['content'] = 'xls';
$status['state'] = 'ok';
} else if (($regs[1] == 'application/excel' || $regs[1] == 'application/vnd.ms-excel') && $index_xls == 0) {
$status['content'] = 'xls';
$status['state'] = 'Indexing of XLS files is not activated in Admin Settings';
} else if (($regs[1] == 'text/csv') && $index_csv == 1) {
$status['content'] = 'csv';
$status['state'] = 'ok';
} else if (($regs[1] == 'text/csv') && $index_csv == 0) {
$status['content'] = 'csv';
$status['state'] = 'Indexing of CSV files is not activated in Admin Settings';
/* // Currently unsupported, because a failure was detected while converting ppt files > 7 MByte
// see also ../include/common/suffix.txt
// see also .../admin/configset.php
} else if (($regs[1] == 'application/mspowerpoint' || $regs[1] == 'application/vnd.ms-powerpoint') && $index_ppt == 1) {
$status['content'] = 'ppt';
$status['state'] = 'ok';
} else if (($regs[1] == 'application/mspowerpoint' || $regs[1] == 'application/vnd.ms-powerpoint') && $index_ppt == 0) {
$status['content'] = 'ppt';
$status['state'] = 'Indexing of PPT files is not activated in Admin Settings';
*/
/*
} else if (($regs[1] == 'application/vnd.openxmlformats-officedocument.presentationml.presentation') && $index_ppt == 1) {
$status['content'] = 'ppt';
$status['state'] = 'ok';
} else if (($regs[1] == 'application/vnd.openxmlformats-officedocument.presentationml.presentation') && $index_ppt == 0) {
$status['content'] = 'ppt';
$status['state'] = 'Indexing of PPT files is not activated in Admin Settings';
*/
} else if (($regs[1] == 'application/xml' || $regs[1] == 'application/rss' || $regs[1] == 'text/xml') && $index_rss == 1) {
$status['content'] = 'xml';
$status['state'] = 'ok';
} else if (($regs[1] == 'application/xhtml' || $regs[1] == 'application/rss' || $regs[1] == 'text/xhtml' || $regs[1] == 'application/xhtml') && $index_rss == 1) {
$status['content'] = 'xhtml';
$status['state'] = 'ok';
} else if (($regs[1] == 'application/xml' || $regs[1] == 'application/rss' || $regs[1] == 'text/xml' || $regs[1] == 'text/xhtml' || $regs[1] == 'application/xhtml') && $index_rss == 0) {
$status['content'] = 'xml';
$status['state'] = '<br />Indexing of RDF, RSD, RSS and Atom feeds is not activated in Admin Settings';
} else if (($regs[1] == 'application/zip' || $regs[1] == 'zip') && $index_zip == 1) {
$status['content'] = 'zip';
$status['state'] = 'ok';
} else if (($regs[1] == 'application/zip' || $regs[1] == 'zip') && $index_zip == 0) {
$status['content'] = 'zip';
$status['state'] = '<br />Indexing of ZIP archives is not activated in Admin Settings';
} else if (($regs[1] == 'application/rar' || $regs[1] == 'application/x-rar-compressed') && $index_rar == 1) {
$status['content'] = 'rar';
$status['state'] = 'ok';
} else if (($regs[1] == 'application/rar' || $regs[1] == 'application/x-rar-compressed') && $index_rar == 0) {
$status['content'] = 'rar';
$status['state'] = '<br />Indexing of RAR archives is not activated in Admin Settings';
} else if (($regs[1] == 'application/vnd.oasis.opendocument.spreadsheet') && $index_ods == 1) {
$status['content'] = 'ods';
$status['state'] = 'ok';
} else if (($regs[1] == 'application/vnd.oasis.opendocument.spreadsheet') && $index_ods == 0) {
$status['content'] = 'ods';
$status['state'] = '<br />Indexing of OpenDocument<strong>Spreadsheet</strong> is not activated in Admin Settings';
} else if (($regs[1] == 'application/vnd.oasis.opendocument.text') && $index_odt == 1) {
$status['content'] = 'odt';
$status['state'] = 'ok';
} else if (($regs[1] == 'application/vnd.oasis.opendocument.text') && $index_odt == 0) {
$status['content'] = 'odt';
$status['state'] = '<br />Indexing of OpenDocument<strong>Text</strong> is not activated in Admin Settings';
} else if (stripos ($urlparts['path'], ".js") || $regs[1] == 'application/javascript') {
$status['content'] = 'js';
$status['state'] = 'ok';
} else {
$status['state'] = "<br />For Sphider-plus v.$plus_nr not executable Text or Media.<br /> $content => UFO file<br />";
$realnum -- ;
}
} else {
if ($socket_status['timed_out'] == 1) {
$status['state'] = "Timed out. URL: $url0 <br />No reply from server within $fsocket_timeout seconds.";
$realnum -- ;
} else {
$status['state'] = "Not text or html";
}
}
}
}
if ($clear == 1) {
unset ($urlparts, $answer);
$socket_status = array();
}
//echo "\r\n\r\n<br>status Array final:<br><pre>";print_r($status);echo "</pre>\r\n";
return $status;
}
function check_robot_txt($url, $robots) {
global $user_agent, $clear, $cl;
$urlparts = parse_addr($url);
if ($urlparts['host'] == 'localhost') { // for 'localhost' applications add the path until last slash
$loc_path = substr($urlparts['path'], 0, strrpos($urlparts['path'], '/'));
$url = 'http://'.$urlparts['host']."".$loc_path."/$robots";
} else { // www application
$url = 'http://'.$urlparts['host']."/$robots";
}
$url_status = url_status($url);
$omit = array ();
if ($url_status['state'] == "ok") {
$file = @file_get_contents($url);
$robot = explode("\n", $file);
if (!$robot) {
$get_charset = '';
$contents = getFileContents($url, $get_charset); // read the robots.txt file
$file = $contents['file'];
$robot = explode("\n", $file);
}
// check for invalid content in robots.txt
if (stristr($file, "Disallow:<!--") || stristr($file, "<script") ) {
$domain = str_replace($robots, "", $url);
printBadRobots($domain, $cl);
} else {
// robots.txt seems okay, now parse it
$regs = Array ();
$this_agent= "";
while (list ($id, $line) = each($robot)) {
if (preg_match("/^user-agent: *([^#]+) */i", $line, $regs)) {
$this_agent = trim($regs[1]);
if ($this_agent == '*' || $this_agent == $user_agent)
$check = 1;
else
$check = 0;
}
if (preg_match("/disallow: *([^#]+)/i", $line, $regs) && $check == 1) {
$disallow_str = urldecode(preg_replace("/[\n ]+/i", "", $regs[1])); // make readable the %BO%D1 coded URLs
if (trim($disallow_str) != "") {
if ($urlparts['host'] == 'localhost') { // for 'localhost' applications add the path until last slash
$omit[] = "".$loc_path."".$disallow_str."";
} else { // www application
$omit[] = $disallow_str;
}
} else {
if ($this_agent == '*' || $this_agent == $user_agent) {
if ($clear == 1) unset ($urlparts, $contents, $file, $robot, $regs);
return null;
}
}
}
}
}
}
if ($clear == 1) unset ($urlparts, $contents, $file, $robot, $regs);
return $omit; // array that holds all forbidden links from robots.txt
}
// Remove the file part from an url (to build an url from an url and given relative path)
function remove_file_from_url($url) {
$url_parts = parse_addr($url);
$path = $url_parts['path'];
$path = str_replace("+", "", $path); // as not cooperating with preg_replace
$regs = Array ();
//if (preg_match('/([^\/]+)$/i', $path, $regs)) {
if (preg_match('/([^\/]+)$/i', $path, $regs)) {
$file = $regs[1];
$check = $file.'$';
$path = preg_replace("/$check"."/i", "", $path);
}
if ($url_parts['port'] == 80 || $url_parts['port'] == "") {
$portq = "";
} else {
$portq = ":".$url_parts['port'];
}
$url = $url_parts['scheme']."://".$url_parts['host'].$portq.$path;
unset ($url_parts, $regs, $file);
return $url;
}
// Extract links from html
function get_links($file, $url, $can_leave_domain, $base, $media_links, $use_nofollow, $local_redir, $url_reloc) {
global $strip_sessids, $imagelist, $audiolist, $videolist, $command_line, $no_log, $index_media;
global $mainurl, $include_dir, $idna, $local, $index_rss, $index_alt;
//echo "<br />get_links ******************************************************************<br />\r\n";
//echo "\r\n\r\n<br /> url_reloc: '$url_reloc'<br />\r\n";
//echo "\r\n\r\n<br /> mainurl: '$mainurl'<br />\r\n";
$chunklist = array ();
// The base URL comes from either the meta tag or the current URL.
if (!empty($base)) {
$url = $base;
}
//echo "\r\n\r\n<br />calling url: '$url'<br />\r\n";
$links = array ();
$regs = Array ();
$checked_urls = Array();
$body = substr($file, stripos($file, "<body"));
// try to find links to JavaScript src=. . .
if (preg_match_all("@<script(.*?)src(.*?)=(.*?)[\'\"](.*?)[\'\"]@si", $body, $regs)) {
foreach ($regs[4] as $val) {
if (($a = url_purify($val, $url, $can_leave_domain, 1, $relocated, $local_redir)) != '') {
$links[] = $a; // add this new link
}
}
}
$file = preg_replace("@<script[^>]*?>.*?<\/script>@si", " ",$file); // delete all scripts from the content
if ($index_rss) {
$file = preg_replace("@<link>|<url>@si", "<href=\"", $file); // convert all links to href=
$file = preg_replace("@</link>|</url>@si", "\">", $file);
}
//echo "\r\n\r\n<br /> file: '$file'<br />\r\n";
//preg_match_all("/href\s*=\s*[\'\"]?([+:%\/\?~=&;\\\(\),._a-zA-Z0-9- ]*)(#[.a-zA-Z0-9-]*)?[\'\" ]?(\s*rel\s*=\s*[\'\"]?(nofollow)[\'\"]?)?/i", $file, $regs, PREG_SET_ORDER);
preg_match_all("/href\s*=\s*[\'\"](.*?)[\'\" ](.*?)>/si", $file, $regs, PREG_SET_ORDER); // Replaced in order to index links containing non-ASCII characters
foreach ($regs as $val) {
if ($use_nofollow == '0') {
$val[2] = ''; // temporary ignore 'nofollow' directive
}
if (strstr($val[2], "nofollow")){
$report = "<br /><br />Found ".$val[1].", but <strong>nofollow</strong> flag is set.";
printNofollowLink($report, $command_line, $no_log);
}
}
//echo "\r\n\r\n<br>regs Array:<br><pre>";print_r($regs);echo "</pre>\r\n";
foreach ($regs as $val) {
if ($val[1]) { // reject empty links, which would cause invalid url_purify()
/*
// for all servers that deliver ' / ' instead of ' ./ ' as relative links on localhost
if (strpos($val[1], "/") === 0 && strpos($url, "localhost")) {
$val[1] = ".".$val[1]."";
}
*/
$ignore = '';
if ($use_nofollow == '1' && (strstr($val[2], "nofollow"))) {
$ignore = '1'; // temporary ignore 'nofollow' directive
}
if ($checked_urls[$val[1]]!=1 && $ignore == '') { //if nofollow is not set
$care_excl = '1'; // care file suffix to be excluded
$relocated = ''; // URL is not relocated
// create a link, which points back to the domain
if ($val[1] == "/") {
$main_url_parts = parse_all_url($mainurl);
$val[1] = $main_url_parts['scheme']."://".$main_url_parts['host']."/";
}
//echo "\r\n\r\n<br>val Array:<br><pre>";print_r($val);echo "</pre>\r\n";
if (($a = url_purify($val[1], $url, $can_leave_domain, $care_excl, $relocated, $local_redir)) != '') {
//echo "\r\n\r\n<br /> a: '$a'<br />\r\n";
$match_i = '0';
$match_a = '0';
$match_v = '0';
// prevent self-linking for link pathes ending with and/or without final slash
// and for relocated on it selves as detected in nurl_purify
if ($mainurl == $a || $a == "self") {
$a = '';
}
$a = str_replace( " ", "%20", $a); // in order to find also links containing blanks.
if($index_media > 0 && $a){
if ($index_image == '1') {
$select = $imagelist;
$match_i = valid_link($a, $select);
}
if ($index_audio == '1') {
$select = $audiolist;
$match_a = valid_link($a, $select);
}
if ($index_video == '1') {
$select = $videolist;
$match_v = valid_link($a, $select);
}
}
if ($media_links == '0' && $match_i == '0' && $match_a == '0' && $match_v == '0') {
$links[] = $a; // find only non-media links
}
if ($media_links == '1' && ($match_i == '1' || $match_a == '1' || $match_v == '1')) {
$links[] = $a; // find only media links
}
}
$checked_urls[$val[1]] = 1;
}
}
}
$care_excl = '1'; // care file suffixed to be excluded
$relocated = ''; // URL is not relocated
preg_match_all("/(frame[^>]*src[[:blank:]]*)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
foreach ($regs as $val) {
if ($checked_urls[$val[1]]!=1 ) { // if nofollow is not set
//if (($a = url_purify($val[1], $url, $can_leave_domain, '1')) != '') { //modified in order to follow frame links Tec 23.03.2009
if (($a = url_purify($val[2], $url, $can_leave_domain, $care_excl, $relocated, $local_redir)) != '') {
$links[] = $a; // find only media links
}
$checked_urls[$val[1]] = 1;
}
}
preg_match_all("/(window[.]location)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
foreach ($regs as $val) {
if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
if (($a = url_purify($val[1], $url, $can_leave_domain, $care_excl, $relocated, $local_redir)) != '') {
$links[] = $a; // add links
}
$checked_urls[$val[1]] = 1;
}
}
preg_match_all("/(http-equiv=['\"]refresh['\"] *content=['\"][0-9]+;url)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
foreach ($regs as $val) {
if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
if (($a = url_purify($val[1], $url, $can_leave_domain, $care_excl, $relocated, $local_redir)) != '') {
$links[] = $a; // add links
}
$checked_urls[$val[1]] = 1;
}
}
preg_match_all("/(window[.]open[[:blank:]]*[(])[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
foreach ($regs as $val) {
if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
if (($a = url_purify($val[1], $url, $can_leave_domain, $care_excl, $relocated, $local_redir)) != '') {
$links[] = urldecode($a); // add links
}
$checked_urls[$val[1]] = 1;
}
}
// find invalid links for localhost application
if (strstr($url, "localhost") && !$can_leave_domain) {
$local_links = array();
$pre = strlen($local); // path length to the localhost URLs
foreach ($links as $thislink) {
// if $url contains another slash behind $pre, there must be a subfolder
if (strstr($url, "/", $pre)) {
// extract the path (folder name) of parent URL
$url_len = strpos($url, "/", $pre); // find first slash behind $pre
$dom = substr($url, $pre);
$dom = substr($dom, 0, strpos($dom, "/"));
if (strlen($thislink) > $url_len && strstr($thislink, $dom)) {
$local_links[] = $thislink;
}
} else { // direct link at $local
if (strlen($thislink) > $url_len) {
$local_links[] = $thislink;
}
}
$links = $local_links;
}
}
if ($clear == 1) unset ($chunklist, $regs, $checked_urls, $a);
//echo "\r\n\r\n<br>links Array:<br><pre>";print_r($links);echo "</pre>\r\n";
if ($strip_sessids == 1) {
return remove_sessid($links);
} else {
return $links;
}
}
// Function to build a unique word array from the text of a webpage, together with the count of each word
function unique_array($arr) {
global $min_word_length, $common, $word_upper_bound;
global $index_numbers, $stem_words, $clear, $case_sensitive;
if ($stem_words != 'none') {
$newarr = Array();
foreach ($arr as $val) {
$newarr[] = stem_word($val, '0');
}
$arr = $newarr;
}
sort($arr);
reset($arr);
$newarr = array ();
$i = 0;
$counter = 1;
$element = current($arr);
if ($index_numbers == 0) {
$pattern = "/[0-9]+/";
} else {
$pattern = "/[ ]+/";
}
$regs = Array ();
for ($n = 0; $n < sizeof($arr); $n ++) {
//check if word is long enough, does not contain characters as defined in $pattern and is not a common word
//to eliminate/count multiple instance of words
$next_in_arr = next($arr);
if ($case_sensitive == "1") { // compare words by means of upper and lower case characters (e.g. for Chinese language)
if ($next_in_arr != $element) {
if (strlen($element) >= $min_word_length && !preg_match($pattern, $element) && ($common[$element] != 1)) {
if (preg_match("/^(-|\\\')(.*)/", $element, $regs))
$element = $regs[2];
if (preg_match("/(.*)(\\\'|-)$/", $element, $regs))
$element = $regs[1];
$newarr[$i][1] = $element;
$newarr[$i][2] = $counter;
$element = current($arr);
$i ++;
$counter = 1;
} else {
$element = $next_in_arr;
$counter = 1; // otherwise the count will be the amount of skipped words
}
} else {
if ($counter < $word_upper_bound)
$counter ++;
}
} else { // compare all words only using lower case characters
if ($next_in_arr != $element) {
if (strlen($element) >= $min_word_length && !preg_match($pattern, $element) && ($common[strtolower($element)] != 1)) {
if (preg_match("/^(-|\\\')(.*)/", $element, $regs))
$element = $regs[2];
if (preg_match("/(.*)(\\\'|-)$/", $element, $regs))
$element = $regs[1];
$newarr[$i][1] = $element;
$newarr[$i][2] = $counter;
$element = current($arr);
$i ++;
$counter = 1;
} else {
$element = $next_in_arr;
$counter = 1; // otherwise the count will be the amount of skipped words
}
} else {
if ($counter < $word_upper_bound)
$counter ++;
}
}
}
if ($clear == 1) unset ($element, $arr);
return $newarr;
}
// Check if url is legal, relative to the main url.
// Currently working only for port 80 connections !!!
function url_purify($url, $parent_url, $can_leave_domain, $care_excl, $relocated) {
global $ext, $mainurl, $apache_indexes, $strip_sessids, $clear, $dup_path;
global $other_host, $redir_host, $sldlist, $only_links, $include_dir, $idna;
//echo "<br />new url******************************************************************<br />\r\n";
//echo "\r\n\r\n<br /> url0: '$url'<br />\r\n";
if(strstr(substr($url, 0, 5), "www") && !strstr($url, "://")) {
$url = "http://$url";
}
if(strstr(substr($url, 0, 5), "www") && !strstr(substr($url, 0, 5), "://")) {
$url = "http://$url";
}
$orig_parent_url = $parent_url; // in order to remember, also after several modifications
if ($idna) { // parse IDN coded URLs and make punycode readable
// with respect to the different codings of our dear webmasters (and their special CMS)
$url = urldecode($url);
$parent_url = urldecode($parent_url);
$mainurl = urldecode($mainurl);
require_once "$include_dir/idna_converter.php";
// Initialize the converter class
$IDN = new idna_convert(array('idn_version' => 2008));
// The input string, if input is not UTF-8 or UCS-4, it must be converted before
//$thislink = utf8_encode($thislink);
// Encode it
if (strstr($url, "xn--")) {
$url = $IDN->decode($url);
}
$url_parts = parse_all_url($url);
if (strstr($mainurl, "xn--")) {
$mainurl = $IDN->decode($mainurl);
}
$main_url_parts = parse_all_url($mainurl);
if (strstr($mainurl, "xn--")) {
$main_url_parts['host'] = $IDN->decode($main_url_parts['host']);
}
} else {
$main_url_parts = parse_all_url($mainurl);
$url_parts = parse_all_url($url);
}
if ($strip_sessids == 1) {
$url = remove_sessid($url);
}
//echo "\r\n\r\n<br /> mainurl: '$mainurl'<br />\r\n";
//echo "\r\n\r\n<br /> parent_url: $parent_url<br />\r\n";
//echo "\r\n\r\n<br>main_url_parts Array:<br><pre>";print_r($main_url_parts);echo "</pre>\r\n";
//echo "\r\n\r\n<br /> url10: $url<br />\r\n";
//echo "\r\n\r\n<br>url_parts Array:<br><pre>";print_r($url_parts);echo "</pre>\r\n";
/*
echo "\r\n\r\n<br /> can_leave_domain: $can_leave_domain<br />\r\n";
echo "\r\n\r\n<br /> local_redir: $local_redir<br />\r\n";
echo "\r\n\r\n<br /> relocated: $relocated<br />\r\n";
echo "\r\n\r\n<br /> redir_host: $redir_host<br />\r\n";
echo "\r\n\r\n<br /> other_host: $other_host<br />\r\n";
*/
// if activated in Admin settings, allow other hosts in same domain, and also ignore www. and TLD and SLD
if (($local_redir != 1 && $relocated ==1 && $redir_host == 1 || $other_host == 1)
&& $url_parts['host'] != "" && $url_parts['host'] != $main_url_parts['host']){
// remove 'www'
$new_host = str_replace('www.', '', $url_parts['host']) ;
$main_host = str_replace('www.', '', $main_url_parts['host']);
// remove TLD
if(strstr($new_host, '.')) {
$new_host = substr($new_host , 0, strrpos($new_host, '.')) ;
}
if(strstr($main_host, '.')) {
$main_host = substr($main_host , 0, strrpos($main_host, '.')) ;
}
// If exist, remove SLD
foreach ($sldlist as &$value) {
if (preg_match("/$value$/", $new_host)){
$new_host = substr($new_host , 0, strpos($new_host, $value)) ;
}
}
foreach ($sldlist as &$value) {
if (preg_match("/$value$/", $main_host)){
$main_host = substr($main_host , 0, strpos($main_host, $value)) ;
}
}
// if exist, remove sub-domains
if(strstr($new_host, '.')) {
$new_host = substr($new_host , strrpos($new_host, '.')+1) ;
}
if(strstr($main_host, '.')) {
$main_host = substr($main_host , strrpos($main_host, '.')+1) ;
}
// follow only host with same domain-name
if ($new_host == $main_host) {
if ($care_excl == '1') { // care about non-exepted suffixes
reset($ext);
while (list ($id, $excl) = each($ext))
if (preg_match("/\.$excl($|\?)/i", $url)){ // if suffix is at the end of the link, or followd by a question mark
return '';
}
}
if (substr($url, -1) == '\\') {
return '';
}
if (isset($url_parts['query'])) {
if ($apache_indexes[$url_parts['query']]) {
return '';
}
}
if (preg_match("/[\/]?mailto:|[\/]?javascript:|[\/]?news:/i", $url)) {
return '';
}
//only http and https links are followed
if (isset($url_parts['scheme'])) {
$scheme = $url_parts['scheme'];
} else {
$scheme ="";
}
if (!($scheme == 'http' || $scheme == '' || $scheme == 'https')) {
return '';
}
// if missing, add slash to URL
if(!$url_parts['path'] && !preg_match("/\/$/", $url)) {
$url = $url."/";
}
return convert_url($url);
}
} // end of finding new URLs for 'follow other host with same domain-name'
// now purify links only for known domains, but independent from containing www or not www
$url_host = str_replace("www.", "", $url_parts['host']);
$main_url_host = str_replace("www.", "", $main_url_parts['host']);
// This detects foreign domains: $url_parts['host'] != $main_url_parts['host']
if ($url_host != "" && $url_host != $main_url_host && $can_leave_domain != 1) {
if ($only_links && $can_leave_domain == 1) {
return $url;
} else {
return '';
}
}
if ($care_excl == '1') { // care about non-exepted suffixes
reset($ext);
while (list ($id, $excl) = each($ext))
if (preg_match("/\.$excl($|\?)/i", $url)){ // if suffix is at the end of the link, or followd by a question mark
return '';
}
}
if (substr($url, -1) == '\\') {
return '';
}
if (isset($url_parts['query'])) {
if ($apache_indexes[$url_parts['query']]) {
return '';
}
}
if (preg_match("/[\/]?mailto:|[\/]?javascript:|[\/]?news:/i", $url)) {
return '';
}
if (isset($url_parts['scheme'])) {
$scheme = $url_parts['scheme'];
} else {
$scheme ="";
}
// only http and https links are followed
if (!($scheme == 'http' || $scheme == '' || $scheme == 'https')) {
return '';
}
// now special processing for relative links
if (!strpos(substr($url, 0, 5), "ttp")) {
$parent_url_parts = parse_all_url($parent_url);
//echo "\r\n\r\n<br>parent_url_parts Array0:<br><pre>";print_r($parent_url_parts);echo "</pre>\r\n";
if ($idna) { // make punycode readable
require_once "$include_dir/idna_converter.php";
// Initialize the converter class
$IDN = new idna_convert(array('idn_version' => 2008));
// The input string, if input is not UTF-8 or UCS-4, it must be converted before
//$thislink = utf8_encode($thislink);
// Encode it
if (strstr($parent_url, "xn--")) {
$parent_url = $IDN->decode($parent_url);
}
if (strstr($parent_url_parts['host'], "xn--")) {
$parent_url_parts['host'] = $IDN->decode($parent_url_parts['host']);
}
//$parent_url_parts['path'] = $IDN->decode($parent_url_parts['path']);
}
// if only a query is added to the current page URL
if (preg_match("/^\?/", $url)) {
$parent_end = substr($parent_url, strrpos($parent_url, "/")+1); // parse the end of the parent url behind the last slash
// if the link is only a new query
if (substr($parent_end, 0, 1) == "?" ) {
$parent_url = substr($parent_url, 0, strrpos($parent_url, "/")+1) ;
}
// unfortunately some webmasters repeat the file name (and/or query) as part of the new link
if (strstr($url, $parent_end) || strstr($parent_end, $url)) { // so we need the name (and/or query) from the parent url
$parent_url = substr($parent_url, 0, strrpos($parent_url, "/")+1);
}
// in case that $parent end contains of a file name plus a query, we need to kill the query from the parent _url
if (strstr($parent_end, "?")) {
$parent_url = substr($parent_url, 0, strpos($parent_url, "?"));
}
$url = $parent_url.$url; // build the complete link
if (!strpos($url, "ttp")) {
if ($main_url_parts['port'] == 80 || $url_parts['port'] == "") {
$portq = "";
} else {
$portq = ":".$main_url_parts['port'];
}
$url = $parent_url_parts['scheme']."://".$parent_url_parts['host'].$portq.$parent_url_parts['path'].$url;
}
return convert_url($url);
} else {
//echo "\r\n\r\n<br /> parent_url0: '$parent_url'<br />\r\n";
// kill eventually existing arguments from the parent url
if (strpos($parent_url, "?")) {
$parent_url = substr($parent_url, 0, strpos($parent_url, "?"));
}
// parent url might be used to build the URL from relative path
// don't remove filename if it is a bare query or fragment
if (substr($url, 0, 1) != '?' && substr($url, 0, 1) != '#') {
$parent_url = remove_file_from_url($parent_url);
}
$parent_end = substr($parent_url, strrpos($parent_url, "/")+1); // parse the end of the parent url behind the last slash
// now try to find self linking in real links (to be ignored)
if ($url == '#') {
return '';
}
// now try to find anchor-links (anchor is to be ignored)
if (strstr($url, "#")) {
$url = substr($url, 0, strpos($url, "#")); // remove the anchor part of the link
if (!$url) { // this link was only an anchor, forget it
return '';
}
}
// another kind of self linking
if (urlencode($orig_parent_url) == urlencode($url)) {
return '';
}
// another kind of self linking in real links
// 'urlencode' added for IDN domains
$par_length = strlen(urlencode($parent_url));
$url_length = strlen(urlencode($url));
$pos = strpos($parent_url, $url);
if ($pos) {
$rel = $par_length-$pos;
if ($rel == $url_length+1) { // the new link is just the end of $parent_url, this is self linking
return '';
}
}
$urlpath = $url_parts['path']; // simplified for string functions
// if ../ should cause one folder up (even several times)
$regs1 = Array ();
$parent_url_parts['path'] = substr($parent_url_parts['path'], 0, strrpos($parent_url_parts['path'], "/"));
while (preg_match("/^[.]{2}\//", $urlpath, $regs1)) {
// remove ../ from link path
$urlpath = substr($urlpath, 3);
// remove last folder from parent url path
$parent_url_parts['path'] = substr($parent_url_parts['path'], 0, strrpos($parent_url_parts['path'], "/" ));
}
// in case we need to add a slash at the end of the path
if (substr($parent_url_parts['path'], strlen($parent_url_parts['path'])-1, 1) != "/") {
$parent_url_parts['path'] .= "/";
}
$urlpath = preg_replace("/\/+/", "/", $urlpath);
$urlpath = str_replace("//", "/", $urlpath); // we've seen so much nonsense, even double slashes at the beginning of the urlpath)
//echo "\r\n\r\n<br /> urlpath: '$urlpath'<br />\r\n";
$query = "";
if (isset($url_parts['query'])) {
$query = "?".$url_parts['query']; // (Some servers seem to run this . . .)
//$query = "/?".$url_parts['query']; // (Some other servers even seem to run this . . .)
}
if ($main_url_parts['port'] == 80 || $url_parts['port'] == "") {
$portq = "";
} else {
$portq = ":".$main_url_parts['port'];
}
//echo "\r\n\r\n<br>parent_url_parts Array1<br><pre>";print_r($parent_url_parts);echo "</pre>\r\n";
if ($parent_url_parts['host'] != "localhost") {
// if the link URL contains the complete path like the calling URL(root folder) remove the path from the parent_url_path
if ($parent_url_parts['path'] != "/" && substr($urlpath, 0, 1) == "/") {
$parent_url_parts['path'] = "/";
}
//echo "\r\n\r\n<br>parent_url_parts Array2<br><pre>";print_r($parent_url_parts);echo "</pre>\r\n";
// remove the eventually existing leading ./ from the link
$urlpath = str_replace("./", "/", $urlpath);
// if there is no filename in urlpath, add a final slash to the urlpath
if ($url_parts['path'] != "/") {
$last = substr($urlpath, strrpos($urlpath, "/"));
if ($last != "/" && !strstr($last, ".")) {
$urlpath .= "/" ;
}
}
// if activated in Admin settings, and parts of the parent_url_path are equal to the url_path,
// delete the duplicate part from the parent_url_path
if ($dup_path && strstr($urlpath, "/")) {
$path = substr($urlpath, 0, strrpos($urlpath, "/")+1);
if ( $parent_url_parts['path'] != "/" && strstr($parent_url_parts['path'], $path)) {
$dup = stripos($parent_url_parts['path'], $path);
//$parent_url_parts['path'] = str_replace($path, "", $parent_url_parts['path']);
$parent_url_parts['path'] = substr($parent_url_parts['path'], 0, $dup);
if (substr($parent_url_parts['path'], 0, 1) != '/'){
//if(!substr($parent_url_parts['path'], 0 , "/")) {
$parent_url_parts['path'] = "/".$parent_url_parts['path'];
}
// in case that we killed the complete path from the parent_url, we use / as path
if (!$parent_url_parts['path']) {
$parent_url_parts['path'] = "/";
}
}
}
}
//echo "\r\n\r\n<br>parent_url_parts Array3:<br><pre>";print_r($parent_url_parts);echo "</pre>\r\n";
// remove any trailing slash, which will be supported by $parent_url_parts
if (substr($urlpath, 0, 1) == "/") {
$urlpath = substr($urlpath, 1);
}
// finally build the complete URL for relative links
$url = $parent_url_parts['scheme']."://".$parent_url_parts['host'].$portq.$parent_url_parts['path'].$urlpath.$query;
// in case that someone has forgotten to fix the backslashes (Windows like) in the URL
// I've seen even this . . .
$url = str_replace("\\", "/", $url);
}
}
if ($mainurl == $url) {
return 'self';
}
//echo "\r\n\r\n<br />link url: '$url'<br />\r\n";
// convert 'blank' and '&'
$url = convert_url($url);
/*
// if in last position of url path, remove final slash
$linkparts = parse_all_url($url);
if ($linkparts['path'] != "/") {
if (substr($url, strlen($url)-1) == "/") {
$url = substr($url, 0, strlen($url)-1);
}
}
*/
if ($can_leave_domain == 1 || $other_host == 1) {
return $url;
}
// only urls staying in the starting domain/directory are followed
if (strstr($url, $main_url_host) == false && $only_links != '1') { // $main_url_parts['host'] will support also relative-back-folder like ../../
if ($clear == 1) unset ($mainurl, $url_parts, $urlparts, $urlpath, $query, $page);
return '';
} else {
if ($clear == 1) unset ($mainurl, $url_parts, $urlparts, $urlpath, $query, $page);
return $url;
}
}
function save_keywords($wordarray, $link_id, $domain) {
global $mysql_table_prefix, $all_keywords, $debug, $db_con, $case_sensitive, $clear;
reset($wordarray);
sort($wordarray); // get alphabetic order
while ($thisword = each($wordarray)) {
$word = trim($thisword[1][1]);
$word = str_replace("/ /","",$word);
$word = str_replace("<", "<", $word); //make it visible
$word = str_replace(">", ">", $word); //make it visible
$wordmd5 = substr(md5($word), 0, 1);
$hits = $thisword[1][2];
$weight = $thisword[1][3];
if (strlen($word)<= 255) {
$keyword_id = $all_keywords[$word];
if ($keyword_id == "") {
if ($debug == '2') {
printActKeyword(str_replace("\'", "'", $word)); //make it readable for all
}
mysqltest();
mysql_query("insert into ".$mysql_table_prefix."keywords (keyword) values ('$word')");
if (mysql_errno() == 1062) {
$result = mysql_query("select keyword_ID from ".$mysql_table_prefix."keywords where keyword='$word'");
if ($debug > '0') echo mysql_error();
$row = mysql_fetch_row($result);
$keyword_id = $row[0];
if ($clear == 1) clean_resource($result, '50');
} else{
$keyword_id = mysql_insert_id();
$all_keywords[$word] = $keyword_id;
if ($debug > '0') echo mysql_error();
}
}
$inserts[$wordmd5] .= ",($link_id, $keyword_id, $weight, $domain, $hits, now())";
}
}
mysqltest();
for ($i=0;$i<=15; $i++) {
$char = dechex($i);
$values= substr($inserts[$char], 1);
if ($values != "") {
mysqltest();
$query = "insert into ".$mysql_table_prefix."link_keyword$char (link_id, keyword_id, weight, domain, hits,indexdate) values $values";
mysql_query($query);
if ($debug > '0') echo mysql_error();
}
}
if ($clear == 1) unset ($values, $char, $inserts, $all_keywords, $weight, $word, $wordarray);
}
function get_head_data($file, $url, $use_nofollow, $use_robot, $can_leave_domain) {
global $clear, $cano_leave;
$data = array();
$headdata = "";
preg_match("@<head[^>]*>(.*?)<\/head>@si",$file, $regs);
$headdata = $regs[1];
$description = "";
$robots = "";
$keywords = "";
$base = "";
$cano_link = "";
$refresh = "";
$wait = "0";
$res = Array ();
if ($headdata != "") {
// check for robots in meta tags
preg_match("/<meta +name *=[\"']?robots[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
if (isset ($res)) {
$robots = $res[1];
}
// check for description tag in header
$res = array();
preg_match("/<meta +name *=[\"']?description[\"']? *content=[\"']?([^<>\"]+)[\"']?/i", $headdata, $res);
if (isset ($res)) {
$description = $res[1];
}
// check for keywords tag in header
$res = array();
preg_match("/<meta +name *=[\"']?keywords[\"']? *content=[\"']?([^<>\"]+)[\"']?/i", $headdata, $res);
if (isset ($res)) {
$keywords = $res[1];
}
// e.g. <base href="http://www.consil.co.uk/index.php" />
$res = array();
preg_match("/<base +href *= *[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
if (isset($res) && $res[1] != "/") {
$base = $res[1];
} else {
$base = $url; // eventually this needs to be reduced to the URL of the domain. Not sure about this
}
$keywords = preg_replace("/[, ]+/", " ", $keywords);
$robots = explode(",", strtolower($robots));
$nofollow = 0;
$noindex = 0;
foreach ($robots as $x) {
if (trim($x) == "noindex" && $use_robot == '1') {
$noindex = 1;
}
if (trim($x) == "nofollow" && $use_nofollow == '1') {
$nofollow = 1;
}
}
// check for refresh link in meta tags
$res = array();
preg_match("/http-equiv=[\"']refresh[\"'] *content=[\"'](.*?); *url= *(.*?)[\"']/i", $headdata, $res);
if (isset ($res[0])) {
if ($res[1] != "0") {
$wait = $res[1];
sleep($wait); // if we should wait for some time until continuing to load the real URL
}
if (strpos($res[2], "//")) {
$cano_link = $res[2]; // refresh contains an absolute URL
} else {
$length = strlen(trim($url));
if (strrpos(trim($url), "/")+1 == $length) { // add new file to URL
$new = $res[2];
$url .= $new;
$cano_link = $url;
} else {
$filename = basename($url);
$cano_link = str_replace($filename, $res[2], $url); // build the real URL to refreshed link
}
}
$refresh = '1';
}
// check for canonical link info in meta tags
$res = array();
preg_match("/<link +rel *=[\"']canonical[\"'] *href=[\"'](.*?)[\"']/i", $headdata, $res);
if (isset ($res[0])) {
$cano_link = '1';
$care_excl = '1'; // care file suffix to be excluded
$relocated = ''; // URL is not relocated
$local_redir = '';
if ($cano_leave == '1') { // if acttivated in Admin backend, allow to leave the domain for canonical links
$can_leave_domain = '1';
}
if (($a = url_purify($res[1], $url, $can_leave_domain, $care_excl, $relocated, $local_redir)) != '') {
if (strcmp($url, $a)) {
$cano_link = $a; // if cano_link != url
} else {
$cano_link = ''; // if cano-link = url
}
}
if (urldecode($url) == urldecode($res[1])) {
$cano_link = ''; // another kind of self-linking
}
}
$data['description'] = addslashes($description);
$data['keywords'] = addslashes($keywords);
$data['nofollow'] = $nofollow;
$data['noindex'] = $noindex;
$data['base'] = $base;
$data['cano_link'] = $cano_link;
$data['refresh'] = $refresh;
$data['wait'] = $wait;
}
if ($clear == 1) unset ($headdata, $res, $keywords, $robots);
return $data;
}
function get_link_details($file, $url, $can_leave_domain, $base, $media_links, $use_nofollow, $local_redir) {
global $strip_sessids, $imagelist, $audiolist, $videolist, $command_line, $no_log;
global $clear, $div_all, $div_hyphen, $del_secchars, $debug, $cl;
global $use_white1, $use_white2, $use_black, $whitelist, $blacklist;
$chunklist = array ();
// The base URL comes from either the meta tag or the current URL.
if (!empty($base)) {
$url = $base;
}
$links = array();
$regs = array();
$checked_urls = array();
$data = array();
// first clean unused parts of the file
$file = preg_replace("@<!--.*?-->@si", " ",$file);
$file = preg_replace("@<script[^>]*?>.*?<\/script>@si", " ",$file);
$file = preg_replace("@<style[^>]*>.*?<\/style>@si", " ", $file);
// get all links
preg_match_all("/<a href=[\'\"](.*?)[\'\" ](.*?)>(.*?)<\/a>/si", $file, $regs, PREG_SET_ORDER); //get all links
foreach ($regs as $val) {
if ($use_nofollow == '0') {
$val[2] = ''; // temporary ignore 'nofollow' directive
}
if (stristr($val[2], "nofollow")){
$report = "<br /><br />Found ".$val[1].", but <strong>nofollow</strong> flag is set.";
printNofollowLink($report, $command_line, $no_log);
}
}
$i = 0;
foreach ($regs as $val) {
if ($val[1] && !stristr($val[0], ".css")) { // reject empty links, which would cause invalid url_purify() and ignore style links
// for all servers that deliver ' / ' instead of ' ./ ' as relative links on localhost
if (strpos($val[1], "/") === 0 && strpos($url, "localhost")) {
$val[1] = ".".$val[1]."";
}
$ignore = '';
if ($use_nofollow == '1' && (stristr($val[2], "nofollow"))) {
$ignore = '1'; // temporary ignore 'nofollow' directive
}
if ($checked_urls[$val[1]]!=1 && $ignore == '') { //if nofollow is not set
$care_excl = '1'; // care file suffix to be excluded
$relocated = ''; // URL is not relocated
$title = '';
if (($a = url_purify($val[1], $url, $can_leave_domain, $care_excl, $relocated, $local_redir)) != '') {
// get title from images
if (stripos($val[3], "title=")) {
preg_match_all("/title=\"(.*?)\"/si", $val[3], $regtlt, PREG_SET_ORDER);
$title = $regtlt[0][1];
} else {
if (stripos($val[3], "alt=")) {
preg_match_all("/alt=\"(.*?)\"/si", $val[3], $regtlt, PREG_SET_ORDER); //get alternate title from images
$title = $regtlt[0][1];
}
}
if (!$title){
$title = $val[3];
}
if ($use_white1 == '1') { // check, whether this title matches ANY word in whitelist
$found = '0';
foreach ($whitelist as $key => $value) {
if (stristr($title, $value)) {
$found = '1';
}
}
if ($found == '0') {
if ($debug == '2') {
printWhiteLink($url, $title, $cl);
}
$title = '';
}
}
if ($use_white2 == '1') { // check whether this title matches ALL words in whitelist
$all = count($whitelist);
$found = '0';
$found_this = '0';
foreach ($whitelist as $key => $value) {
if (stristr($title, $value)) {
$found_this = '1';
}
if ($found_this != '0'){
$found++;
$found_this = '0';
}
}
if ($found != $all) {
if ($debug == '2') {
printWhiteLink($url, $title, $cl);
}
$title = '';
}
}
if ($use_black == '1') {
$found = '0'; // check whether this title matches ANY string in blacklist
foreach ($blacklist as $key => $value) {
$met = stristr($title, $value);
if($met) $found = '1';
}
if ($found == '1') {
if ($debug == '2') {
printBlackLink($a, $title, $cl);
}
$title = '';
}
}
if ($title) {
$data[0][0] .= " $title"; // add current link text as part of the complete title string
// clean title from stuff
$trash = array(" ", " ", " ", "<br />", "\r\n", "\n", "\r", "\\r\\n", "\\n", "\\r", "\\", "\\\\", "<strong>", "</strong>", "\"");
$replace = ' ';
$title = str_replace($trash, $replace, $title);
$data[0][0] = str_replace($trash, $replace, $data[0][0]);
$search = '';
$data[0][0] = del_secchars($data[0][0]);
//$data[0][0] = preg_replace('/,|\. |\.\. |\.\.\. |!|\? |" |: |\) |\), |\). |ãâ¬â |ï¼Ⱐ|?,|? |� |�|ãâ¬â,|ãâ¬â |ââ¬Å¾ |ââ¬Å |ââ¬ï¿½ |ââ¬ï¿½|ââ¬ï¿½ |û |.û|;û|:û|,û|.û|Ãâ¡Ã»|ë|ë |û, |û. |.ââ¬ï¿½ |,ââ¬ï¿½|;ââ¬ï¿½ |ââ¬ï¿½. |ââ¬ï¿½, |ââ¬Â¿|ãâ¬ï¿½|ï¼â°|Ãâ¡|;|\] |\} |=|\<|\>/', " ", $data[0][0]);
//$data[0][0] = preg_replace('/ \[| "| \(| ââ¬Å¾| ââ¬Å|ï¼Ë| ë| ãâ¬ï¿½| ââ¬Â¿| ï¼Ë/', " ", $data[0][0]); // kill special characters in front of words
$data[0][0] = mysql_real_escape_string($data[0][0]);
$data[$i][1] = mysql_real_escape_string($val[0]);
$data[$i][2] = mysql_real_escape_string($a);
$data[$i][3] = mysql_real_escape_string($title);
$checked_urls[$val[1]] = 1;
}
}
}
}
$i++;
}
// split words at hyphen, single quote, dot and comma into their basics
if (($div_all || $div_hyphen)) {
$data[0][0] = split_words($data[0][0]);
}
if ($clear == 1) unset ($regs, $regtlt, $title, $val);
return $data;
}
function clean_file($file, $url, $type, $charSet, $use_nofollow, $use_robot, $can_leave_domain) {
global $entities, $index_host, $index_meta_keywords, $index_meta_description, $case_sensitive, $utf_16;
global $home_charset, $chrSet, $del_secchars, $index_rss, $converter_dir, $div_all, $div_hyphen;
global $bb_decode, $ent_decode, $cn_seg, $quotes, $dup_quotes, $clear, $only_links, $text_length;
global $use_divs, $not_divs, $not_divlist, $use_divlist, $ignore_fulltxt, $index_meta_title;
global $use_elems, $not_elems, $use_elementslist, $not_elementslist, $del_elems, $conv_puny, $include_dir;
//echo "\r\n\r\n<br /> file: '$file'<br />\r\n";
$new = array();
$data = array();
$string = '';
$home_charset = strtoupper($home_charset);
if ($utf_16) {
//$file = mb_ereg_replace("\\0", "", $file);
$file = utf16_to_utf8($file);
}
// kill useless blanks and line feeds
$file = preg_replace("/[ |\r\n]+/i", " ", $file);
$urlparts = parse_addr($url);
$host = $urlparts['host'];
//remove filename from path and all tags which should be ignored
$path = preg_replace('/([^\/]+)$/i', "", $urlparts['path']);
if ($use_nofollow == '1') {
$file = preg_replace("@<!--sphider_noindex-->.*?<!--\/sphider_noindex-->@si", " ",$file);
}
// parse HTML header
$headdata = get_head_data($file, $url, $use_nofollow, $use_robot, $can_leave_domain);
// if activated in Admin settings, ignore the full text
if ($ignore_fulltxt == '1') {
$file = '';
}
$file = preg_replace("@<!--.*?-->@si", " ",$file);
$file = preg_replace("@<script[^>]*?>.*?<\/script>@si", " ",$file);
$file = preg_replace("@<style[^>]*>.*?<\/style>@si", " ", $file);
$file = preg_replace("/<link rel[^<>]*>/i", " ", $file);
$title = '';
if ($only_links != '1') {
$regs = Array ();
if (preg_match("@<title *>(.*?)<\/title*>@si", $file, $regs)) {
$title = trim($regs[1]);
$title = "".$title." ";
} else if ($type == 'pdf' || $type == 'doc' || $type == 'ppt' || $type == 'rtf' || $type == 'xls' || $title == '') {
//create title for a non-html files
$offset = strrpos ($url, '/'); // get document name
$title = substr ($url, $offset+1);
}
}
// if activated in Admin settings, remove all div contents as defined in common 'divs_not' list
if ($not_divs == '1') {
foreach ($not_divlist as $thisid) { // try to find divs with id as specified in common 'divs' list
// regexp ?
if (strpos($thisid, "/") == "1" && strrpos($thisid, "/") == strlen($thisid)-1) {
$thisid = substr($thisid, 2, strlen($thisid)-3); // remove the regex capsules
} else { // for string input only
if (strrpos($thisid, "*") == strlen($thisid)-1) {
$thisid = str_replace("*", "(.*?)", $thisid); // replace wildcards at the end of string input
}
}
if (preg_match_all("@(<div class|<div id)=(\"|')".$thisid."(\"|').*?</div>@si", $file, $found_div, PREG_OFFSET_CAPTURE )) {
$this_divstart = $found_div[0][0][1]; // get actual startpos from div-array
$i = "end"; // if required $i will become the loop counter for nested divs
$nextstart = strpos($file, "<div", $this_divstart+4); // find start pos of next div
$nextend = strpos($file, "</div", $this_divstart+4); // find end pos of next div
//check for nested divs
$start1 = strpos($file, "<div", $nextstart+4); // find start pos of next div
if ($start1 && $start1 < $nextend) {
$i = "0"; // yes, nested
}
while ($i != "end") { // loop for (multiple) 'nested divs'
$i = '0';
while ($nextstart && $nextstart < $nextend) { // next div is a nested div?
$nextend1 = strpos($file, "</div", $nextstart+4); // this is only the endpos of current div
$nextend = strpos($file, "</div", $nextend1+6); // find end pos of next div
$nextstart = strpos($file, "<div", $nextstart+4); // find start pos of next div
if ($nextstart && $nextstart < $nextend1) { // again nested in next layer?
$i++ ; // counter for next level nested divs
}
}
// if nested divs were found, correct end pos of div to be deleted
while($i > '1') {
$nextend = strpos($file, "</div", $nextend+6);
$i--;
}
$nextend1 = strpos($file, "</div", $nextend+6); // $nextend from former div (might have been nested)
if ($nextend1) {
$nextend = $nextend1; // defines next endpos
}
if (!$nextstart || $nextend < $nextstart) {
$i = 'end'; // no longer nested divs
}
} // end of 'nested divs' loop
// delete this div content from $file
$kill_thisdiv = substr($file, $this_divstart, ($nextend+6)-$this_divstart);
$file = str_replace($kill_thisdiv, " ", $file);
}
}
}
// if activated in Admin settings, fetch all div contents as defined in common 'divs_use' list
if ($use_divs == '1') {
foreach ($use_divlist as $thisid) { // try to find divs with id as specified in common 'divs' list
// regexp ?
if (strpos($thisid, "/") == "1" && strrpos($thisid, "/") == strlen($thisid)-1) {
$thisid = substr($thisid, 2, strlen($thisid)-3); // remove the regex capsules
} else { // for string input only
if (strrpos($thisid, "*") == strlen($thisid)-1) {
$thisid = str_replace("*", "(.*?)", $thisid); // replace wildcards at the end of string input
}
}
if (preg_match_all("@(<div class|<div id)=(\"|')".$thisid."(\"|').*?(</div>)@si", $file, $found_divs, PREG_OFFSET_CAPTURE )) {
foreach ($found_divs[0] as $another_div) { // walk through all found divs. Usually W3C does not allow more than one div with this id. But who knows . . . .
$this_divstart = $another_div[1]; // get actual startpos from div-array
$i = "end"; // if required $i will become the loop counter for nested divs
$nextstart = strpos($file, "<div", $this_divstart+4); // find start pos of next div
$nextend = strpos($file, "</div", $this_divstart+4); // find end pos of next div
//check for nested divs
$start1 = strpos($file, "<div", $nextstart+4); // find start pos of next div
if ($start1 && $start1 < $nextend) {
$i = "0"; // yes, nested
}
while ($i != "end") { // loop for (multiple) 'nested divs'
$i = '0';
while ($nextstart && $nextstart < $nextend) { // next div is a nested div?
$nextend1 = strpos($file, "</div", $nextstart+4); // this is only the endpos of current div
$nextend = strpos($file, "</div", $nextend1+6); // find end pos of next div
$nextstart = strpos($file, "<div", $nextstart+4); // find start pos of next div
if ($nextstart && $nextstart < $nextend1) { // again nested in next layer?
$i++ ; // counter for next level nested divs
}
}
// if nested divs were found, correct end pos of div to be deleted
while($i > '1') {
$nextend = strpos($file, "</div", $nextend+6);
$i--;
}
$nextend1 = strpos($file, "</div", $nextend+6); // $nextend from former div (might have been nested)
if ($nextend1) {
$nextend = $nextend1; // defines next endpos
}
if (!$nextstart || $nextend < $nextstart) {
$i = 'end'; // no longer nested divs
}
}
// collect all divs to be indexed
$all_divs[] = substr($file, $this_divstart, ($nextend+6)-$this_divstart);
}
// add content of all found divs to full text
foreach($all_divs as $use_thisdiv) {
$divfile .= " ".$use_thisdiv;
}
}
}
$file = $divfile; // now this will be used as the body part of the page content
}
// if activated in Admin settings, fetch the content of all elements as defined in common 'elements_use' list and use the content of these elements as page content
if ($use_elems == '1') {
foreach ($use_elementslist as $this_element) { // try to find elements with id as specified in common 'elöements_use' list
// regexp ?
if (strpos($this_element, "/") == "1" && strrpos($this_element, "/") == strlen($this_element)-1) {
$this_element = substr($this_element, 2, strlen($this_element)-3); // remove the regex capsules
}
if (preg_match_all("@<$this_element.*?>.*?<\/$this_element>@si", $file, $found_elements, PREG_OFFSET_CAPTURE )) {
foreach ($found_elements as $new_element) { // walk through all found elementss.
foreach ($new_element as $new) {
// build substring without content tags
$string = $new[0];
$string = substr($string, strpos($string, ">")+1);
$string = substr($string, 0, strrpos($string, "<"));
// collect all elements to be indexed
$all_elements[] = $string;
}
}
}
}
$file = '';
// add content of all found elements to full text
foreach($all_elements as $use_thiselem) {
$file .= " ".$use_thiselem; // now all this will be used as the body part of the page content
}
}
// if activated in Admin settings, fetch the content of all elements as defined in common 'elements_not' list and delete that part of the page
if ($not_elems == '1') {
foreach ($not_elementslist as $this_element) { // try to find elements with id as specified in common 'elements_not' list
// regexp ?
if (strpos($this_element, "/") == "1" && strrpos($this_element, "/") == strlen($this_element)-1) {
$this_element = substr($this_element, 2, strlen($this_element)-3); // remove the regex capsules
}
if (preg_match_all("@<$this_element.*?>.*?<\/$this_element>@si", $file, $found_elements, PREG_OFFSET_CAPTURE )) {
foreach ($found_elements as $new_element) { // walk through all found elementss.
foreach ($new_element as $new) {
// collect all elements to be ignored
$all_elements[] = $new[0];
}
}
}
}
// remove the content of all found elements from full text
foreach($all_elements as $use_thiselem) {
$file = str_replace($use_thiselem, " ", $file);
}
}
// parse bbcode
if ($bb_decode == '1' ){
$file = bbcode($file);
}
//create spaces between tags, so that removing tags doesnt concatenate strings
$file = preg_replace("/<[\w ]+>/", "\\0 ", $file);
$file = preg_replace("/<\/[\w ]+>/", "\\0 ", $file);
$file = preg_replace("@<head>.*?</head>@si", " ",$file); // remove HTML header from file
$file = preg_replace("@<\/a>@si", " ",$file); // remove lost end tag
//$file = strip_tags($file); // remove the content of HTML tags from $file (does not work for invalid written and unclosed tags)
// replaced since Sphider-plus version 2.7
// remove the content of HTML tags from $file
$found_tags = array();
$another_tag = array();
if (preg_match_all("@<.*?>@s", $file, $found_tags, PREG_OFFSET_CAPTURE )) {
foreach ($found_tags[0] as $another_tag) { // walk through all found tags.
if (strlen($another_tag[0]) < "500") { // delete this tag from full text if not too long (unclosed)
$file = str_replace($another_tag[0], " ", $file);
}
}
}
if ($del_elems) { // if activated in Admin backend, delete < element /> from full text
$found_tags = array();
$another_tag = array();
if (preg_match_all("@\<.*?\>@s", $file, $found_tags, PREG_OFFSET_CAPTURE )) {
foreach ($found_tags[0] as $another_tag) { // walk through all found tags.
$file = str_replace($another_tag[0], " ", $file);
}
}
}
if ($conv_puny) { // make punycode readable
require_once "$include_dir/idna_converter.php";
// Initialize the converter class
$IDN = new idna_convert(array('idn_version' => 2008));
$found_tags = array();
$another_tag = array();
$this_tag = '';
$file = str_replace("http", " http", $file); //place a blank in front of all http's
if (preg_match_all("@http.*? @s", $file, $found_tags, PREG_OFFSET_CAPTURE )) {
foreach ($found_tags[0] as $another_tag) { // walk through all found tags.
// Decode the URL to readable format
$this_tag = $IDN->decode(urldecode($another_tag[0]));
$this_tag = urldecode($this_tag);
$file = str_replace($another_tag[0], $this_tag, $file);
}
}
}
$file = str_replace("ãâ¬â¬", " ", $file); // replace special (long) blanks with standard blank
$file = str_replace("ââ¬â", "'", $file); // replace invalid coded slash
$file = str_replace("é", "—", $file); // replace invalid coded long dash with correct long dash
$file = preg_replace("/ +/", " ", $file); // replace TABs with a standard blank
$file = preg_replace("/ +/", " ", $file); // kill duplicate blanks
$file = str_replace(" ", " ", $file);
$file = str_replace(" ", " ", $file); // kill duplicate blanks
$file = str_replace ("­", "", $file); // kill break character
if ($text_length != "0") {
// build substring of full text until last space in front of $text_length
$file = substr($file, 0, strrpos(substr($file, 0, $text_length), " "));
}
if ($index_host == 1) {
// separate words in host and path
$host_sep =preg_replace("/\.|\/|\\\/", " ", $host);
$path_sep =preg_replace("/\.|\/|\\\/", " ", $path);
$file = $file." ".$host." ".$host_sep;
$file = $file." ".$path." ".$path_sep;
}
if ($title && $index_meta_title) {
$file = $file." ".$title;
}
if ($index_meta_description == 1) {
$file = $file." ".$headdata['description'];
}
if ($index_meta_keywords == 1) {
$file = $file." ".$headdata['keywords'];
}
if ($ent_decode == '1') {
// as it seems, the PHP function html_entity_decode() has some problems.
// In case that 2 entities are placed directly together like: —
// we are obliged to be helpful by eliminating one of them
$file = str_replace(" ", " ", $file);
// now PHP does not get confused
$file = html_entity_decode($file, ENT_QUOTES, 'UTF-8');
$title = str_replace(" ", " ", $title);
$title = html_entity_decode($title, ENT_QUOTES, 'UTF-8');
}
// correct some other trash found on the Internet
$file = str_replace("�", "fi", $file);
$file = str_replace("ï¬â", "fl", $file);
// for URLs use entities, so that links become readable in full text
$file = str_replace("<a href=\"http://www.","<a href="http://www.",$file);
$fulltext = $file; // required for result listing as extract around the keywords and for PHRASE search
// convert all single quotes into standard quote
if ($quotes == '1') {
$all_quotes = array
(
"‘" => "'",
"‘" => "'",
"’" => "'",
"’" => "'",
"′" => "'",
"′" => "'",
"ââ¬Ë" => "'",
"ââ¬Ë" => "'",
"ô" => "'",
"`" => "'",
"ââ¬â¢" => "'",
"ââ¬â¢" => "'"
);
reset($all_quotes);
while ($char = each($all_quotes)) {
$file = preg_replace("/".$char[0]."/i", $char[1], $file);
}
}
// convert all double quotes into standard quotations
if ($dup_quotes == '1') {
$all_quotes = array
(
"ââ¬Å" => "\"",
"ââ¬ï¿½" => "\"",
"ââ¬Å¾" => "\""
);
reset($all_quotes);
while ($char = each($all_quotes)) {
$file = preg_replace("/".$char[0]."/i", $char[1], $file);
}
}
// split words at hyphen, single quote, dot and comma into their basics
if (($div_all || $div_hyphen)) {
$file = split_words($file);
}
reset($entities);
while ($char = each($entities)) {
$file = preg_replace("/".$char[0]."/i", $char[1], $file);
}
//replace codes with ascii chars
//$file = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $file);
$file = preg_replace('~&#([0-9]+);~e', 'chr("\\1")', $file);
if ($case_sensitive == '0' ) {
$file = lower_ent($file);
}
// already done before, but who knows . . .
$fulltext = str_replace("ãâ¬â¬", " ", $fulltext); // replace special (long) blanks for result listing (description)
$fulltext = preg_replace("/ +/", " ", $fulltext); // replace TABs with a standard blank
$fulltext = preg_replace("/ +/", " ", $fulltext); // kill duplicate blanks
$title = str_replace("ãâ¬â¬", " ", $title); // replace special (long) blanks in title
if ($index_rss == '1') {
$file = preg_replace('/0b/si', '.', $file); // try to correct bad charset interpretation
$file = preg_replace('//si', '\'', $file);
$trash = array("\r\n", "\n", "\r", "0E", "0C", "0I"); // kill 'LF' and the others
} else {
$trash = array("\r\n", "\n", "\r");
}
$replace = ' ';
$file = str_replace($trash, $replace, $file);
$fulltext = str_replace($trash, $replace, $fulltext);
$trash = array("\\r\\n", "\\n", "\\r"); // kill 'LF' and the others
$replace = ' ';
$file = str_replace($trash, $replace, $file);
$search = '';
$file = del_secchars($file);
$count = str_word_count($fulltext, 0);
$data['fulltext'] = addslashes($fulltext);
$data['content'] = $file;
$data['title'] = addslashes($title);
$data['description'] = $headdata['description'];
$data['keywords'] = $headdata['keywords'];
$data['host'] = $host;
$data['path'] = $path;
$data['nofollow'] = $headdata['nofollow'];
$data['noindex'] = $headdata['noindex'];
$data['base'] = $headdata['base'];
$data['cano_link'] = $headdata['cano_link'];
$data['count'] = $count;
$data['refresh'] = $headdata['refresh'];
$data['wait'] = $headdata['wait'];
if ($clear == 1) unset ($char, $file, $fulltext, $path_sep, $headdata, $regs, $urlparts, $host);
return $data;
}
function calc_weights($wordarray, $title, $host, $path, $keywords, $url_parts) {
global $index_host, $index_meta_keywords, $sort_results, $domain_mul, $cn_seg, $clear, $dompromo, $keypromo;
$hostarray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($host))));
$patharray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($path))));
if ($cn_seg == '1') { // we need all characters for Chinese language
$titlearray = unique_array(explode(" ", strtolower($title)));
$keywordsarray = unique_array(explode(" ", strtolower($keywords)));
} else {
$titlearray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($title))));
$keywordsarray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($keywords))));
}
$path_depth = countSubstrs($path, "/");
$main_url_factor = '1';
if ($sort_results == '2') { // enter here if 'Main URLs (domains) on top' is selected
$act_host = $host;
$act_path = $url_parts['path'];
$act_query = $url_parts['query'];
// try to find main URL for localhost systems
if ($act_host == 'localhost' && substr_count($act_path, ".") == '0' && substr_count($act_path, "/") <= '3') {
$main_url_factor = $domain_mul; // if localhost: increase weight for domains in path
}
/*
if ($act_host == 'localhost' && substr_count($act_path, ".") == '1' && substr_count($act_path, "/") <= '3') {
$main_url_factor = $domain_mul/2; // if localhost: increase weight for sub-domains in path slightly
}
*/
// only these files are exepted as valid part of the url path
$act_path = str_replace ('index.php', '', $act_path);
$act_path = str_replace ('index.html', '', $act_path);
$act_path = str_replace ('index.htm', '', $act_path);
$act_path = str_replace ('index.shtml', '', $act_path);
// try to find main URL in the wild
if ($act_host != 'localhost' && substr_count($act_host, ".") == '2' && strlen($act_path) <= '1' && !$url_parts['query']) {
$main_url_factor = $domain_mul; // increase weight for main URLs (domains)
}
}
$promo = '';
$catch_found = '';
while (list ($w, $word) = each($wordarray)) {
if ($keypromo == $word[1]) {
$catch_found = '1'; // catchword found in text
}
}
// for promoted domains, correct the weighting
if (!$keypromo && $dompromo && strstr($host, $dompromo)){
$promo = '1';
}
// for promoted catchwords, correct the weighting
if (!$dompromo && $keypromo && $catch_found){
$promo = '1';
}
// for promoted domains AND promoted catchwords , correct the weighting
if ($keypromo && $catch_found && $dompromo && strstr($host, $dompromo)){
$promo = '1';
}
reset ($wordarray);
while (list ($wid, $word) = each($wordarray)) {
$word_in_path = 0;
$word_in_domain = 0;
$word_in_title = 0;
$meta_keyword = 0;
if ($index_host == 1) {
while (list ($id, $path) = each($patharray)) {
if ($path[1] == $word[1]) {
$word_in_path = 1;
break;
}
}
reset($patharray);
while (list ($id, $host) = each($hostarray)) {
if ($host[1] == $word[1]) {
$word_in_domain = 1;
break;
}
}
reset($hostarray);
}
if ($index_meta_keywords == 1) {
while (list ($id, $keyword) = each($keywordsarray)) {
if ($keyword[1] == $word[1]) {
$meta_keyword = 1;
break;
}
}
reset($keywordsarray);
}
while (list ($id, $tit) = each($titlearray)) {
if ($tit[1] == $word[1]) {
$word_in_title = 1;
break;
}
}
reset($titlearray);
$wordarray[$wid][3] = (int) (calc_weight($wordarray[$wid][2], $word_in_title, $word_in_domain, $word_in_path, $path_depth, $meta_keyword, $main_url_factor, $host, $promo));
}
if ($clear == 1) unset ($titlearray, $keywordsarray, $hostarray, $patharray, $act_path, $act_host, $act_query);
reset($wordarray);
return $wordarray;
}
function calc_weight($words_in_page, $word_in_title, $word_in_domain, $word_in_path, $path_depth, $meta_keyword, $main_url_factor, $host, $promo) {
global $title_weight, $domain_weight, $path_weight, $meta_weight;
$weight = ( ( $words_in_page
+ $word_in_title * $title_weight
+ $word_in_domain * $domain_weight
+ $word_in_path * $path_weight
+ $meta_keyword * $meta_weight
) * 10
/ (0.2 + 0.8*$path_depth)
)*$main_url_factor;
// for promoted domains and/or promoted catchwords, correct the weighting
if ($promo){
$weight = $weight*8;
}
return $weight;
}
function isDuplicateMD5($md5sum) {
global $mysql_table_prefix, $debug, $clear;
mysqltest();
$result = mysql_query("select link_id from ".$mysql_table_prefix."links where md5sum='$md5sum'");
if ($debug > '0') echo mysql_error();
if (mysql_num_rows($result) > 0) {
return true;
}
if ($clear == 1) clean_resource($result, '51') ;
return false;
}
function check_include($link, $inc, $not_inc) {
global $clear;
$url_inc = Array ();
$url_not_inc = Array ();
if ($inc != "") {
$url_inc = explode("\n", $inc);
}
if ($not_inc != "") {
$url_not_inc = explode("\n", $not_inc);
}
$oklinks = Array ();
$include = true;
foreach ($url_not_inc as $str) {
$str = trim($str);
if ($str != "") {
if (substr($str, 0, 1) == '*') {
if (preg_match(substr($str, 1), $link)) {
$include = false;
break;
}
} else {
if (!(strpos($link, $str) === false)) {
$include = false;
break;
}
}
}
}
if ($include && $inc != "") {
$include = false;
foreach ($url_inc as $str) {
$str = trim($str);
if ($str != "") {
if (substr($str, 0, 1) == '*') {
if (preg_match(substr($str, 1), $link)) {
$include = true;
break 2;
}
} else {
if (strpos($link, $str) !== false) {
$include = true;
break;
}
}
}
}
}
if ($clear == 1) unset ($str, $link, $url_not_inc, $url_inc, $oklinks);
return $include;
}
function check_for_removal($url) {
global $mysql_table_prefix, $debug, $no_log, $command_line, $clear, $not_erase;
if (!$not_erase) { // delete links only if "URL Must Not include" is not activated for erasing function
mysqltest();
$result = mysql_query("select link_id, visible from ".$mysql_table_prefix."links"." where url='$url'");
if ($debug > '0') echo mysql_error();
if (mysql_num_rows($result) > 0) {
$row = mysql_fetch_row($result);
$link_id = $row[0];
$visible = $row[1];
if ($visible > 0) {
$visible --;
mysql_query("update ".$mysql_table_prefix."links set visible='$visible' where link_id='$link_id'");
if ($debug > '0') echo mysql_error();
} else {
mysql_query("delete from ".$mysql_table_prefix."links where link_id=$link_id");
if ($debug > '0') echo mysql_error();
for ($i=0;$i<=15; $i++) {
$char = dechex($i);
mysql_query("delete from ".$mysql_table_prefix."link_keyword$char where link_id=$link_id");
if ($debug > '0') echo mysql_error();
}
printStandardReport('pageRemoved',$command_line, '0');
}
}
if ($clear == 1) clean_resource($result, '52') ;
unset ($char, $link_id, $visible);
}
}
function extract_text($contents, $source_type, $url) {
global $tmp_dir, $pdftotext_path, $catdoc_path, $xls2csv_path, $op_system, $mb;
global $catppt_path, $home_charset, $command_line, $no_log, $clear, $converter_dir;
$home_charset1 = str_ireplace ('iso-','',$home_charset);
$charset_int = str_ireplace ('iso','',$home_charset1);
$temp_file = "tmp_file";
$result = array();
$filename = $tmp_dir."/".$temp_file ;
if ($source_type == 'ods'){
$filename .= ".".$source_type."";
}
if (!$handle = fopen($filename, 'w')) {
die ("Cannot open file $filename in temp folder");
}
mysqltest();
if (fwrite($handle, $contents) === FALSE) {
die ("Cannot write to file $filename in temp folder");
}
fclose($handle);
mysqltest();
//echo "\r\n\r\n<br /> source_type: '$source_type'<br />\r\n";
// for PDF documents enter here
if ($source_type == 'pdf') {
if (!$handle = fopen($pdftotext_path, 'rb')) {
printStandardReport('errorNoPDFConv',$command_line);
$result[] = 'ERROR';
} else { // prepare command line for PDF converter
if ($op_system != 'win') {
$command = "".$pdftotext_path." -enc UTF-8 ".$filename."";
} else {
$command = "".$pdftotext_path." -cfg xpdfrc ".$filename." -";
}
$a = exec($command, $result, $retval); // convert the PDF document
if ($retval != '0') { // error handler for PDF file converter
if ($retval == '1' || $retval == '3' || $retval == '127') {
if ($retval == '1') {
printStandardReport('errorOpenPDF',$command_line);
}
if ($retval == '3') {
printStandardReport('permissionError',$command_line);
}
if ($retval == '127') {
printStandardReport('noConverter',$command_line);
}
} else {
printStandardReport('ufoError',$command_line);
}
$result[] = 'ERROR';
}
$result = implode(' ', $result);
}
/*
require_once "".$converter_dir."/pdf2txt.php";
$result = pdf2txt($filename);
*/
/*
require_once "".$converter_dir."/class.pdf2text.php";
$a = new PDF2Text();
$a->setFilename($filename);
$a->decodePDF();
$result = $a->output();
*/
/*
require_once "".$converter_dir."/pdf2txt.php";
$result = pdf2txt::directConvert($contents);
*/
//echo "\r\n\r\n<br /> PDF result string: $result<br />\r\n";
//echo "\r\n\r\n<br>PDF result Array:<br><pre>";print_r($result);echo "</pre>\r\n";
// for DOC and RTF files enter here
} else if ($source_type == 'doc' || $source_type == 'rtf') {
if ($op_system == 'win') {
$command = $catdoc_path." -s $charset_int -d utf-8 -x $filename";
$a = exec($command, $result, $retval);
}
// for PPT files enter here
} else if ($source_type == 'ppt') {
// currently unsupported,as a failure was encountered for large PowerPoint pesentations
$a = '';
/*
$command = $catppt_path." -s $charset_int -d utf-8 $filename";
$a = exec($command, $result, $retval);
*/
// for XLS spreadsheets enter here
} else if ($source_type == 'xls') {
$error = '';
require_once "".$converter_dir."/xls_reader.php";
$data = new Spreadsheet_Excel_Reader();
if ($mb == '1') {
// if extention exists, change 'iconv' to mb_convert_encoding:
$data->setUTFEncoder('mb');
}
// set output encoding.
$data->setOutputEncoding('UTF-8');
// read this document
$data->read($filename);
$error = $data->_ole->error;
if ($error == '1'){
printStandardReport('xlsError',$command_line, $no_log);
$result = 'ERROR';
} else {
$result = '';
$boundsheets = array();
$sheets = array();
$boundsheets = $data->boundsheets; // get all tables in this file
$sheets = $data->sheets; // get content of all sheets in all tables
if($boundsheets) {
foreach ($boundsheets as &$bs) {
$result .= "".$bs['name'].", "; // collect all table names in this file
}
if ($sheets) {
foreach ($sheets as &$sheet) {
$cells = $sheet['cells'];
if ($cells) { // ignore all empty cells
foreach ($cells as &$cell) {
foreach ($cell as &$content) {
$result .= "".$content.", "; // collect content of all cells
}
}
}
}
}
if (strtoupper($home_charset) == 'ISO-8859-1') {
$result = utf8_encode($result);
}
}
}
// for ODS spreadsheets enter here
} else if ($source_type == 'ods') {
require_once "".$converter_dir."/ods_reader.php";
$reader = ods_reader::reader($filename);
$sheets = $reader->read($filename);
if($sheets) {
$result = '';
foreach ($sheets as &$sheet) {
if($sheet) {
foreach ($sheet as &$cell) {
if($cell) { // ignore all empty cells
foreach ($cell as &$content) {
$result .= "".$content." "; // collect content of all cells
}
}
}
}
}
} else {
$result = 'ERROR';
}
// for ODT documents enter here
} else if ($source_type == 'odt') {
require_once "".$converter_dir."/odt_reader.php";
$x = new odt_reader();
// Unzip the document
$u = $x->odt_unzip($filename, false);
// read the document
$result = $x->odt_read($u[0], 2);
// create some blanks around the <div> tags
$result = str_replace("<", " <", $result);
$result = str_replace(">", "> ", $result);
//echo "\r\n\r\n<br /> odt result: $result<br />\r\n";
// for JavaScript enter here
}else if ($source_type == 'js') {
$result = extract_js($contents);
}
/*
echo "\r\n\r\n<br /> url: $url<br />\r\n";
echo "\r\n\r\n<br /> source_type: $source_type<br />\r\n";
echo "\r\n\r\n<br /> Operating system: $op_system<br />";
echo "\r\n\r\n<br /> Converter command: $command<br />";
echo "\r\n\r\n<br>extracted content Array:<br><pre>";print_r($result);echo "</pre>\r\n";
echo "\r\n\r\n<br /> Extracted content string: $result<br />";
*/
if ($result != 'ERROR') {
if(is_array($result)) {
$result = implode(" ", $result);
}
$count = strlen($result);
if ($count =='0'){ // if there was not one word found, print warning message
if ($source_type == 'js') {
printStandardReport('jsEmpty',$command_line, $no_log);
} else {
printStandardReport('nothingFound',$command_line, $no_log);
}
$result = 'ERROR';
}
}
unlink ($filename);
mysqltest();
if ($clear == 1) unset ($command, $retval, $a, $contents, $count);
return $result;
}
function remove_sessid($url) {
global $strip_s_sessids;
if ($strip_s_sessids) {
return preg_replace("/(\?|;|&|&)(PHPSESSID|JSESSIONID|session_id|ASPSESSIONID|sid|zenid|s)=(.)+$/i", "", $url);
} else {
return preg_replace("/(\?|;|&|&)(PHPSESSID|JSESSIONID|session_id|ASPSESSIONID|sid|zenid)=(.)+$/i", "", $url);
}
}
function get_sitemap($input_file, $indexed_map, $mysql_table_prefix) {
global $mysql_table_prefix, $command_line, $debug, $no_log, $max_links, $clear;
if ($indexed_map) {
$map_cont = '';
// read content of uncomressed secondary sitemap file
if (!strstr($input_file, "gz") && $fd = @fopen($input_file, "r")) { // read uncompressed sitemap file
//if ($zd = @gzopen("".$input_file.".xml", "r")) { // uncompressed
$map_cont = @stream_get_contents($fd);
fclose($fd);
}
if (!$map_cont && $zd = @fopen("compress.zlib://$input_file", "r")) { // read compressed secondary sitemap
//if (!$smap_found && $zd = @gzopen("".$input_file.".xml.gz", "r")) { // compressed ;
$map_cont = @gzread($zd, 10485760); // max. 10 MB (might be too large for some server)
gzclose($zd);
}
} else {
$map_cont = $input_file;
}
$s_map = simplexml_load_string ($map_cont);
if ($s_map) { // if sitemap file is conform to XML version 1.0
//echo "\r\n\r\n<br>s_map Array:<br><pre>";print_r($s_map);echo "</pre>\r\n";
$links = array ();
mysqltest();
$count = '0';
$scheme = '';
foreach($s_map as $url) {
if ($count < $max_links) { // save time, we dont need more
$the_url = str_replace("&","&",$url->loc);
if ($the_url) { // hopefully this is a URL
if (!strstr($the_url, "ttp")) {
$scheme = '1';
$the_url = "http://".$the_url;
}
$lastmod = strtotime($url->lastmod); // get lastmod date only for this page from sitemap
if (!$lastmod) $lastmod = '999999999'; // if the webmaster was lazy we are obliged to index this link
$res=mysql_query("select indexdate from ".$mysql_table_prefix."links where url like '%$the_url%'");
$num_rows = mysql_num_rows($res); // do we already know this link?
$indexdate = '0';
$new = '1';
if ($num_rows > '0') {
$indexdate = strtotime(mysql_result($res,"indexdate"));
$new = $lastmod - $indexdate;
}
if ($new > '0') $links[] =($url->loc); // add new link only if date from sitemap.xml is newer than date of last index
}
$count++;
}
}
if ($clear == 1) clean_resource($res, '53') ;
$links = explode(",",(implode(",",$links))); // destroy SimpleXMLElement Object and get the link array
}
if ($scheme == '1'){ // hopefully this is a URL, otherwise we need to add the scheme
$i = '0';
foreach($links as $url) {
if (!strstr($url, "ttp")) {
$url = "http://".$url;
$links[$i] = $url;
$i++;
}
}
}
//echo "\r\n\r\n<br>links Array:<br><pre>";print_r($links);echo "</pre>\r\n";
return($links);
}
function store_newLinks($links, $level, $sessid) {
global $mysql_table_prefix, $debug;
mysqltest();
while ($thislink = each($links)) {
// check if we already know this link as a site url
$thislink[1] = mysql_real_escape_string($thislink[1]);
$result = mysql_query("select url from ".$mysql_table_prefix."sites where url like '$thislink[1]%'");
if ($debug > '0') echo mysql_error();
$rows = mysql_num_rows($result);
if ($rows == '0') { // for all new links: save in temp table
mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$thislink[1]', '$level', '$sessid')");
if ($debug > '0') echo mysql_error();
}
}
if ($result) clean_resource($result, '54') ;
return;
}
function create_sitemap($site_id, $url) {
global $mysql_table_prefix, $smap_dir, $smap_unique, $debug, $clear;
$changefreq = "monthly"; // individualize this variable
$priority = "0.50"; // individualize this variable
// Below this only change something, if you are sure to remain compatible to http://www.sitemaps.org/schemas/sitemap/0.9
$date = date("Y-m-d");
$time = date("h:i:s");
$modtime = "T$time+01:00";
$version = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" ;
$urlset = "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.google.com/schemas/sitemap/0.84 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd\">";
$copyright = "<!-- Generated by Sphider-plus created by Tec (v.1.2 rev.3) -->" ;
$update = "<!-- Last update of this sitemap: $date / $time -->" ;
$all_links = '';
mysqltest();
$res=mysql_query("select * from ".$mysql_table_prefix."links where site_id = $site_id");
if ($debug > '0') echo mysql_error();
$num_rows = mysql_num_rows($res); // Get all links of the current domain
for ($i=0; $i<$num_rows; $i++) { // Create individual rows for XML-file
$link = mysql_result($res, $i, "url");
$link = str_replace("&", "&", $link); // URL should become XML conform
$all_links = "$all_links<url><loc>$link</loc><lastmod>$date$modtime</lastmod><changefreq>$changefreq</changefreq><priority>$priority</priority></url>\n";
}
if ($clear == 1) clean_resource($res, '55') ;
$name = parse_addr($url); // Create filename and open file
$hostname = $name[host];
if ($hostname == 'localhost'){ // if we run a localhost system extract the domain
$pathname = $name[path]; // get path, domain and filename
$pos = strpos($pathname,"/",1); // extract domain from path and forget first / by +1 offset
$pathname = substr($pathname,$pos+1); // suppress /localhost/
$pos = strrpos($pathname,"/");
if ($pos) {
$pathname = substr(str_replace("/", "_", $pathname),0,$pos); // if exists, suppress folder, filename and suffix
}
if (!is_dir($smap_dir)) {
mkdir($smap_dir, 0766); // if new, create directory
}
if ($smap_unique == '0') { // different names for every sitemap file
$filename = "./$smap_dir/sitemap_localhost_$pathname.xml";
} else {
$filename = "./$smap_dir/sitemap.xml";
}
if (!$handle = fopen($filename, "w")) {
printInvalidFile($filename);
die;
}
} else { // if we run in the wild
if (!is_dir($smap_dir)) {
mkdir($smap_dir, 0766); // if new, create directory
}
if ($smap_unique == '0') { // different names for every sitemap file
$filename = "./$smap_dir/sitemap_$hostname.xml";
} else {
$filename = "./$smap_dir/sitemap.xml";
}
if (!$handle = fopen($filename, "w")) {
printInvalidFile($filename);
die ('');
}
}
// Now write all to XML-file
if (!fwrite($handle, "$version\n$urlset\n$copyright\n$update\n$all_links</urlset>\n")) {
printInvalidFile($filename);
die ('');
}
fclose($handle);
// sitemap.xml done! Now final printout
printSitemapCreated($filename);
}
function build_url($url, $parent_url, $select, $current, $handle, $store_file) {
global $clear, $ext, $mainurl, $apache_indexes, $strip_sessids, $ex_media, $clear;
// find only media-files with allowed file suffix or type-description or application descriptor
$match = valid_link($url, $select);
if ($match == '0') {
return '';
}
if (substr($url, -1) == '\\') {
return '';
}
$original_parent_url_parts = parse_all_url($url);
$urlparts = parse_all_url($url);
$main_url_parts = parse_all_url($mainurl);
if ($urlparts['host'] != "" && $urlparts['host'] != $main_url_parts['host'] && $ex_media != 1) {
return '';
}
if (isset($urlparts['query'])) {
if ($apache_indexes[$urlparts['query']]) {
return '';
}
}
if (preg_match("/[\/]?mailto:|[\/]?javascript:|[\/]?news:/i", $url)) {
return '';
}
if (isset($urlparts['scheme'])) {
$scheme = $urlparts['scheme'];
} else {
$scheme ="";
}
//only http and https links are followed
if (!($scheme == 'http' || $scheme == '' || $scheme == 'https')) {
return '';
}
//parent url might be used to build an url from relative path
$parent_url = remove_file_from_url($parent_url);
$parent_url_parts = parse_all_url($parent_url);
if (substr($url, 0, 1) == '/') {
$url = $parent_url_parts['scheme']."://".$parent_url_parts['host'].$url;
} else
if (!isset($urlparts['scheme'])) {
$url = $parent_url.$url;
}
$url_parts = parse_all_url($url);
$urlpath = $url_parts['path'];
$regs = Array ();
while (preg_match("/[^\/]*\/[.]{2}\//", $urlpath, $regs)) {
$urlpath = str_replace($regs[0], "", $urlpath);
}
//remove relative path instructions like ../ etc
$urlpath = preg_replace("/\/+/", "/", $urlpath);
$urlpath = preg_replace("/[^\/]*\/[.]{2}/", "", $urlpath);
$urlpath = str_replace("./", "", $urlpath);
$query = "";
if (isset($url_parts['query'])) {
$query = "?".$url_parts['query'];
}
if ($main_url_parts['port'] == 80 || $url_parts['port'] == "") {
$portq = "";
} else {
$portq = ":".$main_url_parts['port'];
}
if (!$urlpath) $urlpath = "/"; // if not exists, add slash instead of real urlpath
$url = $url_parts['scheme']."://".$url_parts['host'].$portq.$urlpath.$query;
if (strstr($url, "/?")) { //added to address <a href="?id=1"> syntax
$page = str_replace($main_url_parts['path'], null, $original_parent_url_parts['path']);
if (substr(trim($mainurl), -1) !== "/" and substr(trim($page), 0, 1) !== "/") {
$page = "/" . $page;
}
$url = $mainurl . $page . $query;
}
if ($ex_media == 1) { // if we index sub-domains
return $url;
}
$mainurl = remove_file_from_url($mainurl);
$url = convert_url($url); // convert 'blank' and '&'
if ($strip_sessids == 1) {
$url = remove_sessid($url);
}
if (strstr($url, $main_url_parts['host']) == false) { // $main_url_parts['host'] will support also relative-back-folder like ../../
if ($clear == 1) {
unset ($select, $mainurl, $urlpath, $query, $page);
$original_parent_url_parts = array();
$main_url_parts = array();
$url_parts = array();
$urlparts = array();
}
return '';
} else {
if ($clear == 1) {
unset ($select, $mainurl, $urlpath, $query, $page);
$original_parent_url_parts = array();
$main_url_parts = array();
$url_parts = array();
$urlparts = array();
}
return $url;
}
}
function make_abslinks($body, $url){
// assuming that all src, data, classid and value links are relative links in a page and without ../ or ./
// otherwise we need to run through all links by using $offset++
// this function is used only for frames and iframes in order to correct the link URL with respect to the found frame-folder
$offset = '0';
$link = '';
$domain = substr($url, '0', strrpos($url, "/")+1);
$found_link = strpos($body, "src=", $offset);
$link = substr($body, $found_link, '20');
if (!$link) {
$found_link = strpos($body, "classid=", $offset);
$link = substr($body, $found_link, '20');
}
if (!$link) {
$found_link = strpos($body, "data=", $offset);
$link = substr($body, $found_link, '20');
}
if (!$link) {
$found_link = strpos($body, "value=", $offset);
$link = substr($body, $found_link, '20');
}
if ($link) {
$abs = strpos($link, "http");
$sc1 = strpos($link, "./");
$sc2 = strpos($link, "../");
if (!$abs && !$sc1 && !$sc2) { // add domain to link, href is not altered
$body = preg_replace("/src=\"/", "src=\"".$domain."", $body);
$body = preg_replace("/classid=\"/", "classid=\"".$domain."", $body);
$body = preg_replace("/data=\"/", "data=\"".$domain."", $body);
$body = preg_replace("/value=\"/", "value=\"".$domain."", $body);
}
}
return $body;
}
function get_frames($frame, $url, $can_leave_domain) {
global $abslinks;
$links = array ();
$regs = array ();
$replace = '';
$get_charset = '';
$care_excl = '1'; // care file suffixed to be excluded
$relocated = ''; // URL is not relocated
$local_redir = '';
// find all frames of the frameset
preg_match_all("/(frame[^>]*src[[:blank:]]*)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $frame, $regs, PREG_SET_ORDER);
foreach ($regs as $val) {
if (($a = url_purify($val[2], $url, $can_leave_domain, $care_excl, $relocated, $local_redir)) != '') {
$links[] = ($a); // collect all frame links
}
}
if ($links) {
foreach ($links as $url) {
printNewLinks($url);
if (preg_match("/.html|.htm|.xml|.php|.aspx/i", $url)) {
$contents = getFileContents($url, $get_charset); // get content of this frame
$frame = $contents['file'];
// separate the body part of this frame
/*
preg_match("@<body(.*?)>(.*?)<\/body>@si",$frame, $regs); // doesn't work for all frame content
$body = $regs[1];
*/
$start_body = strpos($frame,"<body")+6;
$end_body = strpos($frame,"</body")-1;
$length = $end_body-$start_body;
$body = substr($frame, $start_body, $length);
if ($abslinks == '1') {
$body = make_abslinks($body, $url); // if required, correct links relative to found frame
}
$replace = "".$replace."<br />".$body."";
} else { // might be an image
$replace = "".$replace."<br /><img src=\"".$url."\">";
}
}
}
return $replace;
}
function get_elements($element, $all_media, $raw_file, $regs, $trash1, $replace1) {
global $clear, $index_embeded;
preg_match_all("/<$element(.*?)<\/$element\s*>/si", $raw_file, $regs, PREG_SET_ORDER); //get 'object' elements
foreach ($regs as $val) {
$val = preg_replace("@<map.*?map>@si", " ",$val); // kill <map> elements in object
$val = str_replace(" ","", str_replace($trash1, $replace1, $val));
// this must be an object but not client- or server-side maped, not ActiveX and no Java Script
if (!preg_match("/[\/]?usemap|[\/]?ismap|[\/]?javascript:|[\/]?java:|[\/]?clsid:/i", $val)) {
$all = $val;
$nested = substr_count(lower_case($val[1]), $element);
if ($nested) {
while ($nested > '0') {
$inner = array();
$inner[0] = '';
$last_pos = strrpos (lower_case($all[1]), $element); // find inner nested element
$inner[1] = substr($all[1], $last_pos); // separate inner nested element
if ($index_embeded == '1') {
$inner = array_reverse($inner); // move <object> into [0] of array
$all_media[] = $inner; // save actual element
}
$all[1] = substr($all[1], 0, $last_pos); // get previous element
$nested--;
}
}
if ($index_embeded == '1') { // search for embeded objects
if (preg_match("/<embed(.*?)<\/embed\s*>/si", $all[1], $regs)) {; //get 'embed' elements
foreach ($regs as $val) {
$embed[0] = $val;
$embed[1] = '';
if (strstr($embed[0], 'embed')) {
$all_media[] = $embed; // save embeded element
}
}
}
}
$all[0] = substr($all[0], '0', strpos($all[0], '>')+1); // kill nested elements in object[0]
$all[1] = substr($all[1], '0', strpos($all[1], '>')+1); // kill nested elements in object[1]
$all[1] = preg_replace("@<embed.*?embed>@si", " ",$all[1]); // kill <embed> element in object
if (strstr(lower_case($all[1]), '<object')) {
$all = array_reverse($all); // move <object> into [0] of array
}
$all_media[] = $all; // save outer element
}
}
if ($clear == '1') unset ($all, $val, $regs, $embed, $inner, $element);
return $all_media;
}
function get_id3string($link, $build_tmp, $cl) {
global $clear, $case_sensitive, $curl, $debug;
$error = '';
$id3_string = '';
$localtempfile = $link;
$unreachable = '';
if ($build_tmp == '1') { // we need to build a temporary file
mysqltest();
if ($fp_remote = @fopen($link, 'rb')) {
$localtempfile = tempnam('./tmp', 'getID3');
if ($fp_local = fopen($localtempfile, 'wb')) {
// this will read the first 64 kByte of the media file
for ($i = 1; $i <= 4; $i++) {
$buffer = @fread($fp_remote, 8192);
fwrite($fp_local, $buffer);
}
// this will read the complete media file
/*
while ($buffer = @fread($fp_remote, 8192)) {
mysqltest();
fwrite($fp_local, $buffer);
}
*/
fclose($fp_local);
}
} else { // if impossible to open by PHP function 'fopen()', try to open this image by means of cURL library
if ($curl == '1') { // if cURL library is available
if($buffer = curl_open($link)) {
$localtempfile = tempnam('./tmp', 'getID3');
if ($fp_local = fopen($localtempfile, 'wb')) {
fwrite($fp_local, $buffer);
} else {
$unreachable = '1'; // unable to write to temp-file
}
fclose($fp_local);
} else {
$unreachable = '2'; // unable to open the remote file by cURL
}
} else {
$unreachable = '3'; // no cURL library available
}
}
if ($debug == '2') {
if ($unreachable) {
if ($unreachable == '1') $report = "Unable to write to temp-file.";
if ($unreachable == '2') $report = "Unable to open the remote media file $link by cURL function.";
if ($unreachable == '3') $report = "Unable to open media file $link by means of PHP function fopen(), nor cURL library available.";
printWarning($report, $cl);
}
}
}
// Remote files are not supported
if (!preg_match('/^(ht|f)tp:\/\//', $localtempfile) && !$unreachable) {
$getID3 = new getID3; // Initialize getID3 engine
$getid3->encoding = 'UTF-8';
try {
$This_ID3 = $getID3->analyze($localtempfile);
}
catch (Exception $e) {
$rep = $e->message ;
$report = "Problem when analysing media file. ".$rep.".";
printWarning($report, $cl);
}
if ($build_tmp == '1') {
unlink($localtempfile); // Delete temporary file
fclose($fp_remote);
}
$id3_array = array();
foreach ($This_ID3 as $key0 => $val0) { // prepare all relevant ID3 and EXIF information into array
if (is_array($val0)) {
foreach ($This_ID3 as $key1 => $section1) {
foreach ($section1 as $name1 => $val1) {
if (is_array($val1)) {
foreach ($val1 as $key2 => $section2) {
foreach ($section2 as $name2 => $val2) {
if (is_array($val2)) {
// for future releases
} else {
if (strlen($val2) < 100 && $key2 != "THUMBNAIL" && $key2 != "keyframes" && $val2 != "") {
//echo "2 $key2.$name2: $val2<br />\n";
$id3_array[] = " ".$key2." >> ".$name2." ;; ".$val2." ";
}
}
}
}
} else {
if (strlen($val1) < 100 && $val1 != "") {
//echo "1 $key1.$name1: $val1<br />\n";
$id3_array[] = " ".$key1." >> ".$name1." ;; ".$val1." ";
}
}
}
}
} else {
if ($key0 != "GETID3_VERSION") {
//echo "0 $key0: $val0<br />\n";
$id3_array[] = " >> ".$key0." ;; ".$val0." ";
}
}
}
sort($id3_array);
$id3_string = implode("<br />",array_unique($id3_array)); // convert array into string with <br /> as delimiter
if ($case_sensitive == '0') {
$id3_string = lower_ent($id3_string);
$id3_string = lower_case($id3_string);
}
if ($clear == '1') {
unset ($key0, $key1, $key2, $name1, $name2, $val0, $val1, $val2);
unset ($section1, $section2, $This_ID3, $getID3);
$id3_array = array();
}
}
return $id3_string;
}
function get_exif($localtempfile) {
global $clear, $case_sensitive;
$id3_string = '';
// Remote files are not supported
if (!preg_match('/^(ht|f)tp:\/\//', $localtempfile)) {
$getID3 = new getID3; // Initialize getID3 engine
$getid3->encoding = 'UTF-8';
try {
$This_ID3 = $getID3->analyze($localtempfile);
}
catch (Exception $e) {
echo 'Problem to analyze media file '.$link.' : ' . $e->message;
}
$id3_array = array();
foreach ($This_ID3 as $key0 => $val0) { // prepare all relevant ID3 and EXIF information into array
if (is_array($val0)) {
foreach ($This_ID3 as $key1 => $section1) {
foreach ($section1 as $name1 => $val1) {
if (is_array($val1)) {
foreach ($val1 as $key2 => $section2) {
foreach ($section2 as $name2 => $val2) {
if (is_array($val2)) {
// for future releases
} else {
if (strlen($val2) < 100 && $key2 != "THUMBNAIL" && $key2 != "keyframes" && $val2 != "") {
//echo "2 $key2.$name2: $val2<br />\n";
$id3_array[] = " ".$key2." >> ".$name2." ;; ".$val2." ";
}
}
}
}
} else {
if (strlen($val1) < 100 && $val1 != "") {
//echo "1 $key1.$name1: $val1<br />\n";
$id3_array[] = " ".$key1." >> ".$name1." ;; ".$val1." ";
}
}
}
}
} else {
if ($key0 != "GETID3_VERSION") {
//echo "0 $key0: $val0<br />\n";
$id3_array[] = " >> ".$key0." ;; ".$val0." ";
}
}
}
sort($id3_array);
$id3_string = implode("<br />",array_unique($id3_array)); // convert array into string with <br /> as delimiter
if ($case_sensitive == '0') {
$id3_string = lower_ent($id3_string);
$id3_string = lower_case($id3_string);
}
if ($clear == '1') {
unset ($key0, $key1, $key2, $name1, $name2, $val0, $val1, $val2);
unset ($section1, $section2, $This_ID3, $getID3);
}
}
return $id3_string;
}
function mysqltest(){
global $db_con, $database, $mysql_host, $mysql_user, $mysql_password, $command_line;
$mysql_fail = '';
if (@mysql_ping($db_con) === FALSE){
$dbtries = 0;
while ($dbtries < 5 && @mysql_ping($db_con) === FALSE){
$dbtries++;
printDB_errorReport('noSQL',$command_line, '1');
sleep(30);
$db_con = @mysql_pconnect ($mysql_host, $mysql_user, $mysql_password);
if (!$db_con)
$mysql_fail = '1';
//echo "<span class='blue sml'> Cannot connect to database.<br /></span>";
if ($db_con) {
$success = @mysql_select_db ($database, $db_con);
if (!$success) {
$mysql_fail = '1';
//echo "<p class='blue sml'> Cannot choose database.<br /></p>";
}
}
}
if (@mysql_ping($db_con) === FALSE){
printDB_errorReport('noSucc',$command_line, '1'); // failed 5 times. End of index procedure
printDB_errorReport('aborted',$command_line, '1');
printDB_errorReport('end',$command_line, '1');
die('');
}
printStandardReport('newSQL',$command_line, '1'); // reconnected to db
}
return $mysql_fail;
}
function clean_resource($result, $event) {
global $clear, $db_con, $debug, $cl;
if ($clear == '1' && $result) {
$mysql_fail = '';
$mysql_fail = mysqltest();
if (!$mysql_fail) {
if ($result == '') {
printFreeRes($event, $cl);
}
$free = mysql_free_result($result) ;
if ($free != '1') {
printFreeMySQL($result, $event, $cl);
}
mysqltest();
// DO NOT USE THE NEXT ROW ON SHARED HOSTING SYSTEMS ! ! ! 'flush query cache' could be forbidden.
@mysql_query("FLUSH QUERY CACHE");
if ($debug > '0') echo mysql_error();
}
}
}
function valid_link($url, $select) {
reset($select);
$match = '0';
$url_parts = parse_all_url($url);
$path = $url_parts['path']; // if exsists, remove domain and query
foreach ($select as $key =>$value) {
$last_dot = strrpos($path, "."); // find last dot in URL string
$suffix = lower_case(substr($path, $last_dot)); // extract suffix
if (preg_match("/\.$value$/i", $suffix)) {
$match = '1';
}
}
return $match;
}
function bbcode($text) {
// encrypt Smilies
$smiles = array();
$smiles['<:)>'] = '<:)> beard';
$smiles['>:)'] = '> Evil';
$smiles[':)'] = ':) Smile';
$smiles['|:('] = '|:( Headbanger';
$smiles[':('] = ':( Angry';
$smiles[':\'('] = ':\ Rears';
$smiles[':o'] = ':o Amazed';
$smiles[':D'] = ':D Big Smile';
$smiles[':r'] = ':r Disgusted';
$smiles[':9~'] = ':9~ Jummy!';
$smiles[':9'] = ':9 Delicious';
$smiles[';)'] = ';) Wink';
$smiles[':9'] = ':9 Delicious';
$smiles[':7'] = ':7 Love It';
$smiles[':+'] = ':+ Clown';
$smiles['O+'] = 'O+ Heart';
$smiles[':*'] = ':* Kiss';
$smiles['}:O'] = '}: Stupid Cow';
$smiles['^)'] = '^) Married';
$smiles['_O_'] = '_O_ Worshippie';
$smiles[':W'] = ':W Wave goodbye';
$smiles['^O^'] = '^O^ Way To Go!';
$smiles[':?'] = ':? Come Again?';
$smiles['(8>'] = '(8> Spy vs. Spy';
$smiles[':Y)'] = ':Y) Vork';
$smiles[':Z'] = 'Sleeping';
$smiles[';('] = 'cry';
$smiles['}:|'] = '}:| Grmbl';
$smiles[':z'] = ':z Sleepy';
$smiles['}>'] = '}> Evil';
$smiles[':X'] = ':X Hgnn';
$smiles[':O'] = ':O Booooring';
$smiles['*)'] = '*) Prodent';
$smiles[':{'] = ':{ Uhuh';
$smiles['O-)'] = 'O-) The Saint';
$smiles['8-)'] = '8-) Sunchaser';
$smiles['*;'] = '*;Liefde is';
$smiles[':Y'] = ':Y Yes';
$smiles[':N'] = ':N No';
$smiles[':@'] = ':@ Ashamed';
$smiles['8)7'] = '8)7 Twisted';
$smiles[':P'] = ':P puh';
foreach($smiles as $grim => $txt)
$text = str_replace($grim, ''.$txt.'', $text);
$bb_search = array( // convert most important bbcodes
"/(\[)(url)(=)(['\"]?)(www\.)([^\"']*)(\\4)(.*)(\[\/url\])/siU",
"/(\[)(url)(=)(['\"]?)([^\"']*)(\\4])(.*)(\[\/url\])/siU",
"/(\[)(url)(\])(www\.)([^\"]*)(\[\/url\])/siU",
"/(\[)(url)(\])([^\"']*)(\[\/url\])/siU",
"/(\[)(email)(\])([^\"']*)(\[\/email\])/siU",
"/(\[)(email)(=)(['\"]?)([^\"']*)(\\4])(.*)(\[\/email\])/siU",
"/(\[)(color=)([^\W]*)(\])(.*)(\[\/color\])/siU",
"/(\[)(size=)([^\.]*)(\])(.*)(\[\/size\])/siU",
"/(\[)(font=)([^\W]*)(\])(.*)(\[\/font\])/siU",
"/(\[)(b)(\])(\r\n)*(.*)(\[\/b\])/siU",
"/(\[)(u)(\])(\r\n)*(.*)(\[\/u\])/siU",
"/(\[)(i)(\])(\r\n)*(.*)(\[\/i\])/siU",
"/(\[)(indent)(\])(\r\n)*(.*)(\[\/indent\])/siU",
"/(\[)(center)(\])(\r\n)*(.*)(\[\/center\])/siU",
"/(\[)(left)(\])(\r\n)*(.*)(\[\/left\])/siU",
"/(\[)(right)(\])(\r\n)*(.*)(\[\/right\])/siU",
"/(\[)(quote)(\])(\r\n)*(.*)(\[\/quote\])/siU",
"/(\[)(code)(\])(\r\n)*(.*)(\[\/code\])/siU",
"/(\[)(pre)(\])(\r\n)*(.*)(\[\/pre\])/siU",
"/(\[)(img)(\])(?!javascript:)(\r\n)*([^\"']*)(\[\/img\])/siU",
"/about:/si");
$replace = array(
"<a href=\"http://www.\\6\" target=\"_blank\">\\8</a>",
"<a href=\"\\5\" target=\"_blank\">\\7</a>",
"<a href=\"http://www.\\5\" target=\"_blank\">\\5</a>",
"<a href=\"\\4\" target=\"_blank\">\\4</a>",
"<a href=\"mailto:\\4\" target=\"_blank\">\\4</a>",
"<a href=\"mailto:\\5\" target=\"_blank\">\\7</a>",
"<span style=\"color:\\3;\">\\5</span>",
"<span style=\"font-size:\\3;\">\\5</span>",
"<span style=\"font-family:\\3;\">\\5</span>",
"<b>\\5</b>",
"<u>\\5</u>",
"<i>\\5</i>",
"<blockquote>\\5</blockquote>",
"<center>\\5</center>",
"<left>\\5</left>",
"<right>\\5</right>",
"<blockquote>Quote:
<hr>
\\5<hr></blockquote>",
"<blockquote>Code:
<hr>
\\5<hr></blockquote>",
"<pre>Code:
\\5</pre>",
"<img src=\"\\5\" border=\"0\">",
"about: ");
$text= preg_replace($bb_search, $replace, $text);
// Create surrounding spaces for not yet encoded BB's
$text = str_replace("[", " [", $text);
$text = str_replace("]", "] ", $text);
return ($text);
}
function microtime_float(){
list($usec, $sec) = explode(" ", microtime());
return ((float)$usec + (float)$sec);
}
function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex, $use_nofollow, $cl, $use_robot, $use_prefcharset) {
global $entities, $min_delay, $link_check, $command_line, $min_words_per_page, $dup_content, $dup_url, $quotes, $plus_nr;
global $min_words_per_page, $supdomain, $smp, $follow_sitemap, $max_links, $realnum, $local, $tmp_dir, $auto_add, $admin_email;
global $mysql_table_prefix, $user_agent, $tmp_urls, $delay_time, $domain_arr, $home_charset, $charSet, $url_status;
global $debug, $common, $use_white1, $use_white2, $use_black, $whitelist, $blacklist, $clear, $db_con, $abslinks;
global $index_media, $index_image, $suppress_suffix, $imagelist, $min_image_x, $min_image_y, $dup_media, $index_alt, $no_log, $index_rss;
global $index_audio, $audiolist, $index_video, $videolist, $index_embeded, $rss_template, $index_csv, $delim, $ext, $index_id3, $dba_act;
global $converter_dir, $dict_dir, $cn_seg, $jp_seg, $index_framesets, $index_iframes, $cdata, $dc, $preferred, $index_rar, $index_zip, $curl;
global $docs, $only_docs, $only_links, $case_sensitive, $vowels, $noacc_el, $include_dir, $thumb_folder, $js_reloc, $server_char;
$data = array();
$cn_data = array();
$topic = '';
$url_reloc = '';
$comment = mysql_real_escape_string("Automatically added during index procedure, as this domain is not yet available in 'Sites' menu.");
$admin_email = mysql_real_escape_string($admin_email);
if ($debug == '0'){
if (function_exists("ini_set")) {
ini_set("display_errors", "0");
}
error_reporting(0) ;
} else {
error_reporting (E_ERROR) ; // otherwise a non existing siemap.xml would always cause a warning message
}
$needsReindex = 1;
$deletable = 0;
$url_status = url_status($url);
$url_parts = parse_all_url($url);
$thislevel = $level - 1;
if ($url_status['relocate'] ){ // if relocated, print message and redirect to new URL
// remove the original URL from temp table. The relocated URL will be added later on.
mysqltest();
mysql_query ("delete from ".$mysql_table_prefix."temp where link = '$url' AND id = '$sessid'");
if ($debug > '0') echo mysql_error();
$new_url = $url_status['path'] ;
$diff = strlen($url);
$redir = substr( $new_url, $diff); // extract diff. between original URL and relocated URL
//echo "\r\n\r\n<br /> relocated new_url: '$new_url'<br />\r\n";
if ($redir == "index.php" || $redir == "index.html" || $redir == "index.htm" || $redir == "home.html") {
$local_redir = '1'; // no output because diff. is only index.html etc.
} else {
printRedirected($url_status['relocate'], $url_status['path'], $cl);
}
if (strstr($url_status['path'], "//")) { // if redirected to absolute URL, use this for further usage
$url = $url_status['path'];
} else {
$relo_url = str_replace($url_parts['query'], "", $url); // url without query
$relo_url = substr($url, 0, strrpos($relo_url, "/")+1); // url without file name
if (strpos($url_status['path'], "./") === 0) { // if redirected relativ to same folder depth
$url_status['path'] = str_replace("./", "", $url_status['path']);
$url = "".$relo_url."".$url_status['path']."";
}
if (strpos($url_status['path'], "../") === 0) { // if redirected relativ and one folder up
$url_status['path'] = str_replace("./", "", $url_status['path']);
$relo_url = substr($url, 0, strpos($url_parts['path'])); // url without file name
$relo_url = substr($url, 0, strrpos($relo_url, "/")+1); // url without last folder
$url = "".$relo_url."".$url_status['path']."";
}
}
$url_reloc = $url; // remember the relocated url in order to redefine $mainurl
$url_status = url_status($url); // get the status of the relocated URL
$url_parts = parse_all_url($url); // rebuild the url parts from the relocated URL
}
if ($smp != 1 && $follow_sitemap == 1) { // enter here if we don't already know a valid sitemap and if admin settings allowed us to do so
$tmp_urls = get_temp_urls($sessid); // reload previous temp
$url2 = remove_sessid(convert_url($url));
// get folder where sitemap should be and if exists, cut existing filename, suffix and subfolder
$host = parse_addr($url2);
$hostname = $host[host];
$more_sitemaps = array ();
if ($hostname == 'localhost') $host1 = str_replace($local,'',$url2);
$pos = strpos($host1, "/"); // on local server delete all behind the /
if ($pos) $host1 = substr($host1,0,$pos); // build full adress again, now only until host
if ($hostname == 'localhost') {
$url2 = ("".$local."".$host1."");
}else {
$url2 = ("$host[scheme]://$hostname");
}
$sitemap_name = "sitemap"; // standard name for sitemap file
$input_file = "$url2/$sitemap_name"; // create path to sitemap
$log_file = './sitemaps/current_sitemap.xml'; // destination for sitemap log-file
$smap_found = '';
$indexed_map = '';
$map_cont = '';
// try to fetch individual sitemap url from database
mysqltest();
$result = mysql_query("select smap_url from ".$mysql_table_prefix."sites where site_id='$site_id'");
if ($debug > '0') echo mysql_error();
$row = mysql_fetch_row($result);
if (preg_match("/http:\/\//", $row[0])) { // use the individual sitemap
$input_file = preg_replace("/.xml.gz|.xml/i", "", $row[0]);
}
$file = "".$input_file.".xml";
if ($fd = @fopen($file, "r")) { // uncompressed ?
//if ($zd = @gzopen("".$input_file.".xml", "r")) { // uncompressed ?
$map_cont = @stream_get_contents($fd);
if ($map_cont && strpos($map_cont, "schemas/sitemap")) { // if we were able to read it
$smap_found = '1';
}
fclose($fd);
}
$gz_file = "".$input_file.".xml.gz";
if (!$smap_found && $zd = @fopen("compress.zlib://$gz_file", "r")) { // compressed ?
//if (!$smap_found && $zd = @gzopen("".$input_file.".xml.gz", "r")) { // compressed ?
$map_cont = @gzread($zd, 10485760); // max. 10 MB (might be too large for some server)
gzclose($zd);
if ($map_cont && strpos($map_cont, "schemas/sitemap")) {
$smap_found = '1';
}
}
//echo "\r\n\r\n<br>map_cont Array:<br><pre>";print_r($map_cont);echo "</pre>\r\n";
if($smap_found) {
if ($debug != '0') { // create a log-file of current sitemap.xml
file_put_contents($log_file, $map_cont);
}
//$del = mysql_query("delete from ".$mysql_table_prefix."temp"); // function get_sitemap and store_links will build a new temp table
if (stristr($map_cont, "<sitemapindex")) { // if current sitemap file is an index file
printStandardReport('validSitemapInd',$command_line, $no_log);
$get_maps = simplexml_load_string ($map_cont);
if ($get_maps) {
reset($get_maps);
foreach($get_maps as $map_x) {
$new_links[] =($map_x->loc); // get all links to further sitemap files
}
if (is_array($new_links)) { // if we found more sitemap files
$new_links = explode(",",(implode(",",$new_links))); // destroy SimpleXMLElement Object and get the link array
$new_links = array_slice($new_links, 0, $max_links);
$indexed_map = '1';
$i = '0';
//echo "\r\n\r\n<br>new_links Array:<br><pre>";print_r($new_links);echo "</pre>\r\n";
foreach($new_links as $input_file) {
$these_links = get_sitemap($input_file, $indexed_map, $mysql_table_prefix); // now extract page links from this sitemap file
//echo "\r\n\r\n<br>these_links Array:<br><pre>";print_r($these_links);echo "</pre>\r\n";
if ($these_links){
reset($these_links);
store_newLinks($these_links, $level, $sessid);
$smp = '1'; // there were valid sitemap files and we stored the new links
$i++;
} else {
printStandardReport('invalidSecSitemap',$command_line, $no_log); // unable to extract links from secondary sitemap file
}
}
printValidSecSmap($i, $cl);
unset ($input_file, $map_cont, $new_links);
} else {
printStandardReport('invalidSecSitemap',$command_line, $no_log); // unable to extract links from secondary sitemap file
}
} else {
printStandardReport('invalidSitemapInd',$command_line, $no_log); // unable to extract links from sitemap INDEX file
}
} else {
$links = get_sitemap($map_cont, $indexed_map, $mysql_table_prefix); // extract links from sitemap.xml (there was only one sitemap file)
if ($links !='') {
reset ($links);
//echo "\r\n\r\n<br>sitemmap links Array:<br><pre>";print_r($links);echo "</pre>\r\n";
store_newLinks($links, $level, $sessid);
$smp = '1'; // there was one valid sitemap and we stored the new links
printStandardReport('validSitemap',$command_line, $no_log);
} else {
printStandardReport('invalidSitemap',$command_line, $no_log);
}
unset ($links);
}
}
}
if ($debug == '0'){
if (function_exists("ini_set")) {
ini_set("display_errors", "0");
}
error_reporting(0) ;
} else {
error_reporting (E_ALL ^ E_NOTICE ^ E_WARNING) ;
}
if (strstr($url_status['state'], "Relocation") || $url_status['relocate']) {
$care_excl = '1'; // care file suffixed to be excluded
$relocated = '1'; // URL is relocated
$url = preg_replace("/ /i", "", url_purify($url_status['path'], $url, $can_leave_domain, $care_excl, $relocated, $local_redir));
// check for unsupported file suffix
if ($care_excl == '1') {
reset($ext);
while (list ($id, $excl) = each($ext)){
if (preg_match("/\.$excl$/i", $url_status['path'])) {
$url = 'excl';
}
}
}
if ($url <> '' && $url != "self" && $url != "excl") {
mysqltest();
$result = mysql_query("select link from ".$mysql_table_prefix."temp where link='$url' && id = '$sessid'");
if ($debug > '0') echo mysql_error();
$rows = mysql_num_rows($result);
if ($rows == 0) {
mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', '$level', '$sessid')");
if ($debug > '0') echo mysql_error();
}
// check whether the redirected URL is already known and in database
mysqltest();
$query = "select indexdate from ".$mysql_table_prefix."links where url='$url'";
$result = mysql_query($query);
if ($debug > '0') echo mysql_error();
$rows = mysql_num_rows($result);
if ($rows) {
$url_status['state'] = "Already in database";
} else {
$url_status['state'] = 'ok'; // okay to index the relocated URL
}
if ($clear == 1) clean_resource($result, '02') ;
} else {
if ($url == "self") {
$url_status['state'] = "Relocated to the caling URL. Blocked, because this could cause an infinite loop.";
}
if ($url == "excl") {
$url_status['state'] = "Relocated to currently unsupported file suffix.";
} else {
$url_status['state'] = "Redirected out of domain: $domain";
}
}
}
if ($url_status['state'] == 'ok') {
$OKtoIndex = 1;
$file_read_error = 0;
if (time() - $delay_time < $min_delay) {
sleep ($min_delay- (time() - $delay_time));
}
//echo "\r\n\r\n<br>url_status Array:<br><pre>";print_r($url_status);echo "</pre>\r\n";
if ($url_status['body']) {
$file = $url_status['body'];
} else {
// fetch the file content
$delay_time = time();
$contents = array();
$chrSet = '';
$file = '';
$get_charset = '1';
$file = file_get_contents($url);
//echo "\r\n\r\n<br /> file0: '$file'<br />\r\n";
// try to get the contents with a slash at the end of the path
if (!$file) {
if (!isset($url_parts['query']) && $url_parts['path'] != "/" && $url_status['path1']) {
$url = 'http://'.$url_parts['host']."".$url_status['path1']."";
$file = file_get_contents($url);
}
}
//echo "\r\n\r\n<br /> file1: '$file'<br />\r\n";
// try alternate method no. 3 to get the file content
if (!$file) {
$get_charset = '1';
$contents = getFileContents($url, $get_charset);
$file = $contents['file'];
}
}
//echo "\r\n\r\n<br /> file2: '$file'<br />\r\n";
//die ('Bis hier');
// convert gzip coded content into plain text
if ($url_status['Content-Encoding'] == "gzip") {
$result = gz_decode($file, $url_status['Content-Encoding'], $url_status['Transfer-Encoding']);
if($result == "error_gz0") {
if ($debug == "2") {
$result = "Announced by the URL as gzip formatted content, it's not! We'll treat it as plain text";
printUrlStatus($result, $command_line, $no_log);
}
} else {
$file = $result;
}
}
// We've tried it with 3 different methods. File is not readable for Sphider-plus
if (!$file || preg_match("/<title>30\d Found<\/title>/i", $file)) {
$url_status['state'] = "Unable to read the content of the file.<br />$url does not respond,<br />or HTTP status 403: Forbidden.";
$realnum -- ;
}
}
// try to find a relocation in JavaScript (like at http://www.m-porechye.ru/)
if ($js_reloc && $file) {
$file_js = substr($file, 0, 1024);
if (preg_match("@javascript(.*?)window.location(.*?)=(.*?)[\'\"](.*?)[\'\"]@si", $file_js, $regs)) {
//echo "\r\n\r\n<br>regs Array:<br><pre>";print_r($regs);echo "</pre>\r\n";
if ($regs[4] != strstr($regs[4], "ttp")) {
if ($url_parts['path']) {
if ($url_status['path'] != "/") {
$url_status['path'] .= "/"; // ad a finalslash to the path
}
}
$reloc_url = 'http://'.$url_parts['host']."".$url_status['path']."".$regs[4].""; // build the complete URL
} else {
$reloc_url = $regs[4];
}
//echo "\r\n\r\n<br /> JavaScript reloc URL: '$reloc_url'<br />\r\n";
$url_status = url_status($reloc_url);
if (strstr($url_status['state'], "ok")) { // this is the status of $reloc
// remove the original URL from temp table. The relocated URL will be added immediately.
mysqltest();
mysql_query ("delete from ".$mysql_table_prefix."temp where link = '$url' AND id = '$sessid'");
if ($debug > '0') echo mysql_error();
// if not relocated in it selves
if ($reloc_url != $url) {
mysqltest();
$result = mysql_query("select link from ".$mysql_table_prefix."temp where link='$reloc_url' && id = '$sessid'");
if ($debug > '0') echo mysql_error();
$rows = mysql_num_rows($result);
if ($rows == 0) {
// add the relocated URL to the temp table, so we may process it later on
mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$reloc_url', '$level', '$sessid')");
if ($debug > '0') echo mysql_error();
}
// check whether the redirected URL is already known and in database as a link
mysqltest();
$query = "select indexdate from ".$mysql_table_prefix."links where url='$reloc_url'";
$result = mysql_query($query);
if ($debug > '0') echo mysql_error();
$rows = mysql_num_rows($result);
if ($rows) {
$url_status['state'] = "Already in database";
} else {
// this should be the normal exit for JavaScript relocations
$url_status['state'] = "Redirected by JavaScript to $reloc_url";
}
if ($clear == 1) clean_resource($result, '02a') ;
} else {
$url_status['state'] = "Redirected to the same URL. Blocked, because could cause an infinite loop.";
}
} else {
$message = $url_status['state'];
$url_status['state'] = "Unable to follow the JavaScript redirection: $reloc<br />$message";
}
//echo "\r\n\r\n<br>final JavaScript reloc url_status Array:<br><pre>";print_r($url_status);echo "</pre>\r\n";
}
}
if ($url_status['state'] == 'ok') {
// if required, uncompress ZIP archives and make content of each file => text
if ($url_status['content'] == 'zip' && $index_zip == '1' && $file) {
file_put_contents("".$tmp_dir."/archiv.temp",$file);
$zip = zip_open("".$tmp_dir."/archiv.temp");
if ($zip) {
$url_status['content'] = "text"; // preventiv, if not another status will be detected for individual archiv files
$file = ''; // starting with a blank file for all archive files
$topic = 'zip';
if ($debug == '2') {
printStandardReport('archivFiles', $command_line, $no_log);
}
while ($zip_entry = zip_read($zip)) {
if (zip_entry_open($zip, $zip_entry, "r")) {
$buf = zip_entry_read($zip_entry, zip_entry_filesize($zip_entry)); //uncompress the content of recent archiv file
$name = zip_entry_name($zip_entry); // get filename of recent archive file
if ($debug == '2') { //
$report = "<strong> ".$name."</strong>";
printThis($report, $cl);
$size = (int)(zip_entry_filesize($zip_entry)/1024);
if ($size == 0) $size = '1';
$report = " - Unpacked size: ".$size." kByte<br />";
printThis($report, $cl);
}
$buf = get_arch_content($buf, $name, $url); // if necessary, convert PDF, extract feed etc. for the recent file
zip_entry_close($zip_entry); // done for this file in archiv
$file .= "".$buf."<br /><br />"; // add all uncompressed and converted files together
}
}
zip_close($zip);
}
unlink("".$tmp_dir."/archiv.temp");
}
// remove all useless parts of the content
if ($use_nofollow == '1') {
$file = preg_replace("@<!--sphider_noindex-->.*?<!--\/sphider_noindex-->@si", " ",$file);
}
$file = preg_replace("@<!--.*?-->@si", " ",$file);
$file = preg_replace("@<style[^>]*>.*?<\/style>@si", " ", $file);
$file = preg_replace("/<link rel[^<>]*>/i", " ", $file);
$file = str_replace ("encoding: ''", " ", $file); // yes, I've seen such nonsense !
$raw_file = $file;
/*
$file = preg_replace("@<script[^>]*?>.*?<\/script>@si", " ",$file);
*/
// if required, uncompress RAR archives and make content of each file => text
if ($url_status['content'] == 'rar' && $index_rar == '1') {
file_put_contents("".$tmp_dir."/archiv.temp",$file);
$rar = rar_open("".$tmp_dir."/archiv.temp");
if ($rar) {
$url_status['content'] = "text"; // preventiv, all individual archiv files willl be converted to 'text'
$file = ''; // starting with a blank file for all archive files
$topic = 'rar';
$entries = rar_list($rar);
if ($rar) {
if ($debug == '2') {
printStandardReport('archivFiles', $command_line, $no_log);
}
foreach ($entries as $entry) {
$name = $entry->getName();
if ($debug == '2') {
$report = "<strong> ".$name."</strong>";
printThis($report, $cl);
$size = (int)($entry->getPackedSize()/1024);
if ($size == 0) $size = '1';
$report = " - Packed size: ".$size." kByte";
printThis($report, $cl);
$size = (int)($entry->getUnpackedSize()/1024);
if ($size == 0) $size = '1';
$report = " - Unpacked size: ".$size." kByte<br />";
printThis($report, $cl);
}
$entry->extract('', "./".$tmp_dir."/".$name.""); // extract single file of archiv into temporary folder
$buf = file_get_contents("./".$tmp_dir."/".$name.""); // read content of this intermediate file
unlink ("./".$tmp_dir."/".$name.""); // destroy this file
if ($buf) {
$buf = get_arch_content($buf, $name, $url); // if necessary, convert PDF, extract feed etc. for the recent file
$file .= "".$buf."<br /><br />"; // add all uncompressed and converted files together
}
}
}
rar_close($rar);
}
unlink("".$tmp_dir."/archiv.temp");
}
// kill eventually duplicate coding info in dynamic links
if (stristr(substr($file, '0', '4000'), "encoding") && strstr(substr($file, '0', '4000'), "charset")) {
$file = substr($file, strrpos($file, "<!DOCTYPE"));
}
$chrSet = '';
if ($use_prefcharset == '1') { // use preferred charset as defined in Admin settings
$chrSet = $home_charset;
} else {
if($server_char && $url_status['charset']) {
$chrSet = $url_status['charset']; // use charset as supplied by the remote server
} else { // try to extract the charset of this file
if (preg_match("'encoding=[\'\"](.*?)[\'\"]'si", substr($file, 0, 3000), $regs)) {
$chrSet = trim(strtoupper($regs[1])); // get encoding of current XML or XHTML file and use it furtheron
}
if (!$chrSet) {
if (preg_match("'charset=(.*?)[\'\"]'si", substr($file, 0, 3000), $regs)) {
$chrSet = trim(strtoupper($regs[1])); // get charset of current HTML file and use it furtheron
}
}
if (!$chrSet) {
if (preg_match("'charset=[\'\"](.*?)[\'\"]'si", substr($file, 0, 3000), $regs)) {
$chrSet = trim(strtoupper($regs[1])); // get charset of current HTML file and use it furtheron
}
}
if (!$chrSet) {
if (preg_match("'charset=(.*?)[\'\"]'si", substr($file, 0, 3000), $regs)) {
$chrSet = trim(strtoupper($regs[1])); // get charset of current HTML file and use it furtheron
}
}
if ($chrSet == '') {
$chrSet = $home_charset; // no charset found, we need to use default charset like for DOCs, PDFs, etc
}
}
}
if (strpos($chrSet, " ")) { // in the wild we have aloready seen a lot of variants
$chrSet = substr($chrSet, 0, strpos($chrSet, " "));
}
//echo "\r\n\r\n<br /> chrSet: '$chrSet'<br />\r\n";
$contents['charset'] = $chrSet;
if ($index_framesets == '1') {
if (preg_match("@<frameset[^>]*>(.*?)<\/frameset>@si",$file, $regs)) {
printStandardReport('newFrameset', $command_line, $no_log);
// separate the <frameset> ....</frameset> part of this file
$frame = $regs[1];
$replace = get_frames($frame, $url, $can_leave_domain);
$replace ="<body>".$replace."</body>"; // create the body tags for $file
$contents['charset'] = $chrSet; // rebuild charset
// include all replacements instead of the frameset tag into the actual file. This will become the body
$file = preg_replace("@<frameset.*?</frameset>@si", "$replace", $file);
}
}
if ($index_iframes == '1') {
$links = array ();
$regs = Array ();
$replace = '';
$get_charset = '';
$real_url = $url;
if (preg_match_all("/(iframe[^>]*src[[:blank:]]*)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER)) {
printStandardReport('newIframe', $command_line, $no_log);
// find all frames of the iframe;
$care_excl = ''; // don't care file suffixed to be excluded
$relocated = ''; // URL is not relocated
foreach ($regs as $val) {
if (($a = url_purify($val[2], $url, $can_leave_domain, $care_exel, $relocated, $local_redir)) != '') {
$links[] = ($a); // collect all iframe links
}
}
if ($links) {
foreach ($links as $url) {
printNewLinks($url, $cl);
if (preg_match("/.html|.htm|.xhtml|.xml|.php/i", $url)) {
$frame = file_get_contents($url); // get content of this frame
// separate the body part of this frame
preg_match("@<body[^>]*>(.*?)<\/body>@si",$frame, $regs);
$body = $regs[1];
if ($abslinks == '1') {
$body = make_abslinks($body, $url); // if required, correct links relative to found iframe
}
$replace = "".$replace."<br />".$body."";
} else { // might be an image
$replace = "".$replace."<br /><img src=\"".$url."\">";
}
}
}
// include all replacements instead of the iframe tag into the actual file
$file = preg_replace("@<iframe.*?</iframe>@si", "$replace", $file);
$contents['charset'] = $chrSet; // rebuild charset
}
$url = $real_url;
}
// in order to index RDF, RSD, RSS and ATOM feeds enter here
if (($url_status['content'] == 'xml') && $index_rss =='1') {
if (!preg_match("/<rss|atom|<feed|<rdf|<rsd/si", substr($file,0,400))) {
printStandardReport('notRSS',$command_line, $no_log); // no valid feed detected
$OKtoIndex = 0;
$file_read_error = 1;
$realnum -- ;
} else {
$html = '';
$xml = XML_IsWellFormed($file); // check for well-formed XML
if ($xml != '1') {
if ($debug > 0 ) {
printNotWellFormedXML($xml, $cl);
}
$OKtoIndex = 0;
$file_read_error = 1;
$realnum -- ;
} else {
$rss = new feedParser;
// define options for feed parser
$rss->limit = $max_links; // save time by limiting the items/entries to be processed
$rss->in_cp = strtoupper($contents['charset']); // charset of actual file
$rss->out_cp = 'UTF-8'; // convert all into this charset
$rss->cache_dir = ''; // currently unused
$rss->dc = $dc; // treat Dublin Core tags in RDF feeds
$rss->pro = $preferred; // obey the PREFERRED directive in RSD feeds
$rss->file = '1'; // use $file as feed (as a string, not URL)
if ($cdata != 1) {
$rss->CDATA = 'content'; // get it all (naughty)
} else {
$rss->CDATA = 'nochange'; // well educated crawler
}
// get feed as array
if ($feed = $rss->get($url, $file)){
// if you want to see the feed during index procedure, uncomment the following row
// echo "<br>FEED array:<br><pre>";print_r($feed);echo "</pre>";
$link = '';
$textinput_link = '';
$image_url = '';
$image_link = '';
$docs = '';
$subjects = '';
$count = '';
$type = $feed[type];
$count = $feed[sub_count];
$cached = $feed[cached];
// kill all no longer required values
$feed[type] = '';
$feed[sub_count] = '';
$feed[encoding_in] = '';
$feed[encoding_out] = '';
$feed[items_count] = '';
$feed[cached] = '';
if (!$count) {
$count = '0';
}
if ($type == 'RSD') {
// prepare all RSD APIs
for($i=0;$i<$count;$i++){
$subjects .= ''.$feed['api'][$i]['name'].'<br />
'.$feed['api'][$i]['apiLink'].'<br />
'.$feed['api'][$i]['blogID'].'<br />
'.$feed['api'][$i]['settings_docs'].'<br />
'.$feed['api'][$i]['settings_notes'].'<br />';
}
}
if ($type == 'Atom') {
// prepare all Atom entries
for($i=0;$i<$count;$i++){
$subjects .= ''.$feed['entries'][$i]['link'].'<br />
'.$feed['entries'][$i]['title'].'<br />
'.$feed['entries'][$i]['id'].'<br />
'.$feed['entries'][$i]['published'].'<br />
'.$feed['entries'][$i]['updated'].'<br />
'.$feed['entries'][$i]['summary'].'<br />
'.$feed['entries'][$i]['rights'].'<br />
'.$feed['entries'][$i]['author_name'].' '.$feed['entries'][$i]['author_email'].' '.$feed['entries'][$i]['author_uri'].'<br />
'.$feed['entries'][$i]['category_term'].' '.$feed['entries'][$i]['category_label'].' '.$feed['entries'][$i]['category_scheme'].'<br />
'.$feed['entries'][$i]['contributor_name'].' '.$feed['entries'][$i]['contributor_email'].' '.$feed['entries'][$i]['contributor_uri'].'<br />
';
}
}
if ($type == 'RDF' | $type =='RSS v.0.91/0.92' | $type == 'RSS v.2.0'){ // For RDF and RSS feeds enter here
// prepare channel image
$image_url = $feed[image_url];
if($image_url){
$width = $feed[image_width];
if (!$width || $width > '144') {
$width = '88'; //set to default value
}
$height = $feed[image_height];
if (!$height || $height > '400') {
$height = '31'; //set to default value
}
$feed[image_url] = "<img id=\"rss_007\" src=\"".$image_url."\" alt=\"".$feed[image_title]."\" width=\"".$width."\" height=\"".$height."\">";
}
$image_link = $feed[image_link];
if($image_link){
$feed[image_link] = "<a href=\"".$image_link."\">".$image_link."</a>";
}
// prepare all RDF or RSS items
for($i=0;$i<$count;$i++){
$subjects .= ''.$feed['items'][$i]['link'].'<br />
'.$feed['items'][$i]['title'].'<br />
'.$feed['items'][$i]['description'].'<br />
'.$feed['items'][$i]['author'].'<br />
'.$feed['items'][$i]['category'].'<br />
'.$feed['items'][$i]['guid'].'<br />
'.$feed['items'][$i]['comments'].'<br />
'.$feed['items'][$i]['pubDate'].'<br />
'.$feed['items'][$i]['source'].'<br />
'.$feed['items'][$i]['enclosure'].'<br />
'.$feed['items'][$i]['country'].'<br />
'.$feed['items'][$i]['coverage'].'<br />
'.$feed['items'][$i]['contributor'].'<br />
'.$feed['items'][$i]['date'].'<br />
'.$feed['items'][$i]['industry'].'<br />
'.$feed['items'][$i]['language'].'<br />
'.$feed['items'][$i]['publisher'].'<br />
'.$feed['items'][$i]['state'].'<br />
'.$feed['items'][$i]['subject'].'<br />
';
}
}
// convert the channel/feed part into a string
$feed_common = implode(" ", $feed);
//echo "\r\n\r\n<br /> feed_common: '$feed_common'<br />\r\n";
//echo "\r\n\r\n<br /> subjects: '$subjects'<br />\r\n";
// build something that could be indexed
$html .= "<html>\r\n<head>\r\n<title>".$feed['title']."</title>\r\n<meta name=\"description\" content=\"".$feed['description']." \">\r\n</head>\r\n";
$html .= "<body>\r\n".$feed_common."\r\n".$subjects."\r\n</body>\r\n</html>\r\n";
}
if (strlen($html) < "130") { // can't be a valid feed
if ($type == "unknown") {
printInvalidFeedType($type, $cl);
} else {
printStandardReport('invalidRSS',$command_line, $no_log);
}
$OKtoIndex = 0;
$file_read_error = 1;
$realnum -- ;
} else {
$contents['charset'] = 'UTF-8'; // the feed reader converts all to utf-8
$file = $html; // use feed reader output
if ($debug > 0 ) {
printValidFeed($type, $count, $cl);
}
}
}
}
}
// prepare CVS files
if (($url_status['content'] == 'csv') && $index_csv =='1') {
$file = str_replace(",", " ", $file);
$file = str_replace(";", " ", $file);
}
// for DOCs, PDFs, etc we need special text converter
if ($url_status['content'] != 'text' && $url_status['content'] != 'xml' && $url_status['content'] != 'xhtml' && $url_status['content'] != 'csv') {
$file = extract_text($file, $url_status['content'], $url);
$contents['charset'] = 'UTF-8';
$home_charset = 'UTF-8';
$charSet = 'UTF-8'; // because the converter already transferred the documents to UTF-8
if ($file == 'ERROR') { // if error, suppress further indexing
$OKtoIndex = 0;
$file_read_error = 1;
$realnum -- ;
}
// reduce Pashtu and Urdu to the main Farsi letters
if (strtolower($home_charset) == 'windows-1256' && $url_status['content'] == 'pdf') {
$f_letter0= array("ﺎ","�");
$f_letter1= array("�","�","ïºâ","ïºâ");
$f_letter2= array("ïÂâ","ïÂâ","ïÂË","ïÂâ¢");
$f_letter3= array("ïºâ¢","ïºâ","ïºâ","ïºË");
$f_letter4= array("ïºâ¢","ﺚ","ïºâº","ïºÅ");
$f_letter5= array("�","ﺞ","ﺟ","ﺠ");
$f_letter6= array("ïº","ï»","ï¼","ï½");
$f_letter7= array("ﺡ","ﺢ","ﺣ","ﺤ");
$f_letter8= array("ï®â¹","ﮊ");
$f_letter9= array("ﺥ","ﺦ","ﺧ","ﺨ");
$f_letter10= array("ﺩ","ﺪ");
$f_letter11= array("ﺫ","ﺬ");
$f_letter12= array("ïºÂ","ﺮ");
$f_letter13= array("ﺯ","ﺰ");
$f_letter14= array("ﺱ","ﺲ","ﺳ","ﺴ");
$f_letter15= array("ﺵ","ﺶ","ﺷ","ﺸ");
$f_letter16= array("ﺹ","ﺺ","ﺻ","ﺼ");
$f_letter17= array("ﺽ","ﺾ","ﺿ","ï»â¬");
$f_letter18= array("�","ï»â","ï»Æ","ï»â");
$f_letter19= array("ï»â¦","ï»â ","ï»â¡","ï»Ë");
$f_letter20= array("ï»â°","ﻊ","ï»â¹","ï»Å");
$f_letter21= array("�","ﻎ","�","�");
$f_letter22= array("ï»â","ï»â","ï»â","ï»â");
$f_letter23= array("ï»â¢","ï»â","ï»â","ï»Ë");
$f_letter24= array("ï»â¢","ﻚ","ï»âº","ï»Å","ﮎ","�","�","ï®â");
$f_letter25= array("ï®â","ï®â","ï®â","ï®â¢");
$f_letter26= array("�","ﻞ","ﻟ","ﻠ");
$f_letter27= array("ﻡ","ﻢ","ﻣ","ﻤ");
$f_letter28 = array("ﻧ","ﻨ","ﻦ","ﻥ");
$f_letter29= array("ï»Â","ï»®");
$f_letter30= array("ﻩ","ﻪ","ﻫ","ﻬ");
$f_letter31= array("ﻯ","ﻰ","ﻱ","ﻲ","ﻳ","ﻴ");
$file=str_replace($f_letter0,"ç",$file);
$file=str_replace($f_letter1,"è",$file);
$file=str_replace($f_letter2,"þ",$file);
$file=str_replace($f_letter3,"ê",$file);
$file=str_replace($f_letter4,"ë",$file);
$file=str_replace($f_letter5,"ì",$file);
$file=str_replace($f_letter6,"Ãâ ",$file);
$file=str_replace($f_letter7,"ÃÂ",$file);
$file=str_replace($f_letter8,"ÃË",$file);
$file=str_replace($f_letter9,"î",$file);
$file=str_replace($f_letter10,"ï",$file);
$file=str_replace($f_letter11,"ð",$file);
$file=str_replace($f_letter12,"ñ",$file);
$file=str_replace($f_letter13,"ò",$file);
$file=str_replace($f_letter14,"ó",$file);
$file=str_replace($f_letter15,"ô",$file);
$file=str_replace($f_letter16,"õ",$file);
$file=str_replace($f_letter17,"ö",$file);
$file=str_replace($f_letter18,"÷",$file);
$file=str_replace($f_letter19,"ø",$file);
$file=str_replace($f_letter20,"ù",$file);
$file=str_replace($f_letter21,"ú",$file);
$file=str_replace($f_letter22,"�",$file);
$file=str_replace($f_letter23,"Ãâ",$file);
$file=str_replace($f_letter24,"é",$file);
$file=str_replace($f_letter25,"ï",$file);
$file=str_replace($f_letter26,"Ãâ",$file);
$file=str_replace($f_letter27,"Ãâ¦",$file);
$file=str_replace($f_letter28,"Ãâ ",$file);
$file=str_replace($f_letter29,"ÃË",$file);
$file=str_replace($f_letter30,"Ãâ¡",$file);
$file=str_replace($f_letter31,"ÃÅ ",$file);
}
}
if ($OKtoIndex == 1) {
$pageSize = number_format(strlen($file)/1024, 2, ".", "");
printPageSizeReport($pageSize, $topic);
}
$charSet = strtoupper(trim($contents['charset'])); // final charset for UTF-8 converter
//echo "\r\n\r\n<br /> charSet: '$charSet'<br />\r\n";
if (stristr($charSet, "encoding") || strlen($charSet) < '3') { // must be invalid encountered charset
$charSet = 'UTF-8';
}
$seg_data = '';
// if Chinese or Korean text should be segmented enter here
if ($cn_seg == '1' && $file) {
$dic = '';
if ($charSet == 'GB2312' || $charSet == 'GB18030' || $charSet == 'GBK') {
$dic = "".$dict_dir."/cn_gb18030.dic"; // simplified Chinese
}
if ($charSet == 'BIG5') {
$dic = "".$dict_dir."/cn_big5.dic"; // traditional Chinese
}
if ($charSet == 'ISO10646-1933') {
$dic = "".$dict_dir."/kr_iso10646-1933.dic"; // Korean
}
if ($charSet == 'EUC-KR') {
$dic = "".$dict_dir."/kr_euc-kr.dic"; // Korean
}
if ($charSet == 'UTF-8') {
$dic = "".$dict_dir."/cn_utf-8.dic"; // Unicode
}
//echo "<br />dic: $dic<br />";
if ($dic) { // if dictionary is available for page charset, perform a segmentation
$Segmentation = new Segmentation;
$Segmentation->load($dic);
$Segmentation->setLowercase(FALSE);
$cn_result = $Segmentation->segmentString($file);
if($cn_result && $charSet != 'UTF-8'){
$iconv_file = @iconv($charSet, "UTF-8//IGNORE", $cn_result);
if(trim($iconv_file) == ""){ // iconv is not installed or input charSet not available. We need to use class ConvertCharset
$NewEncoding = new ConvertCharset($charSet, "utf-8");
$NewFileOutput = $NewEncoding->Convert($cn_result);
$cn_result = $NewFileOutput;
}else{
$cn_result = $iconv_file;
}
unset ($iconv_file, $NewEncoding, $NewFileOutput);
}
$seg_data = clean_file($cn_result, $url, $url_status['content'], $charSet, $use_nofollow, $use_robot, $can_leave_domain);
} else {
printNoDictionary($charSet, $cl); // no dictionary found for this charset
}
}
// if Japanese text should be segmented enter here
if ($jp_seg == '1' && $file) {
$dic = '';
if ($charSet == 'UTF-8' ||$charSet == 'EUC-JP') {
$file = @iconv($charSet, "SHIFT_JIS//IGNORE", $file);
$charSet = "SHIFT_JIS";
}
if ($charSet == 'SHIFT_JIS') {
$dic = "".$dict_dir."/jp_shiftJIS.dic";
}
if ($dic) { // if dictionary is available for page charset, perform a segmentation
$Segmentation = new Segmentation;
$Segmentation->load($dic);
$Segmentation->setLowercase(FALSE);
$jp_result = $Segmentation->segmentString($file);
//echo "\r\n\r\n<br /> jp_result: $jp_result<br />\r\n";
if($jp_result && $charSet != 'UTF-8'){
$iconv_file = @iconv($charSet, "UTF-8//IGNORE" ,$jp_result);
if(trim($iconv_file) == ""){ // iconv is not installed or input charSet not available. We need to use class ConvertCharset
$NewEncoding = new ConvertCharset($charSet, "utf-8");
$NewFileOutput = $NewEncoding->Convert($jp_result);
$jp_result = $NewFileOutput;
}else{
$jp_result = $iconv_file;
}
unset ($iconv_file, $NewEncoding, $NewFileOutput);
}
$seg_data = clean_file($jp_result, $url, $url_status['content'], $charSet, $use_nofollow, $use_robot, $can_leave_domain);
} else {
printNoDictionary($charSet, $cl); // no dictionary found for this charset
}
}
//echo "\r\n\r\n<br /> charSet: '$charSet'<br />\r\n";
// enter here only, if site / file is not yet UTF-8 coded or had already been converted to UTF-8
if($charSet != "UTF-8" && $file){
$file = convertToUTF8($file, $charSet, $char_Set, $converter_dir);
}
if ($index_media == '1') {
$raw_file = $file; // will be needed to find links to media files
$newmd5sum = md5($raw_file); // get md5 including links and title of media files
}
$data = clean_file($file, $url, $url_status['content'], $charSet, $use_nofollow, $use_robot, $can_leave_domain);
// index only links and their titles
if($only_links) {
$media_links = '0';
$my_links = get_link_details($file, $url, $can_leave_domain, $data['base'], $media_links, $use_nofollow, $local_redir);
$data['content'] = $my_links[0][0]; // define new content
$data['fulltext'] = $my_links[0][0]; // define new content also for 'full text';
}
// combine raw words plus segmented words
if ($cn_seg == 1 || $jp_seg == 1 && $dic) {
if ($debug != '0') {
$seg_add = $seg_data[count]-$data[count]; // calculate segmentation result
if ($seg_add > '0') {
if ($charSet == 'EUC-KR' || $charSet == 'ISO10646-1933'){
printSegKR($seg_add, $cl);
}
if ($charSet == 'SHIFT_JIS'){
printSegJA($seg_add, $cl);
} else {
printSegCN($seg_add, $cl);
}
}
/*
echo "<br /><pre>Results of word segmentation:</pre>";
echo "<br />Unsegmented title :<br><pre>";print_r($data[title]);echo "</pre>";
echo "<br />Segmented title :<br><pre>";print_r($seg_data[title]);echo "</pre>";
echo "<br />Unsegmented full text:<br />$data[fulltext]<br />";
echo "<br />Segmented full text:<br />$seg_data[fulltext]";
*/
}
$data[content] ="".$data[content]."".$seg_data[content]."";
$data[title] ="".$data[title]."".$seg_data[title]."";
$data[description] ="".$data[description]."".$seg_data[description]."";
$data[keywords] ="".$data[keywords]."".$seg_data[keywords]."";
}
// check if canonical redirection was found in page header
$cano_link = '0';
if ($data['cano_link']) {
$cano_link = $data['cano_link'];
$OKtoIndex = 0;
$deletable = 1;
$realnum -- ;
if ($cano_link =="1") {
printNoCanonical($cano_link, $cl); // if unable to extract redirection link
} else {
if ($data['refresh'] == '1') {
printRefreshed($cano_link, $data['wait'], $cl); // if refresh meta tag was found in header
} else {
printCanonical($cano_link, $cl); // if canonical link was found in header
}
// do we already know this link in link-table
$res = mysql_query("select url from ".$mysql_table_prefix."links where url like '$cano_link'");
if ($debug > '0') echo mysql_error();
$rows = mysql_num_rows($res);
if ($rows == 0) { // if not known in link-table, check if already known in temp-table
$res = mysql_query("select link from ".$mysql_table_prefix."temp where link like '$cano_link'");
if ($debug > '0') echo mysql_error();
$rows = mysql_num_rows($res);
if ($rows == 0) { // not known in link-table, add new link
if ($numoflinks <= $max_links) mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$cano_link', '$level', '$sessid')");
if ($debug > '0') echo mysql_error();
}
}
}
} else {
if ($index_media == '0') {
$newmd5sum = md5($data['content']); // get md5 from cleaned full text only
}
if ($md5sum == $newmd5sum) {
printStandardReport('md5notChanged',$command_line, $no_log);
$OKtoIndex = 0;
$realnum -- ;
} else {
mysqltest();
// check for duplicate page content
$result = mysql_query("select link_id from ".$mysql_table_prefix."links where md5sum='$newmd5sum'");
if ($debug > '0') echo mysql_error();
if (mysql_num_rows($result) > 0) { // display warning message and urls with duplicate content
printStandardReport('duplicate',$command_line, $no_log);
$num_rows = mysql_num_rows($result);
for ($i=0; $i<$num_rows; $i++) {
$link_id = mysql_result($result, $i, "link_id");
//$num = $i+1;
$res = mysql_query("select url from ".$mysql_table_prefix."links where link_id like '$link_id'");
if ($debug > '0') echo mysql_error();
$row = mysql_fetch_row($res);
$dup_url = $row[0];
if ($clear == 1) clean_resource($res, '03') ;
printDupReport($dup_url,$command_line);
}
if ($dup_content == '0') { // enter here, if pages with duplicate content should not be indexed/re-indexed
$OKtoIndex = 0;
$realnum -- ;
} else {
$OKtoIndex = 1;
}
}
}
}
if (($md5sum != $newmd5sum || $reindex ==1) && $OKtoIndex == 1) {
$urlparts = parse_addr($url);
$newdomain = $urlparts['host'];
$type = 0;
if ($data['noindex'] == 1) {
$OKtoIndex = 0;
$deletable = 1;
$realnum -- ;
printStandardReport('metaNoindex',$command_line, $no_log);
}
$content = explode(" ",addslashes($data['content']));
//echo "\r\n\r\n<br>content Array:<br><pre>";print_r($content);echo "</pre>\r\n";
$acc_words[] = array();
$type = '';
// if Greek accents should be removed from Greek vowels
if ($noacc_el) {
foreach ($content as &$thisword) {
$no_acc = remove_acc_el($thisword);
if($no_acc != $thisword) {
$acc_words[] = $no_acc;
}
}
}
// if the other (Latin) accents should be removed from their vowels
if ($vowels) {
foreach ($content as $thisword) {
$no_acc = remove_acc($thisword);
if($no_acc != $thisword) {
$acc_words[] = $no_acc;
}
}
}
// now add the words without accents to the total text content
$content = array_merge($content, $acc_words);
$wordarray = unique_array($content);
if ($smp != 1) {
if ($data['nofollow'] != 1 && $cano_link == '0') {
$media_links = '0';
$links = get_links($raw_file, $url, $can_leave_domain, $data['base'], $media_links, $use_nofollow, $local_redir, $url_reloc);
$links = distinct_array($links);
$all_links = count($links);
if ($all_links > $max_links) $all_links = $max_links;
$links = array_slice($links,0,$max_links);
if ($realnum < $max_links) {
$numoflinks = 0;
//if there are any, add to the temp table, but only if there isnt such url already
if (is_array($links)) {
reset ($links);
$tmp_urls = get_temp_urls($sessid); // reload previous temp
if ($debug == '2' ) { // if debug mode, show details
printStandardReport('newLinks', $command_line, $no_log);
}
while ($thislink = each($links)) {
// ignore self linking
if ($thislink[1] != "self"){
// find new domains for _addurl table
if ($auto_add && $can_leave_domain) {
$all_link = parse_all_url($thislink[1]);
$new_link = $all_link['host'];
mysqltest();
// check whether URL is already known in sites table
$res1 = mysql_query("select url from ".$mysql_table_prefix."sites where url like '%$new_link%'");
if ($debug > '0') echo mysql_error();
// check whether URL is already known in addurl table
$res2 = mysql_query("select url from ".$mysql_table_prefix."addurl where url like '%$new_link%'");
if ($debug > '0') echo mysql_error();
// check whether URL is banned
$res3 = mysql_query("select domain from ".$mysql_table_prefix."banned where domain like '%$new_link%'");
if ($debug > '0') echo mysql_error();
if (mysql_num_rows($res1) == 0 && mysql_num_rows($res2) == 0 && mysql_num_rows($res3) == 0) {
// add new domain into _addurl table
mysql_query ("insert into ".$mysql_table_prefix."addurl (url, description, account) values ('$thislink[1]', '$comment', '$admin_email')");
if ($debug > '0') echo mysql_error();
}
}
// check whether thislink is already known as a link ( might happen by means of relocated URLs)
$res4 = '';
$res5 = '';
$known_link = '';
$known_temp = '';
$res4 = mysql_query("select url from ".$mysql_table_prefix."links where url like '$thislink[1]'");
if ($debug > '0') echo mysql_error();
$known_link = mysql_num_rows($res4);
$res5 = mysql_query("select link from ".$mysql_table_prefix."temp where link like '$thislink[1]'");
if ($debug > '0') echo mysql_error();
$known_temp = mysql_num_rows($res5);
// if this is a new link not yet known, add this new link to the temp table
if ($tmp_urls[$thislink[1]] != 1 && !$known_link && !$known_temp) {
$tmp_urls[$thislink[1]] = 1;
$numoflinks++;
if ($debug == '2') {
$act_link = $thislink[1];
printNewLinks($act_link, $cl);
}
mysqltest();
if ($numoflinks <= $max_links) mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$thislink[1]', '$level', '$sessid')");
if ($debug > '0') echo mysql_error();
}
}
}
}
}
} else {
printStandardReport('noFollow',$command_line, $no_log);
}
unset ($file);
}
// if we should index only the files as defined in docs list
if ($only_docs) {
$OKtoIndex = '';
foreach ($docs as $thisdoc){
if (strstr($urlparts['path'], $thisdoc)) {
$OKtoIndex = "1";
}
}
if (!$OKtoIndex) {
printStandardReport('noDoclist',$command_line, $no_log);
}
}
if ($OKtoIndex == 1) {
if ($link_check == 0) {
$title = $data['title'];
$host = $data['host'];
$path = $data['path'];
$fulltxt = $data['fulltext'];
$desc = substr($data['description'], 0,254);
// extract domain
$url_parts = parse_all_url($url);
$hostname = $url_parts[host];
// rebuild domain for localhost applications
if ($hostname == 'localhost') {
$host1 = str_replace($local,'',$url);
}
$pos = strpos($host1, "/"); // on local server delete all behind the /
// will work for localhost URLs like http://localhost/publizieren/japan1/index.htm
// will fail for localhost URLs like http://localhost/publizieren/externe/japan2/index.htm
if ($pos) {
$host1 = substr($host1,0,$pos); // build full adress again, now only local domain
}
if ($hostname == 'localhost') {
$domain_for_db = ("".$local."".$host1."/"); // complete URL
$domain_for_db = str_replace("http://", "", $domain_for_db);
//$domain_for_db = $host1;
}else {
//$domain_for_db = ("$url_parts[scheme]://".$hostname."/"); // complete URL
$domain_for_db = $hostname;
}
if (isset($domain_arr[$domain_for_db])) {
$dom_id = $domain_arr[$domain_for_db];
} else {
mysqltest();
mysql_query("insert into ".$mysql_table_prefix."domains (domain) values ('$domain_for_db')");
$dom_id = mysql_insert_id();
$domain_arr[$domain_for_db] = $dom_id;
}
reset($wordarray);
if ($case_sensitive == '0') {
foreach ($wordarray as &$value) {
$value[1] = lower_ent($value[1]);
$value[1] = lower_case($value[1]); // convert keywords to lower case
}
}
$wordarray = calc_weights ($wordarray, $title, $host, $path, $data['keywords'], $url_parts);
//if there are words to index, add the link to the database, get its id, and add the word + their relation
if (is_array($wordarray) && count($wordarray) >= $min_words_per_page) {
$OKtoSave = 1;
if ($use_white1 == '1') { // check if content of page matches ANY word in whitelist
$found = '0';
foreach ($whitelist as $key => $val1) {
reset($wordarray);
while ($thisword = each($wordarray)) {
$word = trim($thisword[1][1]);
if (strcasecmp($val1, $word) == 0) {
$found = '1';
}
}
}
if ($found == '0') {
printStandardReport('noWhitelist',$command_line, $no_log);
$OKtoSave = 0;
$realnum -- ;
}
}
if ($use_white2 == '1') { // check if content of page matches ALL words in whitelist
$all = count($whitelist);
$found = '0';
$found_this = '0';
foreach ($whitelist as $key => $val2) {
reset($wordarray);
while ($thisword = each($wordarray)) {
$word = trim($thisword[1][1]);
if (strcasecmp($val2, $word) == 0) {
$found_this = '1';
}
}
if ($found_this != '0'){
$found++;
$found_this = '0';
}
}
if ($found != $all) {
printStandardReport('noWhitelist',$command_line, $no_log);
$OKtoSave = 0;
$realnum -- ;
}
}
if ($use_black == '1') {
$found = '0'; // check if content of page matches ANY string in blacklist
foreach ($blacklist as $key => $val3) {
$met = stripos($data[fulltext], $val3);
if($met) $found = '1';
}
if ($found == '1') {
printStandardReport('matchBlacklist',$command_line, $no_log);
$OKtoSave = 0;
$realnum -- ;
}
}
if ($md5sum == '' || ($md5sum == '' && $url_status['relocate'])) { // enter here for new page (unknown link) OR for new relocated URL(so it will become a new link)
mysqltest();
mysql_query ("insert into ".$mysql_table_prefix."links (site_id, url, title, description, fulltxt, indexdate, size, md5sum, level) values ('$site_id', '$url', '$title', '$desc', '$fulltxt', curdate(), '$pageSize', '$newmd5sum', '$thislevel')");
if ($debug > '0') echo mysql_error();
$result = mysql_query("select link_id from ".$mysql_table_prefix."links where url='$url'");
if ($debug > '0') echo mysql_error();
$row = mysql_fetch_row($result);
$link_id = $row[0];
if ($clear == 1) clean_resource($result, '04');
if ($OKtoSave) {
// store link details, if not yet known (during reindex)
if ($only_links) {
// extract domain of current page delivering the new links
$url_parts = parse_all_url($url);
$hostname = $url_parts[host];
if ($hostname == 'localhost') { // rebuild domain for localhost applications
$host1 = str_replace($local,'',$url);
}
$pos = strpos($host1, "/"); // on local server delete all behind the /
// will work for localhost URLs like http://localhost/publizieren/japan1/index.htm
// will fail for localhost URLs like http://localhost/publizieren/externe/japan2/index.htm
if ($pos) {
$host1 = substr($host1,0,$pos); // build full adress again, now only local domain
}
if ($hostname == 'localhost') {
$domain_db = ("".$local."".$host1."/"); // complete URL
$domain_db = str_replace("http://", "", $domain_db);
//$domain_db = $host1;
}else {
//$domain_db = ("$url_parts[scheme]://".$hostname."/"); // complete URL
$domain_db = $hostname;
}
// now store all link details into db
foreach ($my_links as $found_link) {
// but only if we have found a title
if ($found_link[3]) {
mysqltest();
// check whether URL is already known in sites table
$res1 = mysql_query("select title from ".$mysql_table_prefix."link_details where link_id like '$link_id' and url like '%$found_link[2]%'");
if ($debug > '0') echo mysql_error();
if (mysql_num_rows($res1) == 0) { // must be new link
mysql_query ("insert into ".$mysql_table_prefix."link_details (link_id, url, title, indexdate, domain) values ('$link_id', '$found_link[2]', '$found_link[3]', now(), '$domain_db')");
if ($debug > '0') echo mysql_error();
}
}
}
}
if ($debug == '2') { // if debug mode, show details
printStandardReport('newKeywords', $command_line, $no_log);
}
save_keywords($wordarray, $link_id, $dom_id);
}
mysqltest();
if ($index_media == '1' && $OKtoSave) { // find media content only if there was no conflict with text (white and/or blacklist)
include "index_media.php"; // try to find media files
}
mysqltest();
if ($debug == '2') {
printStandardReport('indexed1', $command_line, $no_log);
} else {
printStandardReport('indexed', $command_line, $no_log);
}
} else if (($md5sum <> '') && ($md5sum <> $newmd5sum) && $OKtoSave) { //if page has changed, start updating
mysqltest();
$result = mysql_query("select link_id from ".$mysql_table_prefix."links where url='$url'");
if ($debug > '0') echo mysql_error();
$row = mysql_fetch_row($result);
$link_id = $row[0];
for ($i=0;$i<=15; $i++) {
$char = dechex($i);
mysql_query ("delete from ".$mysql_table_prefix."link_keyword$char where link_id=$link_id");
if ($debug > '0') echo mysql_error();
}
if ($clear == 1) clean_resource($result, '05');
if ($debug == '2') { // if debug mode, show details
printStandardReport('newKeywords', $command_line, $no_log);
}
save_keywords($wordarray, $link_id, $dom_id);
$query = "update ".$mysql_table_prefix."links set title='$title', description ='$desc', fulltxt = '$fulltxt', indexdate=now(), size = '$pageSize', md5sum='$newmd5sum', level='$thislevel' where link_id='$link_id'";
mysqltest();
mysql_query($query);
if ($debug > '0') echo mysql_error();
if ($index_media == '1') {
include "index_media.php"; // try to find media files
}
if ($debug == '2') {
printStandardReport('re-indexed1', $command_line, $no_log);
}
}
}else {
printStandardReport('minWords', $command_line, $no_log);
$realnum -- ;
}
} else {
printStandardReport('link_okay', $command_line, $no_log);
}
unset ($wordarray, $title, $fulltxt, $desc, $data, $seg_data);
}
}
} else {
$deletable = 1;
printUrlStatus($url_status['state'], $command_line, $no_log);
}
mysqltest();
if ($url_status['relocate'] ){
// remove this relocated URL from temp table, because it is indexed now
mysql_query ("delete from ".$mysql_table_prefix."temp where link = '$url' AND id = '$sessid'");
if ($debug > '0') echo mysql_error();
}
if ($reindex ==1 && $deletable == 1) {
check_for_removal($url);
} else if ($reindex == 1) {
}
if (!isset($all_links)) {
$all_links = 0;
}
if (!isset($numoflinks)) {
$numoflinks = 0;
}
if ($smp != 1 && $OKtoIndex == 1) { // if valid sitemap found,or canonical link, or something else, no LinkReport
printLinksReport($numoflinks, $all_links, $command_line);
}
}
function index_site($url, $reindex, $maxlevel, $soption, $url_inc, $url_not_inc, $can_leave, $use_robot, $use_nofollow, $cl, $all, $use_pref) {
global $mysql_table_prefix, $command_line, $mainurl, $tmp_urls, $domain_arr, $all_keywords, $smp, $follow_sitemap;
global $link_check, $smap_dir, $index_media, $db_con, $clear, $create_sitemap, $tmp_dir, $domaincb;
global $max_links, $realnum, $debug, $no_log, $dba_act, $add_auth, $interrupt, $index_media, $thumb_folder;
if (!$can_leave) {
$can_leave = $domaincb;
}
$can_leave_domain = $can_leave;
$starttime = getmicrotime(); // start time to index this site
$site_id = '';
$skip = '';
$smp = '0';
$omit = array();
if (strstr($interrupt, "-")) { // if indexer should not be interrupted periodically
$interrupt = '999999'; // never
}
$int_count = $interrupt; // $int_count will be decreased by each indexed link until $int_count = 1
printStandardReport('starting',$command_line, $no_log);
if (!isset($all_keywords)) {
mysqltest();
$result = mysql_query("select keyword_ID, keyword from ".$mysql_table_prefix."keywords");
if ($debug > '0') echo mysql_error();
while($row=mysql_fetch_array($result)) {
$all_keywords[addslashes($row[1])] = $row[0];
}
if ($clear == 1) clean_resource($result, '06') ;
}
$url = convert_url($url);
$compurl = parse_addr($url);
if ($compurl['path'] == '') {
$url = $url . "/";
}
$t = microtime();
$a = getenv("REMOTE_ADDR");
$sessid = md5 ($t.$a);
if ($url != '/') { // ignore dummies
$urlparts = parse_addr($url);
$domain = $urlparts['host'];
if (isset($urlparts['port'])) {
$port = (int)$urlparts['port'];
}else {
$port = 80;
}
mysqltest();
$result = mysql_query("select site_id, authent from ".$mysql_table_prefix."sites where url='$url'");
if ($debug > '0') echo mysql_error();
$row = mysql_fetch_row($result);
$site_id = $row[0];
$authent = $row[2];
if ($clear == 1) clean_resource($result, '07') ;
if ($add_auth && $authent) { // for sites with authentication we need to verify the value
$url_status = url_status($url);
$url_parts = parse_all_url($url);
if ($url_status['state'] == 'ok' && $url_status['content'] == 'text') {
if ($url_status['relocate'] ){ // if relocated, print message and redirect to new URL
$new_url = $url_status['path'] ;
$diff = strlen($url);
$redir = substr( $new_url, $diff); // extract diff. between original URL and relocated URL
if ($redir == "index.php" || $redir == "index.html" || $redir == "index.htm") {
$local_redir = '1';
// no output because diff. is only index.html etc.
} else {
printRedirected($url_status['relocate'], $url_status['path'], $cl);
}
if (strstr($url_status['path'], "//")) { // if redirected to absolute URL, use this for further usage
$url = $url_status['path'];
} else {
$relo_url = str_replace($url_parts['query'], "", $url); // url without query
$relo_url = substr($url, 0, strrpos($relo_url, "/")+1); // url without file name
if (strpos($url_status['path'], "./") === 0) { // if redirected relativ to same folder depth
$url_status['path'] = str_replace("./", "", $url_status['path']);
$url = "".$relo_url."".$url_status['path']."";
}
if (strpos($url_status['path'], "../") === 0) { // if redirected relativ and one folder up
$url_status['path'] = str_replace("./", "", $url_status['path']);
$relo_url = substr($url, 0, strpos($url_parts['path'])); // url without file name
$relo_url = substr($url, 0, strrpos($relo_url, "/")+1); // url without last folder
$url = "".$relo_url."".$url_status['path']."";
}
}
}
// read file
$contents = array();
$file = '';
$file = file_get_contents($url);
if ($file === FALSE) { // we know another way to get the content
$get_charset = '';
$contents = getFileContents($url, $get_charset);
$file = $contents['file'];
}
// parse header only
preg_match("@<head[^>]*>(.*?)<\/head>@si",$file, $regs);
$headdata = $regs[1];
// fetch the tag value
preg_match("/<meta +name *=[\"']?Sphider-plus[\"']? *content=[\"'](.*?)[\"']/i", $headdata, $res);
if (isset ($res)) {
if ($authent != $res[1]) { // invalid value in authentication tag
$skip = '1';
printHeader ($omit, $url, $command_line);
printStandardReport('Skipped_03', $command_line, $no_log);
}
} else { // no authentication tag found in header
$skip = '1';
printHeader ($omit, $url, $command_line);
printStandardReport('Skipped_02', $command_line, $no_log);
}
} else {
$skip = '1';
printHeader ($omit, $url, $command_line);
printStandardReport('statError', $command_line, $no_log);
}
}
if (!$skip) {
if ($site_id != "" && $reindex == 1) {
mysqltest();
mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', 0, '$sessid')");
if ($debug > '0') echo mysql_error();
$result = mysql_query("select url, level from ".$mysql_table_prefix."links where site_id = $site_id");
while ($row = mysql_fetch_array($result)) {
$site_link = $row['url'];
$link_level = $row['level'];
if ($site_link != $url) {
mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$site_link', '$link_level', '$sessid')");
}
}
if ($clear == 1) clean_resource($result, '08') ;
$qry = "update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth ='$maxlevel', required = '$url_inc'," .
"disallowed = '$url_not_inc', can_leave_domain='$can_leave', use_prefcharset='$use_pref' where site_id='$site_id'";
mysqltest();
mysql_query ($qry);
if ($debug > '0') echo mysql_error();
} else if ($site_id == '') {
mysqltest();
mysql_query ("insert into ".$mysql_table_prefix."sites (url, indexdate, spider_depth, required, disallowed, can_leave_domain, use_prefcharset) " .
"values ('$url', now(), '$maxlevel', '$url_inc', '$url_not_inc', '$can_leave_domain', '$use_pref')");
if ($debug > '0') echo mysql_error();
$result = mysql_query("select site_ID from ".$mysql_table_prefix."sites where url='$url'");
$row = mysql_fetch_row($result);
$site_id = $row[0];
if ($clear == 1) clean_resource($result, '09') ;
} else {
mysqltest();
mysql_query ("update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth ='$maxlevel', required = '$url_inc'," .
"disallowed = '$url_not_inc', can_leave_domain='$can_leave_domain', use_prefcharset='$use_pref' where site_id='$site_id'");
if ($debug > '0') echo mysql_error();
}
$pending = array();
mysqltest();
$result = mysql_query("select site_id, temp_id, level, count, num from ".$mysql_table_prefix."pending where site_id='$site_id'");
if ($debug > '0') echo mysql_error();
$row = mysql_fetch_row($result);
$pending = $row[0];
$level = '0';
$count = '0';
if ($clear == 1) clean_resource($result, '10') ;
$domain_arr = get_domains();
if ($pending == '') {
mysqltest();
mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', 0, '$sessid')");
if ($debug > '0') echo mysql_error();
} else if ($pending != '') {
printStandardReport('continueSuspended',$command_line, $no_log);
mysqltest();
$pend_count = '0';
//$result = mysql_query("select temp_id, level, count from ".$mysql_table_prefix."pending where site_id='$site_id'");
$result = mysql_query("select * from ".$mysql_table_prefix."pending where site_id='$site_id'");
if ($debug > '0') echo mysql_error();
$row = mysql_fetch_row($result);
if ($row) {
$sessid = $row[1];
$level = $row[2];
$pend_count = $row[3] + 1;
$num = $row[4];
$pending = 1;
$tmp_urls = get_temp_urls($sessid);
if ($clear == 1) clean_resource($result, '11') ;
}
}
if ($pending != 1) {
mysqltest();
mysql_query ("insert into ".$mysql_table_prefix."pending (site_id, temp_id, level, count) values ('$site_id', '$sessid', '0', '0')");
if ($debug > '0') echo mysql_error();
}
$time = time();
$robots = ("robots.txt"); // standardname of robots file
if ($use_robot == '1') {
$omit = check_robot_txt($url, $robots);
}
printHeader ($omit, $url, $command_line);
if ($link_check == 1) printStandardReport('start_link_check', $command_line, $no_log);
if ($link_check == 0 && $reindex == 1 ) printStandardReport('start_reindex', $command_line, $no_log);
if ($link_check == 0 && $reindex == 0 ) printStandardReport('starting', $command_line, $no_log);
$mainurl = $url;
$realnum = $num;
$num = 0;
while (($level <= $maxlevel && $soption == 'level') || ($soption == 'full')) {
if ($pending == 1) {
$count = $pend_count;
$pending = 0;
} else {
$count = 0;
}
$links = array();
mysqltest();
$result = mysql_query("select distinct link from ".$mysql_table_prefix."temp where level=$level && id='$sessid' order by link");
if ($debug > '0') echo mysql_error();
$rows = mysql_num_rows($result);
if ($rows == 0) {
break;
}
while ($row = mysql_fetch_array($result)) {
$links[] = $row['link'];
}
if ($clear == 1) clean_resource($result, '12') ;
reset ($links);
// now loop through all available links(pages)
while ($count < count($links)) {
$num++;
$realnum ++ ;
if ($realnum > $max_links ) { // if max. links per page reached
mysqltest();
mysql_query ("delete from ".$mysql_table_prefix."temp where id = '$sessid'");
if ($debug > '0') echo mysql_error();
mysql_query ("delete from ".$mysql_table_prefix."pending where site_id = '$site_id'");
if ($debug > '0') echo mysql_error();
printMaxLinks($max_links, $cl);
printStandardReport('completed',$command_line, $no_log);
return;
}
$thislink = $links[$count];
$urlparts = parse_addr($thislink);
$forbidden = 0;
if (is_array($omit)) { // if valid robots.txt was found
reset ($omit);
foreach ($omit as $omiturl) {
$omiturl = trim($omiturl);
$omiturl_parts = array();
$omiturl_parts = parse_addr($omiturl);
if (@$omiturl_parts['scheme'] == '') {
$check_omit = $urlparts['host'] . $omiturl;
} else {
$check_omit = $omiturl;
}
if (strpos($thislink, $check_omit)) {
printRobotsReport($num, $thislink, $command_line);
$realnum -- ;
check_for_removal($thislink);
$forbidden = 1;
break;
}
}
}
if (!check_include($thislink, $url_inc, $url_not_inc )) {
$realnum -- ;
printUrlStringReport($num, $thislink, $command_line);
//printUrlStringReport($realnum, $thislink, $command_line);
check_for_removal($thislink);
$forbidden = 1;
}
if ($forbidden == 0) {
printRetrieving($num, $thislink, $command_line);
//printRetrieving($realnum, $thislink, $command_line);
mysqltest();
$query = "select md5sum, indexdate from ".$mysql_table_prefix."links where url='$thislink'";
$result = mysql_query($query);
if ($debug > '0') echo mysql_error();
$rows = mysql_num_rows($result);
if ($rows == 0) {
index_url($thislink, $level+1, $site_id, '', $domain, '', $sessid, $can_leave_domain, $reindex, $use_nofollow, $cl, $use_robot, $use_pref );
mysqltest();
mysql_query("update ".$mysql_table_prefix."pending set level ='$level', count='$count', num='$realnum' where site_id='$site_id'");
if ($debug > '0') echo mysql_error();
} else if ($rows <> 0 && $reindex == 1) {
$row = mysql_fetch_array($result);
$md5sum = $row['md5sum'];
$indexdate = $row['indexdate'];
if ($link_check == 1 && $reindex == 1) link_check($thislink, $level+1, $sessid, $can_leave_domain, $reindex);
else {
mysqltest();
index_url($thislink, $level+1, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex, $use_nofollow, $cl, $use_robot, $use_pref);
}
}else {
printStandardReport('inDatabase',$command_line, $no_log);
$realnum -- ;
//$num--;
}
if ($rows <> 0) {
mysqltest();
mysql_query("update ".$mysql_table_prefix."pending set level ='$level', count='$count', num='$realnum' where site_id='$site_id'");
if ($debug > '0') echo mysql_error();
}
if ($clear == 1) clean_resource($result, '13') ;
}
// check for interrupt counter
if ($int_count == '1') { // interrupt the index procedure until interactive resume
mysql_query("update ".$mysql_table_prefix."pending set level ='$level', count='$count', num='$realnum' where site_id='$site_id'");
if ($debug > '0') echo mysql_error();
printInterrupt($interrupt, $url, $cl) ;
die;
}
$count++;
$int_count--;
}
$level++;
}
}
mysqltest();
mysql_query ("delete from ".$mysql_table_prefix."temp where id = '$sessid'");
if ($debug > '0') echo mysql_error();
mysql_query ("delete from ".$mysql_table_prefix."pending where site_id = '$site_id'");
if ($debug > '0') echo mysql_error();
if ($create_sitemap == 1) {
create_sitemap($site_id, $url);
}
/*
$consumed = round(getmicrotime() - $starttime, 3);
printConsumedReport('consumed', $cl, '0', $consumed); // time elapsed to index this URL
*/
printStandardReport('completed',$command_line, $no_log);
$stats = get_Stats();
printDatabase($stats, $cl);
}
if ($index_media) {
// delete all thumbnails in .../admin/tmp/thumbs/ folder
clear_folder(".".$thumb_folder);
}
}
function index_all() {
global $mysql_table_prefix, $reindex, $command_line, $omit;
global $url, $cl, $clear, $real_log, $debug, $use_robot, $use_nofollow, $no_log;
$all = '1'; // here only as a dummy; needed to display the back to admin button
mysqltest();
$result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain, use_prefcharset from ".$mysql_table_prefix."sites");
if ($debug > '0') echo mysql_error();
while ($row=mysql_fetch_row($result)) {
$url = $row[0];
$depth = $row[1];
$include = $row[2];
$not_include = $row[3];
$can_leave_domain = $row[4];
$use_prefcharset = $row[5];
if ($can_leave_domain=='') {
$can_leave_domain=0;
}
if ($depth == -1) {
$soption = 'full';
} else {
$soption = 'level';
}
index_site($url, 1, $depth, $soption, $include, $not_include, $can_leave_domain, $use_robot, $use_nofollow, $cl, $all, $use_prefcharset);
}
if ($clear == 1) clean_resource($result, '14') ;
printStandardReport('ReindexFinish', $command_line, $no_log);
create_footer();
}
function index_these() {
global $mysql_table_prefix, $reindex, $command_line, $omit, $tmp_dir;
global $url, $cl, $clear, $real_log, $debug, $use_robot, $use_nofollow, $no_log;
$site_ids = array();
$all = '1'; // here only as a dummy; needed to display the back to admin button
$site_ids = @file("$tmp_dir/act_sites.txt"); // read the temp file that holds the actual site ids
if (is_array($site_ids) && count($site_ids)) {
mysqltest();
foreach($site_ids as $this_id) {
$result = mysql_query("select url, spider_depth, required, disallowed, can_leave_domain, use_prefcharset from ".$mysql_table_prefix."sites where site_id='$this_id'");
if ($debug > '0') echo mysql_error();
$row = mysql_fetch_row($result);
$url = $row[0];
$depth = $row[1];
$include = $row[2];
$not_include = $row[3];
$can_leave_domain = $row[4];
$use_prefcharset = $row[5];
if ($can_leave_domain=='') {
$can_leave_domain=0;
}
if ($depth == -1) {
$soption = 'full';
} else {
$soption = 'level';
}
index_site($url, 1, $depth, $soption, $include, $not_include, $can_leave_domain, $use_robot, $use_nofollow, $cl, $all, $use_prefcharset);
}
} else {
printStandardReport('NoSitesFound', $command_line, $no_log); // print warning message
}
if ($clear == 1) {
clean_resource($result, '14') ;
$site_ids = array();
$row = array();
}
printStandardReport('ReindexFinish', $command_line, $no_log);
create_footer();
}
function erase() { // only for command line option: -erase
global $mysql_table_prefix, $reindex, $command_line, $omit;
global $url, $cl, $clear, $real_log, $debug, $use_robot, $use_nofollow;
global $no_log, $clear_cache, $textcache_dir, $mediacache_dir ;
// if Admin selected, clear text and media cache
if ($clear_cache == '1') {
if ($handle = opendir($textcache_dir)) {
while (false !== ($file = readdir($handle))) {
if ($file != "." && $file != "..") {
@unlink("".$textcache_dir."/".$file."");
}
}
}
if ($handle = opendir($mediacache_dir)) {
while (false !== ($file = readdir($handle))) {
if ($file != "." && $file != "..") {
@unlink("".$mediacache_dir."/".$file."");
}
}
}
}
// clear all data in database
$erase =array ("domains","keywords","links","link_keyword0","link_keyword1","link_keyword2","link_keyword3","link_keyword4","link_keyword5","link_keyword6","link_keyword7","link_keyword8","link_keyword9","link_keyworda","link_keywordb","link_keywordc","link_keywordd","link_keyworde","link_keywordf","media");
foreach ($erase as $allthis){
mysql_query ("TRUNCATE `".$mysql_table_prefix."$allthis`");
if ($debug > '0') echo mysql_error();
}
if ($clear == 1) clean_resource($result, '14') ;
printStandardReport('ErasedFinished', $command_line, $no_log);
create_footer();
}
function index() { // only for command line option: -eall
global $mysql_table_prefix, $command_line, $no_log;
global $url, $clear, $debug, $use_robot, $use_nofollow;
// now re-index all
mysqltest();
$result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain, use_prefcharset from ".$mysql_table_prefix."sites");
if ($debug > '0') echo mysql_error();
while ($row=mysql_fetch_row($result)) {
$url = $row[0];
$depth = $row[1];
$include = $row[2];
$not_include = $row[3];
$can_leave_domain = $row[4];
$use_prefcharset = $row[5];
if ($can_leave_domain=='') {
$can_leave_domain=0;
}
if ($depth == -1) {
$soption = 'full';
} else {
$soption = 'level';
}
index_site($url, 1, $depth, $soption, $include, $not_include, $can_leave_domain, $use_robot, $use_nofollow, $use_prefcharset );
}
if ($clear == 1) clean_resource($result, '14') ;
printStandardReport('ReindexFinish', $command_line, $no_log);
create_footer();
}
function get_temp_urls($sessid) {
global $mysql_table_prefix, $debug, $clear;
$result = mysql_query("select link from ".$mysql_table_prefix."temp where id='$sessid' limit 0,100");
if ($debug > '0') echo mysql_error();
$tmp_urls = Array();
while ($row=mysql_fetch_row($result)) {
$tmp_urls[$row[0]] = 1;
}
if ($clear == 1) clean_resource($result, '15') ;
return $tmp_urls;
}
function get_domains() {
global $mysql_table_prefix, $debug, $clear;
mysqltest();
$result = mysql_query("select domain_id, domain from ".$mysql_table_prefix."domains");
if ($debug > '0') echo mysql_error();
$domains = Array();
while ($row=mysql_fetch_row($result)) {
$domains[$row[1]] = $row[0];
}
if ($clear == 1) clean_resource($result, '16') ;
return $domains;
}
function get_arch_content($buf, $name, $url) {
global $index_framesets, $command_line, $no_log, $can_leave_domain, $index_rss;
$suffix = substr(strtolower($name), strrpos($name, ".")+1);
// if special converter is required
if ($suffix == 'pdf') $buf = extract_text($buf, 'pdf', 0);
if ($suffix == 'doc') $buf = extract_text($buf, 'doc', 0);
if ($suffix == 'rtf') $buf = extract_text($buf, 'rtf', 0);
if ($suffix == 'xls') $buf = extract_text($buf, 'xls', 0);
if ($suffix == 'ptt') $buf = extract_text($buf, 'ptt', 0);
// for extracting framesets of this file enter here. Iframes will be extracted later on for the complete $file
if ($index_framesets == '1') {
if (preg_match("@<frameset[^>]*>(.*?)<\/frameset>@si",$buf, $regs)) {
printStandardReport('newFrameset', $command_line, $no_log);
// separate the <frameset> ....</frameset> part of this file
$frame = $regs[1];
$replace = get_frames($frame, $url, $can_leave_domain);
$replace ="<body>".$replace."</body>"; // create the body tags for $buf
// include all replacements instead of the frameset tag into the actual file. This will become the body
$buf = preg_replace("@<frameset.*?</frameset>@si", "$replace", $buf);
}
}
// for extracting archived feeds enter here
if ((preg_match("/<rss|atom|<feed|<rdf|<rsd/si", substr($buf,0,400))) && $index_rss =='1') {
$buf = get_arch_feeds($buf, $url);
}
return $buf;
}
function get_arch_feeds($buf, $url) {
global $command_line, $no_log, $debug, $cl, $max_links, $dc, $preferred, $cdata;
$html = '';
$xml = XML_IsWellFormed($buf); // check for well-formed XML
if ($xml != '1') {
if ($debug > 0 ) {
printNotWellFormedXML($xml, $cl);
}
} else {
$rss = new feedParser;
// define options for feed parser
$rss->limit = $max_links; // save time by limiting the items/entries to be processed
$rss->in_cp = strtoupper($contents['charset']); // charset of actual file
$rss->out_cp = 'UTF-8'; // convert all into this charset
$rss->cache_dir = ''; // currently unused
$rss->dc = $dc; // treat Dublin Core tags in RDF feeds
$rss->pro = $preferred; // obey the PREFERRED directive in RSD feeds
$rss->file = '1'; // use $buf as feed (as a string, not URL)
if ($cdata != 1) {
$rss->CDATA = 'content'; // get it all (naughty)
} else {
$rss->CDATA = 'nochange'; // well educated crawler
}
// get feed as array
if ($feed = $rss->get($url, $buf)){
// if you want to see the feed, uncomment the following row
//echo "<br>Feed array:<br><pre>";print_r($feed);echo "</pre>";
$link = '';
$textinput_link = '';
$image_url = '';
$image_link = '';
$docs = '';
$subjects = '';
$count = '';
$type = $feed[type];
$count = $feed[sub_count];
$cached = $feed[cached];
// kill all no longer required values
$feed[type] = '';
$feed[sub_count] = '';
$feed[encoding_in] = '';
$feed[encoding_out] = '';
$feed[items_count] = '';
$feed[cached] = '';
if (!$count) {
$count = '0';
}
if ($type == 'RSD') {
// prepare all RSD APIs
for($i=0;$i<$count;$i++){
$subjects .= ''.$feed['api'][$i]['name'].'<br />
'.$feed['api'][$i]['apiLink'].'<br />
'.$feed['api'][$i]['blogID'].'<br />
'.$feed['api'][$i]['settings_docs'].'<br />
'.$feed['api'][$i]['settings_notes'].'<br />';
}
}
if ($type == 'Atom') {
// prepare all Atom entries
for($i=0;$i<$count;$i++){
$subjects .= ''.$feed['entries'][$i]['link'].'<br />
'.$feed['entries'][$i]['title'].'<br />
'.$feed['entries'][$i]['id'].'<br />
'.$feed['entries'][$i]['published'].'<br />
'.$feed['entries'][$i]['updated'].'<br />
'.$feed['entries'][$i]['summary'].'<br />
'.$feed['entries'][$i]['rights'].'<br />
'.$feed['entries'][$i]['author_name'].' '.$feed['entries'][$i]['author_email'].' '.$feed['entries'][$i]['author_uri'].'<br />
'.$feed['entries'][$i]['category_term'].' '.$feed['entries'][$i]['category_label'].' '.$feed['entries'][$i]['category_scheme'].'<br />
'.$feed['entries'][$i]['contributor_name'].' '.$feed['entries'][$i]['contributor_email'].' '.$feed['entries'][$i]['contributor_uri'].'<br />
';
}
}
if ($type == 'RDF' | $type =='RSS v.0.91/0.92' | $type == 'RSS v.2.0'){ // For RDF and RSS feeds enter here
// prepare channel image
$image_url = $feed[image_url];
if($image_url){
$width = $feed[image_width];
if (!$width || $width > '144') {
$width = '88'; //set to default value
}
$height = $feed[image_height];
if (!$height || $height > '400') {
$height = '31'; //set to default value
}
$feed[image_url] = "<img id=\"rss_007\" src=\"".$image_url."\" alt=\"".$feed[image_title]."\" width=\"".$width."\" height=\"".$height."\">";
}
$image_link = $feed[image_link];
if($image_link){
$feed[image_link] = "<a href=\"".$image_link."\">".$image_link."</a>";
}
// prepare all RDF or RSS items
for($i=0;$i<$count;$i++){
$subjects .= ''.$feed['items'][$i]['link'].'<br />
'.$feed['items'][$i]['title'].'<br />
'.$feed['items'][$i]['description'].'<br />
'.$feed['items'][$i]['author'].'<br />
'.$feed['items'][$i]['category'].'<br />
'.$feed['items'][$i]['guid'].'<br />
'.$feed['items'][$i]['comments'].'<br />
'.$feed['items'][$i]['pubDate'].'<br />
'.$feed['items'][$i]['source'].'<br />
'.$feed['items'][$i]['enclosure'].'<br />
'.$feed['items'][$i]['country'].'<br />
'.$feed['items'][$i]['coverage'].'<br />
'.$feed['items'][$i]['contributor'].'<br />
'.$feed['items'][$i]['date'].'<br />
'.$feed['items'][$i]['industry'].'<br />
'.$feed['items'][$i]['language'].'<br />
'.$feed['items'][$i]['publisher'].'<br />
'.$feed['items'][$i]['state'].'<br />
'.$feed['items'][$i]['subject'].'<br />
';
}
}
// convert the channel/feed part into a string
$feed_common = implode(" ", $feed);
// build something that could be indexed
$html .= "<html>\r\n<head>\r\n<title>".$feed['title']."</title>\r\n<meta name=\"description\" content=\"".$feed['description']." \">\r\n</head>\r\n";
$html .= "<body>\r\n".$feed_common."\r\n".$subjects."\r\n</body>\r\n</html>\r\n";
}
if (strlen($html) < '100') { // can't be a valid feed
printStandardReport('invalidRSS',$command_line, $no_log);
} else {
if ($debug > 0 ) {
printValidFeed($type, $count, $cl);
}
}
}
return $html;
}
function commandline_help() {
print "Usage: php spider.php <options>\n\n";
print "Options:\n";
print " -all\t\t Re-index everything in the database\n";
print " -eall\t\t Erase and afterwards Re-index everything in the database\n";
print " -new\t\t Index only the new sites\n";
print " -erase\t\t Erase database\n";
print " -erased\t\t Index all meanwhile erased sites\n";
print " -preall\t\t Set 'Last indexed' to 0000\n";
print " -u <url>\t Set url to index\n";
print " -f\t\t Set indexing depth to full (unlimited depth)\n";
print " -d <num>\t Set indexing depth to <num>\n";
print " -l\t\t Allow spider to leave the initial domain\n";
print " -r\t\t Set spider to reindex a site\n";
print " -m <string>\t Set the string(s) that an url must include (use \\n as a delimiter between multiple strings)\n";
print " -n <string>\t Set the string(s) that an url must not include (use \\n as a delimiter between multiple strings)\n";
}
function link_check($url, $level, $sessid, $can_leave_domain, $reindex) {
global $command_line, $mysql_table_prefix, $user_agent, $debug, $index_media, $no_log, $clear;
$needsReindex = 1;
$deletable = 0;
$local_url = 0;
$local_url = strpos($url, 'localhost');
if ($local_url != '7') {
$url_status = url_status($url);
$thislevel = $level - 1;
if (strstr($url_status['state'], "Relocation")) {
$care_excl = '1'; // care file suffixed to be excluded
$relocated = '1'; // URL is relocated
$local_redir = '';
$url = preg_replace("/ /i", "", url_purify($url_status['path'], $url, $can_leave_domain, $care_excl, $relocated, $local_redir));
if ($url <> '') {
mysqltest();
$result = mysql_query("select link from ".$mysql_table_prefix."temp where link='$url' && id = '$sessid'");
if ($debug > '0') echo mysql_error();
$rows = mysql_num_rows($result);
if ($rows == 0) {
mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', '$level', '$sessid')");
if ($debug > '0') echo mysql_error();
}
}
$url_status['state'] == "redirected";
if ($clear == 1) clean_resource($result, '17') ;
}
ini_set("user_agent", $user_agent);
if ($url_status['state'] == 'ok') {
printStandardReport('link_okay', $command_line, $no_log);
} else {
$deletable = 1;
printUrlStatus($url_status['state'], $command_line);
}
}
if ($local_url == '7') {
printStandardReport('link_local', $command_line, $no_log);
}
if ($reindex ==1 && $deletable == 1) {
check_for_removal($url);
} else if ($reindex == 1) {
}
if (!isset($all_links)) {
$all_links = 0;
}
if (!isset($numoflinks)) {
$numoflinks = 0;
}
}
function get_Stats() {
global $mysql_table_prefix, $debug, $clear, $db_con;
$stats = array();
$keywordQuery = "select count(keyword_id) from ".$mysql_table_prefix."keywords";
$linksQuery = "select count(url) from ".$mysql_table_prefix."links";
$siteQuery = "select count(site_id) from ".$mysql_table_prefix."sites";
$categoriesQuery = "select count(category_id) from ".$mysql_table_prefix."categories";
$mediaQuery = "select count(media_id) from ".$mysql_table_prefix."media";
mysqltest();
$result = mysql_query($keywordQuery);
if ($debug > '0') echo mysql_error();
if ($row=mysql_fetch_array($result)) {
$stats['keywords']=$row[0];
}
$result = mysql_query($linksQuery);
if ($debug > '0') echo mysql_error();
if ($row=mysql_fetch_array($result)) {
$stats['links']=$row[0];
}
for ($i=0;$i<=15; $i++) {
$char = dechex($i);
mysqltest();
$result = mysql_query("select count(link_id) from ".$mysql_table_prefix."link_keyword$char");
if ($debug > '0') echo mysql_error();
if ($row=mysql_fetch_array($result)) {
$stats['index']+=$row[0];
}
}
mysqltest();
$result = mysql_query($siteQuery);
if ($debug > '0') echo mysql_error();
if ($row=mysql_fetch_array($result)) {
$stats['sites']=$row[0];
}
$result = mysql_query($categoriesQuery);
if ($debug > '0') echo mysql_error();
if ($row=mysql_fetch_array($result)) {
$stats['categories']=$row[0];
}
$result = mysql_query($mediaQuery);
if ($debug > '0') echo mysql_error();
if ($row=mysql_fetch_array($result)) {
$stats['media']=$row[0];
}
return $stats;
}
function index_new() {
global $mysql_table_prefix, $command_line, $debug, $use_robot, $use_nofollow, $no_log, $clear, $cl, $started;
$reindex == 0;
printStandardReport('NewStart',$command_line, $no_log);
mysqltest();
$result=mysql_query("select url, indexdate, spider_depth, required, disallowed, can_leave_domain, use_prefcharset from ".$mysql_table_prefix."sites");
if ($debug > '0') echo mysql_error();
while ($row=mysql_fetch_row($result)) {
$url = $row[0];
// get actual status of indexdate, eventually other threads meanwhile indexed this URL
$res=mysql_query("select indexdate from ".$mysql_table_prefix."sites where url='$url'");
if ($debug > '0') echo mysql_error();
$ind=mysql_fetch_row($res);
if ($ind[0] == '') {
// immediately info for all other threads: now indexed by this thread
$qry = "update ".$mysql_table_prefix."sites set indexdate=now() where url='$url'";
mysqltest();
mysql_query ($qry);
if ($debug > '0') echo mysql_error();
$depth = $row[2];
$include = $row[3];
$not_include = $row[4];
$can_leave_domain = $row[5];
$use_prefcharset = $row[6];
if ($can_leave_domain=='') {
$can_leave_domain=0;
}
if ($depth == -1) {
$soption = 'full';
} else {
$soption = 'level';
}
// now index this new site
index_site($url, 1, $depth, $soption, $include, $not_include, $can_leave_domain, $use_robot, $use_nofollow, $use_prefcharset );
}
}
if ($clear == 1) clean_resource($result, '18');
$ended = time();
$consumed = $ended - $started;
printConsumedReport('consumed', $cl, '0', $consumed);
printStandardReport('NewFinish',$command_line, '0');
create_footer();
}
function index_erased() {
global $mysql_table_prefix, $command_line, $debug, $use_robot, $use_nofollow, $no_log, $clear, $started, $cl;
$started = time();
$reindex == 0;
printStandardReport('ErasedStart',$command_line, $no_log);
mysqltest();
$result=mysql_query("select url, indexdate, spider_depth, required, disallowed, can_leave_domain, use_prefcharset from ".$mysql_table_prefix."sites");
if ($debug > '0') echo mysql_error();
while ($row=mysql_fetch_row($result)) {
$url = $row[0];
// get actual status of indexdate, eventually other threads meanwhile indexed this URL
$res=mysql_query("select indexdate from ".$mysql_table_prefix."sites where url='$url'");
if ($debug > '0') echo mysql_error();
$ind=mysql_fetch_row($res);
if (strstr($ind[0], '0000')) {
// immediately info for all other threads: now indexed by this thread
$qry = "update ".$mysql_table_prefix."sites set indexdate=now() where url='$url'";
mysqltest();
mysql_query ($qry);
if ($debug > '0') echo mysql_error();
$depth = $row[2];
$include = $row[3];
$not_include = $row[4];
$can_leave_domain = $row[5];
$use_prefcharset = $row[6];
if ($can_leave_domain=='') {
$can_leave_domain=0;
}
if ($depth == -1) {
$soption = 'full';
} else {
$soption = 'level';
}
// now index this erased site
index_site($url, 1, $depth, $soption, $include, $not_include, $can_leave_domain, $use_robot, $use_nofollow, $cl, 1, $use_prefcharset);
}
}
if ($clear == 1) clean_resource($result, '19');
$ended = time();
$consumed = $ended - $started;
printConsumedReport('consumed', $cl, '0', $consumed);
//printStandardReport('ErasedFinish',$command_line, '0');
printStandardReport('ReindexFinish',$command_line, '0');
create_footer();
}
function index_suspended() {
global $mysql_table_prefix, $command_line, $debug, $use_robot, $use_nofollow, $no_log, $clear, $started, $cl;
$started = time();
$reindex = 0;
printStandardReport('SuspendedStart',$command_line, $no_log);
// get ID and URL of all sites
$result1 = mysql_query("SELECT site_id, url from ".$mysql_table_prefix."sites ORDER by url");
if ($debug > '0') echo mysql_error();
while ($row1=mysql_fetch_row($result1)) {
$url = $row1[1];
$site_id = $row1[0];
// check whether this site is pending
$result2 = mysql_query("SELECT site_id from ".$mysql_table_prefix."pending where site_id =$site_id");
if ($debug > '0') echo mysql_error();
$row2=mysql_fetch_array($result2);
// if pending, continue indexing this URL
if ($row2['site_id'] == $site_id) {
// fetch all important data of this site
$result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain, use_prefcharset from ".$mysql_table_prefix."sites where url='$url'");
if ($debug > '0') echo mysql_error();
if($row=mysql_fetch_row($result)) {
$maxlevel = $row[1];
$in = $row[2];
$out = $row[3];
$domaincb = $row[4];
$use_prefcharset = $row[5];
if ($domaincb=='') {
$domaincb=0;
}
if ($maxlevel == -1) {
$soption = 'full';
} else {
$soption = 'level';
}
}
if ($clear == 1) clean_resource($result, '21') ;
if (!isset($in)) {
$in = "";
}
if (!isset($out)) {
$out = "";
}
// now indnex the rest of this site
index_site($url, $reindex, $maxlevel, $soption, $in, $out, $domaincb, $use_robot, $use_nofollow, $cl, $all, $use_prefcharset);
}
}
if ($clear == 1) clean_resource($result, '20');
$ended = time();
$consumed = $ended - $started;
printConsumedReport('consumed', $cl, '0', $consumed);
printStandardReport('SuspendedFinish',$command_line, '0');
create_footer();
}
function create_footer() {
global $plus_nr, $log_handle, $log_file;
$footer_msg = "<p class='bd'>
<span class='em'>
<br /><br />Indexing / Re-indexing finished.<br /><br />
</span></p>
";
LogUpdate($log_handle, $footer_msg);
}
function create_logFile($id) {
global $log_format, $log_dir, $dba_act;
// prepare current log file
if ($log_format == 'text') {
$log_file = $log_dir."/db".$dba_act."_".Date("ymd-H.i.s").".txt";
} else {
$log_file = $log_dir."/db".$dba_act."_".Date("ymd-H.i.s")."_".$id.".html";
}
if (!$log_handle = fopen($log_file, 'w')) { // create a new log file
$logdir = mkdir($log_dir, 0777); // try to create a log directory
if ($logdir != '1') {
die ("Logging option is set, but cannot create folder for logging files.");
} else {
if (!$log_handle = fopen($log_file, 'w')) { // try again to create a log file
die ("Logging option is set, folder was created, but cannot open a file for logging.");
}
}
}
return $log_handle;
}
function LogUpdate($log_handle, $log_msg){
if (!$log_handle) {
die ("Cannot open file for realtime logging. ");
}
if (fwrite($log_handle, $log_msg) === FALSE) {
die ("Cannot write to file for realtime logging. ");
}
}
function clear_TextCache() {
global $textcache_dir;
$count = '0';
if ($handle = opendir($textcache_dir)) {
while (false !== ($file = readdir($handle))) {
if ($file != "." && $file != "..") {
@unlink("".$textcache_dir."/".$file."");
$count++;
}
}
}
}
function clear_MediaCache() {
global $mediacache_dir;
$count = '0';
if ($handle = opendir($mediacache_dir)) {
while (false !== ($file = readdir($handle))) {
if ($file != "." && $file != "..") {
@unlink("".$mediacache_dir."/".$file."");
$count++;
}
}
}
}
function gz_decode($data, $c, $t) {
$fpointer = 0;
$result = '';
// check, for really gzip coded data
if("\x1f\x8b" != substr($data, $pointer,2) ){
$result = "error_gz0";
}
/*
if("\x08" != substr($data, $pointer,1) ){
$result = "Compression method must be 'deflate'";
}
*/
if(!$result) {
$result = gzinflate(substr($data,10,-8));
}
return $result;
}
function pre_all() {
global $mysql_table_prefix, $debug;
$qry = "update ".$mysql_table_prefix."sites set indexdate='NULL'";
mysql_query ($qry);
if ($debug > '0') echo mysql_error();
}
function extract_js($contents) {
global $clear;
$regs = array();
if(preg_match_all("/document\.write\((\"|')(.*?)(\"|')\);/si", $contents, $regs)) {
$content = '';
$content = implode("\r\n", $regs[2]);
// remove unused parts of the content
$content = preg_replace("@<!--.*?-->@si", " ",$content);
$content = preg_replace("@<style[^>]*>.*?<\/style>@si", " ", $content);
$content = preg_replace("/<link rel[^<>]*>/i", " ", $content);
$content = str_replace ("encoding: ''", " ", $content); // yes, I've seen such nonsense !
$content = preg_replace("@<script[^>]*?>.*?<\/script>@si", " ",$content);
}
/*
// if only links and their titles should be found in JavaScript
// comment the above if preg_match_all loop completely and use this one here
if(preg_match_all("/<a\s*href(.*?)<\/a>/si", $contents, $regs)) {
$content = '';
$content = implode("\r\n", $regs[0]);
}
*/
if ($clear == 1) {
$regs = array ();
unset ($contents);
}
return $content;
}
function convertToUTF8($file, $charSet, $char_Set, $converter_dir) {
global $home_charset;
$conv_file = $file; // pure code
$iconv_file = @iconv($charSet,"UTF-8//IGNORE",$conv_file); // if installed, first try to use PHP function iconv()
// IGNORE => ignore unknown characters
// TRANSLIT=> replace unknown characters with something similar
// Attention: TRANSLIT breaks converting, if no 'close to' chararacter will be found
//echo "\r\n\r\n<br /> iconv_file: $iconv_file<br />";
if(trim($iconv_file) == ""){ // iconv is not installed or input charSet not available. We need to use class ConvertCharset
$char_Set = str_ireplace ('iso-','',$charSet);
//$charSet = str_ireplace ('iso','',$charSet);
$converter = "".$converter_dir."/charsets/".$char_Set.".txt" ;
if(!is_file($converter) ) { // if this charset table is not avaulable
$char_Set = str_ireplace ('iso-','',$home_charset); // try alternatively the home charset
printConverterError($charSet, $cl);
printTryHome($home_charset, $cl);
}
if (is_file($converter) || $home_charset != 'UTF-8') { // UTF-8 -> UTF-8 would not work
$NewEncoding = new ConvertCharset($char_Set, "utf-8");
$NewFileOutput = $NewEncoding->Convert($conv_file);
//$NewEncoding = new ConvertCharset;
//$NewFileOutput = $NewEncoding->Convert($conv_file, $chrSet, "utf-8",false);
$file = $NewFileOutput;
}
}else{
$file = $iconv_file;
}
unset ($conv_file, $iconv_file, $NewEncoding, $NewFileOutput);
return $file;
}
function check_utf8($str) {
$len = strlen($str);
for($i = 0; $i < $len; $i++){
$c = ord($str[$i]);
if ($c > 128) {
if (($c > 247)) return false;
elseif ($c > 239) $bytes = 4;
elseif ($c > 223) $bytes = 3;
elseif ($c > 191) $bytes = 2;
else return false;
if (($i + $bytes) > $len) return false;
while ($bytes > 1) {
$i++;
$b = ord($str[$i]);
if ($b < 128 || $b > 191) return false;
$bytes--;
}
}
}
return true;
}
// Unicode BOM is U+FEFF, but after encoded, it will look like this.
define ('UTF32_BIG_ENDIAN_BOM' , chr(0x00) . chr(0x00) . chr(0xFE) . chr(0xFF));
define ('UTF32_LITTLE_ENDIAN_BOM', chr(0xFF) . chr(0xFE) . chr(0x00) . chr(0x00));
define ('UTF16_BIG_ENDIAN_BOM' , chr(0xFE) . chr(0xFF));
define ('UTF16_LITTLE_ENDIAN_BOM', chr(0xFF) . chr(0xFE));
define ('UTF8_BOM' , chr(0xEF) . chr(0xBB) . chr(0xBF));
function detect_utf_encoding($filename) {
$text = file_get_contents($filename);
$first2 = substr($text, 0, 2);
$first3 = substr($text, 0, 3);
$first4 = substr($text, 0, 3);
if ($first3 == UTF8_BOM) return 'UTF-8';
elseif ($first4 == UTF32_BIG_ENDIAN_BOM) return 'UTF-32BE';
elseif ($first4 == UTF32_LITTLE_ENDIAN_BOM) return 'UTF-32LE';
elseif ($first2 == UTF16_BIG_ENDIAN_BOM) return 'UTF-16BE';
elseif ($first2 == UTF16_LITTLE_ENDIAN_BOM) return 'UTF-16LE';
}
function utf16_to_utf8($str) {
$c0 = ord($str[0]);
$c1 = ord($str[1]);
if ($c0 == 0xFE && $c1 == 0xFF) {
$be = true;
} else if ($c0 == 0xFF && $c1 == 0xFE) {
$be = false;
} else {
return $str;
}
$str = substr($str, 2);
$len = strlen($str);
$dec = '';
for ($i = 0; $i < $len; $i += 2) {
$c = ($be) ? ord($str[$i]) << 8 | ord($str[$i + 1]) :
ord($str[$i + 1]) << 8 | ord($str[$i]);
if ($c >= 0x0001 && $c <= 0x007F) {
$dec .= chr($c);
} else if ($c > 0x07FF) {
$dec .= chr(0xE0 | (($c >> 12) & 0x0F));
$dec .= chr(0x80 | (($c >> 6) & 0x3F));
$dec .= chr(0x80 | (($c >> 0) & 0x3F));
} else {
$dec .= chr(0xC0 | (($c >> 6) & 0x1F));
$dec .= chr(0x80 | (($c >> 0) & 0x3F));
}
}
return $dec;
}
function XML_IsWellFormed($buf) {
libxml_use_internal_errors(true);
libxml_clear_errors(true);
$doc = new DOMDocument('1.0', 'utf-8');
$doc->loadXML($buf);
$errors = libxml_get_errors();
if (empty($errors)){
return true;
}
$error = $errors[ 0 ];
if ($error->level < 3){
return true;
}
$lines = explode("r", $buf);
$line = $lines[($error->line)-1];
$message = $error->message . ' at line ' . $error->line . ':<br /><br /> ' . htmlentities($line);
return $message;
}
?>