Location: PHPKode > projects > Sphider Plus > sphider-plus_v.2.9/admin/spiderfuncs.php
<?php

    function getFileContents($url, $get_charset) {
        global $user_agent, $url_status, $home_charset, $cn_seg;
        global $user1, $pwd1, $user2, $pwd2, $user3, $pwd3, $clear, $include_dir, $idna;

        $urlparts = parse_addr($url);
        $path = $urlparts['path'];
        $host = $urlparts['host'];

        if ($idna) {
            require_once "$include_dir/idna_converter.php";
            // Initialize the converter class
            $IDN = new idna_convert(array('idn_version' => 2008));
            // The input string, if input is not UTF-8 or UCS-4, it must be converted before
            //$url = utf8_encode($url);
            // Encode it to its readyble presentation
            $host = $IDN->encode($host);
        }

        if ($urlparts['query'] != "")
            $path .= "?".$urlparts['query'];
        if (isset ($urlparts['port'])) {
            $port = (int) $urlparts['port'];
        } else
        if ($urlparts['scheme'] == "http") {
            $port = 80;
        } else
        if ($urlparts['scheme'] == "https") {
            $port = 443;
        }

        if ($port == 80) {
            $portq = "";
        } else {
            $portq = ":$port";
        }

        $all = "Accept-Encoding: 0";
        $auth = sprintf("Authorization: Basic %s", base64_encode($user1 . ":" . $pwd1));
        $request1 = "GET $path HTTP/1.0\r\nHost: $host$portq\r\n$all\r\nUser-Agent: $user_agent\r\n$auth\r\n\r\n";

        $fsocket_timeout = 60;

        if (substr($url, 0, 5) == "https") {
            $target = "ssl://".$host;
        } else {
            $target = $host;
        }
        @fclose($fp);   //close any previous socket connection
        $errno = 0;
        $errstr = "";
        $fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);

        $contents = array ();
        if (!$fp) {
            $contents['state'] = "NOHOST";
            return $contents;
        } else {
            if (!fputs($fp, $request1)) {
                $contents['state'] = "Cannot send request";
                return $contents;
            }

            $answer = fgets($fp, 4096);

            if (strpos($answer, "401")) {    //  Try with second and third authorization
                fclose($fp);
                $errno = 0;
                $errstr = "";
                $fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
                print $errstr;
                $linkstate = "ok";
                if (!$fp) {
                    $status['state'] = "NOHOST";
                } else {
                    $user   = $user2;
                    $pwd    = $pwd2;
                    $answer = auth_connect($fp, $user, $pwd, $path, $host, $portq);
                }

                if (strpos($answer, "401")) {
                    fclose($fp);
                    $errno = 0;
                    $errstr = "";
                    $fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
                    print $errstr;
                    $linkstate = "ok";
                    if (!$fp) {
                        $status['state'] = "NOHOST";
                    } else {
                        $user   = $user3;
                        $pwd    = $pwd3;
                        $answer = auth_connect($fp, $user, $pwd, $path, $host, $portq);
                    }
                }
            }

            $data = null;
            $pageSize = 0;
            socket_set_timeout($fp, $fsocket_timeout);
            $status = socket_get_status($fp);

            while ((!feof($fp) && !$status['timed_out']) && ($pageSize < 16000) ) {
                $data .= fgets($fp, 8192);
                $pageSize = number_format(strlen($data)/1024, 2, ".", "");
            }
            fclose($fp);

            if ($status['timed_out'] == 1) {
                $contents['state'] = "timeout";
            } else {
                $contents['state'] = "ok";
                $contents['file'] = substr($data, strpos($data, "\r\n\r\n") + 4);

                if ($get_charset == 1) {    //  if charset is already known, don't enter here
                    if (($url_status['content'] == 'text' || $url_status['content'] == 'xml' || $url_status['content'] == 'xhtml')){     //      do not search if pdf, doc, rtf, xls, rss etc.
                        $hedlen = strlen($data) - strlen($contents['file']);
                        $contents['header'] = substr($data,0,$hedlen);

                        $chrSet = '';

                        //  search for encoding or charset in the header
                        $inp = strtoupper($contents['header']);
                        if (preg_match("'encoding=[\'\"](.*?)[\'\"]'si", $inp, $regs)) {
                            $chrSet = trim(strtoupper($regs[1]));      //      get encoding of current XML or XHTML file     and use it furtheron

                        } else {
                            if (preg_match("'charset=(.*?)[\'\"]'si", $inp, $regs)) {
                                $chrSet = trim(strtoupper($regs[1]));      //      get charset of current HTML file     and use it furtheron
                            }
                        }

                        if(trim($chrSet) != ''){
                            $contents['charset'] = $chrSet;

                        } else { //not found, need to search in file
                            $inp = strtoupper($contents['file']);
                            if (preg_match("@(encoding=(\"|'))(.*?)('|\")@si", $inp, $regs)) {
                                $chrSet = trim(strtoupper($regs[1]));      //      get encoding of current XML or XHTML file     and use it furtheron
                            } else {
                                if (preg_match("'charset=(.*?)[\'\"]'si", $inp, $regs)) {
                                    $chrSet = trim(strtoupper($regs[1]));      //      get charset of current HTML file     and use it furtheron
                                }
                            }
                            if(trim($chrSet) != ''){
                                $contents['charset'] = $chrSet;
                            } else {
                                $contents['charset'] = $home_charset;    //  nothing found, we need to use default charset for DOCs, PDFs, etc
                            }
                        }
                    }
                }
            }
        }
        if ($clear == 1) unset ($data, $inp, $urlparts, $lines, $chrSet, $request, $status);
        return $contents;
    }

    //      try to connect without and with 'Basic Authorization'
    function auth_connect($fp, $user, $pwd, $path, $host, $portq, $call) {
        global $user_agent;

        $all = "Accept-Encoding: 0";
        socket_set_timeout($fp, 60);
        $auth = sprintf("Authorization: Basic %s", base64_encode($user . ":" . $pwd));
        $request0   = "GET $path HTTP/1.1\r\nHost: $host$portq\r\n$all\r\nUser-Agent: $user_agent\r\n\r\n";
        $request    = "GET $path HTTP/1.1\r\nHost: $host$portq\r\n$all\r\nUser-Agent: $user_agent\r\n$auth\r\n\r\n";

        if ($call = "1") {
            fputs($fp, $request0);
        } else {
            fputs($fp, $request);
        }
        return (fgets($fp, 4096));
    }

    //      check if URL is accessible and try to connect
    function url_status($url) {
        global $user_agent, $index_pdf, $index_doc, $index_rtf, $index_xls, $index_ppt, $index_ods, $index_odt, $realnum, $index_rss;
        global $plus_nr, $user1, $pwd1, $user2, $pwd2, $user3, $pwd3, $clear, $index_rar, $index_zip, $index_csv, $browser_string;
        global $include_dir, $idna, $ext, $strip_sessids, $debug;

        $url0       = $url;
        $state      = array();
        $status     = array();
        if ($idna) {
            $urlparts   = parse_all_url($url);  //  currently only working for port 80
        } else {
            $urlparts   = parse_url($url);
        }
//echo "\r\n\r\n<br>urlparts Array:<br><pre>";print_r($urlparts);echo "</pre>\r\n";
        $path       = $urlparts['path'];
        $host       = $urlparts['host'];

        if ($idna) {
            require_once "$include_dir/idna_converter.php";
            // Initialize the converter class
            $IDN = new idna_convert(array('idn_version' => 2008));
            // The input string, if input is not UTF-8 or UCS-4, it must be converted before
            //$input = utf8_encode($url);
            // Encode it to its readable presentation
            $host = $IDN->encode($host);
        }

        if (isset($urlparts['query'])) {
            $path .= "?".$urlparts['query'];
        }

        if (!isset($urlparts['path'])  && !isset($urlparts['query'])) {
            $path = "/";
        }

        //  prepare alll for socket open
        if (isset ($urlparts['port'])) {
            $port = (int) $urlparts['port'];
        } else
        if ($urlparts['scheme'] == "http") {
            $port = 80;
        } else
        if ($urlparts['scheme'] == "https") {
            $port = 443;
        }

        if ($port == 80) {
            $portq = "";
        } else {
            $portq = ":$port";
        }

        if (substr($url, 0, 5) == "https") {
            $target = "ssl://".$host;
        } else {
            $target = $host;
        }

        $all                = "*/*";
        $auth               = sprintf("Authorization: Basic %s", base64_encode($user1 . ":" . $pwd1));
        $fsocket_timeout    = 60;
        $errno              = 0;
        $errstr             = "";

        //  request with first authorization
        $request1           = "GET $path HTTP/1.1\r\nHost: $host$portq\r\n$all\r\nUser-Agent: $user_agent\r\n$auth\r\n\r\n";
        $fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
        socket_set_timeout($fp, 60);
        fputs($fp, $request1);
        $answer = fgets($fp, 4096);    //  get the first row of the HTTP header
        fclose($fp);
//echo "\r\n\r\n<br /> answer 00: '$answer'<br />\r\n";

        if (!$answer) {
            $status['state'] = "NOHOST";
        }
        //  some server do noit accept $all = "*/*"
        if (strstr($answer, "400")) {
            $status['state'] = "HTTP/1.1 400 Bad Request";
        }

        if ($status['state'] != "NOHOST") {

            if (!preg_match("/301|302|303|307|400/i", $answer)) {

                require_once( 'http.php' );
                header('Content-Type: text/xml');

                $http_client = new http( HTTP_V11, false);

                $http_client    ->host          = $host;
                $http_client    ->user_agent    = $user_agent;
                $http_client    ->_auth_login   = $user1;
                $http_client    ->_auth_pwd     = $pwd1;
                $http_client    ->_debug        = '';

                //  now connect to the remote URL (host was already defined above)
                $answer = $http_client->get($path);
//echo "\r\n\r\n<br /> answer01 von http.php: '$answer'<br />\r\n";
                if ($answer == "200") {
                    $linkstate                      = "ok";
                    $status['state']                = $http_client->get_response_header( 'Status' ) ;
                    $status['Content-Encoding']     = $http_client->get_response_header( 'Content-Encoding' ) ;
                    $status['Transfer-Encoding']    = $http_client->get_response_header( 'Transfer-Encoding' ) ;
                    $status['Content-Type']         = "Content-Type: ".$http_client->get_response_header( 'Content-Type' ) ;
                    $content                        = $status['Content-Type'] ;

                    if(strstr ($status['Content-Type'], "text" )) {
                        $status['content']          = "text" ;
                    }

                    //  get charset
                    if (preg_match("@charset=([a-z0-9,\- ]+)@i", $status['Content-Type'], $charreg)) {
                        $status['charset'] = strtoupper(trim($charreg[1]));
                    }

                    $status['path']                 = $http_client->get_response_header( 'Location' ) ;
                    if ($status['path']) {
                        $path                       = $status['path'];
                        $status['relocate']         = "Relocated by HTTP to $path";
                    }

                    $status['date']                 = $http_client->get_response_header( 'date' ) ;
                    if (!$status['date']) {
                        $status['date']             = $http_client->get_response_header( 'Last-Modified' ) ;
                    }

                    $status['body']                 = $http_client->get_response_body() ;
                }
                unset($http_client);
            }
//echo "\r\n\r\n<br>status Array after http class:<br><pre>";print_r($status);echo "</pre>\r\n";

            if ($status['state'] == "200" ) {
                $status['state'] = "ok" ;
            } else {
                if (isset ($urlparts['port'])) {
                    $port = (int) $urlparts['port'];
                } else
                if ($urlparts['scheme'] == "http") {
                    $port = 80;
                } else
                if ($urlparts['scheme'] == "https") {
                    $port = 443;
                }

                if ($port == 80) {
                    $portq = "";
                } else {
                    $portq = ":$port";
                }

                if (substr($url, 0, 5) == "https") {
                    $target = "ssl://".$host;
                } else {
                    $target = $host;
                }

                //$accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
                //$lang   = "*/*";
                //$encode = "gzip, deflate";
                //  request with first authorization
                //$request1 = "GET $path HTTP/1.1\r\nHost: $host$portq\r\nUser-Agent: $user_agent\r\nAccept: $accept\r\n\Accept-Language: $lang\r\nAccept-Encoding: $encode\r\n$auth\r\n\r\n";

                $all                = "Accept-Encoding: 0";
                $auth               = sprintf("Authorization: Basic %s", base64_encode($user1 . ":" . $pwd1));
                $fsocket_timeout    = 60;
                $errno              = 0;
                $errstr             = "";
                //  request with first authorization
                $request1           = "GET $path HTTP/1.1\r\nHost: $host$portq\r\n$all\r\nUser-Agent: $user_agent\r\n$auth\r\n\r\n";

                ini_set("user_agent", $user_agent);

                $fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
                socket_set_timeout($fp, 60);

                //  we wil try to something from all  the header rows
                fputs($fp, $request1);
                $answer = @fgets($fp, 4096);    //  get the first row of the HTTP header
                $answer0 = $answer;             //  remember this first answer
//echo "\r\n\r\n<br /> answer0 in alter Version: '$answer'<br />\r\n";

                if (strpos($answer, "503")) {   //  temporary unreachable
                    $retry      = '';
                    $license    = '';

                    if ($debug == "2") {
                        while ($answer) {
                            $answer = fgets($fp, 4096);
                            if (preg_match("/Retry-after: *([^\n\r ]+)/i", $answer, $regs)) {
                                $retry = $regs[0];
                            }
                            if (preg_match("/License status: *([^\n\r ]+)/i", $answer, $regs)) {
                                $license = $regs[0];
                                break;
                            }
                        }
                    }
                    //  prepare status message for HTTP 503
                    $status['state'] = "Unreachable: HTTP 503 Service temporary unavailable<br />$retry<br />$license";
                    $linkstate = "Unreachable";
                }

                if (strpos($answer, "500") && $browser_string) {  // try with standard browser http_user_agent (some servers do not like crawler)
                    fclose($fp);    // close existing connection
                    sleep(1);       //  might not be necessary to wait, but . . .

                    $browser_agent      = "Mozilla/5.0 (Windows NT 6.1; rv:5.0) Gecko/20100101 Firefox/5.0";
                    $browser_request    = "GET $path HTTP/1.1\r\nHost: $host$portq\r\n$all\r\nUser-Agent: $browser_agent\r\n$auth\r\n\r\n";

                    $fsocket_timeout = 60;
                    $errno = 0;
                    $errstr = "";

                    //try to re-connect
                    $fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
                    print $errstr;
                    $linkstate = "ok";
                    if (!$fp) {
                        $status['state'] = "NOHOST";
                    } else {
                        fputs($fp, $browser_request);
                        $answer = fgets($fp, 4096);
                        ini_set("user_agent", $browser_agent);  //      overwrite $user_agent with $browser_agent
                    }
                }

                //  some servers obligatory need a slash at the end of the path. We'll try it here
                //  some other server do not like the slash as last charachter of the path, lets follow also this quirk
                if ((strpos($answer, "301") || strpos($answer, "400") || strpos($answer, "404")) && !isset($urlparts['query'])) {  // try with slash at the end of host or path
                    fclose($fp);    // close existing connection
                    sleep(1);       //  might not be necessary to wait, but . . .

                    if ($path != "/" && !strstr($path, ".")) {

                        //  if last charachter of $path isn't already a slash, add a slash at the end of the path
                        if (strrpos($path, "/") != strlen($path)-1) {
                            $path .="/";
                        }
                    }

                    $browser_agent  = "Mozilla/5.0 (Windows NT 6.1; rv:5.0) Gecko/20100101 Firefox/5.0";
                    $request        = "GET $path HTTP/1.1\r\nHost: $host$portq\r\n$all\r\nUser-Agent: $user_agent\r\n$auth\r\n\r\n";

                    $fsocket_timeout = 60;
                    $errno = 0;
                    $errstr = "";

                    //try to re-connect
                    $fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
                    print $errstr;
                    $linkstate = "ok";
                    if (!$fp) {
                        $status['state'] = "NOHOST";
                    } else {
                        fputs($fp, $request);
                        $answer = fgets($fp, 4096);

                    }
                    $status['path1'] = $path;   //  remember the corrected path, if we will try to get the file contents
//echo "\r\n\r\n<br /> answer01: '$answer'<br />\r\n";
                    //  some other server do not like the slash as last charachter of the path, lets follow also this quirk
                    if (strpos($answer, "404")) {
                        fclose($fp);    // close existing connection
                        sleep(1);       //  might not be necessary to wait, but . . .

                        //  if last charachter of $path isn't already a slash, add a slash at the end of the path
                        if (strrpos($path, "/") == strlen($path)-1) {
                            $path = substr($path, 0, strlen($path)-1);

                        }

                        $browser_agent  = "Mozilla/5.0 (Windows NT 6.1; rv:5.0) Gecko/20100101 Firefox/5.0";
                        $request        = "GET $path HTTP/1.1\r\nHost: $host$portq\r\n$all\r\nUser-Agent: $user_agent\r\n$auth\r\n\r\n";

                        $fsocket_timeout = 60;
                        $errno = 0;
                        $errstr = "";

                        //try to re-connect
                        $fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
                        print $errstr;
                        $linkstate = "ok";
                        if (!$fp) {
                            $status['state'] = "NOHOST";
                        } else {
                            fputs($fp, $request);
                            $answer = fgets($fp, 4096);

                        }
                        $status['path1'] = $path;   //  remember the corrected path, if we will try to get the file contents
                    }
                }
//echo "\r\n\r\n<br /> answer02: '$answer'<br />\r\n";
                if (strpos($answer, "401")) {    //  try without authorization (some servers do not like the $auth  annex)
                    fclose($fp);
                    $errno = 0;
                    $errstr = "";
                    $call = '1';
                    $fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
                    print $errstr;
                    $linkstate = "ok";
                    if (!$fp) {
                        $status['state'] = "NOHOST";
                    } else {
                        $user   = $user1;
                        $pwd    = $pwd1;
                        $answer = auth_connect($fp, $user, $pwd, $path, $host, $portq, $call);
                    }
//echo "\r\n\r\n<br /> answer1 in url_status: '$answer'<br />\r\n";
                    if (strpos($answer, "401")) {    //  try with second authorization
                        fclose($fp);
                        $errno = 0;
                        $errstr = "";
                        $call = '2';
                        $fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
                        print $errstr;
                        $linkstate = "ok";
                        if (!$fp) {
                            $status['state'] = "NOHOST";
                        } else {
                            $user   = $user2;
                            $pwd    = $pwd2;
                            $answer = auth_connect($fp, $user, $pwd, $path, $host, $portq, $call);
                        }
                    }

                    if (strpos($answer, "401")) {    //  try with third authorization
                        fclose($fp);
                        $errno = 0;
                        $errstr = "";
                        $call = '3';
                        $fp = fsockopen($target, $port, $errno, $errstr, $fsocket_timeout);
                        print $errstr;
                        $linkstate = "ok";
                        if (!$fp) {
                            $status['state'] = "NOHOST";
                        } else {
                            $user   = $user3;
                            $pwd    = $pwd3;
                            $answer = auth_connect($fp, $user, $pwd, $path, $host, $portq, $call);
                        }
                    }
                }

                $regs = Array ();
                if (preg_match("{HTTP/[0-9.]+ (([0-9])[0-9]{2})}i", $answer, $regs)) {
                    $httpcode = $regs[2];
                    $full_httpcode = $regs[1];

                    if ($httpcode <> 2 && $httpcode <> 3) {
                        $status['state'] = "Unreachable: HTTP $full_httpcode";
                        $linkstate = "Unreachable";
                        $realnum -- ;
                    }
                }

                $answer1 = $answer;

                //      this is the entry for usual response
                if ($linkstate <> "Unreachable" ) {
                    $content = '';

                    while ($answer && strlen($answer) > 2) {
                        $answer = fgets($fp, 4096);

                        //  get any relocation/redirection
                        if (preg_match("/location: *([^\n\r ]+)/i", $answer, $regs)) {
                            $status['path'] = $regs[1];     //      URL redirected
                            $status['relocate'] = "Relocated by HTTP $full_httpcode to ";
                        }

                        //  get Last-Modified date
                        if (preg_match("/(Date|Last-Modified): *([a-z0-9,: ]+)/i", $answer, $regs)) {
                            $status['date'] = $regs[2];
                        }

                        //  get Content-Encoding like 'gzip'
                        if (preg_match("/Content-Encoding: *([a-z0-9,: ]+)/i", $answer, $regs)) {
                            $status['Content-Encoding'] = strtolower(trim($regs[1]));
                        }

                        //  get Transfer-Encoding like 'chunked'
                        if (preg_match("/Transfer-Encoding: *([a-z0-9,: ]+)/i", $answer, $regs)) {
                            $status['Transfer-Encoding'] = strtolower(trim($regs[1]));
                        }

                        //  get Content-Type and if available Charset
                        if (preg_match("/Content-Type:/i", $answer)) {
                            $content = $answer;
                            if (preg_match("@charset=([a-z0-9,\- ]+)@i", $answer, $regs)) {
                            $status['charset'] = strtoupper(trim($regs[1]));
                            }
                        }
                    }
//echo "\r\n\r\n<br /> content: '$content'<br />\r\n";
//echo "\r\n\r\n<br /> linkstate: '$linkstate'<br />\r\n";
//echo "\r\n\r\n<br /> answer02: '$answer1'<br />\r\n";
                    if (preg_match("/200/i", $answer1)) {
                        $status['state']    = 'ok';
                    }
                    //      relocated URL? So we need to overwrite the $status array and define the type of content
                    if ($linkstate <> "Unreachable" && preg_match("/301|302|303|307/i", $answer0) && preg_match("/200/i", $answer1)) {
                        while ($answer1 && strlen($answer1) > 2) {
                            $answer1 = fgets($fp, 4096);

                            //  get any relocation/redirection
                            if (preg_match("/Location: *([^\n\r ]+)/i", $answer1, $regs)) {
                                $status['path'] = $regs[1];     //      URL redirected
                                $status['relocate'] = "Relocated by HTTP $full_httpcode to ";
                                $status['state'] = '';
                            }

                            //  get Last-Modified date
                            if (preg_match("/(Date|Last-Modified): *([a-z0-9,: ]+)/i", $answer1, $regs)) {
                                $status['date'] = $regs[2];
                            }

                            //  get Content-Encoding like 'gzip'
                            if (preg_match("/Content-Encoding: *([a-z0-9,: ]+)/i", $answer, $regs)) {
                                $status['encoding'] = $regs[1];
                            }

                            //  get Transfer-Encoding like 'chunked'
                            if (preg_match("/Transfer-Encoding: *([a-z0-9,: ]+)/i", $answer, $regs)) {
                                $status['Transfer-Encoding'] = $regs[1];
                            }

                            //  get Content-Type and if available Charset
                            if (preg_match("/Content-Type:/i", $answer1)) {
                                $status['content'] = $answer1;
                                $content = $answer1;
                                if (preg_match("@charset=([a-z0-9,\- ]+)@i", $answer1, $regs)) {
                                    $status['charset'] = strtoupper(trim($regs[1]));
                                }
                            }

                            if ($content && $status['path']) {  //  these 2 conditions would be enough to index thelocated  reURL
                                $status['state']    = "ok";
                            }
                        }

                        //  if the relocated URL or the Content-Type could not be detected, we need to GET the complete header info from the remote server
                        if ($status['state'] != "ok") {
                            $header = array();
                            $header = get_headers($url);

                            foreach ($header as $value) {
                                if (preg_match("/location: *([^\n\r ]+)/i", $value, $regs)) {
                                    $status['path'] = $regs[1];     //      URL redirected
                                    $status['relocate'] = "Relocated by HTTP $full_httpcode to ";

                                }
                            }
                        }

                        //  if the relocated path is relative, add the calling URL
                        if (!stristr($status['path'], "ttp")) {
                        $url = substr($url, 0, strrpos($url, "/")+1);
                            $status['path'] = $url.$status['path'];
                        }

                        //  analyze the header
                        if ($header) {
                            //  check for multiple redirection
                            $i = '0';
                            foreach ($header as $value) {
                                if (preg_match("/HTTP\/(.*?)301|HTTP\/(.*?)302|HTTP\/(.*?)303|HTTP\/(.*?)307/i", $value)) {
                                    $i++;
                                }
                            }

                            if ($i > "1") {
                                //      Example for requested cookie:     http://www.fogelplast.ru/
                                $status['state'] = "Multiple redirections, which is not supported by Sphider-plus version $plus_nr";
                            } else {
                                //  try to find the content type of the relocated URL
                                krsort ($header);
                                foreach ($header as $value) {
                                    if (preg_match("/Content-Type: *([^\n\r ]+)/i", $value, $regs)) {
                                        $status['content']  = $regs[1]; //     content type
                                        $content            = $value;   //     content type
                                        //  get charset
                                        if (preg_match("@charset=([a-z0-9,\- ]+)@i", $regs[1], $charreg)) {
                                            $status['charset'] = strtoupper(trim($charreg[1]));
                                        }
                                        break;
                                    }
                                }
                                //  check for valid file type in order to become indexed
                                foreach ($ext as $this_suffix) {
                                    if (stristr($status['content'], $this_suffix)) {
                                        $status['state'] = "Not text or html";
                                    }
                                }
                            }
                        }
                    }
                }
            }   //  end row by row analyzing the header

            // if Admin selected, remove session from relocated URL
            if ($status['state'] == "ok" && $strip_sessids == 1) {
                $status['path'] = remove_sessid($status['path']);
            }

            if ($status['state'] == "ok") {
                $socket_status = socket_get_status($fp);
                @fclose($fp);

                if (preg_match("{Content-Type: *([a-z/.-]*)}i", $content, $regs)) {

                    if ($regs[1] == 'text/html' || $regs[1] == 'text/' || $regs[1] == 'text/plain') {
                        $status['content'] = 'text';
                        $status['state'] = 'ok';

                    } else if ($regs[1] == 'application/pdf' && $index_pdf == 1) {
                        $status['content'] = 'pdf';
                        $status['state'] = 'ok';
                    } else if ($regs[1] == 'application/pdf' && $index_pdf == 0) {
                        $status['content'] = 'pdf';
                        $status['state'] = 'Indexing of PDF files is not activated in Admin Settings';

                    } else if (($regs[1] == 'application/msword' || $regs[1] == 'application/vnd.ms-word') && $index_doc == 1) {
                        $status['content'] = 'doc';
                        $status['state'] = 'ok';
                    } else if (($regs[1] == 'application/msword' || $regs[1] == 'application/vnd.ms-word') && $index_doc == 0) {
                        $status['content'] = 'doc';
                        $status['state'] = 'Indexing of DOC files is not activated in Admin Settings';

                    } else if (($regs[1] == 'text/rtf') && $index_rtf == 1) {
                        $status['content'] = 'rtf';
                        $status['state'] = 'ok';
                    } else if (($regs[1] == 'text/rtf') && $index_rtf == 0) {
                        $status['content'] = 'rtf';
                        $status['state'] = 'Indexing of RTF files is not activated in Admin Settings';

                    } else if (($regs[1] == 'application/excel' || $regs[1] == 'application/vnd.ms-excel') && $index_xls == 1) {
                        $status['content'] = 'xls';
                        $status['state'] = 'ok';
                    } else if (($regs[1] == 'application/excel' || $regs[1] == 'application/vnd.ms-excel') && $index_xls == 0) {
                        $status['content'] = 'xls';
                        $status['state'] = 'Indexing of XLS files is not activated in Admin Settings';

                    } else if (($regs[1] == 'text/csv') && $index_csv == 1) {
                        $status['content'] = 'csv';
                        $status['state'] = 'ok';
                    } else if (($regs[1] == 'text/csv') && $index_csv == 0) {
                        $status['content'] = 'csv';
                        $status['state'] = 'Indexing of CSV files is not activated in Admin Settings';
/*      //  Currently unsupported, because a failure was detected while converting ppt files > 7 MByte
        //  see also    ../include/common/suffix.txt
        //  see also    .../admin/configset.php
                    } else if (($regs[1] == 'application/mspowerpoint' || $regs[1] == 'application/vnd.ms-powerpoint') && $index_ppt == 1) {
                        $status['content'] = 'ppt';
                        $status['state'] = 'ok';
                    } else if (($regs[1] == 'application/mspowerpoint' || $regs[1] == 'application/vnd.ms-powerpoint') && $index_ppt == 0) {
                        $status['content'] = 'ppt';
                        $status['state'] = 'Indexing of PPT files is not activated in Admin Settings';
*/
/*
                    } else if (($regs[1] == 'application/vnd.openxmlformats-officedocument.presentationml.presentation') && $index_ppt == 1) {
                        $status['content'] = 'ppt';
                        $status['state'] = 'ok';
                    } else if (($regs[1] == 'application/vnd.openxmlformats-officedocument.presentationml.presentation') && $index_ppt == 0) {
                        $status['content'] = 'ppt';
                        $status['state'] = 'Indexing of PPT files is not activated in Admin Settings';
*/
                    } else if (($regs[1] == 'application/xml' || $regs[1] == 'application/rss' || $regs[1] == 'text/xml') && $index_rss == 1) {
                        $status['content'] = 'xml';
                        $status['state'] = 'ok';
                    } else if (($regs[1] == 'application/xhtml' || $regs[1] == 'application/rss' || $regs[1] == 'text/xhtml' || $regs[1] == 'application/xhtml') && $index_rss == 1) {
                        $status['content'] = 'xhtml';
                        $status['state'] = 'ok';
                    } else if (($regs[1] == 'application/xml' || $regs[1] == 'application/rss' || $regs[1] == 'text/xml' || $regs[1] == 'text/xhtml' || $regs[1] == 'application/xhtml') && $index_rss == 0) {
                        $status['content'] = 'xml';
                        $status['state'] = '<br />Indexing of RDF, RSD, RSS and Atom feeds is not activated in Admin Settings';

                    } else if (($regs[1] == 'application/zip' || $regs[1] == 'zip') && $index_zip == 1) {
                        $status['content'] = 'zip';
                        $status['state'] = 'ok';
                    } else if (($regs[1] == 'application/zip' || $regs[1] == 'zip') && $index_zip == 0) {
                        $status['content'] = 'zip';
                        $status['state'] = '<br />Indexing of ZIP archives is not activated in Admin Settings';

                    } else if (($regs[1] == 'application/rar' || $regs[1] == 'application/x-rar-compressed') && $index_rar == 1) {
                        $status['content'] = 'rar';
                        $status['state'] = 'ok';
                    } else if (($regs[1] == 'application/rar' || $regs[1] == 'application/x-rar-compressed') && $index_rar == 0) {
                        $status['content'] = 'rar';
                        $status['state'] = '<br />Indexing of RAR archives is not activated in Admin Settings';

                    } else if (($regs[1] == 'application/vnd.oasis.opendocument.spreadsheet') && $index_ods == 1) {
                        $status['content'] = 'ods';
                        $status['state'] = 'ok';
                    } else if (($regs[1] == 'application/vnd.oasis.opendocument.spreadsheet') && $index_ods == 0) {
                        $status['content'] = 'ods';
                        $status['state'] = '<br />Indexing of OpenDocument<strong>Spreadsheet</strong> is not activated in Admin Settings';

                    } else if (($regs[1] == 'application/vnd.oasis.opendocument.text') && $index_odt == 1) {
                        $status['content'] = 'odt';
                        $status['state'] = 'ok';
                    } else if (($regs[1] == 'application/vnd.oasis.opendocument.text') && $index_odt == 0) {
                        $status['content'] = 'odt';
                        $status['state'] = '<br />Indexing of OpenDocument<strong>Text</strong> is not activated in Admin Settings';
                    } else if (stripos ($urlparts['path'], ".js") || $regs[1] == 'application/javascript') {

                        $status['content'] = 'js';
                        $status['state'] = 'ok';

                    } else {
                        $status['state'] = "<br />For Sphider-plus v.$plus_nr not executable Text or Media.<br /> $content&nbsp;&nbsp;&nbsp;=>&nbsp;&nbsp;&nbsp;UFO file<br />";
                        $realnum -- ;
                    }

                } else {
                    if ($socket_status['timed_out'] == 1) {
                        $status['state'] = "Timed out. URL: $url0 <br />No reply from server within $fsocket_timeout seconds.";
                        $realnum -- ;
                    } else {
                        $status['state'] = "Not text or html";
                    }
                }
            }
        }

        if ($clear == 1) {
            unset ($urlparts, $answer);
            $socket_status = array();
        }
//echo "\r\n\r\n<br>status Array final:<br><pre>";print_r($status);echo "</pre>\r\n";
        return $status;
    }

    function check_robot_txt($url, $robots) {
        global $user_agent, $clear, $cl;

        $urlparts = parse_addr($url);
        if ($urlparts['host'] == 'localhost') {     //  for 'localhost' applications add the path until last slash
            $loc_path = substr($urlparts['path'], 0, strrpos($urlparts['path'], '/'));
            $url = 'http://'.$urlparts['host']."".$loc_path."/$robots";
        } else {    //      www application
            $url = 'http://'.$urlparts['host']."/$robots";
        }

        $url_status = url_status($url);
        $omit = array ();

        if ($url_status['state'] == "ok") {
            $file = @file_get_contents($url);
            $robot = explode("\n", $file);
            if (!$robot) {
                $get_charset    = '';
                $contents = getFileContents($url, $get_charset);    //  read the robots.txt file
                $file = $contents['file'];
                $robot = explode("\n", $file);
            }

            //  check for invalid content in robots.txt
            if (stristr($file, "Disallow:<!--") || stristr($file, "<script") ) {
                $domain = str_replace($robots, "", $url);
                printBadRobots($domain, $cl);

            } else {
                //  robots.txt seems okay, now parse it
                $regs = Array ();
                $this_agent= "";
                while (list ($id, $line) = each($robot)) {
                    if (preg_match("/^user-agent: *([^#]+) */i", $line, $regs)) {
                        $this_agent = trim($regs[1]);
                        if ($this_agent == '*' || $this_agent == $user_agent)
                        $check = 1;
                        else
                        $check = 0;
                    }

                    if (preg_match("/disallow: *([^#]+)/i", $line, $regs) && $check == 1) {
                        $disallow_str = urldecode(preg_replace("/[\n ]+/i", "", $regs[1])); //  make readable the %BO%D1 coded URLs
                        if (trim($disallow_str) != "") {
                            if ($urlparts['host'] == 'localhost') {     //  for 'localhost' applications add the path until last slash
                                $omit[] = "".$loc_path."".$disallow_str."";
                            } else {        //      www application
                                $omit[] = $disallow_str;
                            }
                        } else {
                            if ($this_agent == '*' || $this_agent == $user_agent) {
                                if ($clear == 1) unset ($urlparts, $contents, $file, $robot, $regs);
                                return null;
                            }
                        }
                    }
                }
            }
        }

        if ($clear == 1) unset ($urlparts, $contents, $file, $robot, $regs);
        return $omit;       //     array that holds all forbidden links from robots.txt
    }

    // Remove the file part from an url (to build an url from an url and given relative path)
    function remove_file_from_url($url) {
        $url_parts = parse_addr($url);
        $path = $url_parts['path'];
        $path = str_replace("+", "", $path);    //  as not cooperating with preg_replace

        $regs = Array ();
        //if (preg_match('/([^\/]+)$/i', $path, $regs)) {
        if (preg_match('/([^\/]+)$/i', $path, $regs)) {
            $file = $regs[1];
            $check = $file.'$';
            $path = preg_replace("/$check"."/i", "", $path);
        }

        if ($url_parts['port'] == 80 || $url_parts['port'] == "") {
            $portq = "";
        } else {
            $portq = ":".$url_parts['port'];
        }

        $url = $url_parts['scheme']."://".$url_parts['host'].$portq.$path;

        unset ($url_parts, $regs, $file);
        return $url;
    }

    // Extract links from html
    function get_links($file, $url, $can_leave_domain, $base, $media_links, $use_nofollow, $local_redir, $url_reloc) {
        global $strip_sessids, $imagelist, $audiolist, $videolist, $command_line, $no_log, $index_media;
        global $mainurl, $include_dir, $idna, $local, $index_rss, $index_alt;
//echo "<br />get_links ******************************************************************<br />\r\n";

//echo "\r\n\r\n<br /> url_reloc: '$url_reloc'<br />\r\n";
//echo "\r\n\r\n<br /> mainurl: '$mainurl'<br />\r\n";
        $chunklist = array ();
        // The base URL comes from either the meta tag or the current URL.
        if (!empty($base)) {
            $url = $base;
        }
//echo "\r\n\r\n<br />calling url: '$url'<br />\r\n";
        $links          = array ();
        $regs           = Array ();
        $checked_urls   = Array();

        $body = substr($file, stripos($file, "<body"));
        //  try to find links to JavaScript src=. . .
        if (preg_match_all("@<script(.*?)src(.*?)=(.*?)[\'\"](.*?)[\'\"]@si", $body, $regs)) {
            foreach ($regs[4] as $val) {
                if (($a = url_purify($val, $url, $can_leave_domain, 1, $relocated, $local_redir)) != '') {
                    $links[] = $a;    //  add this new link
                }
            }
        }

        $file = preg_replace("@<script[^>]*?>.*?<\/script>@si", " ",$file); //  delete all scripts from the content

        if ($index_rss) {
            $file = preg_replace("@<link>|<url>@si", "<href=\"", $file);     //  convert all links to href=
            $file = preg_replace("@</link>|</url>@si", "\">", $file);
        }
//echo "\r\n\r\n<br /> file: '$file'<br />\r\n";
        //preg_match_all("/href\s*=\s*[\'\"]?([+:%\/\?~=&;\\\(\),._a-zA-Z0-9- ]*)(#[.a-zA-Z0-9-]*)?[\'\" ]?(\s*rel\s*=\s*[\'\"]?(nofollow)[\'\"]?)?/i", $file, $regs, PREG_SET_ORDER);
        preg_match_all("/href\s*=\s*[\'\"](.*?)[\'\" ](.*?)>/si", $file, $regs, PREG_SET_ORDER);    //  Replaced in order to index links containing non-ASCII characters

        foreach ($regs as $val) {
            if ($use_nofollow == '0') {
                $val[2] = '';   //  temporary ignore 'nofollow' directive
            }

            if (strstr($val[2], "nofollow")){
                $report = "<br /><br />Found ".$val[1].", but <strong>nofollow</strong> flag is set.";
                printNofollowLink($report, $command_line, $no_log);
            }
        }
//echo "\r\n\r\n<br>regs Array:<br><pre>";print_r($regs);echo "</pre>\r\n";
        foreach ($regs as $val) {
            if ($val[1]) {  //  reject empty links, which would cause invalid url_purify()
/*
                //      for all servers  that deliver ' / ' instead of ' ./ ' as relative links on localhost
                if (strpos($val[1], "/") === 0 && strpos($url, "localhost")) {
                    $val[1] = ".".$val[1]."";
                }
*/
                $ignore = '';
                if ($use_nofollow == '1' && (strstr($val[2], "nofollow"))) {
                    $ignore = '1';   //  temporary ignore 'nofollow' directive
                }

                if ($checked_urls[$val[1]]!=1 && $ignore == '') { //if nofollow is not set
                    $care_excl = '1';   //  care file suffix to be excluded
                    $relocated = '';    //  URL is not relocated

                    //  create a link, which points back to the domain
                    if ($val[1] == "/") {

                        $main_url_parts = parse_all_url($mainurl);
                        $val[1] = $main_url_parts['scheme']."://".$main_url_parts['host']."/";
                    }
//echo "\r\n\r\n<br>val Array:<br><pre>";print_r($val);echo "</pre>\r\n";
                    if (($a = url_purify($val[1], $url, $can_leave_domain, $care_excl, $relocated, $local_redir)) != '') {
//echo "\r\n\r\n<br /> a: '$a'<br />\r\n";
                        $match_i = '0';
                        $match_a = '0';
                        $match_v = '0';

                        //  prevent self-linking for link pathes ending with and/or without final slash
                        //  and for relocated on it selves as detected in nurl_purify
                        if ($mainurl == $a || $a == "self") {
                            $a = '';
                        }

                        $a   = str_replace( " ", "%20", $a);    //  in order to find also links containing blanks.

                        if($index_media > 0 && $a){
                            if ($index_image == '1') {
                                $select  = $imagelist;
                                $match_i = valid_link($a, $select);
                            }
                            if ($index_audio == '1') {
                                $select  = $audiolist;
                                $match_a = valid_link($a, $select);
                            }
                            if ($index_video == '1') {
                                $select  = $videolist;
                                $match_v = valid_link($a, $select);
                            }
                        }

                        if ($media_links == '0' && $match_i == '0' && $match_a == '0' && $match_v == '0') {
                            $links[] = $a;    //  find only non-media links
                        }
                        if ($media_links == '1' && ($match_i == '1' || $match_a == '1' || $match_v == '1')) {
                            $links[] = $a;    //  find only media links
                        }
                    }
                    $checked_urls[$val[1]] = 1;
                }
            }
        }

        $care_excl = '1';   //  care file suffixed to be excluded
        $relocated = '';    //  URL is not relocated
        preg_match_all("/(frame[^>]*src[[:blank:]]*)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
        foreach ($regs as $val) {
            if ($checked_urls[$val[1]]!=1 ) { //    if nofollow is not set
                //if (($a = url_purify($val[1], $url, $can_leave_domain, '1')) != '') {      //modified in order to follow frame links Tec 23.03.2009
                if (($a = url_purify($val[2], $url, $can_leave_domain, $care_excl, $relocated, $local_redir)) != '') {
                    $links[] = $a;    //  find only media links
                }
                $checked_urls[$val[1]] = 1;
                }
            }
            preg_match_all("/(window[.]location)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
            foreach ($regs as $val) {
                if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
                    if (($a = url_purify($val[1], $url, $can_leave_domain, $care_excl, $relocated, $local_redir)) != '') {
                        $links[] = $a;    //  add links
                    }
                    $checked_urls[$val[1]] = 1;
                }
            }
            preg_match_all("/(http-equiv=['\"]refresh['\"] *content=['\"][0-9]+;url)[[:blank:]]*=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
            foreach ($regs as $val) {
                if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
                    if (($a = url_purify($val[1], $url, $can_leave_domain, $care_excl, $relocated, $local_redir)) != '') {
                        $links[] = $a;    //  add links
                    }
                    $checked_urls[$val[1]] = 1;
                }
            }
            preg_match_all("/(window[.]open[[:blank:]]*[(])[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER);
            foreach ($regs as $val) {
                if ($checked_urls[$val[1]]!=1 && !isset ($val[4])) { //if nofollow is not set
                    if (($a = url_purify($val[1], $url, $can_leave_domain, $care_excl, $relocated, $local_redir)) != '') {
                        $links[] = urldecode($a);    //  add links
                    }
                    $checked_urls[$val[1]] = 1;
                }
            }

            //  find invalid links for localhost application
            if (strstr($url, "localhost") && !$can_leave_domain) {
                $local_links    = array();
                $pre            = strlen($local);   //  path length to the localhost URLs

                foreach ($links as $thislink) {
                    //  if $url contains another slash behind $pre, there must be a subfolder
                    if (strstr($url, "/", $pre)) {
                        //  extract the path (folder name) of parent URL
                        $url_len = strpos($url, "/", $pre);  //  find first slash behind $pre
                        $dom = substr($url, $pre);
                        $dom = substr($dom, 0, strpos($dom, "/"));

                        if (strlen($thislink) > $url_len && strstr($thislink, $dom)) {
                            $local_links[] = $thislink;
                        }
                    } else {    //  direct link at $local
                        if (strlen($thislink) > $url_len) {
                            $local_links[] = $thislink;
                        }

                    }
                    $links = $local_links;
                }
            }

            if ($clear == 1) unset ($chunklist, $regs, $checked_urls, $a);
//echo "\r\n\r\n<br>links Array:<br><pre>";print_r($links);echo "</pre>\r\n";
            if ($strip_sessids == 1) {
                return remove_sessid($links);
            } else {
                return $links;
            }
    }

    // Function to build a unique word array from the text of a webpage, together with the count of each word
    function unique_array($arr) {
        global $min_word_length, $common, $word_upper_bound;
        global $index_numbers, $stem_words, $clear, $case_sensitive;

        if ($stem_words != 'none') {
            $newarr = Array();
            foreach ($arr as $val) {
                $newarr[] = stem_word($val, '0');
            }
            $arr = $newarr;
        }
        sort($arr);
        reset($arr);
        $newarr = array ();
        $i = 0;
        $counter = 1;

        $element = current($arr);

        if ($index_numbers == 0) {
            $pattern = "/[0-9]+/";
        } else {
            $pattern = "/[ ]+/";
        }

        $regs = Array ();
        for ($n = 0; $n < sizeof($arr); $n ++) {
            //check if word is long enough, does not contain characters as defined in $pattern and is not a common word
            //to eliminate/count multiple instance of words
            $next_in_arr = next($arr);

            if ($case_sensitive == "1") {   //  compare words by means of upper and lower case characters (e.g. for Chinese language)
                if ($next_in_arr != $element) {
                    if (strlen($element) >= $min_word_length && !preg_match($pattern, $element) && ($common[$element] != 1)) {
                        if (preg_match("/^(-|\\\')(.*)/", $element, $regs))
                        $element = $regs[2];

                        if (preg_match("/(.*)(\\\'|-)$/", $element, $regs))
                        $element = $regs[1];

                        $newarr[$i][1] = $element;
                        $newarr[$i][2] = $counter;
                        $element = current($arr);
                        $i ++;
                        $counter = 1;
                    } else {
                        $element = $next_in_arr;
                        $counter = 1;   //  otherwise the count will be the amount of skipped words
                    }
                } else {
                    if ($counter < $word_upper_bound)
                    $counter ++;
                }

            } else {        //  compare all words only using lower case characters

                if ($next_in_arr != $element) {
                    if (strlen($element) >= $min_word_length && !preg_match($pattern, $element) && ($common[strtolower($element)] != 1)) {
                        if (preg_match("/^(-|\\\')(.*)/", $element, $regs))
                        $element = $regs[2];

                        if (preg_match("/(.*)(\\\'|-)$/", $element, $regs))
                        $element = $regs[1];

                        $newarr[$i][1] = $element;
                        $newarr[$i][2] = $counter;
                        $element = current($arr);
                        $i ++;
                        $counter = 1;
                    } else {
                        $element = $next_in_arr;
                        $counter = 1;   //  otherwise the count will be the amount of skipped words
                    }
                } else {
                    if ($counter < $word_upper_bound)
                    $counter ++;
                }
            }

        }

        if ($clear == 1) unset ($element, $arr);
        return $newarr;
    }

    // Check if url is legal, relative to the main url.
    //  Currently working only for port 80 connections !!!
    function url_purify($url, $parent_url, $can_leave_domain, $care_excl, $relocated) {
        global $ext, $mainurl, $apache_indexes, $strip_sessids, $clear, $dup_path;
        global $other_host, $redir_host, $sldlist, $only_links, $include_dir, $idna;
//echo "<br />new url******************************************************************<br />\r\n";
//echo "\r\n\r\n<br /> url0: '$url'<br />\r\n";

        if(strstr(substr($url, 0, 5), "www") && !strstr($url, "://")) {
            $url = "http://$url";
        }

        if(strstr(substr($url, 0, 5), "www") && !strstr(substr($url, 0, 5), "://")) {
            $url = "http://$url";
        }

        $orig_parent_url    = $parent_url;  //  in order to remember, also after several modifications

        if ($idna) {    //  parse IDN coded URLs and make punycode readable
            //  with respect to the different codings of our dear webmasters (and their special CMS)
            $url        = urldecode($url);
            $parent_url = urldecode($parent_url);
            $mainurl    = urldecode($mainurl);

            require_once "$include_dir/idna_converter.php";
            // Initialize the converter class
            $IDN = new idna_convert(array('idn_version' => 2008));
            // The input string, if input is not UTF-8 or UCS-4, it must be converted before
            //$thislink = utf8_encode($thislink);
            // Encode it
            if (strstr($url, "xn--")) {
                $url                = $IDN->decode($url);
            }

            $url_parts   = parse_all_url($url);

            if (strstr($mainurl, "xn--")) {
                $mainurl = $IDN->decode($mainurl);
            }

            $main_url_parts         = parse_all_url($mainurl);
            if (strstr($mainurl, "xn--")) {
                $main_url_parts['host'] = $IDN->decode($main_url_parts['host']);
            }
        } else {
            $main_url_parts = parse_all_url($mainurl);
            $url_parts       = parse_all_url($url);
        }

        if ($strip_sessids == 1) {
            $url = remove_sessid($url);
        }


//echo "\r\n\r\n<br /> mainurl: '$mainurl'<br />\r\n";
//echo "\r\n\r\n<br /> parent_url: $parent_url<br />\r\n";
//echo "\r\n\r\n<br>main_url_parts Array:<br><pre>";print_r($main_url_parts);echo "</pre>\r\n";
//echo "\r\n\r\n<br /> url10: $url<br />\r\n";
//echo "\r\n\r\n<br>url_parts Array:<br><pre>";print_r($url_parts);echo "</pre>\r\n";

/*
 echo "\r\n\r\n<br /> can_leave_domain: $can_leave_domain<br />\r\n";
 echo "\r\n\r\n<br /> local_redir: $local_redir<br />\r\n";
 echo "\r\n\r\n<br /> relocated: $relocated<br />\r\n";
 echo "\r\n\r\n<br /> redir_host: $redir_host<br />\r\n";
 echo "\r\n\r\n<br /> other_host: $other_host<br />\r\n";
 */

        //   if activated in Admin settings, allow other hosts in same domain, and also ignore www. and TLD and SLD
        if (($local_redir != 1 && $relocated ==1 && $redir_host == 1 || $other_host == 1)
        && $url_parts['host'] != "" && $url_parts['host'] != $main_url_parts['host']){

            //  remove 'www'
            $new_host = str_replace('www.', '', $url_parts['host']) ;
            $main_host = str_replace('www.', '', $main_url_parts['host']);

            //  remove TLD
            if(strstr($new_host, '.')) {
                $new_host = substr($new_host , 0, strrpos($new_host, '.')) ;
            }
            if(strstr($main_host, '.')) {
                $main_host = substr($main_host , 0, strrpos($main_host, '.')) ;
            }

            //  If exist, remove SLD
            foreach ($sldlist as &$value) {
                if (preg_match("/$value$/", $new_host)){
                    $new_host = substr($new_host , 0, strpos($new_host, $value)) ;
                }
            }
            foreach ($sldlist as &$value) {
                if (preg_match("/$value$/", $main_host)){
                    $main_host = substr($main_host , 0, strpos($main_host, $value)) ;
                }
            }

            //  if exist, remove sub-domains
            if(strstr($new_host, '.')) {
                $new_host = substr($new_host , strrpos($new_host, '.')+1) ;
            }

            if(strstr($main_host, '.')) {
                $main_host = substr($main_host , strrpos($main_host, '.')+1) ;
            }

            //  follow only host with same domain-name
            if ($new_host == $main_host) {
                if ($care_excl == '1') {    //  care about non-exepted suffixes
                    reset($ext);
                    while (list ($id, $excl) = each($ext))
                        if (preg_match("/\.$excl($|\?)/i", $url)){  //  if suffix is at the end of the link, or followd by a question mark
                            return '';
                        }
                }

                if (substr($url, -1) == '\\') {
                    return '';
                }

                if (isset($url_parts['query'])) {
                    if ($apache_indexes[$url_parts['query']]) {
                        return '';
                    }
                }

                if (preg_match("/[\/]?mailto:|[\/]?javascript:|[\/]?news:/i", $url)) {
                    return '';
                }

                //only http and https links are followed
                if (isset($url_parts['scheme'])) {
                    $scheme = $url_parts['scheme'];
                } else {
                    $scheme ="";
                }
                if (!($scheme == 'http' || $scheme == '' || $scheme == 'https')) {
                    return '';
                }

                // if missing, add slash to URL
                if(!$url_parts['path'] && !preg_match("/\/$/", $url)) {
                    $url = $url."/";
                }
                return convert_url($url);
            }

        }   //  end of finding new URLs for 'follow other host with same domain-name'

        //  now purify links only for known domains, but independent from containing www or not www
        $url_host       = str_replace("www.", "", $url_parts['host']);
        $main_url_host = str_replace("www.", "", $main_url_parts['host']);
        //  This detects foreign domains:                                $url_parts['host']                !=         $main_url_parts['host']
        if ($url_host != "" && $url_host != $main_url_host  && $can_leave_domain != 1) {

            if ($only_links && $can_leave_domain == 1) {
                return $url;
            } else {
                return '';
            }
        }

        if ($care_excl == '1') {    //  care about non-exepted suffixes
            reset($ext);
            while (list ($id, $excl) = each($ext))
                if (preg_match("/\.$excl($|\?)/i", $url)){  //  if suffix is at the end of the link, or followd by a question mark
                    return '';
                }
        }

        if (substr($url, -1) == '\\') {
            return '';
        }

        if (isset($url_parts['query'])) {
            if ($apache_indexes[$url_parts['query']]) {
                return '';
            }
        }

        if (preg_match("/[\/]?mailto:|[\/]?javascript:|[\/]?news:/i", $url)) {
            return '';
        }

        if (isset($url_parts['scheme'])) {
            $scheme = $url_parts['scheme'];
        } else {
            $scheme ="";
        }

        //  only http and https links are followed
        if (!($scheme == 'http' || $scheme == '' || $scheme == 'https')) {
            return '';
        }

        //  now special processing for relative links
        if (!strpos(substr($url, 0, 5), "ttp")) {

            $parent_url_parts = parse_all_url($parent_url);
//echo "\r\n\r\n<br>parent_url_parts Array0:<br><pre>";print_r($parent_url_parts);echo "</pre>\r\n";
            if ($idna) {    //  make punycode readable
                require_once "$include_dir/idna_converter.php";
                // Initialize the converter class
                $IDN = new idna_convert(array('idn_version' => 2008));
                // The input string, if input is not UTF-8 or UCS-4, it must be converted before
                //$thislink = utf8_encode($thislink);
                // Encode it
                if (strstr($parent_url, "xn--")) {
                    $parent_url = $IDN->decode($parent_url);
                }
                if (strstr($parent_url_parts['host'], "xn--")) {
                    $parent_url_parts['host']   = $IDN->decode($parent_url_parts['host']);
                }
                //$parent_url_parts['path']   = $IDN->decode($parent_url_parts['path']);
            }

            //  if only a query is added to the  current page URL
            if (preg_match("/^\?/", $url)) {
                $parent_end = substr($parent_url, strrpos($parent_url, "/")+1);         //  parse the end of the parent url behind the last slash

                //  if the link is only a new query
                if (substr($parent_end, 0, 1) == "?" ) {
                    $parent_url = substr($parent_url, 0, strrpos($parent_url, "/")+1) ;
                }

                //  unfortunately some webmasters repeat the file name (and/or query) as part of the new link
                if (strstr($url, $parent_end) || strstr($parent_end, $url)) {  //  so we need the name (and/or query) from the parent url
                    $parent_url = substr($parent_url, 0, strrpos($parent_url, "/")+1);
                }

                //  in case that $parent end contains of a file name plus  a query, we need to kill the query from the parent _url
                if (strstr($parent_end, "?")) {
                    $parent_url = substr($parent_url, 0, strpos($parent_url, "?"));
                }

                $url = $parent_url.$url;    //  build the complete link

                if (!strpos($url, "ttp")) {
                    if ($main_url_parts['port'] == 80 || $url_parts['port'] == "") {
                        $portq = "";
                    } else {
                        $portq = ":".$main_url_parts['port'];
                    }
                    $url = $parent_url_parts['scheme']."://".$parent_url_parts['host'].$portq.$parent_url_parts['path'].$url;
                }
                return convert_url($url);
            } else {
//echo "\r\n\r\n<br /> parent_url0: '$parent_url'<br />\r\n";
                //  kill eventually existing arguments from the parent url
                if (strpos($parent_url, "?")) {
                    $parent_url = substr($parent_url, 0, strpos($parent_url, "?"));
                }

                //  parent url might be used to build the URL from relative path
                // don't remove filename if it is a bare query or fragment
                if (substr($url, 0, 1) != '?' && substr($url, 0, 1) != '#') {
                    $parent_url = remove_file_from_url($parent_url);
                }

                $parent_end = substr($parent_url, strrpos($parent_url, "/")+1);     //  parse the end of the parent url behind the last slash

                //  now try to find self linking in real links (to be ignored)
                if ($url == '#') {
                    return '';
                }

                //  now try to find anchor-links (anchor is to be ignored)
                if (strstr($url, "#")) {
                    $url = substr($url, 0, strpos($url, "#"));  //  remove the anchor part of the link
                    if (!$url) {    //  this link was only an anchor, forget it
                        return '';
                    }
                }

                //  another kind of self linking
                if (urlencode($orig_parent_url) == urlencode($url)) {
                    return '';
                }

                //  another kind of self linking in real links
                //  'urlencode' added for IDN domains
                $par_length = strlen(urlencode($parent_url));
                $url_length = strlen(urlencode($url));
                $pos = strpos($parent_url, $url);

                if ($pos) {
                    $rel = $par_length-$pos;
                    if ($rel == $url_length+1) {    //  the new link is just the end of $parent_url, this is self linking
                        return '';
                    }
                }

                $urlpath = $url_parts['path'];      //  simplified for string functions

                //      if ../ should cause one folder up (even several times)
                $regs1   = Array ();
                $parent_url_parts['path'] = substr($parent_url_parts['path'], 0, strrpos($parent_url_parts['path'], "/"));

                while (preg_match("/^[.]{2}\//", $urlpath, $regs1)) {
                    //  remove ../ from link path
                    $urlpath = substr($urlpath, 3);
                    //  remove last folder from parent url path
                    $parent_url_parts['path'] = substr($parent_url_parts['path'], 0, strrpos($parent_url_parts['path'], "/" ));
                }

                //  in case we need to add a slash at the end of the path
                if (substr($parent_url_parts['path'],  strlen($parent_url_parts['path'])-1, 1)  != "/") {
                    $parent_url_parts['path'] .= "/";
                }

                $urlpath = preg_replace("/\/+/", "/", $urlpath);
                $urlpath = str_replace("//", "/", $urlpath);    //  we've seen so much nonsense, even double slashes at the beginning of the urlpath)
//echo "\r\n\r\n<br /> urlpath: '$urlpath'<br />\r\n";
                $query = "";

                if (isset($url_parts['query'])) {
                    $query = "?".$url_parts['query'];      // (Some servers seem to run this . . .)
                    //$query = "/?".$url_parts['query'];            // (Some other servers even seem to run this . . .)
                }
                if ($main_url_parts['port'] == 80 || $url_parts['port'] == "") {
                    $portq = "";
                } else {
                    $portq = ":".$main_url_parts['port'];
                }
//echo "\r\n\r\n<br>parent_url_parts Array1<br><pre>";print_r($parent_url_parts);echo "</pre>\r\n";
                if ($parent_url_parts['host'] != "localhost") {
                    //  if the link URL contains the complete path like the calling URL(root folder) remove the path from the parent_url_path
                    if ($parent_url_parts['path'] != "/" && substr($urlpath, 0, 1) == "/") {
                        $parent_url_parts['path'] = "/";
                    }
//echo "\r\n\r\n<br>parent_url_parts Array2<br><pre>";print_r($parent_url_parts);echo "</pre>\r\n";

                    //  remove the eventually existing leading ./ from the link
                    $urlpath = str_replace("./", "/", $urlpath);

                    //  if there is no filename in urlpath, add a final slash to the urlpath
                    if ($url_parts['path'] != "/") {
                        $last = substr($urlpath, strrpos($urlpath, "/"));
                        if ($last != "/" && !strstr($last, ".")) {
                            $urlpath .= "/" ;
                        }
                    }

                    //  if activated in Admin settings, and parts of the parent_url_path are equal to the url_path,
                    //  delete the duplicate part from the parent_url_path
                    if ($dup_path && strstr($urlpath, "/")) {
                        $path = substr($urlpath, 0, strrpos($urlpath, "/")+1);

                        if ( $parent_url_parts['path'] != "/" && strstr($parent_url_parts['path'], $path)) {
                            $dup = stripos($parent_url_parts['path'], $path);
                            //$parent_url_parts['path'] = str_replace($path, "", $parent_url_parts['path']);
                            $parent_url_parts['path'] = substr($parent_url_parts['path'], 0, $dup);

                            if (substr($parent_url_parts['path'], 0, 1) != '/'){
                            //if(!substr($parent_url_parts['path'], 0 , "/")) {
                                $parent_url_parts['path'] = "/".$parent_url_parts['path'];
                            }

                            //  in case that we killed the complete path from the parent_url, we use / as path
                            if (!$parent_url_parts['path']) {
                                $parent_url_parts['path'] = "/";
                            }
                        }
                    }
                }
//echo "\r\n\r\n<br>parent_url_parts Array3:<br><pre>";print_r($parent_url_parts);echo "</pre>\r\n";
                //  remove any trailing slash, which will be supported by $parent_url_parts
                if (substr($urlpath, 0, 1) == "/") {
                    $urlpath = substr($urlpath, 1);
                }

                //  finally build the complete URL for relative links
                $url = $parent_url_parts['scheme']."://".$parent_url_parts['host'].$portq.$parent_url_parts['path'].$urlpath.$query;

                //  in case that someone has forgotten to fix the backslashes (Windows like)  in the URL
                //  I've seen even this . . .
                $url = str_replace("\\", "/", $url);
            }
        }

        if ($mainurl == $url) {
            return 'self';
        }
//echo "\r\n\r\n<br />link url: '$url'<br />\r\n";
        // convert 'blank' and '&amp;'
        $url = convert_url($url);
/*
        // if in last position of url path, remove final slash
        $linkparts = parse_all_url($url);
        if ($linkparts['path'] != "/") {
            if (substr($url, strlen($url)-1) == "/") {
                $url = substr($url, 0, strlen($url)-1);
            }
        }
*/
        if ($can_leave_domain == 1 || $other_host == 1) {
            return $url;
        }

        //  only urls staying in the starting domain/directory are followed
        if (strstr($url, $main_url_host) == false && $only_links != '1') {   //  $main_url_parts['host'] will support also relative-back-folder like ../../
            if ($clear == 1) unset ($mainurl, $url_parts, $urlparts, $urlpath, $query, $page);
            return '';
        } else {
            if ($clear == 1) unset ($mainurl, $url_parts, $urlparts, $urlpath, $query, $page);
            return $url;
        }
    }

    function save_keywords($wordarray, $link_id, $domain) {
        global $mysql_table_prefix, $all_keywords, $debug, $db_con, $case_sensitive, $clear;

        reset($wordarray);

        sort($wordarray);   //  get alphabetic order

        while ($thisword = each($wordarray)) {
            $word = trim($thisword[1][1]);
            $word = str_replace("/&nbsp;/","",$word);
            $word = str_replace("<", "&lt;", $word);  //make it visible
            $word = str_replace(">", "&gt;", $word);  //make it visible

            $wordmd5 = substr(md5($word), 0, 1);
            $hits = $thisword[1][2];
            $weight = $thisword[1][3];

            if (strlen($word)<= 255) {
                $keyword_id = $all_keywords[$word];

                if ($keyword_id  == "") {
                    if ($debug == '2') {
                        printActKeyword(str_replace("\'", "'", $word));  //make it readable for all
                    }

                    mysqltest();
                    mysql_query("insert into ".$mysql_table_prefix."keywords (keyword) values ('$word')");
                    if (mysql_errno() == 1062) {
                        $result = mysql_query("select keyword_ID from ".$mysql_table_prefix."keywords where keyword='$word'");
                        if ($debug > '0') echo mysql_error();
                        $row = mysql_fetch_row($result);
                        $keyword_id = $row[0];
                        if ($clear == 1) clean_resource($result, '50');
                    } else{
                        $keyword_id = mysql_insert_id();
                        $all_keywords[$word] = $keyword_id;
                        if ($debug > '0') echo mysql_error();
                    }
                }
                $inserts[$wordmd5] .= ",($link_id, $keyword_id, $weight, $domain, $hits, now())";
            }
        }

        mysqltest();
        for ($i=0;$i<=15; $i++) {
            $char = dechex($i);
            $values= substr($inserts[$char], 1);

            if ($values != "") {
                mysqltest();
                $query = "insert into ".$mysql_table_prefix."link_keyword$char (link_id, keyword_id, weight, domain, hits,indexdate) values $values";
                mysql_query($query);
                if ($debug > '0') echo mysql_error();
            }
        }
        if ($clear == 1) unset ($values, $char, $inserts, $all_keywords, $weight, $word, $wordarray);
    }

    function get_head_data($file, $url, $use_nofollow, $use_robot, $can_leave_domain) {
        global $clear, $cano_leave;

        $data = array();
        $headdata = "";
        preg_match("@<head[^>]*>(.*?)<\/head>@si",$file, $regs);
        $headdata = $regs[1];

        $description = "";
        $robots = "";
        $keywords = "";
        $base = "";
        $cano_link = "";
        $refresh        = "";
        $wait           = "0";
        $res = Array ();

        if ($headdata != "") {
            //      check for robots in meta tags
            preg_match("/<meta +name *=[\"']?robots[\"']? *content=[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
            if (isset ($res)) {
                $robots = $res[1];
            }
            //      check for description tag in header
            $res = array();
            preg_match("/<meta +name *=[\"']?description[\"']? *content=[\"']?([^<>\"]+)[\"']?/i", $headdata, $res);
            if (isset ($res)) {
                $description = $res[1];
            }
            //      check for keywords tag in header
            $res = array();
            preg_match("/<meta +name *=[\"']?keywords[\"']? *content=[\"']?([^<>\"]+)[\"']?/i", $headdata, $res);
            if (isset ($res)) {
                $keywords = $res[1];
            }

            // e.g. <base href="http://www.consil.co.uk/index.php" />
            $res = array();
            preg_match("/<base +href *= *[\"']?([^<>'\"]+)[\"']?/i", $headdata, $res);
            if (isset($res)  && $res[1] != "/") {
                $base = $res[1];
            } else {
                $base = $url;   //  eventually this needs to be reduced to the URL of the domain. Not sure about this
            }

            $keywords = preg_replace("/[, ]+/", " ", $keywords);
            $robots = explode(",", strtolower($robots));
            $nofollow = 0;
            $noindex = 0;

            foreach ($robots as $x) {
                if (trim($x) == "noindex" && $use_robot == '1') {
                    $noindex = 1;
                }
                if (trim($x) == "nofollow" && $use_nofollow == '1') {
                    $nofollow = 1;
                }
            }

            //      check for refresh link in meta tags
            $res = array();
            preg_match("/http-equiv=[\"']refresh[\"'] *content=[\"'](.*?); *url= *(.*?)[\"']/i", $headdata, $res);
            if (isset ($res[0])) {
                if ($res[1] != "0") {
                    $wait = $res[1];
                    sleep($wait);            //  if we should wait for some time until continuing to load the real URL
                }

                if (strpos($res[2], "//")) {
                    $cano_link = $res[2];       //  refresh contains an absolute URL
                } else {
                    $length = strlen(trim($url));
                    if (strrpos(trim($url), "/")+1 == $length) {    //  add new file to URL
                        $new =  $res[2];
                        $url .= $new;
                        $cano_link = $url;
                    } else {
                        $filename = basename($url);
                        $cano_link = str_replace($filename, $res[2], $url);     //  build the real URL to refreshed link
                    }
                }
                $refresh = '1';
            }

            //      check for canonical link info in meta tags
            $res = array();
            preg_match("/<link +rel *=[\"']canonical[\"'] *href=[\"'](.*?)[\"']/i", $headdata, $res);

            if (isset ($res[0])) {

                $cano_link      = '1';
                $care_excl      = '1';   //  care file suffix to be excluded
                $relocated      = '';    //  URL is not relocated
                $local_redir    = '';

                if ($cano_leave == '1') {   //  if acttivated in Admin backend, allow to leave the domain for canonical links
                    $can_leave_domain = '1';
                }

                if (($a = url_purify($res[1], $url, $can_leave_domain, $care_excl, $relocated, $local_redir)) != '') {
                    if (strcmp($url, $a)) {
                        $cano_link = $a;    //  if cano_link != url
                    } else {
                        $cano_link = '';    // if cano-link = url
                    }
                }
                if (urldecode($url) == urldecode($res[1])) {
                    $cano_link = '';  //  another kind of self-linking
                }
            }

            $data['description']    = addslashes($description);
            $data['keywords']       = addslashes($keywords);
            $data['nofollow']       = $nofollow;
            $data['noindex']        = $noindex;
            $data['base']           = $base;
            $data['cano_link']      = $cano_link;
            $data['refresh']        = $refresh;
            $data['wait']           = $wait;
        }
        if ($clear == 1) unset ($headdata, $res, $keywords, $robots);
        return $data;
    }

    function get_link_details($file, $url, $can_leave_domain, $base, $media_links, $use_nofollow, $local_redir) {
        global $strip_sessids, $imagelist, $audiolist, $videolist, $command_line, $no_log;
        global $clear, $div_all, $div_hyphen, $del_secchars, $debug, $cl;
        global $use_white1, $use_white2, $use_black, $whitelist, $blacklist;

        $chunklist = array ();
        // The base URL comes from either the meta tag or the current URL.
        if (!empty($base)) {
            $url = $base;
        }

        $links          = array();
        $regs           = array();
        $checked_urls   = array();
        $data           = array();
        //  first clean unused parts of the file
        $file = preg_replace("@<!--.*?-->@si", " ",$file);
        $file = preg_replace("@<script[^>]*?>.*?<\/script>@si", " ",$file);
        $file = preg_replace("@<style[^>]*>.*?<\/style>@si", " ", $file);

        //  get all links

        preg_match_all("/<a href=[\'\"](.*?)[\'\" ](.*?)>(.*?)<\/a>/si", $file, $regs, PREG_SET_ORDER);    //get all links

        foreach ($regs as $val) {
            if ($use_nofollow == '0') {
                $val[2] = '';   //  temporary ignore 'nofollow' directive
            }

            if (stristr($val[2], "nofollow")){
                $report = "<br /><br />Found ".$val[1].", but <strong>nofollow</strong> flag is set.";
                printNofollowLink($report, $command_line, $no_log);
            }
        }

        $i = 0;
        foreach ($regs as $val) {
            if ($val[1] && !stristr($val[0], ".css")) {  //  reject empty links, which would cause invalid url_purify()  and ignore style links

                //      for all servers  that deliver ' / ' instead of ' ./ ' as relative links on localhost
                if (strpos($val[1], "/") === 0 && strpos($url, "localhost")) {
                    $val[1] = ".".$val[1]."";
                }

                $ignore = '';
                if ($use_nofollow == '1' && (stristr($val[2], "nofollow"))) {
                    $ignore = '1';   //  temporary ignore 'nofollow' directive
                }

                if ($checked_urls[$val[1]]!=1 && $ignore == '') { //if nofollow is not set
                    $care_excl = '1';   //  care file suffix to be excluded
                    $relocated = '';    //  URL is not relocated
                    $title = '';

                    if (($a = url_purify($val[1], $url, $can_leave_domain, $care_excl, $relocated, $local_redir)) != '') {
                        //  get title from images
                        if (stripos($val[3], "title=")) {
                            preg_match_all("/title=\"(.*?)\"/si", $val[3], $regtlt, PREG_SET_ORDER);
                            $title = $regtlt[0][1];
                        } else {
                            if (stripos($val[3], "alt=")) {
                                preg_match_all("/alt=\"(.*?)\"/si", $val[3], $regtlt, PREG_SET_ORDER);    //get alternate title from images
                                $title = $regtlt[0][1];
                            }
                        }

                        if (!$title){
                            $title = $val[3];
                        }

                        if ($use_white1 == '1') {       //  check, whether this title matches ANY word in whitelist
                            $found = '0';
                            foreach ($whitelist as $key => $value) {
                                if (stristr($title, $value)) {
                                    $found = '1';
                                }
                            }

                            if ($found == '0') {
                                if ($debug == '2') {
                                    printWhiteLink($url, $title, $cl);
                                }
                                $title = '';
                            }
                        }

                        if ($use_white2 == '1') {       //  check whether this  title matches ALL words in whitelist
                            $all  = count($whitelist);
                            $found = '0';
                            $found_this = '0';
                            foreach ($whitelist as $key => $value) {
                                if (stristr($title, $value)) {
                                    $found_this = '1';
                                }

                                if ($found_this != '0'){
                                    $found++;
                                    $found_this = '0';
                                }
                            }

                            if ($found != $all) {
                                if ($debug == '2') {
                                    printWhiteLink($url, $title, $cl);
                                }
                                $title = '';
                            }
                        }

                        if ($use_black == '1') {
                            $found = '0';           //  check whether this title matches ANY string in blacklist
                            foreach ($blacklist as $key => $value) {
                                $met = stristr($title, $value);
                                if($met) $found = '1';
                            }
                            if ($found == '1') {
                                if ($debug == '2') {
                                    printBlackLink($a, $title, $cl);
                                }
                                $title = '';
                            }
                        }

                        if ($title) {
                            $data[0][0] .= " $title";     //  add current link text as part of the complete title string

                            //  clean title from stuff
                            $trash   = array("  ", "&nbsp;&nbsp;", " &nbsp;", "<br />", "\r\n", "\n", "\r", "\\r\\n", "\\n", "\\r", "\\", "\\\\", "<strong>", "</strong>", "\"");
                            $replace = ' ';

                            $title      = str_replace($trash, $replace, $title);
                            $data[0][0] = str_replace($trash, $replace, $data[0][0]);

                            $search = '';


                            $data[0][0] = del_secchars($data[0][0]);
                            //$data[0][0] = preg_replace('/,|\. |\.\. |\.\.\. |!|\? |" |: |\) |\), |\). |】 |) |?,|? |� |�|。,|。 |„ |“ |� |�|�&nbsp;|» |.»|;»|:»|,»|.»|·»|«|« |», |». |.� |,�|;� |�. |�, |‿|�|)|·|;|\] |\} |=|\<|\>/', " ", $data[0][0]);
                            //$data[0][0] = preg_replace('/ \[| "| \(| „| “|(| «| �| ‿| (/', " ", $data[0][0]);     //    kill special characters in front of words


                            $data[0][0]     = mysql_real_escape_string($data[0][0]);
                            $data[$i][1]    = mysql_real_escape_string($val[0]);
                            $data[$i][2]    = mysql_real_escape_string($a);
                            $data[$i][3]    = mysql_real_escape_string($title);

                            $checked_urls[$val[1]] = 1;
                        }
                    }
                }
            }
            $i++;
        }

        //  split words at hyphen, single quote, dot and comma into their basics
        if (($div_all || $div_hyphen)) {
            $data[0][0] = split_words($data[0][0]);
        }

        if ($clear == 1)  unset ($regs, $regtlt, $title, $val);
        return $data;
    }

    function clean_file($file, $url, $type, $charSet, $use_nofollow, $use_robot, $can_leave_domain) {
        global $entities, $index_host, $index_meta_keywords, $index_meta_description, $case_sensitive, $utf_16;
        global $home_charset, $chrSet, $del_secchars, $index_rss, $converter_dir, $div_all, $div_hyphen;
        global $bb_decode, $ent_decode, $cn_seg, $quotes, $dup_quotes, $clear, $only_links, $text_length;
        global $use_divs, $not_divs, $not_divlist, $use_divlist, $ignore_fulltxt, $index_meta_title;
        global $use_elems, $not_elems, $use_elementslist, $not_elementslist, $del_elems, $conv_puny, $include_dir;
//echo "\r\n\r\n<br /> file: '$file'<br />\r\n";

        $new            = array();
        $data           = array();
        $string         = '';
        $home_charset   = strtoupper($home_charset);

        if ($utf_16) {
            //$file = mb_ereg_replace("\\0", "", $file);
            $file = utf16_to_utf8($file);
        }

        //      kill useless blanks and line feeds
        $file       = preg_replace("/[  |\r\n]+/i", " ", $file);
        $urlparts   = parse_addr($url);
        $host       = $urlparts['host'];
        //remove filename from path and all tags which should be ignored
        $path = preg_replace('/([^\/]+)$/i', "", $urlparts['path']);

        if ($use_nofollow == '1') {
            $file = preg_replace("@<!--sphider_noindex-->.*?<!--\/sphider_noindex-->@si", " ",$file);
        }

        //  parse HTML header
        $headdata = get_head_data($file, $url, $use_nofollow, $use_robot, $can_leave_domain);

        //  if activated in Admin settings, ignore the full text
        if ($ignore_fulltxt == '1') {
            $file = '';
        }

        $file = preg_replace("@<!--.*?-->@si", " ",$file);
        $file = preg_replace("@<script[^>]*?>.*?<\/script>@si", " ",$file);
        $file = preg_replace("@<style[^>]*>.*?<\/style>@si", " ", $file);
        $file = preg_replace("/<link rel[^<>]*>/i", " ", $file);

        $title  = '';
        if ($only_links != '1') {
            $regs   = Array ();
            if (preg_match("@<title *>(.*?)<\/title*>@si", $file, $regs)) {
                $title = trim($regs[1]);
                $title = "".$title." ";
            } else if ($type == 'pdf' || $type == 'doc' || $type == 'ppt' || $type == 'rtf' || $type == 'xls' || $title == '') {
                //create title for a non-html files
                $offset = strrpos ($url, '/');      //      get document name
                $title = substr ($url, $offset+1);
            }
        }

        // if activated in Admin settings, remove all div contents as defined in common 'divs_not' list
        if ($not_divs == '1') {
            foreach ($not_divlist as $thisid) {  //    try to find divs with id as specified in common 'divs' list

                //  regexp ?
                if (strpos($thisid, "/") == "1" && strrpos($thisid, "/") == strlen($thisid)-1) {
                    $thisid = substr($thisid, 2, strlen($thisid)-3);    //  remove the regex capsules
                } else {    //  for string input only
                    if (strrpos($thisid, "*") == strlen($thisid)-1) {
                        $thisid = str_replace("*", "(.*?)", $thisid);   //  replace wildcards at the end of string input
                    }
                }

                if (preg_match_all("@(<div class|<div id)=(\"|')".$thisid."(\"|').*?</div>@si", $file, $found_div, PREG_OFFSET_CAPTURE )) {

                    $this_divstart  = $found_div[0][0][1];      //  get actual startpos from div-array
                    $i = "end";                                 //  if required $i will become the loop counter for nested divs
                    $nextstart  = strpos($file, "<div", $this_divstart+4);      //  find start pos of next div
                    $nextend    = strpos($file, "</div", $this_divstart+4);     //  find end pos of next div

                    //check for nested divs
                    $start1  = strpos($file, "<div", $nextstart+4); // find start pos of next div
                    if ($start1 && $start1 < $nextend) {
                        $i = "0";   //  yes, nested
                    }

                    while ($i != "end") {   //  loop for (multiple) 'nested divs'
                        $i = '0';
                        while ($nextstart && $nextstart < $nextend) {   // next div is a nested div?

                            $nextend1    = strpos($file, "</div", $nextstart+4);    //  this is only the endpos of current div
                            $nextend    = strpos($file, "</div", $nextend1+6);      //  find end pos of next div
                            $nextstart  = strpos($file, "<div", $nextstart+4);      // find start pos of next div

                            if ($nextstart && $nextstart < $nextend1) {   //  again nested in next layer?
                                $i++ ;                      //  counter for next level nested divs
                            }
                        }
                        //  if nested divs were found, correct end pos of div to be deleted
                        while($i > '1') {
                            $nextend    = strpos($file, "</div", $nextend+6);
                            $i--;
                        }

                        $nextend1 = strpos($file, "</div", $nextend+6);     //  $nextend from former div (might have been nested)
                        if ($nextend1) {
                            $nextend = $nextend1;   //  defines next endpos
                        }
                        if (!$nextstart || $nextend < $nextstart) {
                            $i = 'end'; //  no longer nested divs
                        }
                    }       //  end of 'nested divs' loop

                    //  delete this div content from $file
                    $kill_thisdiv = substr($file, $this_divstart, ($nextend+6)-$this_divstart);
                    $file = str_replace($kill_thisdiv, " ", $file);
                }
            }
        }

        // if activated in Admin settings, fetch all div contents as defined in common 'divs_use' list
        if ($use_divs == '1') {
            foreach ($use_divlist as $thisid) {  //    try to find divs with id as specified in common 'divs' list

                //  regexp ?
                if (strpos($thisid, "/") == "1" && strrpos($thisid, "/") == strlen($thisid)-1) {
                    $thisid = substr($thisid, 2, strlen($thisid)-3);    //  remove the regex capsules
                } else {    //  for string input only
                    if (strrpos($thisid, "*") == strlen($thisid)-1) {
                        $thisid = str_replace("*", "(.*?)", $thisid);   //  replace wildcards at the end of string input
                    }
                }

                if (preg_match_all("@(<div class|<div id)=(\"|')".$thisid."(\"|').*?(</div>)@si", $file, $found_divs, PREG_OFFSET_CAPTURE )) {

                    foreach ($found_divs[0] as $another_div) {  //  walk through all found divs. Usually W3C does not allow more than one div with this id. But who knows . . . .

                        $this_divstart  = $another_div[1];      //  get actual startpos from div-array
                        $i = "end";                             //  if required $i will become the loop counter for nested divs
                        $nextstart  = strpos($file, "<div", $this_divstart+4);      //  find start pos of next div
                        $nextend    = strpos($file, "</div", $this_divstart+4);     //  find end pos of next div

                        //check for nested divs
                        $start1  = strpos($file, "<div", $nextstart+4); // find start pos of next div
                        if ($start1 && $start1 < $nextend) {
                            $i = "0";   //  yes, nested
                        }

                        while ($i != "end") {   //  loop for (multiple) 'nested divs'
                            $i = '0';
                            while ($nextstart && $nextstart < $nextend) {   // next div is a nested div?

                                $nextend1    = strpos($file, "</div", $nextstart+4);    //  this is only the endpos of current div
                                $nextend    = strpos($file, "</div", $nextend1+6);      //  find end pos of next div
                                $nextstart  = strpos($file, "<div", $nextstart+4);      // find start pos of next div

                                if ($nextstart && $nextstart < $nextend1) {   //  again nested in next layer?
                                    $i++ ;                      //  counter for next level nested divs
                                }
                            }
                            //  if nested divs were found, correct end pos of div to be deleted
                            while($i > '1') {
                                $nextend    = strpos($file, "</div", $nextend+6);
                                $i--;
                            }

                            $nextend1 = strpos($file, "</div", $nextend+6);     //  $nextend from former div (might have been nested)
                            if ($nextend1) {
                                $nextend = $nextend1;   //  defines next endpos
                            }
                            if (!$nextstart || $nextend < $nextstart) {
                                $i = 'end'; //  no longer nested divs
                            }
                        }
                        //  collect all divs to be indexed
                        $all_divs[] = substr($file, $this_divstart, ($nextend+6)-$this_divstart);

                    }
                    //  add content of all found divs to full text
                    foreach($all_divs as $use_thisdiv) {
                        $divfile .= " ".$use_thisdiv;
                    }
                }
            }
            $file = $divfile;  //  now this will be used as the body part of the page content
        }

        // if activated in Admin settings, fetch the content of all elements as defined in common 'elements_use' list and use the content of these elements as page content
        if ($use_elems == '1') {
            foreach ($use_elementslist as $this_element) {  //    try to find elements with id as specified in common 'elöements_use' list
                //  regexp ?
                if (strpos($this_element, "/") == "1" && strrpos($this_element, "/") == strlen($this_element)-1) {
                    $this_element = substr($this_element, 2, strlen($this_element)-3);    //  remove the regex capsules
                }

                if (preg_match_all("@<$this_element.*?>.*?<\/$this_element>@si", $file, $found_elements, PREG_OFFSET_CAPTURE )) {

                    foreach ($found_elements as $new_element) {  //  walk through all found elementss.
                        foreach ($new_element as $new) {
                            //  build substring without content tags
                            $string = $new[0];
                            $string = substr($string, strpos($string, ">")+1);
                            $string = substr($string, 0, strrpos($string, "<"));
                            //  collect all elements to be indexed
                            $all_elements[] = $string;
                        }
                    }
                }
            }
            $file = '';
            //  add content of all found elements to full text
            foreach($all_elements as $use_thiselem) {
                $file .= " ".$use_thiselem;  //  now all this will be used as the body part of the page content
            }
        }

        // if activated in Admin settings, fetch the content of all elements as defined in common 'elements_not' list and delete that part of the page
        if ($not_elems == '1') {
            foreach ($not_elementslist as $this_element) {  //    try to find elements with id as specified in common 'elements_not' list
                //  regexp ?
                if (strpos($this_element, "/") == "1" && strrpos($this_element, "/") == strlen($this_element)-1) {
                    $this_element = substr($this_element, 2, strlen($this_element)-3);    //  remove the regex capsules
                }

                if (preg_match_all("@<$this_element.*?>.*?<\/$this_element>@si", $file, $found_elements, PREG_OFFSET_CAPTURE )) {

                    foreach ($found_elements as $new_element) {  //  walk through all found elementss.
                        foreach ($new_element as $new) {
                            //  collect all elements to be ignored
                            $all_elements[] = $new[0];
                        }
                    }
                }
            }
            //  remove the content of all found elements from full text
            foreach($all_elements as $use_thiselem) {
                $file = str_replace($use_thiselem, " ", $file);
            }
        }

        //  parse bbcode
        if ($bb_decode == '1' ){
            $file = bbcode($file);
        }

        //create spaces between tags, so that removing tags doesnt concatenate strings
        $file = preg_replace("/<[\w ]+>/", "\\0 ", $file);
        $file = preg_replace("/<\/[\w ]+>/", "\\0 ", $file);
        $file = preg_replace("@<head>.*?</head>@si", " ",$file);    //  remove HTML header from file
        $file = preg_replace("@<\/a>@si", " ",$file);               //  remove lost end tag

        //$file = strip_tags($file);  //  remove the content of HTML tags from $file (does not work for invalid written and unclosed tags)
        //  replaced since Sphider-plus version 2.7
        //  remove the content of HTML tags from $file
        $found_tags     = array();
        $another_tag    = array();
        if (preg_match_all("@<.*?>@s", $file, $found_tags, PREG_OFFSET_CAPTURE )) {
            foreach ($found_tags[0] as $another_tag) {       //  walk through all found tags.
                if (strlen($another_tag[0]) < "500") {      //  delete this tag from full text if not too long (unclosed)
                    $file = str_replace($another_tag[0], " ", $file);
                }
            }
        }

        if ($del_elems) {   //  if activated in Admin backend, delete  &lt; element /&gt; from full text
            $found_tags     = array();
            $another_tag    = array();
            if (preg_match_all("@\&lt;.*?\&gt;@s", $file, $found_tags, PREG_OFFSET_CAPTURE )) {
                foreach ($found_tags[0] as $another_tag) {       //  walk through all found tags.
                    $file = str_replace($another_tag[0], " ", $file);
                }
            }
        }

        if ($conv_puny) {   //  make punycode readable
            require_once "$include_dir/idna_converter.php";
            // Initialize the converter class
            $IDN            = new idna_convert(array('idn_version' => 2008));
            $found_tags     = array();
            $another_tag    = array();
            $this_tag       = '';

            $file = str_replace("http", " http", $file);    //place a blank in front of all http's
            if (preg_match_all("@http.*? @s", $file, $found_tags, PREG_OFFSET_CAPTURE )) {

                foreach ($found_tags[0] as $another_tag) {       //  walk through all found tags.
                    // Decode the URL to readable format
                    $this_tag = $IDN->decode(urldecode($another_tag[0]));
                    $this_tag = urldecode($this_tag);
                    $file = str_replace($another_tag[0], $this_tag, $file);
                }
            }
        }

        $file   = str_replace(" ", " ", $file);       //  replace special (long) blanks with standard blank
        $file   = str_replace("—", "'", $file);       //  replace  invalid coded slash
        $file   = str_replace("©", "&#151;", $file);   //  replace  invalid coded long dash with correct long dash
        $file   = preg_replace("/   +/", " ", $file);   //  replace TABs with a standard blank
        $file   = preg_replace("/  +/", " ", $file);    //  kill duplicate blanks

        $file   = str_replace(" &nbsp;", " ", $file);
        $file   = str_replace("&nbsp;&nbsp;", " ", $file);  //  kill duplicate &nbsp; blanks
        $file   = str_replace ("&shy;", "", $file);         //  kill  break character

        if ($text_length != "0") {
            //  build substring of full text until last space in front of $text_length
            $file = substr($file, 0, strrpos(substr($file, 0, $text_length), " "));

        }

        if ($index_host == 1) {
            //  separate words in host and path
            $host_sep =preg_replace("/\.|\/|\\\/", " ", $host);
            $path_sep =preg_replace("/\.|\/|\\\/", " ", $path);

            $file = $file." ".$host." ".$host_sep;
            $file = $file." ".$path." ".$path_sep;
        }


        if ($title && $index_meta_title) {
            $file = $file." ".$title;
        }

        if ($index_meta_description == 1) {
            $file = $file." ".$headdata['description'];
        }

        if ($index_meta_keywords == 1) {
            $file = $file." ".$headdata['keywords'];
        }

        if ($ent_decode == '1') {
            //  as it seems, the PHP function html_entity_decode() has some problems.
            //  In case that 2 entities are placed directly together like: &mdash;&nbsp;
            //  we are obliged to be helpful by eliminating one of them
            $file = str_replace("&nbsp;", " ", $file);
            //  now PHP does not get confused
            $file = html_entity_decode($file, ENT_QUOTES, 'UTF-8');

            $title = str_replace("&nbsp;", " ", $title);
            $title = html_entity_decode($title, ENT_QUOTES, 'UTF-8');

        }

        //  correct some other trash found on the Internet
        $file = str_replace("�", "fi", $file);
        $file = str_replace("fl", "fl", $file);

        //  for URLs use entities, so that links become readable in full text
        $file = str_replace("<a href=\"http://www.","&lt;a href=&quot;http://www.",$file);

        $fulltext = $file;              //  required for result listing as extract around the keywords and for PHRASE search

        //  convert all single quotes into standard quote
        if ($quotes == '1') {
            $all_quotes = array
            (
                    "&#8216;"   => "'",
                    "&lsquo;"   => "'",
                    "&#8217;"   => "'",
                    "&rsquo;"   => "'",
                    "&#8242;"   => "'",
                    "&prime;"   => "'",
                    "‘"         => "'",
                    "‘"         => "'",
                    "´"         => "'",
                    "`"         => "'",
                    "’"         => "'",
                    "’"         => "'"
                    );

                    reset($all_quotes);
                    while ($char = each($all_quotes)) {
                        $file = preg_replace("/".$char[0]."/i", $char[1], $file);
                    }
        }

        //  convert all double quotes into standard quotations
        if ($dup_quotes == '1') {
            $all_quotes = array
            (
                    "“"   => "\"",
                    "�"   => "\"",
                    "„"   => "\""
                    );

                    reset($all_quotes);
                    while ($char = each($all_quotes)) {
                        $file = preg_replace("/".$char[0]."/i", $char[1], $file);
                    }
        }

        //  split words at hyphen, single quote, dot and comma into their basics
        if (($div_all || $div_hyphen)) {
            $file   = split_words($file);
        }

        reset($entities);
        while ($char = each($entities)) {
            $file = preg_replace("/".$char[0]."/i", $char[1], $file);
        }

        //replace codes with ascii chars
        //$file = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $file);
        $file = preg_replace('~&#([0-9]+);~e', 'chr("\\1")', $file);

        if ($case_sensitive == '0' ) {
            $file = lower_ent($file);
        }

        //  already done before, but who knows . . .
        $fulltext   = str_replace(" ", " ", $fulltext);         //  replace special (long) blanks  for result listing  (description)
        $fulltext   = preg_replace("/   +/", " ", $fulltext);   //  replace TABs with a standard blank
        $fulltext   = preg_replace("/  +/", " ", $fulltext);    //  kill duplicate blanks

        $title      = str_replace(" ", " ", $title);    //  replace special (long) blanks in title

        if ($index_rss == '1') {
            $file = preg_replace('/0b/si', '.', $file);     // try to correct bad charset interpretation
            $file = preg_replace('//si', '\'', $file);
            $trash   = array("\r\n", "\n", "\r", "0E", "0C", "0I");     // kill 'LF' and the others
        } else {
            $trash   = array("\r\n", "\n", "\r");
        }
        $replace = ' ';
        $file       = str_replace($trash, $replace, $file);
        $fulltext   = str_replace($trash, $replace, $fulltext);

        $trash   = array("\\r\\n", "\\n", "\\r");       // kill 'LF' and the others
        $replace = ' ';
        $file = str_replace($trash, $replace, $file);
        $search = '';

        $file = del_secchars($file);

        $count  = str_word_count($fulltext, 0);
        $data['fulltext']       = addslashes($fulltext);
        $data['content']        = $file;
        $data['title']          = addslashes($title);
        $data['description']    = $headdata['description'];
        $data['keywords']       = $headdata['keywords'];
        $data['host']           = $host;
        $data['path']           = $path;
        $data['nofollow']       = $headdata['nofollow'];
        $data['noindex']        = $headdata['noindex'];
        $data['base']           = $headdata['base'];
        $data['cano_link']      = $headdata['cano_link'];
        $data['count']          = $count;
        $data['refresh']        = $headdata['refresh'];
        $data['wait']           = $headdata['wait'];

        if ($clear == 1) unset ($char, $file, $fulltext, $path_sep, $headdata, $regs, $urlparts, $host);
        return $data;
    }

    function calc_weights($wordarray, $title, $host, $path, $keywords, $url_parts) {
        global $index_host, $index_meta_keywords, $sort_results, $domain_mul, $cn_seg, $clear, $dompromo, $keypromo;

        $hostarray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($host))));
        $patharray = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($path))));

        if ($cn_seg == '1') {   //      we need all characters for Chinese language
            $titlearray     = unique_array(explode(" ", strtolower($title)));
            $keywordsarray  = unique_array(explode(" ", strtolower($keywords)));
        } else {
            $titlearray     = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($title))));
            $keywordsarray  = unique_array(explode(" ", preg_replace("/[^[:alnum:]-]+/i", " ", strtolower($keywords))));
        }

        $path_depth = countSubstrs($path, "/");
        $main_url_factor = '1';

        if ($sort_results == '2') {         //      enter here if 'Main URLs (domains) on top'  is selected
            $act_host = $host;
            $act_path =  $url_parts['path'];
            $act_query =  $url_parts['query'];

            //      try to find main URL for localhost systems
            if ($act_host == 'localhost' && substr_count($act_path, ".") == '0' && substr_count($act_path, "/") <= '3') {
                $main_url_factor = $domain_mul;     //      if localhost: increase weight for domains in path
            }
            /*
             if ($act_host == 'localhost' && substr_count($act_path, ".") == '1' && substr_count($act_path, "/") <= '3') {
             $main_url_factor = $domain_mul/2;     //      if localhost: increase weight for sub-domains in path slightly
             }
             */
            //      only these files are exepted as valid part of the url path
            $act_path = str_replace ('index.php', '', $act_path);
            $act_path = str_replace ('index.html', '', $act_path);
            $act_path = str_replace ('index.htm', '', $act_path);
            $act_path = str_replace ('index.shtml', '', $act_path);

            //      try to find main URL in the wild
            if ($act_host != 'localhost'  && substr_count($act_host, ".") == '2' && strlen($act_path) <= '1' && !$url_parts['query']) {
                $main_url_factor = $domain_mul;     //      increase weight for main URLs (domains)
            }
        }

        $promo          = '';
        $catch_found    = '';
        while (list ($w, $word) = each($wordarray)) {
            if ($keypromo == $word[1]) {
                $catch_found = '1'; //  catchword found in text
            }
        }

        //  for promoted domains, correct the weighting
        if (!$keypromo && $dompromo && strstr($host, $dompromo)){
            $promo = '1';
        }

        //  for promoted catchwords, correct the weighting
        if (!$dompromo && $keypromo && $catch_found){
            $promo = '1';
        }

        //  for promoted domains AND promoted catchwords , correct the weighting
        if ($keypromo && $catch_found && $dompromo && strstr($host, $dompromo)){
            $promo = '1';
        }

        reset ($wordarray);
        while (list ($wid, $word) = each($wordarray)) {
            $word_in_path = 0;
            $word_in_domain = 0;
            $word_in_title = 0;
            $meta_keyword = 0;

            if ($index_host == 1) {
                while (list ($id, $path) = each($patharray)) {

                    if ($path[1] == $word[1]) {
                        $word_in_path = 1;
                        break;
                    }
                }
                reset($patharray);

                while (list ($id, $host) = each($hostarray)) {
                    if ($host[1] == $word[1]) {
                        $word_in_domain = 1;
                        break;
                    }
                }
                reset($hostarray);
            }

            if ($index_meta_keywords == 1) {
                while (list ($id, $keyword) = each($keywordsarray)) {
                    if ($keyword[1] == $word[1]) {
                        $meta_keyword = 1;
                        break;
                    }
                }
                reset($keywordsarray);
            }
            while (list ($id, $tit) = each($titlearray)) {
                if ($tit[1] == $word[1]) {
                    $word_in_title = 1;
                    break;
                }
            }
            reset($titlearray);
            $wordarray[$wid][3] = (int) (calc_weight($wordarray[$wid][2], $word_in_title, $word_in_domain, $word_in_path, $path_depth, $meta_keyword, $main_url_factor, $host, $promo));
        }
        if ($clear == 1) unset ($titlearray, $keywordsarray, $hostarray, $patharray, $act_path, $act_host, $act_query);
        reset($wordarray);
        return $wordarray;
    }

    function calc_weight($words_in_page, $word_in_title, $word_in_domain, $word_in_path, $path_depth, $meta_keyword, $main_url_factor, $host, $promo) {
        global $title_weight, $domain_weight, $path_weight, $meta_weight;

        $weight =   ( (   $words_in_page
        + $word_in_title * $title_weight
        + $word_in_domain * $domain_weight
        + $word_in_path * $path_weight
        + $meta_keyword * $meta_weight
        ) * 10
        / (0.2 + 0.8*$path_depth)
        )*$main_url_factor;

        //  for promoted domains and/or promoted catchwords, correct the weighting
        if ($promo){
            $weight = $weight*8;
        }
        return $weight;
    }

    function isDuplicateMD5($md5sum) {
        global $mysql_table_prefix, $debug, $clear;

        mysqltest();
        $result = mysql_query("select link_id from ".$mysql_table_prefix."links where md5sum='$md5sum'");
        if ($debug > '0') echo mysql_error();
        if (mysql_num_rows($result) > 0) {
            return true;
        }
        if ($clear == 1) clean_resource($result, '51') ;
        return false;
    }

    function check_include($link, $inc, $not_inc) {
        global $clear;

        $url_inc = Array ();
        $url_not_inc = Array ();
        if ($inc != "") {
            $url_inc = explode("\n", $inc);
        }
        if ($not_inc != "") {
            $url_not_inc = explode("\n", $not_inc);
        }
        $oklinks = Array ();

        $include = true;
        foreach ($url_not_inc as $str) {
            $str = trim($str);
            if ($str != "") {
                if (substr($str, 0, 1) == '*') {
                    if (preg_match(substr($str, 1), $link)) {
                        $include = false;
                        break;
                    }
                } else {
                    if (!(strpos($link, $str) === false)) {
                        $include = false;
                        break;
                    }
                }
            }
        }
        if ($include && $inc != "") {
            $include = false;
            foreach ($url_inc as $str) {
                $str = trim($str);
                if ($str != "") {
                    if (substr($str, 0, 1) == '*') {
                        if (preg_match(substr($str, 1), $link)) {
                            $include = true;
                            break 2;
                        }
                    } else {
                        if (strpos($link, $str) !== false) {
                            $include = true;
                            break;
                        }
                    }
                }
            }
        }
        if ($clear == 1) unset ($str, $link, $url_not_inc, $url_inc, $oklinks);
        return $include;
    }

    function check_for_removal($url) {
        global $mysql_table_prefix, $debug, $no_log, $command_line, $clear, $not_erase;

        if (!$not_erase) {  //  delete links only if "URL Must Not include" is not activated for erasing function
            mysqltest();
            $result = mysql_query("select link_id, visible from ".$mysql_table_prefix."links"." where url='$url'");
            if ($debug > '0') echo mysql_error();
            if (mysql_num_rows($result) > 0) {
                $row = mysql_fetch_row($result);
                $link_id = $row[0];
                $visible = $row[1];
                if ($visible > 0) {
                    $visible --;
                    mysql_query("update ".$mysql_table_prefix."links set visible='$visible' where link_id='$link_id'");
                    if ($debug > '0') echo mysql_error();
                } else {
                    mysql_query("delete from ".$mysql_table_prefix."links where link_id=$link_id");
                    if ($debug > '0') echo mysql_error();
                    for ($i=0;$i<=15; $i++) {
                        $char = dechex($i);
                        mysql_query("delete from ".$mysql_table_prefix."link_keyword$char where link_id=$link_id");
                        if ($debug > '0') echo mysql_error();
                    }
                    printStandardReport('pageRemoved',$command_line, '0');
                }
            }
            if ($clear == 1) clean_resource($result, '52') ;
            unset ($char, $link_id, $visible);
        }
    }

    function extract_text($contents, $source_type, $url) {
        global $tmp_dir, $pdftotext_path, $catdoc_path, $xls2csv_path, $op_system, $mb;
        global $catppt_path, $home_charset, $command_line, $no_log, $clear, $converter_dir;

        $home_charset1 = str_ireplace ('iso-','',$home_charset);
        $charset_int = str_ireplace ('iso','',$home_charset1);
        $temp_file = "tmp_file";
        $result = array();
        $filename = $tmp_dir."/".$temp_file ;
        if ($source_type == 'ods'){
            $filename .= ".".$source_type."";
        }
        if (!$handle = fopen($filename, 'w')) {
            die ("Cannot open file $filename in temp folder");
        }

        mysqltest();
        if (fwrite($handle, $contents) === FALSE) {
            die ("Cannot write to file $filename in temp folder");
        }
        fclose($handle);
        mysqltest();
//echo "\r\n\r\n<br /> source_type: '$source_type'<br />\r\n";
        //      for PDF documents enter here
        if ($source_type == 'pdf') {

            if (!$handle = fopen($pdftotext_path, 'rb')) {

                printStandardReport('errorNoPDFConv',$command_line);
                $result[] = 'ERROR';
            } else {                                    //   prepare command line for PDF converter
                if ($op_system != 'win') {
                    $command = "".$pdftotext_path." -enc UTF-8 ".$filename."";
                } else {
                    $command = "".$pdftotext_path." -cfg xpdfrc ".$filename." -";
                }
                $a = exec($command, $result, $retval);  //  convert the PDF document

                if ($retval != '0') {                   //   error handler for PDF file converter
                    if ($retval == '1' || $retval == '3' || $retval == '127') {
                        if ($retval == '1') {
                            printStandardReport('errorOpenPDF',$command_line);
                        }
                        if ($retval == '3') {
                            printStandardReport('permissionError',$command_line);
                        }
                        if ($retval == '127') {
                            printStandardReport('noConverter',$command_line);
                        }
                    } else {
                        printStandardReport('ufoError',$command_line);
                    }
                    $result[] = 'ERROR';
                }
                $result = implode(' ', $result);
            }
/*
             require_once "".$converter_dir."/pdf2txt.php";
             $result = pdf2txt($filename);
 */
/*
             require_once "".$converter_dir."/class.pdf2text.php";
             $a = new PDF2Text();
             $a->setFilename($filename);
             $a->decodePDF();
             $result = $a->output();
*/
/*
             require_once "".$converter_dir."/pdf2txt.php";
             $result = pdf2txt::directConvert($contents);
*/


            //echo "\r\n\r\n<br /> PDF result string: $result<br />\r\n";
            //echo "\r\n\r\n<br>PDF result Array:<br><pre>";print_r($result);echo "</pre>\r\n";

            //      for DOC and RTF files enter here
        } else if ($source_type == 'doc' || $source_type == 'rtf') {
            if ($op_system == 'win') {
                $command = $catdoc_path." -s $charset_int -d utf-8 -x $filename";
                $a = exec($command, $result, $retval);
            }

            //      for PPT files enter here
        } else if ($source_type == 'ppt') {
//  currently unsupported,as a failure was encountered for large PowerPoint pesentations
            $a = '';
/*
            $command = $catppt_path." -s $charset_int -d utf-8 $filename";
            $a = exec($command, $result, $retval);
*/
            //      for XLS spreadsheets enter here
        } else if ($source_type == 'xls') {
            $error = '';
            require_once "".$converter_dir."/xls_reader.php";
            $data = new Spreadsheet_Excel_Reader();

            if ($mb == '1') {
                //  if extention exists, change 'iconv' to mb_convert_encoding:
                $data->setUTFEncoder('mb');
            }

            // set output encoding.
            $data->setOutputEncoding('UTF-8');

            //  read this document
            $data->read($filename);
            $error = $data->_ole->error;
            if ($error == '1'){
                printStandardReport('xlsError',$command_line, $no_log);
                $result = 'ERROR';
            } else {
                $result = '';
                $boundsheets    = array();
                $sheets         = array();
                $boundsheets    = $data->boundsheets;   // get all tables in this file
                $sheets         = $data->sheets;        // get content of all sheets in all tables

                if($boundsheets) {
                    foreach ($boundsheets as &$bs) {
                        $result .= "".$bs['name'].", "; //  collect all table names in this file
                    }

                    if ($sheets) {
                        foreach ($sheets as &$sheet) {
                            $cells = $sheet['cells'];

                            if ($cells) {    //  ignore all empty cells
                                foreach ($cells as &$cell) {
                                    foreach ($cell as &$content) {
                                        $result .= "".$content.", ";     //  collect content of all cells
                                    }
                                }
                            }
                        }
                    }
                    if (strtoupper($home_charset) == 'ISO-8859-1') {
                        $result = utf8_encode($result);
                    }
                }
            }

        //      for ODS spreadsheets enter here
        } else if ($source_type == 'ods') {
            require_once "".$converter_dir."/ods_reader.php";
            $reader = ods_reader::reader($filename);
            $sheets = $reader->read($filename);

            if($sheets) {
                $result = '';
                foreach ($sheets as &$sheet) {
                    if($sheet) {
                        foreach ($sheet as &$cell) {
                            if($cell) {    //  ignore all empty cells
                                foreach ($cell as &$content) {
                                    $result .= "".$content." ";     //  collect content of all cells
                                }
                            }
                        }
                    }
                }

            } else {
                $result = 'ERROR';
            }

        //      for ODT documents enter here
        } else if ($source_type == 'odt') {

            require_once "".$converter_dir."/odt_reader.php";
            $x = new odt_reader();
            // Unzip the document
            $u = $x->odt_unzip($filename, false);
            // read the document
            $result = $x->odt_read($u[0], 2);
            //  create some blanks around the <div> tags
            $result = str_replace("<", " <", $result);
            $result = str_replace(">", "> ", $result);
            //echo "\r\n\r\n<br /> odt result: $result<br />\r\n";
        //  for JavaScript enter here
        }else if ($source_type == 'js') {
            $result = extract_js($contents);
        }

/*
         echo "\r\n\r\n<br /> url: $url<br />\r\n";
         echo "\r\n\r\n<br /> source_type: $source_type<br />\r\n";
         echo "\r\n\r\n<br /> Operating system: $op_system<br />";
         echo "\r\n\r\n<br /> Converter command: $command<br />";
         echo "\r\n\r\n<br>extracted content Array:<br><pre>";print_r($result);echo "</pre>\r\n";
         echo "\r\n\r\n<br /> Extracted content string: $result<br />";
 */

        if ($result != 'ERROR') {
            if(is_array($result)) {
                    $result = implode(" ", $result);
            }
            $count = strlen($result);
            if ($count =='0'){          //      if there was not one word found, print warning message
                if ($source_type == 'js') {
                   printStandardReport('jsEmpty',$command_line, $no_log);
                } else {
                    printStandardReport('nothingFound',$command_line, $no_log);
                }
                $result = 'ERROR';
            }
        }

        unlink ($filename);
        mysqltest();
        if ($clear == 1) unset ($command, $retval, $a, $contents, $count);
        return $result;
    }

    function  remove_sessid($url) {
        global $strip_s_sessids;

        if ($strip_s_sessids) {
            return preg_replace("/(\?|;|&|&amp;)(PHPSESSID|JSESSIONID|session_id|ASPSESSIONID|sid|zenid|s)=(.)+$/i", "", $url);
        } else {
            return preg_replace("/(\?|;|&|&amp;)(PHPSESSID|JSESSIONID|session_id|ASPSESSIONID|sid|zenid)=(.)+$/i", "", $url);
        }
    }

    function get_sitemap($input_file, $indexed_map, $mysql_table_prefix) {
        global $mysql_table_prefix, $command_line, $debug, $no_log, $max_links, $clear;

        if ($indexed_map) {
            $map_cont = '';
            //      read  content of uncomressed secondary sitemap file
            if (!strstr($input_file, "gz") && $fd = @fopen($input_file, "r")) {   //  read uncompressed sitemap file
                //if ($zd = @gzopen("".$input_file.".xml", "r")) {    //  uncompressed
                $map_cont = @stream_get_contents($fd);
                fclose($fd);
                }
                if (!$map_cont && $zd = @fopen("compress.zlib://$input_file", "r")) {  // read compressed secondary sitemap
                    //if (!$smap_found && $zd = @gzopen("".$input_file.".xml.gz", "r")) {  // compressed  ;
                    $map_cont = @gzread($zd, 10485760);      //  max. 10 MB (might be too large for some server)
                    gzclose($zd);
                    }

        } else {
            $map_cont = $input_file;
        }
        $s_map = simplexml_load_string ($map_cont);

        if ($s_map) { // if sitemap file is conform to XML version 1.0
            //echo "\r\n\r\n<br>s_map Array:<br><pre>";print_r($s_map);echo "</pre>\r\n";
            $links = array ();
            mysqltest();
            $count = '0';
            $scheme = '';

            foreach($s_map as $url) {
                if ($count < $max_links) {  //  save time, we dont need more

                    $the_url = str_replace("&amp;","&",$url->loc);
                    if ($the_url) {     //  hopefully this is a URL

                        if (!strstr($the_url, "ttp")) {
                            $scheme = '1';
                            $the_url = "http://".$the_url;
                        }

                        $lastmod = strtotime($url->lastmod);    // get lastmod date only for this page from sitemap
                        if (!$lastmod) $lastmod = '999999999';  //  if the webmaster was lazy we are obliged to index this link

                        $res=mysql_query("select indexdate from ".$mysql_table_prefix."links where url like '%$the_url%'");
                        $num_rows = mysql_num_rows($res); // do we already know this link?
                        $indexdate = '0';
                        $new    = '1';

                        if ($num_rows > '0') {
                            $indexdate = strtotime(mysql_result($res,"indexdate"));
                            $new = $lastmod - $indexdate;
                        }

                        if ($new > '0') $links[] =($url->loc); // add new link only if date from sitemap.xml is newer than date of last index
                    }
                    $count++;
                }
            }

            if ($clear == 1) clean_resource($res, '53') ;
            $links = explode(",",(implode(",",$links))); // destroy SimpleXMLElement Object and get the link array
        }

        if ($scheme == '1'){ //  hopefully this is a URL, otherwise we need to add the scheme
            $i = '0';
            foreach($links as $url) {
                if (!strstr($url, "ttp")) {
                    $url = "http://".$url;
                    $links[$i] = $url;
                    $i++;
                }
            }
        }
        //echo "\r\n\r\n<br>links Array:<br><pre>";print_r($links);echo "</pre>\r\n";
        return($links);
    }

    function store_newLinks($links, $level, $sessid) {
        global $mysql_table_prefix, $debug;

        mysqltest();
        while ($thislink = each($links)) {
            //  check if we already know this link as a site url
            $thislink[1] = mysql_real_escape_string($thislink[1]);
            $result = mysql_query("select url from ".$mysql_table_prefix."sites where url like '$thislink[1]%'");
            if ($debug > '0') echo mysql_error();
            $rows = mysql_num_rows($result);

            if ($rows == '0') {     // for all new links: save in temp table
                mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$thislink[1]', '$level', '$sessid')");
                if ($debug > '0') echo mysql_error();
            }
        }
        if ($result) clean_resource($result, '54') ;
        return;
    }

    function create_sitemap($site_id, $url) {
        global $mysql_table_prefix, $smap_dir, $smap_unique, $debug, $clear;

        $changefreq = "monthly";   //      individualize this variable
        $priority   = "0.50";      //      individualize this variable

        //      Below this only change something, if you are sure to remain compatible to http://www.sitemaps.org/schemas/sitemap/0.9
        $date       = date("Y-m-d");
        $time       = date("h:i:s");
        $modtime    = "T$time+01:00";
        $version    = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" ;
        $urlset     = "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.google.com/schemas/sitemap/0.84 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd\">";
        $copyright  = "<!-- Generated by Sphider-plus created by Tec (v.1.2 rev.3) -->" ;
        $update     = "<!-- Last update of this sitemap: $date / $time -->" ;

        $all_links  = '';
        mysqltest();

        $res=mysql_query("select * from ".$mysql_table_prefix."links where site_id = $site_id");
        if ($debug > '0') echo mysql_error();
        $num_rows = mysql_num_rows($res);    //      Get all links of the current domain

        for ($i=0; $i<$num_rows; $i++) {    //      Create individual rows for XML-file
            $link = mysql_result($res, $i, "url");
            $link = str_replace("&", "&amp;", $link);   // URL should become XML conform
            $all_links = "$all_links<url><loc>$link</loc><lastmod>$date$modtime</lastmod><changefreq>$changefreq</changefreq><priority>$priority</priority></url>\n";
        }
        if ($clear == 1) clean_resource($res, '55') ;

        $name = parse_addr($url);                    //      Create filename and open file
        $hostname = $name[host];

        if ($hostname == 'localhost'){              //  if we run a localhost system extract the domain
            $pathname = $name[path];                //  get path, domain and filename
            $pos = strpos($pathname,"/",1);         //  extract domain from path and forget first / by +1 offset
            $pathname = substr($pathname,$pos+1);   // suppress /localhost/
            $pos = strrpos($pathname,"/");

            if ($pos) {
                $pathname = substr(str_replace("/", "_", $pathname),0,$pos);   // if exists, suppress folder, filename and suffix
            }

            if (!is_dir($smap_dir)) {
                mkdir($smap_dir, 0766);     // if new, create directory
            }
            if ($smap_unique == '0') {      // different names for every sitemap file
                $filename   = "./$smap_dir/sitemap_localhost_$pathname.xml";
            } else {
                $filename   = "./$smap_dir/sitemap.xml";
            }

            if (!$handle = fopen($filename, "w")) {
                printInvalidFile($filename);
                die;
            }

        } else {    //  if we run in the wild
            if (!is_dir($smap_dir)) {
                mkdir($smap_dir, 0766);     // if new, create directory
            }
            if ($smap_unique == '0') {      // different names for every sitemap file
                $filename   = "./$smap_dir/sitemap_$hostname.xml";
            } else {
                $filename   = "./$smap_dir/sitemap.xml";
            }

            if (!$handle = fopen($filename, "w")) {
                printInvalidFile($filename);
                die ('');
            }
        }

        //      Now write all to XML-file
        if (!fwrite($handle, "$version\n$urlset\n$copyright\n$update\n$all_links</urlset>\n")) {
            printInvalidFile($filename);
            die ('');
        }
        fclose($handle);

        //      sitemap.xml done! Now final printout
        printSitemapCreated($filename);

    }

    function build_url($url, $parent_url, $select, $current, $handle, $store_file) {
        global $clear, $ext, $mainurl, $apache_indexes, $strip_sessids, $ex_media, $clear;

        // find only media-files with allowed file suffix  or type-description  or application descriptor
        $match = valid_link($url, $select);
        if ($match == '0') {
            return '';
        }

        if (substr($url, -1) == '\\') {
            return '';
        }

        $original_parent_url_parts  = parse_all_url($url);
        $urlparts                   = parse_all_url($url);
        $main_url_parts             = parse_all_url($mainurl);

        if ($urlparts['host'] != "" && $urlparts['host'] != $main_url_parts['host']  && $ex_media != 1) {
            return '';
        }

        if (isset($urlparts['query'])) {
            if ($apache_indexes[$urlparts['query']]) {
                return '';
            }
        }

        if (preg_match("/[\/]?mailto:|[\/]?javascript:|[\/]?news:/i", $url)) {
            return '';
        }
        if (isset($urlparts['scheme'])) {
            $scheme = $urlparts['scheme'];
        } else {
            $scheme ="";
        }

        //only http and https links are followed
        if (!($scheme == 'http' || $scheme == '' || $scheme == 'https')) {
            return '';
        }

        //parent url might be used to build an url from relative path
        $parent_url = remove_file_from_url($parent_url);
        $parent_url_parts = parse_all_url($parent_url);


        if (substr($url, 0, 1) == '/') {
            $url = $parent_url_parts['scheme']."://".$parent_url_parts['host'].$url;
        } else
        if (!isset($urlparts['scheme'])) {
            $url = $parent_url.$url;
        }

        $url_parts = parse_all_url($url);
        $urlpath    = $url_parts['path'];
        $regs       = Array ();

        while (preg_match("/[^\/]*\/[.]{2}\//", $urlpath, $regs)) {
            $urlpath = str_replace($regs[0], "", $urlpath);
        }

        //remove relative path instructions like ../ etc
        $urlpath = preg_replace("/\/+/", "/", $urlpath);
        $urlpath = preg_replace("/[^\/]*\/[.]{2}/", "",  $urlpath);
        $urlpath = str_replace("./", "", $urlpath);
        $query = "";
        if (isset($url_parts['query'])) {
            $query = "?".$url_parts['query'];
        }
        if ($main_url_parts['port'] == 80 || $url_parts['port'] == "") {
            $portq = "";
        } else {
            $portq = ":".$main_url_parts['port'];
        }

        if (!$urlpath) $urlpath = "/";      //     if not exists, add slash instead of real urlpath
        $url = $url_parts['scheme']."://".$url_parts['host'].$portq.$urlpath.$query;

        if (strstr($url, "/?")) {           //added to address <a href="?id=1"> syntax
            $page = str_replace($main_url_parts['path'], null, $original_parent_url_parts['path']);
            if (substr(trim($mainurl), -1) !== "/" and substr(trim($page), 0, 1) !== "/") {
                $page = "/" . $page;
            }
            $url = $mainurl . $page . $query;

        }

        if ($ex_media == 1) {    	//  if we index sub-domains
            return $url;
        }

        $mainurl = remove_file_from_url($mainurl);
        $url = convert_url($url);           // convert 'blank' and '&amp;'

        if ($strip_sessids == 1) {
            $url = remove_sessid($url);
        }

        if (strstr($url, $main_url_parts['host']) == false) {   //  $main_url_parts['host'] will support also relative-back-folder like ../../
            if ($clear == 1) {
                unset ($select, $mainurl, $urlpath, $query, $page);
                $original_parent_url_parts  = array();
                $main_url_parts             = array();
                $url_parts                  = array();
                $urlparts                   = array();
            }
            return '';
        } else {
            if ($clear == 1) {
                unset ($select, $mainurl, $urlpath, $query, $page);
                $original_parent_url_parts  = array();
                $main_url_parts             = array();
                $url_parts                  = array();
                $urlparts                   = array();
            }
            return $url;
        }
    }

    function make_abslinks($body, $url){
        //  assuming that all src, data, classid and value links are relative links in a page and without ../ or ./
        //  otherwise we need to run through all links by using $offset++
        //  this function is used only for frames and iframes in order to correct the link URL with respect to the found frame-folder
        $offset = '0';
        $link = '';
        $domain = substr($url, '0', strrpos($url, "/")+1);

        $found_link = strpos($body, "src=", $offset);
        $link = substr($body, $found_link, '20');

        if (!$link) {
            $found_link = strpos($body, "classid=", $offset);
            $link = substr($body, $found_link, '20');
        }

        if (!$link) {
            $found_link = strpos($body, "data=", $offset);
            $link = substr($body, $found_link, '20');
        }

        if (!$link) {
            $found_link = strpos($body, "value=", $offset);
            $link = substr($body, $found_link, '20');
        }

        if ($link) {
            $abs = strpos($link, "http");
            $sc1 = strpos($link, "./");
            $sc2 = strpos($link, "../");
            if (!$abs && !$sc1 && !$sc2) {      //  add domain to link, href is not altered
                $body = preg_replace("/src=\"/", "src=\"".$domain."", $body);
                $body = preg_replace("/classid=\"/", "classid=\"".$domain."", $body);
                $body = preg_replace("/data=\"/", "data=\"".$domain."", $body);
                $body = preg_replace("/value=\"/", "value=\"".$domain."", $body);
            }
        }

        return $body;
    }

    function get_frames($frame, $url, $can_leave_domain) {
        global $abslinks;

        $links          = array ();
        $regs           = array ();
        $replace        = '';
        $get_charset    = '';
        $care_excl      = '1';   //  care file suffixed to be excluded
        $relocated      = '';    //  URL is not relocated
        $local_redir    = '';
        //  find all frames of the frameset
        preg_match_all("/(frame[^>]*src[[:blank:]]*)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $frame, $regs, PREG_SET_ORDER);
        foreach ($regs as $val) {
            if (($a = url_purify($val[2], $url, $can_leave_domain, $care_excl, $relocated, $local_redir)) != '') {
                $links[] = ($a);    // collect  all frame links
            }
        }

        if ($links) {
            foreach ($links as $url) {
                printNewLinks($url);
                if (preg_match("/.html|.htm|.xml|.php|.aspx/i", $url)) {
                    $contents = getFileContents($url, $get_charset);      //      get content of this frame
                    $frame = $contents['file'];

                    //  separate the body part of this frame
                    /*
                    preg_match("@<body(.*?)>(.*?)<\/body>@si",$frame, $regs);       //  doesn't work for all frame content
                    $body = $regs[1];
                    */
                    $start_body = strpos($frame,"<body")+6;
                    $end_body   = strpos($frame,"</body")-1;
                    $length     = $end_body-$start_body;
                    $body       = substr($frame, $start_body, $length);

                    if ($abslinks == '1') {
                        $body = make_abslinks($body, $url);     //  if required, correct links relative to found frame
                    }
                    $replace = "".$replace."<br />".$body."";
                } else {    //  might be an image
                    $replace = "".$replace."<br /><img src=\"".$url."\">";
                }
            }
        }
        return $replace;
    }

    function get_elements($element, $all_media, $raw_file, $regs, $trash1, $replace1) {
        global $clear, $index_embeded;

        preg_match_all("/<$element(.*?)<\/$element\s*>/si", $raw_file, $regs, PREG_SET_ORDER);    //get 'object' elements
        foreach ($regs as $val) {
            $val = preg_replace("@<map.*?map>@si", " ",$val);           //  kill <map> elements in object

            $val = str_replace("  ","", str_replace($trash1, $replace1, $val));
            //      this must be an object but not client- or server-side maped, not ActiveX and no Java Script
            if (!preg_match("/[\/]?usemap|[\/]?ismap|[\/]?javascript:|[\/]?java:|[\/]?clsid:/i", $val)) {
                $all = $val;
                $nested = substr_count(lower_case($val[1]), $element);
                if ($nested) {
                    while ($nested > '0') {
                        $inner = array();
                        $inner[0] = '';
                        $last_pos = strrpos (lower_case($all[1]), $element);    // find inner nested  element
                        $inner[1] = substr($all[1], $last_pos);                 // separate inner nested element
                        if ($index_embeded == '1') {
                            $inner = array_reverse($inner);         //  move <object> into [0] of array
                            $all_media[] = $inner;                  // save actual element
                        }
                        $all[1] = substr($all[1], 0, $last_pos);    // get previous element

                        $nested--;
                    }
                }

                if ($index_embeded == '1') {            //  search for embeded objects
                    if (preg_match("/<embed(.*?)<\/embed\s*>/si", $all[1], $regs)) {;    //get 'embed' elements
                    foreach ($regs as $val) {
                        $embed[0] = $val;
                        $embed[1] = '';
                        if (strstr($embed[0], 'embed')) {
                            $all_media[] = $embed;  // save embeded element
                        }
                    }
                    }
                }
                $all[0] = substr($all[0], '0', strpos($all[0], '>')+1);         //  kill nested elements in object[0]
                $all[1] = substr($all[1], '0', strpos($all[1], '>')+1);         //  kill nested elements in object[1]
                $all[1] = preg_replace("@<embed.*?embed>@si", " ",$all[1]);     //  kill <embed> element in object

                if (strstr(lower_case($all[1]), '<object')) {
                    $all = array_reverse($all);     //  move <object> into [0] of array
                }
                $all_media[] = $all;                //  save outer element
            }
        }
        if ($clear == '1') unset ($all, $val, $regs, $embed, $inner, $element);
        return $all_media;
    }

    function get_id3string($link, $build_tmp, $cl) {
        global $clear, $case_sensitive, $curl, $debug;

        $error          = '';
        $id3_string     = '';
        $localtempfile  = $link;
        $unreachable    = '';

        if ($build_tmp == '1') {        //  we need to build a temporary file
            mysqltest();
            if ($fp_remote = @fopen($link, 'rb')) {
                $localtempfile = tempnam('./tmp', 'getID3');
                if ($fp_local = fopen($localtempfile, 'wb')) {
                    //  this will read the first 64 kByte of the media file
                    for ($i = 1; $i <= 4; $i++) {
                        $buffer = @fread($fp_remote, 8192);
                        fwrite($fp_local, $buffer);
                    }
                    //  this will read the complete media file
                    /*
                    while ($buffer = @fread($fp_remote, 8192)) {
                    mysqltest();
                    fwrite($fp_local, $buffer);
                    }
                    */
                    fclose($fp_local);
                }
            } else {    //  if impossible to open by PHP function 'fopen()', try to open this image by means of cURL library
                if ($curl == '1') {    //  if cURL library is available
                    if($buffer = curl_open($link)) {
                        $localtempfile = tempnam('./tmp', 'getID3');
                        if ($fp_local = fopen($localtempfile, 'wb')) {
                            fwrite($fp_local, $buffer);
                        } else {
                            $unreachable = '1';    //   unable to write to temp-file
                        }
                        fclose($fp_local);
                    } else {
                        $unreachable = '2'; //  unable to open the remote file by cURL
                    }
                } else {
                    $unreachable = '3'; //  no cURL library available
                }
            }
            if ($debug == '2') {
                if ($unreachable) {
                    if ($unreachable == '1') $report = "Unable to write to temp-file.";
                    if ($unreachable == '2') $report = "Unable to open the remote media file $link by cURL function.";
                    if ($unreachable == '3') $report = "Unable to open media file $link by means of PHP function fopen(), nor cURL library available.";
                    printWarning($report, $cl);

                }
            }
        }

        // Remote files are not supported
        if (!preg_match('/^(ht|f)tp:\/\//', $localtempfile) && !$unreachable) {
            $getID3 = new getID3;   // Initialize getID3 engine
            $getid3->encoding = 'UTF-8';

            try {
                $This_ID3 = $getID3->analyze($localtempfile);
            }
            catch (Exception $e) {
                $rep = $e->message ;
                $report = "Problem when analysing media file. ".$rep.".";
                printWarning($report, $cl);
            }

            if ($build_tmp == '1') {
                unlink($localtempfile);     // Delete temporary file
                fclose($fp_remote);
            }

            $id3_array = array();
            foreach ($This_ID3 as $key0 => $val0) {         //  prepare all relevant ID3 and EXIF information  into array
                if (is_array($val0)) {

                    foreach ($This_ID3 as $key1 => $section1) {
                        foreach ($section1 as $name1 => $val1) {
                            if (is_array($val1)) {

                                foreach ($val1 as $key2 => $section2) {
                                    foreach ($section2 as $name2 => $val2) {
                                        if (is_array($val2)) {
                                            //  for future releases
                                        } else {
                                            if (strlen($val2) < 100 && $key2 != "THUMBNAIL"  && $key2 != "keyframes"  && $val2 != "") {
                                                //echo "2 $key2.$name2: $val2<br />\n";
                                                $id3_array[] = " ".$key2." >> ".$name2." ;; ".$val2." ";
                                            }
                                        }
                                    }
                                }

                            } else {
                                if (strlen($val1) < 100   && $val1 != "") {
                                    //echo "1 $key1.$name1: $val1<br />\n";
                                    $id3_array[] = " ".$key1." >> ".$name1." ;; ".$val1." ";
                                }
                            }
                        }
                    }

                } else {
                    if ($key0 != "GETID3_VERSION") {
                        //echo "0 $key0: $val0<br />\n";
                        $id3_array[] = " >> ".$key0." ;; ".$val0." ";
                    }
                }
            }

            sort($id3_array);
            $id3_string = implode("<br />",array_unique($id3_array));  //  convert array into string with <br /> as delimiter

            if ($case_sensitive == '0') {
                $id3_string = lower_ent($id3_string);
                $id3_string = lower_case($id3_string);
            }
            if ($clear == '1') {
                unset ($key0, $key1, $key2, $name1, $name2, $val0, $val1, $val2);
                unset ($section1, $section2, $This_ID3, $getID3);
                $id3_array = array();
            }
        }
        return $id3_string;
    }

    function get_exif($localtempfile) {
        global $clear, $case_sensitive;

        $id3_string = '';
        // Remote files are not supported
        if (!preg_match('/^(ht|f)tp:\/\//', $localtempfile)) {

            $getID3 = new getID3;   // Initialize getID3 engine
            $getid3->encoding = 'UTF-8';

            try {
                $This_ID3 = $getID3->analyze($localtempfile);
            }
            catch (Exception $e) {
                echo 'Problem to analyze media file '.$link.' : ' .  $e->message;
            }

            $id3_array = array();
            foreach ($This_ID3 as $key0 => $val0) {         //  prepare all relevant ID3 and EXIF information  into array
                if (is_array($val0)) {

                    foreach ($This_ID3 as $key1 => $section1) {
                        foreach ($section1 as $name1 => $val1) {
                            if (is_array($val1)) {

                                foreach ($val1 as $key2 => $section2) {
                                    foreach ($section2 as $name2 => $val2) {
                                        if (is_array($val2)) {
                                            //  for future releases
                                        } else {
                                            if (strlen($val2) < 100 && $key2 != "THUMBNAIL"  && $key2 != "keyframes"  && $val2 != "") {
                                                //echo "2 $key2.$name2: $val2<br />\n";
                                                $id3_array[] = " ".$key2." >> ".$name2." ;; ".$val2." ";
                                            }
                                        }
                                    }
                                }

                            } else {
                                if (strlen($val1) < 100   && $val1 != "") {
                                    //echo "1 $key1.$name1: $val1<br />\n";
                                    $id3_array[] = " ".$key1." >> ".$name1." ;; ".$val1." ";
                                }
                            }
                        }
                    }

                } else {
                    if ($key0 != "GETID3_VERSION") {
                        //echo "0 $key0: $val0<br />\n";
                        $id3_array[] = " >> ".$key0." ;; ".$val0." ";
                    }
                }
            }

            sort($id3_array);
            $id3_string = implode("<br />",array_unique($id3_array));  //  convert array into string with <br /> as delimiter

            if ($case_sensitive == '0') {
                $id3_string = lower_ent($id3_string);
                $id3_string = lower_case($id3_string);
            }
            if ($clear == '1') {
                unset ($key0, $key1, $key2, $name1, $name2, $val0, $val1, $val2);
                unset ($section1, $section2, $This_ID3, $getID3);
            }
        }
        return $id3_string;
    }

    function mysqltest(){
        global $db_con, $database, $mysql_host, $mysql_user, $mysql_password, $command_line;

        $mysql_fail = '';
        if (@mysql_ping($db_con) === FALSE){
            $dbtries = 0;
            while ($dbtries < 5 && @mysql_ping($db_con) === FALSE){
                $dbtries++;
                printDB_errorReport('noSQL',$command_line, '1');
                sleep(30);
                $db_con = @mysql_pconnect ($mysql_host, $mysql_user, $mysql_password);
                if (!$db_con)
                $mysql_fail = '1';
                //echo "<span class='blue sml'>&nbsp;&nbsp;Cannot connect to database.<br /></span>";
                if ($db_con) {
                    $success = @mysql_select_db ($database, $db_con);
                    if (!$success) {
                        $mysql_fail = '1';
                        //echo "<p class='blue sml'>&nbsp;&nbsp;Cannot choose database.<br /></p>";
                    }
                }
            }
            if (@mysql_ping($db_con) === FALSE){
                printDB_errorReport('noSucc',$command_line, '1');   //  failed 5 times. End of index procedure
                printDB_errorReport('aborted',$command_line, '1');
                printDB_errorReport('end',$command_line, '1');

                die('');
            }
            printStandardReport('newSQL',$command_line, '1');   //  reconnected to db
        }
        return $mysql_fail;
    }

    function clean_resource($result, $event) {
        global $clear, $db_con, $debug, $cl;

        if ($clear == '1' && $result) {
            $mysql_fail = '';
            $mysql_fail = mysqltest();
            if (!$mysql_fail) {
                if ($result == '') {
                    printFreeRes($event, $cl);
                }

                $free = mysql_free_result($result) ;
                if ($free != '1') {
                    printFreeMySQL($result, $event, $cl);
                }
                mysqltest();
                //  DO NOT USE THE NEXT ROW ON SHARED HOSTING SYSTEMS ! ! !   'flush query cache' could be forbidden.
                @mysql_query("FLUSH QUERY CACHE");
                if ($debug > '0') echo mysql_error();
            }
        }
    }

    function valid_link($url, $select) {

        reset($select);
        $match      = '0';
        $url_parts  = parse_all_url($url);
        $path       = $url_parts['path'];   //  if exsists, remove domain and query

        foreach ($select as $key =>$value) {
            $last_dot   = strrpos($path, ".");       //  find last dot in URL string
            $suffix     = lower_case(substr($path, $last_dot));  //  extract suffix
            if (preg_match("/\.$value$/i", $suffix)) {
                $match = '1';
            }
        }
        return $match;
    }

    function bbcode($text) {
        //      encrypt Smilies
        $smiles = array();
        $smiles['&lt;:)&gt;'] = '&lt;:)&gt; beard';
        $smiles['&gt;:)'] = '&gt; Evil';
        $smiles[':)'] = ':) Smile';
        $smiles['|:('] = '|:( Headbanger';
        $smiles[':('] = ':( Angry';
        $smiles[':\'('] = ':\ Rears';
        $smiles[':o'] = ':o Amazed';
        $smiles[':D'] = ':D Big Smile';
        $smiles[':r'] = ':r Disgusted';
        $smiles[':9~'] = ':9~ Jummy!';
        $smiles[':9'] = ':9 Delicious';
        $smiles[';)'] = ';) Wink';
        $smiles[':9'] = ':9 Delicious';
        $smiles[':7'] = ':7 Love It';
        $smiles[':+'] = ':+ Clown';
        $smiles['O+'] = 'O+ Heart';
        $smiles[':*'] = ':* Kiss';
        $smiles['}:O'] = '}: Stupid Cow';
        $smiles['^)'] = '^) Married';
        $smiles['_O_'] = '_O_ Worshippie';
        $smiles[':W'] = ':W Wave goodbye';
        $smiles['^O^'] = '^O^ Way To Go!';
        $smiles[':?'] = ':? Come Again?';
        $smiles['(8&gt;'] = '(8&gt; Spy vs. Spy';
        $smiles[':Y)'] = ':Y) Vork';
        $smiles[':Z'] = 'Sleeping';
        $smiles[';('] = 'cry';
        $smiles['}:|'] = '}:| Grmbl';
        $smiles[':z'] = ':z Sleepy';
        $smiles['}&gt;'] = '}&gt; Evil';
        $smiles[':X'] = ':X Hgnn';
        $smiles[':O'] = ':O Booooring';
        $smiles['*)'] = '*) Prodent';
        $smiles[':{'] = ':{ Uhuh';
        $smiles['O-)'] = 'O-) The Saint';
        $smiles['8-)'] = '8-) Sunchaser';
        $smiles['*;'] = '*;Liefde is';
        $smiles[':Y'] = ':Y Yes';
        $smiles[':N'] = ':N No';
        $smiles[':@'] = ':@ Ashamed';
        $smiles['8)7'] = '8)7 Twisted';
        $smiles[':P'] = ':P puh';

        foreach($smiles as $grim => $txt)
        $text = str_replace($grim, ''.$txt.'', $text);

        $bb_search = array( //     convert most important bbcodes
    "/(\[)(url)(=)(['\"]?)(www\.)([^\"']*)(\\4)(.*)(\[\/url\])/siU",
    "/(\[)(url)(=)(['\"]?)([^\"']*)(\\4])(.*)(\[\/url\])/siU",
    "/(\[)(url)(\])(www\.)([^\"]*)(\[\/url\])/siU",
    "/(\[)(url)(\])([^\"']*)(\[\/url\])/siU",
    "/(\[)(email)(\])([^\"']*)(\[\/email\])/siU",
    "/(\[)(email)(=)(['\"]?)([^\"']*)(\\4])(.*)(\[\/email\])/siU",
    "/(\[)(color=)([^\W]*)(\])(.*)(\[\/color\])/siU",
    "/(\[)(size=)([^\.]*)(\])(.*)(\[\/size\])/siU",
    "/(\[)(font=)([^\W]*)(\])(.*)(\[\/font\])/siU",
    "/(\[)(b)(\])(\r\n)*(.*)(\[\/b\])/siU",
    "/(\[)(u)(\])(\r\n)*(.*)(\[\/u\])/siU",
    "/(\[)(i)(\])(\r\n)*(.*)(\[\/i\])/siU",
    "/(\[)(indent)(\])(\r\n)*(.*)(\[\/indent\])/siU",
    "/(\[)(center)(\])(\r\n)*(.*)(\[\/center\])/siU",
    "/(\[)(left)(\])(\r\n)*(.*)(\[\/left\])/siU",
    "/(\[)(right)(\])(\r\n)*(.*)(\[\/right\])/siU",
    "/(\[)(quote)(\])(\r\n)*(.*)(\[\/quote\])/siU",
    "/(\[)(code)(\])(\r\n)*(.*)(\[\/code\])/siU",
    "/(\[)(pre)(\])(\r\n)*(.*)(\[\/pre\])/siU",
    "/(\[)(img)(\])(?!javascript:)(\r\n)*([^\"']*)(\[\/img\])/siU",
    "/about:/si");

        $replace = array(
    "<a href=\"http://www.\\6\" target=\"_blank\">\\8</a>",
    "<a href=\"\\5\" target=\"_blank\">\\7</a>",
    "<a href=\"http://www.\\5\" target=\"_blank\">\\5</a>",
    "<a href=\"\\4\" target=\"_blank\">\\4</a>",
    "<a href=\"mailto:\\4\" target=\"_blank\">\\4</a>",
    "<a href=\"mailto:\\5\" target=\"_blank\">\\7</a>",
    "<span style=\"color:\\3;\">\\5</span>",
    "<span style=\"font-size:\\3;\">\\5</span>",
    "<span style=\"font-family:\\3;\">\\5</span>",
    "<b>\\5</b>",
    "<u>\\5</u>",
    "<i>\\5</i>",
    "<blockquote>\\5</blockquote>",
    "<center>\\5</center>",
    "<left>\\5</left>",
    "<right>\\5</right>",
    "<blockquote>Quote:
<hr>
\\5<hr></blockquote>",
    "<blockquote>Code:
<hr>
\\5<hr></blockquote>",
    "<pre>Code:
\\5</pre>",
    "<img src=\"\\5\" border=\"0\">",
    "about: ");
        $text= preg_replace($bb_search, $replace, $text);

        //      Create surrounding spaces for not yet encoded BB's
        $text = str_replace("[", " [", $text);
        $text = str_replace("]", "] ", $text);

        return ($text);
    }

    function microtime_float(){
        list($usec, $sec) = explode(" ", microtime());
        return ((float)$usec + (float)$sec);
    }

    function index_url($url, $level, $site_id, $md5sum, $domain, $indexdate, $sessid, $can_leave_domain, $reindex, $use_nofollow, $cl, $use_robot, $use_prefcharset) {
        global $entities, $min_delay, $link_check, $command_line, $min_words_per_page, $dup_content, $dup_url, $quotes, $plus_nr;
        global $min_words_per_page,  $supdomain, $smp, $follow_sitemap,  $max_links, $realnum, $local, $tmp_dir, $auto_add, $admin_email;
        global $mysql_table_prefix, $user_agent, $tmp_urls, $delay_time, $domain_arr, $home_charset, $charSet, $url_status;
        global $debug, $common, $use_white1, $use_white2, $use_black, $whitelist, $blacklist, $clear, $db_con, $abslinks;
        global $index_media, $index_image, $suppress_suffix, $imagelist, $min_image_x, $min_image_y, $dup_media, $index_alt, $no_log, $index_rss;
        global $index_audio, $audiolist, $index_video, $videolist, $index_embeded, $rss_template, $index_csv, $delim, $ext, $index_id3, $dba_act;
        global $converter_dir, $dict_dir, $cn_seg, $jp_seg, $index_framesets, $index_iframes, $cdata, $dc, $preferred, $index_rar, $index_zip, $curl;
        global $docs, $only_docs, $only_links, $case_sensitive, $vowels, $noacc_el, $include_dir, $thumb_folder, $js_reloc, $server_char;

        $data       = array();
        $cn_data    = array();
        $topic      = '';
        $url_reloc  = '';
        $comment    = mysql_real_escape_string("Automatically added during index procedure, as this domain is not yet available in 'Sites' menu.");
        $admin_email =  mysql_real_escape_string($admin_email);

        if ($debug == '0'){
            if (function_exists("ini_set")) {
                ini_set("display_errors", "0");
            }
            error_reporting(0) ;
        } else {
            error_reporting (E_ERROR) ;     //  otherwise  a non existing siemap.xml  would always cause a warning message
        }

        $needsReindex = 1;
        $deletable = 0;
        $url_status = url_status($url);

        $url_parts  = parse_all_url($url);
        $thislevel = $level - 1;

        if ($url_status['relocate'] ){          //  if relocated,  print message and redirect to new URL

            //  remove the original URL from temp table. The relocated URL will be added later on.
            mysqltest();
            mysql_query ("delete from ".$mysql_table_prefix."temp where link = '$url' AND id = '$sessid'");
            if ($debug > '0') echo mysql_error();

            $new_url = $url_status['path'] ;
            $diff = strlen($url);
            $redir = substr( $new_url, $diff);      //      extract diff. between original URL and relocated URL
//echo "\r\n\r\n<br /> relocated new_url: '$new_url'<br />\r\n";
            if ($redir == "index.php" || $redir == "index.html" || $redir == "index.htm" || $redir == "home.html") {
                $local_redir = '1';     //  no output because diff.  is only index.html etc.
            } else {
                printRedirected($url_status['relocate'], $url_status['path'], $cl);
            }

            if (strstr($url_status['path'], "//")) {                            //  if redirected to absolute URL, use this for further usage
                $url = $url_status['path'];
            } else {
                $relo_url = str_replace($url_parts['query'], "", $url);         //  url without query
                $relo_url = substr($url, 0, strrpos($relo_url, "/")+1);         //  url without file name
                if (strpos($url_status['path'], "./") === 0) {                  //  if redirected relativ to same folder depth
                    $url_status['path'] = str_replace("./", "", $url_status['path']);
                    $url = "".$relo_url."".$url_status['path']."";
                }
                if (strpos($url_status['path'], "../") === 0) {                 //  if redirected relativ and one folder up
                    $url_status['path'] = str_replace("./", "", $url_status['path']);
                    $relo_url = substr($url, 0, strpos($url_parts['path']));    //  url without file name
                    $relo_url = substr($url, 0, strrpos($relo_url, "/")+1);     //  url without last folder
                    $url = "".$relo_url."".$url_status['path']."";
                }
            }
            $url_reloc  = $url;                 //  remember the relocated url in order to redefine $mainurl
            $url_status = url_status($url);     //  get the status of the relocated URL
            $url_parts  = parse_all_url($url);  //  rebuild the url parts from the relocated URL
        }

        if ($smp != 1 && $follow_sitemap == 1) {        //  enter here if we don't already know a valid sitemap and if admin settings allowed us to do so
            $tmp_urls = get_temp_urls($sessid);         //  reload previous temp
            $url2 = remove_sessid(convert_url($url));

            // get folder where sitemap should be and if exists, cut existing filename, suffix and subfolder
            $host = parse_addr($url2);
            $hostname = $host[host];
            $more_sitemaps = array ();

            if ($hostname == 'localhost') $host1 = str_replace($local,'',$url2);
            $pos = strpos($host1, "/");                //      on local server delete all behind the /

            if ($pos) $host1 = substr($host1,0,$pos);   //      build full adress again, now only until host
            if ($hostname == 'localhost') {
                $url2 = ("".$local."".$host1."");
            }else {
                $url2 = ("$host[scheme]://$hostname");
            }

            $sitemap_name   = "sitemap";                        //      standard name for sitemap file
            $input_file     = "$url2/$sitemap_name";            //      create path to sitemap
            $log_file       = './sitemaps/current_sitemap.xml'; //      destination for sitemap log-file
            $smap_found     = '';
            $indexed_map    = '';
            $map_cont       = '';

            //  try to fetch individual sitemap url from database
            mysqltest();
            $result = mysql_query("select smap_url from ".$mysql_table_prefix."sites where site_id='$site_id'");
            if ($debug > '0') echo mysql_error();
            $row = mysql_fetch_row($result);
            if (preg_match("/http:\/\//", $row[0])) {  //   use the individual sitemap
                $input_file = preg_replace("/.xml.gz|.xml/i", "", $row[0]);
            }

            $file = "".$input_file.".xml";
            if ($fd = @fopen($file, "r")) {    //  uncompressed ?
                //if ($zd = @gzopen("".$input_file.".xml", "r")) {    //  uncompressed ?

                $map_cont = @stream_get_contents($fd);
                if ($map_cont && strpos($map_cont, "schemas/sitemap")) {        //  if we were able to read it
                    $smap_found = '1';
                }
                fclose($fd);
            }

            $gz_file = "".$input_file.".xml.gz";
            if (!$smap_found && $zd = @fopen("compress.zlib://$gz_file", "r")) {  // compressed  ?
                //if (!$smap_found && $zd = @gzopen("".$input_file.".xml.gz", "r")) {  // compressed  ?
                $map_cont = @gzread($zd, 10485760);      //  max. 10 MB (might be too large for some server)
                gzclose($zd);
                if ($map_cont && strpos($map_cont, "schemas/sitemap")) {
                    $smap_found = '1';
                }
            }
            //echo "\r\n\r\n<br>map_cont Array:<br><pre>";print_r($map_cont);echo "</pre>\r\n";
            if($smap_found) {
                if ($debug != '0') {    //      create a log-file of current sitemap.xml
                    file_put_contents($log_file, $map_cont);
                }

                //$del = mysql_query("delete from ".$mysql_table_prefix."temp"); // function get_sitemap and store_links will build a new temp table
                if (stristr($map_cont, "<sitemapindex")) {   //      if current sitemap file is an index file
                    printStandardReport('validSitemapInd',$command_line, $no_log);
                    $get_maps = simplexml_load_string ($map_cont);
                    if ($get_maps) {
                        reset($get_maps);
                        foreach($get_maps as $map_x) {
                            $new_links[] =($map_x->loc); //   get all links to further sitemap files
                        }
                        if (is_array($new_links)) {     //      if we found more sitemap files
                            $new_links = explode(",",(implode(",",$new_links))); // destroy SimpleXMLElement Object and get the link array
                            $new_links = array_slice($new_links, 0, $max_links);
                            $indexed_map = '1';
                            $i = '0';
                            //echo "\r\n\r\n<br>new_links Array:<br><pre>";print_r($new_links);echo "</pre>\r\n";
                            foreach($new_links as $input_file) {
                                $these_links = get_sitemap($input_file, $indexed_map, $mysql_table_prefix); // now extract page links from this sitemap file
                                //echo "\r\n\r\n<br>these_links Array:<br><pre>";print_r($these_links);echo "</pre>\r\n";
                                if ($these_links){
                                    reset($these_links);
                                    store_newLinks($these_links, $level, $sessid);
                                    $smp = '1';  //     there were valid sitemap files and we stored the new links
                                    $i++;
                                } else {
                                    printStandardReport('invalidSecSitemap',$command_line, $no_log);    //  unable to extract links from secondary sitemap file
                                }
                            }
                            printValidSecSmap($i, $cl);
                            unset ($input_file, $map_cont, $new_links);
                        } else {
                            printStandardReport('invalidSecSitemap',$command_line, $no_log);    //  unable to extract links from secondary sitemap file
                        }
                    } else {
                        printStandardReport('invalidSitemapInd',$command_line, $no_log);        //  unable to extract links from sitemap INDEX  file
                    }
                } else {
                    $links = get_sitemap($map_cont, $indexed_map, $mysql_table_prefix);         // extract links from sitemap.xml  (there was only one sitemap file)
                    if ($links !='') {
                        reset ($links);
                        //echo "\r\n\r\n<br>sitemmap links Array:<br><pre>";print_r($links);echo "</pre>\r\n";
                        store_newLinks($links, $level, $sessid);
                        $smp = '1';  //     there was one valid sitemap and we stored the new links
                        printStandardReport('validSitemap',$command_line, $no_log);

                    } else {
                        printStandardReport('invalidSitemap',$command_line, $no_log);
                    }
                    unset ($links);
                }
            }
        }

        if ($debug == '0'){
            if (function_exists("ini_set")) {
                ini_set("display_errors", "0");
            }
            error_reporting(0) ;
        } else {
            error_reporting (E_ALL ^ E_NOTICE ^ E_WARNING) ;
        }

        if (strstr($url_status['state'], "Relocation") || $url_status['relocate'])  {

            $care_excl = '1';   //  care file suffixed to be excluded
            $relocated = '1';   //  URL is relocated

            $url = preg_replace("/ /i", "", url_purify($url_status['path'], $url, $can_leave_domain, $care_excl, $relocated, $local_redir));

            //  check for unsupported file suffix
            if ($care_excl == '1') {
                reset($ext);
                while (list ($id, $excl) = each($ext)){
                    if (preg_match("/\.$excl$/i", $url_status['path'])) {
                        $url = 'excl';
                    }
                }
            }

            if ($url <> '' && $url != "self" && $url != "excl") {
                mysqltest();
                $result = mysql_query("select link from ".$mysql_table_prefix."temp where link='$url' && id = '$sessid'");
                if ($debug > '0') echo mysql_error();
                $rows = mysql_num_rows($result);
                if ($rows == 0) {
                    mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', '$level', '$sessid')");
                    if ($debug > '0') echo mysql_error();
                }

                //  check whether the redirected URL is already known and in database
                mysqltest();
                $query = "select indexdate from ".$mysql_table_prefix."links where url='$url'";
                $result = mysql_query($query);
                if ($debug > '0') echo mysql_error();
                $rows = mysql_num_rows($result);
                if ($rows) {
                    $url_status['state'] = "Already in database";
                } else {
                    $url_status['state'] = 'ok';   //  okay to index the relocated URL
                }

                if ($clear == 1) clean_resource($result, '02') ;
            } else {
                if ($url == "self") {
                    $url_status['state'] = "Relocated to the caling URL. Blocked, because this could cause an infinite loop.";
                }
                if  ($url == "excl") {
                    $url_status['state'] = "Relocated to currently unsupported file suffix.";
                } else {
                    $url_status['state'] = "Redirected out of domain: $domain";
                }
            }

        }

        if ($url_status['state'] == 'ok') {

            $OKtoIndex = 1;
            $file_read_error = 0;

            if (time() - $delay_time < $min_delay) {
                sleep ($min_delay- (time() - $delay_time));
            }
//echo "\r\n\r\n<br>url_status Array:<br><pre>";print_r($url_status);echo "</pre>\r\n";

            if ($url_status['body']) {
                $file = $url_status['body'];
            } else {
                //  fetch the file content
                $delay_time     = time();
                $contents       = array();
                $chrSet         = '';
                $file           = '';
                $get_charset    = '1';

                $file = file_get_contents($url);
//echo "\r\n\r\n<br /> file0: '$file'<br />\r\n";
                //  try to get the contents with a slash at the end of the path
                if (!$file) {
                    if (!isset($url_parts['query']) && $url_parts['path'] != "/" && $url_status['path1']) {
                        $url = 'http://'.$url_parts['host']."".$url_status['path1']."";
                        $file = file_get_contents($url);
                    }
                }
//echo "\r\n\r\n<br /> file1: '$file'<br />\r\n";
                // try alternate method no. 3 to get the file content
                if (!$file) {
                    $get_charset    = '1';
                    $contents = getFileContents($url, $get_charset);
                    $file = $contents['file'];
                }
            }
//echo "\r\n\r\n<br /> file2: '$file'<br />\r\n";
//die ('Bis hier');
            //  convert gzip coded content into plain text
            if ($url_status['Content-Encoding'] == "gzip") {

                $result = gz_decode($file, $url_status['Content-Encoding'], $url_status['Transfer-Encoding']);
                if($result == "error_gz0") {
                    if ($debug == "2") {
                        $result = "Announced by the URL as gzip formatted content, it's not! We'll treat it as plain text";
                        printUrlStatus($result, $command_line, $no_log);
                    }
                } else {
                    $file = $result;
                }

            }

            //  We've tried it with 3 different methods. File is not readable for Sphider-plus
            if (!$file || preg_match("/<title>30\d Found<\/title>/i", $file)) {
                $url_status['state'] = "Unable to read the content of the file.<br />$url does not respond,<br />or HTTP status 403: Forbidden.";
                $realnum -- ;
            }
        }

        //  try to find a relocation in JavaScript (like at http://www.m-porechye.ru/)
        if ($js_reloc && $file) {
            $file_js = substr($file, 0, 1024);
            if (preg_match("@javascript(.*?)window.location(.*?)=(.*?)[\'\"](.*?)[\'\"]@si", $file_js, $regs)) {
//echo "\r\n\r\n<br>regs Array:<br><pre>";print_r($regs);echo "</pre>\r\n";
                if ($regs[4] != strstr($regs[4], "ttp")) {
                    if ($url_parts['path']) {
                        if ($url_status['path'] != "/") {
                            $url_status['path'] .= "/"; //  ad a finalslash to the path
                        }
                    }
                    $reloc_url = 'http://'.$url_parts['host']."".$url_status['path']."".$regs[4]."";   //  build the complete URL
                } else {
                    $reloc_url = $regs[4];
                }
//echo "\r\n\r\n<br /> JavaScript reloc URL: '$reloc_url'<br />\r\n";
                $url_status = url_status($reloc_url);

                if (strstr($url_status['state'], "ok"))  {  //  this is the status of $reloc

                    //  remove the original URL from temp table. The relocated URL will be added immediately.
                    mysqltest();
                    mysql_query ("delete from ".$mysql_table_prefix."temp where link = '$url' AND id = '$sessid'");
                    if ($debug > '0') echo mysql_error();

                    //  if not relocated in it selves
                    if ($reloc_url != $url) {
                        mysqltest();
                        $result = mysql_query("select link from ".$mysql_table_prefix."temp where link='$reloc_url' && id = '$sessid'");
                        if ($debug > '0') echo mysql_error();
                        $rows = mysql_num_rows($result);
                        if ($rows == 0) {
                            //  add the relocated URL to the temp table, so we may process it later on
                            mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$reloc_url', '$level', '$sessid')");
                            if ($debug > '0') echo mysql_error();
                        }

                        //  check whether the redirected URL is already known and in database as a link
                        mysqltest();
                        $query = "select indexdate from ".$mysql_table_prefix."links where url='$reloc_url'";
                        $result = mysql_query($query);
                        if ($debug > '0') echo mysql_error();
                        $rows = mysql_num_rows($result);
                        if ($rows) {
                            $url_status['state'] = "Already in database";
                        } else {
                            //  this should be the normal exit for JavaScript relocations
                            $url_status['state'] = "Redirected by JavaScript to $reloc_url";
                        }

                        if ($clear == 1) clean_resource($result, '02a') ;
                    } else {
                        $url_status['state'] = "Redirected to the same URL. Blocked, because could cause an infinite loop.";
                    }
                } else {
                    $message = $url_status['state'];
                    $url_status['state'] = "Unable to follow the JavaScript redirection: $reloc<br />$message";
                }
//echo "\r\n\r\n<br>final JavaScript reloc url_status Array:<br><pre>";print_r($url_status);echo "</pre>\r\n";
            }
        }

        if ($url_status['state'] == 'ok') {

            //  if required, uncompress ZIP archives and make content of each file => text
            if ($url_status['content'] == 'zip' && $index_zip == '1' && $file) {
                file_put_contents("".$tmp_dir."/archiv.temp",$file);
                $zip = zip_open("".$tmp_dir."/archiv.temp");
                if ($zip) {
                    $url_status['content'] = "text";    //  preventiv, if not another status will be detected for individual archiv files
                    $file   = '';                       //  starting with a blank file for all archive files
                    $topic  = 'zip';

                    if ($debug == '2') {
                        printStandardReport('archivFiles', $command_line, $no_log);
                    }

                    while ($zip_entry = zip_read($zip)) {
                        if (zip_entry_open($zip, $zip_entry, "r")) {
                            $buf = zip_entry_read($zip_entry, zip_entry_filesize($zip_entry));  //uncompress the content of recent archiv file
                            $name = zip_entry_name($zip_entry);             //  get filename of recent archive file
                            if ($debug == '2') {    //
                                $report = "<strong>&nbsp;&nbsp;".$name."</strong>";
                                printThis($report, $cl);
                                $size = (int)(zip_entry_filesize($zip_entry)/1024);
                                if ($size == 0) $size = '1';
                                $report =  "&nbsp;&nbsp;&nbsp;-&nbsp;Unpacked size:&nbsp;".$size." kByte<br />";
                                printThis($report, $cl);
                            }
                            $buf = get_arch_content($buf, $name, $url);     //  if necessary, convert PDF, extract feed etc. for the recent file
                            zip_entry_close($zip_entry);                    //  done for this file in archiv
                            $file .= "".$buf."<br /><br />";                //  add all uncompressed and converted files together
                        }
                    }
                    zip_close($zip);
                }
                unlink("".$tmp_dir."/archiv.temp");
            }

            //  remove all useless parts of the content
            if ($use_nofollow == '1') {
                $file = preg_replace("@<!--sphider_noindex-->.*?<!--\/sphider_noindex-->@si", " ",$file);
            }

            $file       = preg_replace("@<!--.*?-->@si", " ",$file);
            $file       = preg_replace("@<style[^>]*>.*?<\/style>@si", " ", $file);
            $file       = preg_replace("/<link rel[^<>]*>/i", " ", $file);
            $file       = str_replace ("encoding: ''", " ", $file);        //  yes, I've seen such nonsense !

            $raw_file   = $file;
/*
            $file       = preg_replace("@<script[^>]*?>.*?<\/script>@si", " ",$file);
*/

            //  if required, uncompress RAR archives and make content of each file => text
            if ($url_status['content'] == 'rar' && $index_rar == '1') {
                file_put_contents("".$tmp_dir."/archiv.temp",$file);
                $rar = rar_open("".$tmp_dir."/archiv.temp");

                if ($rar) {
                    $url_status['content'] = "text";    //  preventiv, all individual archiv files willl be converted to 'text'
                    $file       = '';                   //  starting with a blank file for all archive files
                    $topic      = 'rar';
                    $entries    = rar_list($rar);

                    if ($rar) {
                        if ($debug == '2') {
                            printStandardReport('archivFiles', $command_line, $no_log);
                        }
                        foreach ($entries as $entry) {
                            $name =  $entry->getName();
                            if ($debug == '2') {
                                $report = "<strong>&nbsp;&nbsp;".$name."</strong>";
                                printThis($report, $cl);
                                $size = (int)($entry->getPackedSize()/1024);
                                if ($size == 0) $size = '1';
                                $report = "&nbsp;&nbsp;&nbsp;-&nbsp;Packed size:&nbsp;&nbsp;".$size." kByte";
                                printThis($report, $cl);
                                $size = (int)($entry->getUnpackedSize()/1024);
                                if ($size == 0) $size = '1';
                                $report =  "&nbsp;&nbsp;&nbsp;-&nbsp;Unpacked size:&nbsp;".$size." kByte<br />";
                                printThis($report, $cl);
                            }
                            $entry->extract('', "./".$tmp_dir."/".$name."");        //  extract single file of archiv into temporary folder
                            $buf = file_get_contents("./".$tmp_dir."/".$name."");   //  read content of this intermediate file
                            unlink ("./".$tmp_dir."/".$name."");                    //  destroy this file

                            if ($buf) {
                                $buf = get_arch_content($buf, $name, $url); //  if necessary, convert PDF, extract feed etc. for the recent file
                                $file .= "".$buf."<br /><br />";            //  add all uncompressed and converted files together
                            }
                        }
                    }
                    rar_close($rar);
                }
                unlink("".$tmp_dir."/archiv.temp");
            }

            //  kill eventually duplicate coding info in dynamic links
            if (stristr(substr($file, '0', '4000'), "encoding") && strstr(substr($file, '0', '4000'), "charset")) {
                $file = substr($file, strrpos($file, "<!DOCTYPE"));
            }

            $chrSet = '';
            if ($use_prefcharset == '1') {      //  use preferred charset as defined in Admin settings
                $chrSet = $home_charset;

            } else {
                if($server_char && $url_status['charset']) {
                    $chrSet = $url_status['charset'];    //  use charset as supplied by the remote server

                } else {                        //  try to extract the charset of this file
                    if (preg_match("'encoding=[\'\"](.*?)[\'\"]'si", substr($file, 0, 3000), $regs)) {
                        $chrSet = trim(strtoupper($regs[1]));      //      get encoding of current XML or XHTML file     and use it furtheron
                    }

                    if (!$chrSet) {
                        if (preg_match("'charset=(.*?)[\'\"]'si", substr($file, 0, 3000), $regs)) {
                            $chrSet = trim(strtoupper($regs[1]));      //      get charset of current HTML file     and use it furtheron
                        }
                    }

                     if (!$chrSet) {
                        if (preg_match("'charset=[\'\"](.*?)[\'\"]'si", substr($file, 0, 3000), $regs)) {

                            $chrSet = trim(strtoupper($regs[1]));      //      get charset of current HTML file     and use it furtheron
                        }
                    }

                    if (!$chrSet) {
                        if (preg_match("'charset=(.*?)[\'\"]'si", substr($file, 0, 3000), $regs)) {
                            $chrSet = trim(strtoupper($regs[1]));      //      get charset of current HTML file     and use it furtheron
                        }
                    }

                    if ($chrSet == '') {
                        $chrSet = $home_charset;    //  no charset found, we need to use default charset like for DOCs, PDFs, etc
                    }
                }
            }

            if (strpos($chrSet, " ")) {     // in the wild we have aloready seen a lot of variants
                $chrSet = substr($chrSet, 0, strpos($chrSet, " "));
            }
//echo "\r\n\r\n<br /> chrSet: '$chrSet'<br />\r\n";
            $contents['charset'] = $chrSet;

            if ($index_framesets == '1') {
                if (preg_match("@<frameset[^>]*>(.*?)<\/frameset>@si",$file, $regs)) {
                    printStandardReport('newFrameset', $command_line, $no_log);
                    //  separate the <frameset> ....</frameset> part of this file
                    $frame = $regs[1];
                    $replace = get_frames($frame, $url, $can_leave_domain);
                    $replace ="<body>".$replace."</body>";  //  create the body tags for $file
                    $contents['charset'] = $chrSet;         // rebuild charset
                    //  include all replacements instead of the frameset tag into the actual file. This will become the body
                    $file = preg_replace("@<frameset.*?</frameset>@si", "$replace", $file);
                }
            }

            if ($index_iframes == '1') {
                $links          = array ();
                $regs           = Array ();
                $replace        = '';
                $get_charset    = '';
                $real_url       = $url;
                if (preg_match_all("/(iframe[^>]*src[[:blank:]]*)=[[:blank:]]*[\'\"]?(([[a-z]{3,5}:\/\/(([.a-zA-Z0-9-])+(:[0-9]+)*))*([+:%\/?=&;\\\(\),._ a-zA-Z0-9-]*))(#[.a-zA-Z0-9-]*)?[\'\" ]?/i", $file, $regs, PREG_SET_ORDER)) {

                    printStandardReport('newIframe', $command_line, $no_log);
                    //  find all frames of the iframe;
                    $care_excl = '';   //  don't care file suffixed to be excluded
                    $relocated = '';   //  URL is not relocated

                    foreach ($regs as $val) {
                        if (($a = url_purify($val[2], $url, $can_leave_domain, $care_exel, $relocated, $local_redir)) != '') {
                            $links[] = ($a);    // collect  all iframe links
                        }
                    }

                    if ($links) {
                        foreach ($links as $url) {
                            printNewLinks($url, $cl);

                            if (preg_match("/.html|.htm|.xhtml|.xml|.php/i", $url)) {
                                $frame = file_get_contents($url);      //      get content of this frame
                                //  separate the body part of this frame
                                preg_match("@<body[^>]*>(.*?)<\/body>@si",$frame, $regs);
                                $body = $regs[1];
                                if ($abslinks == '1') {
                                    $body = make_abslinks($body, $url);     //  if required, correct links relative to found iframe
                                }
                                $replace = "".$replace."<br />".$body."";
                            } else {    //  might be an image
                                $replace = "".$replace."<br /><img src=\"".$url."\">";
                            }

                        }
                    }

                    //  include all replacements instead of the iframe tag into the actual file
                    $file = preg_replace("@<iframe.*?</iframe>@si", "$replace", $file);
                    $contents['charset'] = $chrSet;     // rebuild charset
                }
                $url = $real_url;
            }

            //      in order to index RDF, RSD, RSS and ATOM feeds enter here
            if (($url_status['content'] == 'xml') && $index_rss =='1') {

                if (!preg_match("/<rss|atom|<feed|<rdf|<rsd/si", substr($file,0,400))) {
                    printStandardReport('notRSS',$command_line, $no_log);   //  no valid feed detected
                    $OKtoIndex = 0;
                    $file_read_error = 1;
                    $realnum -- ;
                } else {
                    $html = '';

                    $xml = XML_IsWellFormed($file);     //      check for well-formed XML
                    if ($xml != '1') {
                        if ($debug > 0 ) {
                            printNotWellFormedXML($xml, $cl);
                        }

                        $OKtoIndex = 0;
                        $file_read_error = 1;
                        $realnum -- ;

                    } else {

                        $rss = new feedParser;
                        // define options for feed parser
                        $rss->limit     = $max_links;   //   save time by limiting the items/entries to be processed
                        $rss->in_cp     = strtoupper($contents['charset']); //  charset of actual file
                        $rss->out_cp    = 'UTF-8';      //  convert all into this charset
                        $rss->cache_dir = '';           //  currently unused
                        $rss->dc        = $dc;          //  treat Dublin Core tags in RDF feeds
                        $rss->pro       = $preferred;   //  obey the PREFERRED directive in RSD feeds
                        $rss->file      = '1';          //  use $file as feed (as a string, not URL)

                        if ($cdata != 1) {
                            $rss->CDATA = 'content';    //  get it all  (naughty)
                        } else {
                            $rss->CDATA = 'nochange';   //  well educated crawler
                        }

                        //  get feed as array
                        if ($feed = $rss->get($url, $file)){
                            //  if you want to see the feed during index procedure, uncomment the following row
                            //  echo "<br>FEED array:<br><pre>";print_r($feed);echo "</pre>";
                            $link           = '';
                            $textinput_link = '';
                            $image_url      = '';
                            $image_link     = '';
                            $docs           = '';
                            $subjects       = '';
                            $count          = '';
                            $type           = $feed[type];
                            $count          = $feed[sub_count];
                            $cached         = $feed[cached];

                            //  kill all no longer required values
                            $feed[type]         = '';
                            $feed[sub_count]    = '';
                            $feed[encoding_in]  = '';
                            $feed[encoding_out] = '';
                            $feed[items_count]  = '';
                            $feed[cached]       = '';

                            if (!$count) {
                                $count = '0';
                            }

                            if ($type == 'RSD') {
                                //      prepare all RSD APIs
                                for($i=0;$i<$count;$i++){
                                    $subjects .= ''.$feed['api'][$i]['name'].'<br />
                                            '.$feed['api'][$i]['apiLink'].'<br />
                                            '.$feed['api'][$i]['blogID'].'<br />
                                            '.$feed['api'][$i]['settings_docs'].'<br />
                                            '.$feed['api'][$i]['settings_notes'].'<br />';
                                }
                            }

                            if ($type == 'Atom') {
                                //      prepare all Atom entries
                                for($i=0;$i<$count;$i++){
                                    $subjects .= ''.$feed['entries'][$i]['link'].'<br />
                                            '.$feed['entries'][$i]['title'].'<br />
                                            '.$feed['entries'][$i]['id'].'<br />
                                            '.$feed['entries'][$i]['published'].'<br />
                                            '.$feed['entries'][$i]['updated'].'<br />
                                            '.$feed['entries'][$i]['summary'].'<br />
                                            '.$feed['entries'][$i]['rights'].'<br />
                                            '.$feed['entries'][$i]['author_name'].' '.$feed['entries'][$i]['author_email'].' '.$feed['entries'][$i]['author_uri'].'<br />
                                            '.$feed['entries'][$i]['category_term'].' '.$feed['entries'][$i]['category_label'].' '.$feed['entries'][$i]['category_scheme'].'<br />
                                            '.$feed['entries'][$i]['contributor_name'].' '.$feed['entries'][$i]['contributor_email'].' '.$feed['entries'][$i]['contributor_uri'].'<br />
                                        ';
                                }

                            }
                            if ($type == 'RDF' | $type =='RSS v.0.91/0.92' | $type == 'RSS v.2.0'){    //  For RDF and RSS feeds enter here
                                //  prepare channel image
                                $image_url = $feed[image_url];
                                if($image_url){
                                    $width = $feed[image_width];
                                    if (!$width || $width > '144') {
                                        $width = '88';  //set to default value
                                    }
                                    $height = $feed[image_height];
                                    if (!$height || $height > '400') {
                                        $height = '31';  //set to default value
                                    }

                                    $feed[image_url] = "<img id=\"rss_007\" src=\"".$image_url."\" alt=\"".$feed[image_title]."\" width=\"".$width."\" height=\"".$height."\">";
                                }
                                $image_link = $feed[image_link];
                                if($image_link){
                                    $feed[image_link] = "<a href=\"".$image_link."\">".$image_link."</a>";
                                }

                                //      prepare all RDF or RSS items
                                for($i=0;$i<$count;$i++){
                                    $subjects .= ''.$feed['items'][$i]['link'].'<br />
                                            '.$feed['items'][$i]['title'].'<br />
                                            '.$feed['items'][$i]['description'].'<br />
                                            '.$feed['items'][$i]['author'].'<br />
                                            '.$feed['items'][$i]['category'].'<br />
                                            '.$feed['items'][$i]['guid'].'<br />
                                            '.$feed['items'][$i]['comments'].'<br />
                                            '.$feed['items'][$i]['pubDate'].'<br />
                                            '.$feed['items'][$i]['source'].'<br />
                                            '.$feed['items'][$i]['enclosure'].'<br />
                                            '.$feed['items'][$i]['country'].'<br />
                                            '.$feed['items'][$i]['coverage'].'<br />
                                            '.$feed['items'][$i]['contributor'].'<br />
                                            '.$feed['items'][$i]['date'].'<br />
                                            '.$feed['items'][$i]['industry'].'<br />
                                            '.$feed['items'][$i]['language'].'<br />
                                            '.$feed['items'][$i]['publisher'].'<br />
                                            '.$feed['items'][$i]['state'].'<br />
                                            '.$feed['items'][$i]['subject'].'<br />
                                        ';
                                }
                            }

                            //  convert  the channel/feed part  into a string
                            $feed_common = implode(" ", $feed);
//echo "\r\n\r\n<br /> feed_common: '$feed_common'<br />\r\n";
//echo "\r\n\r\n<br /> subjects: '$subjects'<br />\r\n";
                            //  build something that could be indexed
                            $html .= "<html>\r\n<head>\r\n<title>".$feed['title']."</title>\r\n<meta name=\"description\" content=\"".$feed['description']." \">\r\n</head>\r\n";
                            $html .= "<body>\r\n".$feed_common."\r\n".$subjects."\r\n</body>\r\n</html>\r\n";
                        }

                        if (strlen($html) < "130") {    //  can't be a valid feed
                            if ($type == "unknown") {
                                printInvalidFeedType($type, $cl);
                            } else {
                                printStandardReport('invalidRSS',$command_line, $no_log);
                            }
                            $OKtoIndex = 0;
                            $file_read_error = 1;
                            $realnum -- ;
                        } else {
                            $contents['charset'] = 'UTF-8';     //      the feed reader converts all to utf-8
                            $file = $html;                      //     use feed reader output

                            if ($debug > 0 ) {
                                printValidFeed($type, $count, $cl);
                            }
                        }
                    }
                }
            }

            //  prepare CVS files
            if (($url_status['content'] == 'csv') && $index_csv =='1') {
                $file = str_replace(",", " ", $file);
                $file = str_replace(";", " ", $file);
            }

            // for DOCs, PDFs, etc we need special text converter
            if ($url_status['content'] != 'text' && $url_status['content'] != 'xml' && $url_status['content'] != 'xhtml' && $url_status['content'] != 'csv') {

                $file = extract_text($file, $url_status['content'], $url);

                $contents['charset']    = 'UTF-8';
                $home_charset           = 'UTF-8';
                $charSet                = 'UTF-8';  //  because the converter already transferred the documents to UTF-8

                if ($file == 'ERROR') {     //      if error, suppress further indexing
                    $OKtoIndex = 0;
                    $file_read_error = 1;
                    $realnum -- ;
                }

                //  reduce Pashtu and Urdu to the main Farsi letters
                if (strtolower($home_charset) == 'windows-1256' && $url_status['content'] == 'pdf') {
                    $f_letter0= array("ﺎ","�");
                    $f_letter1= array("�","�","ﺑ","ﺒ");
                    $f_letter2= array("ï­–","ï­—","ï­˜","ï­™");
                    $f_letter3= array("ﺕ","ﺖ","ﺗ","ﺘ");
                    $f_letter4= array("ﺙ","ﺚ","ﺛ","ﺜ");
                    $f_letter5= array("�","ﺞ","ﺟ","ﺠ");
                    $f_letter6= array("ï­º","ï­»","ï­¼","ï­½");
                    $f_letter7= array("ﺡ","ﺢ","ﺣ","ﺤ");
                    $f_letter8= array("ﮋ","ﮊ");
                    $f_letter9= array("ﺥ","ﺦ","ﺧ","ﺨ");
                    $f_letter10= array("ﺩ","ﺪ");
                    $f_letter11= array("ﺫ","ﺬ");
                    $f_letter12= array("ﺭ","ﺮ");
                    $f_letter13= array("ﺯ","ﺰ");
                    $f_letter14= array("ﺱ","ﺲ","ﺳ","ﺴ");
                    $f_letter15= array("ﺵ","ﺶ","ﺷ","ﺸ");
                    $f_letter16= array("ﺹ","ﺺ","ﺻ","ﺼ");
                    $f_letter17= array("ﺽ","ﺾ","ﺿ","ﻀ");
                    $f_letter18= array("�","ﻂ","ﻃ","ﻄ");
                    $f_letter19= array("ﻅ","ﻆ","ﻇ","ﻈ");
                    $f_letter20= array("ﻉ","ﻊ","ﻋ","ﻌ");
                    $f_letter21= array("�","ﻎ","�","�");
                    $f_letter22= array("ﻑ","ﻒ","ﻓ","ﻔ");
                    $f_letter23= array("ﻕ","ﻖ","ﻗ","ﻘ");
                    $f_letter24= array("ﻙ","ﻚ","ﻛ","ﻜ","ﮎ","�","�","ﮑ");
                    $f_letter25= array("ﮒ","ﮓ","ﮔ","ﮕ");
                    $f_letter26= array("�","ﻞ","ﻟ","ﻠ");
                    $f_letter27= array("ﻡ","ﻢ","ﻣ","ﻤ");
                    $f_letter28 = array("ﻧ","ﻨ","ﻦ","ﻥ");
                    $f_letter29= array("ï»­","ï»®");
                    $f_letter30= array("ﻩ","ﻪ","ﻫ","ﻬ");
                    $f_letter31= array("ﻯ","ﻰ","ﻱ","ﻲ","ﻳ","ﻴ");

                    $file=str_replace($f_letter0,"ا",$file);
                    $file=str_replace($f_letter1,"ب",$file);
                    $file=str_replace($f_letter2,"Ù¾",$file);
                    $file=str_replace($f_letter3,"ت",$file);
                    $file=str_replace($f_letter4,"Ø«",$file);
                    $file=str_replace($f_letter5,"ج",$file);
                    $file=str_replace($f_letter6,"Ú†",$file);
                    $file=str_replace($f_letter7,"Ø­",$file);
                    $file=str_replace($f_letter8,"Ú˜",$file);
                    $file=str_replace($f_letter9,"Ø®",$file);
                    $file=str_replace($f_letter10,"د",$file);
                    $file=str_replace($f_letter11,"Ø°",$file);
                    $file=str_replace($f_letter12,"ر",$file);
                    $file=str_replace($f_letter13,"ز",$file);
                    $file=str_replace($f_letter14,"س",$file);
                    $file=str_replace($f_letter15,"Ø´",$file);
                    $file=str_replace($f_letter16,"ص",$file);
                    $file=str_replace($f_letter17,"ض",$file);
                    $file=str_replace($f_letter18,"Ø·",$file);
                    $file=str_replace($f_letter19,"ظ",$file);
                    $file=str_replace($f_letter20,"ع",$file);
                    $file=str_replace($f_letter21,"غ",$file);
                    $file=str_replace($f_letter22,"Ù�",$file);
                    $file=str_replace($f_letter23,"Ù‚",$file);
                    $file=str_replace($f_letter24,"Ú©",$file);
                    $file=str_replace($f_letter25,"Ú¯",$file);
                    $file=str_replace($f_letter26,"Ù„",$file);
                    $file=str_replace($f_letter27,"Ù…",$file);
                    $file=str_replace($f_letter28,"Ù†",$file);
                    $file=str_replace($f_letter29,"Ùˆ",$file);
                    $file=str_replace($f_letter30,"Ù‡",$file);
                    $file=str_replace($f_letter31,"ÙŠ",$file);
                }
            }

            if ($OKtoIndex == 1) {
                $pageSize = number_format(strlen($file)/1024, 2, ".", "");
                printPageSizeReport($pageSize, $topic);
            }

            $charSet = strtoupper(trim($contents['charset']));              //      final charset for UTF-8 converter

            //echo "\r\n\r\n<br /> charSet: '$charSet'<br />\r\n";
            if (stristr($charSet, "encoding") || strlen($charSet) < '3') {  //  must be invalid encountered charset
                $charSet = 'UTF-8';
            }

            $seg_data = '';
            //  if Chinese or Korean text should be segmented enter here
            if ($cn_seg == '1' && $file) {
                $dic = '';
                if ($charSet == 'GB2312' || $charSet == 'GB18030' || $charSet == 'GBK') {
                    $dic = "".$dict_dir."/cn_gb18030.dic";          //  simplified Chinese
                }
                if ($charSet == 'BIG5') {
                    $dic = "".$dict_dir."/cn_big5.dic";             //  traditional Chinese
                }
                if ($charSet == 'ISO10646-1933') {
                    $dic = "".$dict_dir."/kr_iso10646-1933.dic";    // Korean
                }
                if ($charSet == 'EUC-KR') {
                    $dic = "".$dict_dir."/kr_euc-kr.dic";           //  Korean
                }
                if ($charSet == 'UTF-8') {
                    $dic = "".$dict_dir."/cn_utf-8.dic";            //  Unicode
                }
                //echo "<br />dic: $dic<br />";

                if ($dic) {      //  if dictionary is available for page charset, perform a segmentation
                    $Segmentation = new Segmentation;
                    $Segmentation->load($dic);
                    $Segmentation->setLowercase(FALSE);
                    $cn_result = $Segmentation->segmentString($file);

                    if($cn_result  && $charSet != 'UTF-8'){
                        $iconv_file = @iconv($charSet, "UTF-8//IGNORE", $cn_result);
                        if(trim($iconv_file) == ""){            // iconv is not installed or input charSet not available. We need to use class ConvertCharset
                            $NewEncoding = new ConvertCharset($charSet, "utf-8");
                            $NewFileOutput = $NewEncoding->Convert($cn_result);
                            $cn_result = $NewFileOutput;
                        }else{
                            $cn_result = $iconv_file;
                        }
                        unset ($iconv_file, $NewEncoding, $NewFileOutput);
                    }

                    $seg_data = clean_file($cn_result, $url, $url_status['content'], $charSet, $use_nofollow, $use_robot, $can_leave_domain);
                } else {
                    printNoDictionary($charSet, $cl);   //  no dictionary found for this charset
                }
            }

            //  if Japanese text should be segmented enter here
            if ($jp_seg == '1' && $file) {
                $dic = '';
                if ($charSet == 'UTF-8' ||$charSet == 'EUC-JP') {
                    $file = @iconv($charSet, "SHIFT_JIS//IGNORE", $file);
                    $charSet = "SHIFT_JIS";
                }

                if ($charSet == 'SHIFT_JIS') {
                    $dic = "".$dict_dir."/jp_shiftJIS.dic";
                }

                if ($dic) {      //  if dictionary is available for page charset, perform a segmentation
                    $Segmentation = new Segmentation;
                    $Segmentation->load($dic);
                    $Segmentation->setLowercase(FALSE);
                    $jp_result = $Segmentation->segmentString($file);
                    //echo "\r\n\r\n<br /> jp_result: $jp_result<br />\r\n";
                    if($jp_result  && $charSet != 'UTF-8'){
                        $iconv_file = @iconv($charSet, "UTF-8//IGNORE" ,$jp_result);
                        if(trim($iconv_file) == ""){            // iconv is not installed or input charSet not available. We need to use class ConvertCharset
                            $NewEncoding = new ConvertCharset($charSet, "utf-8");
                            $NewFileOutput = $NewEncoding->Convert($jp_result);
                            $jp_result = $NewFileOutput;
                        }else{
                            $jp_result = $iconv_file;
                        }
                        unset ($iconv_file, $NewEncoding, $NewFileOutput);
                    }
                    $seg_data = clean_file($jp_result, $url, $url_status['content'], $charSet, $use_nofollow, $use_robot, $can_leave_domain);
                } else {
                    printNoDictionary($charSet, $cl);   //  no dictionary found for this charset
                }
            }
//echo "\r\n\r\n<br /> charSet: '$charSet'<br />\r\n";
            //  enter here only, if site / file is not yet UTF-8 coded or had already been converted to UTF-8
            if($charSet != "UTF-8" && $file){
                $file = convertToUTF8($file, $charSet, $char_Set, $converter_dir);
            }

            if ($index_media == '1') {
                $raw_file = $file;              // will be needed to find links to media files
                $newmd5sum = md5($raw_file);    //  get md5 including links and title of media files
            }

            $data = clean_file($file, $url, $url_status['content'], $charSet, $use_nofollow, $use_robot, $can_leave_domain);

            //  index only links and their titles
            if($only_links) {
                $media_links = '0';
                $my_links = get_link_details($file, $url, $can_leave_domain, $data['base'], $media_links, $use_nofollow, $local_redir);
                $data['content'] = $my_links[0][0];   //  define new content
                $data['fulltext'] = $my_links[0][0];   //  define new content also for 'full text';
            }

            //  combine raw words plus segmented  words
            if ($cn_seg == 1 || $jp_seg == 1 && $dic) {
                if ($debug != '0') {
                    $seg_add = $seg_data[count]-$data[count];  //      calculate segmentation result

                    if ($seg_add > '0') {
                        if ($charSet == 'EUC-KR' || $charSet == 'ISO10646-1933'){
                            printSegKR($seg_add, $cl);
                        }
                        if ($charSet == 'SHIFT_JIS'){
                            printSegJA($seg_add, $cl);
                        } else {
                            printSegCN($seg_add, $cl);
                        }
                    }
/*
 echo "<br /><pre>Results of word segmentation:</pre>";
 echo "<br />Unsegmented title :<br><pre>";print_r($data[title]);echo "</pre>";
 echo "<br />Segmented title :<br><pre>";print_r($seg_data[title]);echo "</pre>";
 echo "<br />Unsegmented full text:<br />$data[fulltext]<br />";
 echo "<br />Segmented full text:<br />$seg_data[fulltext]";
 */
                }
                $data[content]      ="".$data[content]."".$seg_data[content]."";
                $data[title]        ="".$data[title]."".$seg_data[title]."";
                $data[description]  ="".$data[description]."".$seg_data[description]."";
                $data[keywords]     ="".$data[keywords]."".$seg_data[keywords]."";
            }

            //      check if canonical redirection was found in page header
            $cano_link = '0';
            if ($data['cano_link']) {
                $cano_link = $data['cano_link'];
                $OKtoIndex = 0;
                $deletable = 1;
                $realnum -- ;

                if ($cano_link =="1") {
                    printNoCanonical($cano_link, $cl);                  //  if unable to extract redirection link
                } else {
                    if ($data['refresh'] == '1') {
                        printRefreshed($cano_link, $data['wait'], $cl);  //  if refresh meta tag was found in header
                    } else {
                        printCanonical($cano_link, $cl);                //  if canonical link was found in header
                    }
                    //      do we already know this link in link-table
                    $res = mysql_query("select url from ".$mysql_table_prefix."links where url like '$cano_link'");
                    if ($debug > '0') echo mysql_error();
                    $rows = mysql_num_rows($res);

                    if ($rows == 0) {    // if not known in link-table, check if already known in temp-table
                        $res = mysql_query("select link from ".$mysql_table_prefix."temp where link like '$cano_link'");
                        if ($debug > '0') echo mysql_error();
                        $rows = mysql_num_rows($res);

                        if ($rows == 0) {    // not known in link-table, add new link
                            if ($numoflinks <= $max_links) mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$cano_link', '$level', '$sessid')");
                            if ($debug > '0') echo mysql_error();
                        }
                    }
                }
            } else {

                if ($index_media == '0') {
                    $newmd5sum = md5($data['content']); // get md5 from cleaned full text only
                }

                if ($md5sum == $newmd5sum) {

                    printStandardReport('md5notChanged',$command_line, $no_log);
                    $OKtoIndex = 0;
                    $realnum -- ;
                } else {
                    mysqltest();
                    //     check for duplicate page content
                    $result = mysql_query("select link_id from ".$mysql_table_prefix."links where md5sum='$newmd5sum'");
                    if ($debug > '0') echo mysql_error();

                    if (mysql_num_rows($result) > 0) {  //  display warning message and urls with duplicate content
                        printStandardReport('duplicate',$command_line, $no_log);
                        $num_rows = mysql_num_rows($result);
                        for ($i=0; $i<$num_rows; $i++) {
                            $link_id = mysql_result($result, $i, "link_id");
                            //$num = $i+1;
                            $res = mysql_query("select url from ".$mysql_table_prefix."links where link_id like '$link_id'");
                            if ($debug > '0') echo mysql_error();
                            $row = mysql_fetch_row($res);
                            $dup_url = $row[0];
                            if ($clear == 1) clean_resource($res, '03') ;
                            printDupReport($dup_url,$command_line);
                        }
                        if ($dup_content == '0') {    //  enter here, if pages with duplicate content should not be indexed/re-indexed
                            $OKtoIndex = 0;
                            $realnum -- ;
                        } else {
                            $OKtoIndex = 1;
                        }
                    }
                }
            }

            if (($md5sum != $newmd5sum || $reindex ==1) && $OKtoIndex == 1) {
                $urlparts = parse_addr($url);
                $newdomain = $urlparts['host'];
                $type = 0;

                if ($data['noindex'] == 1) {
                    $OKtoIndex = 0;
                    $deletable = 1;
                    $realnum -- ;
                    printStandardReport('metaNoindex',$command_line, $no_log);
                }

                $content    = explode(" ",addslashes($data['content']));
//echo "\r\n\r\n<br>content Array:<br><pre>";print_r($content);echo "</pre>\r\n";
                $acc_words[] = array();
                $type = '';
                //  if Greek accents should be removed from Greek vowels
                if ($noacc_el) {
                    foreach ($content as &$thisword) {
                        $no_acc = remove_acc_el($thisword);
                        if($no_acc != $thisword) {
                            $acc_words[] = $no_acc;
                        }
                    }
                }
                //  if the other (Latin)  accents should be removed from their vowels
                if ($vowels) {
                    foreach ($content as $thisword) {
                        $no_acc = remove_acc($thisword);
                        if($no_acc != $thisword) {
                            $acc_words[] = $no_acc;
                        }
                    }
                }

                //  now add the words without accents to the total text content
                $content    = array_merge($content, $acc_words);
                $wordarray  = unique_array($content);

                if ($smp != 1) {
                    if ($data['nofollow'] != 1 && $cano_link == '0') {
                        $media_links = '0';

                        $links      = get_links($raw_file, $url, $can_leave_domain, $data['base'], $media_links, $use_nofollow, $local_redir, $url_reloc);
                        $links      = distinct_array($links);
                        $all_links  = count($links);
                        if ($all_links > $max_links) $all_links = $max_links;
                        $links = array_slice($links,0,$max_links);

                        if ($realnum < $max_links) {
                            $numoflinks = 0;
                            //if there are any, add to the temp table, but only if there isnt such url already
                            if (is_array($links)) {
                                reset ($links);
                                $tmp_urls = get_temp_urls($sessid);         //  reload previous temp

                                if ($debug == '2' ) {    //  if debug mode, show details
                                    printStandardReport('newLinks', $command_line, $no_log);
                                }

                                while ($thislink = each($links)) {
                                    //  ignore self linking
                                    if ($thislink[1] != "self"){
                                        //  find new domains for _addurl table
                                        if ($auto_add && $can_leave_domain) {
                                            $all_link = parse_all_url($thislink[1]);
                                            $new_link = $all_link['host'];

                                            mysqltest();
                                            //     check whether URL is already known in sites table
                                            $res1 = mysql_query("select url from ".$mysql_table_prefix."sites where url like '%$new_link%'");
                                            if ($debug > '0') echo mysql_error();
                                            //     check whether URL is already known in addurl table
                                            $res2 = mysql_query("select url from ".$mysql_table_prefix."addurl where url like '%$new_link%'");
                                            if ($debug > '0') echo mysql_error();
                                            //     check whether URL is banned
                                            $res3 = mysql_query("select domain from ".$mysql_table_prefix."banned where domain like '%$new_link%'");
                                            if ($debug > '0') echo mysql_error();

                                            if (mysql_num_rows($res1) == 0 && mysql_num_rows($res2) == 0 && mysql_num_rows($res3) == 0) {
                                                //  add new domain into _addurl table
                                                mysql_query ("insert into ".$mysql_table_prefix."addurl (url, description, account) values ('$thislink[1]', '$comment', '$admin_email')");
                                                if ($debug > '0') echo mysql_error();
                                            }
                                        }

                                        //      check whether thislink is already known as a link ( might happen by means of relocated URLs)
                                        $res4       = '';
                                        $res5       = '';
                                        $known_link = '';
                                        $known_temp = '';

                                        $res4 = mysql_query("select url from ".$mysql_table_prefix."links where url like '$thislink[1]'");
                                        if ($debug > '0') echo mysql_error();
                                        $known_link = mysql_num_rows($res4);

                                        $res5 = mysql_query("select link from ".$mysql_table_prefix."temp where link like '$thislink[1]'");
                                        if ($debug > '0') echo mysql_error();
                                        $known_temp = mysql_num_rows($res5);

                                        //      if this is a new link not yet known, add this new link to the temp table
                                        if ($tmp_urls[$thislink[1]] != 1 && !$known_link && !$known_temp) {
                                            $tmp_urls[$thislink[1]] = 1;
                                            $numoflinks++;

                                            if ($debug == '2') {
                                                $act_link = $thislink[1];
                                                printNewLinks($act_link, $cl);
                                            }
                                            mysqltest();
                                            if ($numoflinks <= $max_links) mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$thislink[1]', '$level', '$sessid')");
                                            if ($debug > '0') echo mysql_error();
                                        }
                                    }
                                }
                            }
                        }
                    } else {
                        printStandardReport('noFollow',$command_line, $no_log);
                    }
                    unset ($file);
                }

                //  if we should index only the files as defined in docs list
                if ($only_docs) {
                    $OKtoIndex = '';
                    foreach ($docs as $thisdoc){
                        if (strstr($urlparts['path'], $thisdoc)) {
                            $OKtoIndex = "1";
                        }
                    }
                    if (!$OKtoIndex) {
                        printStandardReport('noDoclist',$command_line, $no_log);
                    }
                }

                if ($OKtoIndex == 1) {
                    if ($link_check == 0) {
                        $title = $data['title'];
                        $host = $data['host'];
                        $path = $data['path'];
                        $fulltxt = $data['fulltext'];
                        $desc = substr($data['description'], 0,254);

                        //  extract domain
                        $url_parts  = parse_all_url($url);
                        $hostname   = $url_parts[host];

                        //  rebuild domain for localhost applications
                        if ($hostname == 'localhost') {
                            $host1 = str_replace($local,'',$url);
                        }

                        $pos = strpos($host1, "/");         //      on local server delete all behind the /
                        //      will work for localhost URLs like http://localhost/publizieren/japan1/index.htm
                        //       will fail for localhost URLs like http://localhost/publizieren/externe/japan2/index.htm
                        if ($pos) {
                            $host1 = substr($host1,0,$pos); //      build full adress again, now only local domain
                        }

                        if ($hostname == 'localhost') {
                            $domain_for_db = ("".$local."".$host1."/");   // complete URL
                            $domain_for_db = str_replace("http://", "", $domain_for_db);
                            //$domain_for_db = $host1;
                        }else {
                            //$domain_for_db = ("$url_parts[scheme]://".$hostname."/");  // complete URL
                            $domain_for_db = $hostname;
                        }

                        if (isset($domain_arr[$domain_for_db])) {
                            $dom_id = $domain_arr[$domain_for_db];
                        } else {
                            mysqltest();
                            mysql_query("insert into ".$mysql_table_prefix."domains (domain) values ('$domain_for_db')");
                            $dom_id = mysql_insert_id();
                            $domain_arr[$domain_for_db] = $dom_id;
                        }

                        reset($wordarray);
                        if ($case_sensitive == '0') {
                            foreach ($wordarray as &$value) {
                                $value[1] = lower_ent($value[1]);
                                $value[1] = lower_case($value[1]);  //  convert keywords to lower case
                            }
                        }

                        $wordarray = calc_weights ($wordarray, $title, $host, $path, $data['keywords'], $url_parts);

                        //if there are words to index, add the link to the database, get its id, and add the word + their relation
                        if (is_array($wordarray) && count($wordarray) >= $min_words_per_page) {

                            $OKtoSave = 1;
                            if ($use_white1 == '1') {       //  check if content of page matches ANY word in whitelist
                                $found = '0';
                                foreach ($whitelist as $key => $val1) {
                                    reset($wordarray);
                                    while ($thisword = each($wordarray)) {
                                        $word = trim($thisword[1][1]);
                                        if (strcasecmp($val1, $word) == 0) {
                                            $found = '1';
                                        }
                                    }
                                }

                                if ($found == '0') {
                                    printStandardReport('noWhitelist',$command_line, $no_log);
                                    $OKtoSave = 0;
                                    $realnum -- ;
                                }
                            }

                            if ($use_white2 == '1') {       //  check if content of page matches ALL words in whitelist
                                $all  = count($whitelist);
                                $found = '0';
                                $found_this = '0';
                                foreach ($whitelist as $key => $val2) {
                                    reset($wordarray);
                                    while ($thisword = each($wordarray)) {
                                        $word = trim($thisword[1][1]);
                                        if (strcasecmp($val2, $word) == 0) {
                                            $found_this = '1';
                                        }
                                    }
                                    if ($found_this != '0'){
                                        $found++;
                                        $found_this = '0';
                                    }
                                }

                                if ($found != $all) {
                                    printStandardReport('noWhitelist',$command_line, $no_log);
                                    $OKtoSave = 0;
                                    $realnum -- ;
                                }
                            }

                            if ($use_black == '1') {
                                $found = '0';           //  check if content of page matches ANY string in blacklist
                                foreach ($blacklist as $key => $val3) {
                                    $met = stripos($data[fulltext], $val3);
                                    if($met) $found = '1';
                                }
                                if ($found == '1') {
                                    printStandardReport('matchBlacklist',$command_line, $no_log);
                                    $OKtoSave = 0;
                                    $realnum -- ;
                                }
                            }

                            if ($md5sum == '' || ($md5sum == '' && $url_status['relocate'])) {    //  enter here for new page (unknown link) OR for new relocated URL(so it will become a new link)
                                mysqltest();
                                mysql_query ("insert into ".$mysql_table_prefix."links (site_id, url, title, description, fulltxt, indexdate, size, md5sum, level) values ('$site_id', '$url', '$title', '$desc', '$fulltxt', curdate(), '$pageSize', '$newmd5sum', '$thislevel')");
                                if ($debug > '0') echo mysql_error();
                                $result = mysql_query("select link_id from ".$mysql_table_prefix."links where url='$url'");
                                if ($debug > '0') echo mysql_error();
                                $row = mysql_fetch_row($result);
                                $link_id = $row[0];
                                if ($clear == 1) clean_resource($result, '04');

                                if ($OKtoSave) {

                                    //  store link details, if not yet known (during reindex)
                                    if ($only_links) {
                                        //  extract domain of current page delivering the new links
                                        $url_parts  = parse_all_url($url);
                                        $hostname   = $url_parts[host];

                                        if ($hostname == 'localhost') {     //  rebuild domain for localhost applications
                                            $host1 = str_replace($local,'',$url);
                                        }

                                        $pos = strpos($host1, "/");         //      on local server delete all behind the /
                                        //      will work for localhost URLs like http://localhost/publizieren/japan1/index.htm
                                        //       will fail for localhost URLs like http://localhost/publizieren/externe/japan2/index.htm
                                        if ($pos) {
                                            $host1 = substr($host1,0,$pos); //      build full adress again, now only local domain
                                        }

                                        if ($hostname == 'localhost') {
                                            $domain_db = ("".$local."".$host1."/");   // complete URL
                                            $domain_db = str_replace("http://", "", $domain_db);
                                            //$domain_db = $host1;
                                        }else {
                                            //$domain_db = ("$url_parts[scheme]://".$hostname."/");  // complete URL
                                            $domain_db = $hostname;
                                        }

                                        //    now store all link details into db
                                        foreach ($my_links as $found_link) {
                                            //  but only if we have found a title
                                            if ($found_link[3]) {
                                                mysqltest();
                                                //     check whether URL is already known in sites table
                                                $res1 = mysql_query("select title from ".$mysql_table_prefix."link_details where link_id like '$link_id' and url like '%$found_link[2]%'");
                                                if ($debug > '0') echo mysql_error();

                                                if (mysql_num_rows($res1) == 0) {   //  must be new link
                                                    mysql_query ("insert into ".$mysql_table_prefix."link_details (link_id, url, title, indexdate, domain) values ('$link_id', '$found_link[2]', '$found_link[3]', now(), '$domain_db')");
                                                    if ($debug > '0') echo mysql_error();
                                                }
                                            }
                                        }
                                    }


                                    if ($debug == '2') {    //  if debug mode, show details
                                        printStandardReport('newKeywords', $command_line, $no_log);
                                    }

                                    save_keywords($wordarray, $link_id, $dom_id);

                                }

                                mysqltest();
                                if ($index_media == '1' && $OKtoSave) {     //   find media content only if there was no conflict with text (white and/or blacklist)
                                    include "index_media.php";              //  try to find media files
                                }
                                mysqltest();

                                if ($debug == '2') {
                                    printStandardReport('indexed1', $command_line, $no_log);
                                } else {
                                    printStandardReport('indexed', $command_line, $no_log);
                                }
                            } else if (($md5sum <> '') && ($md5sum <> $newmd5sum) && $OKtoSave) { //if page has changed, start updating
                                mysqltest();

                                $result = mysql_query("select link_id from ".$mysql_table_prefix."links where url='$url'");
                                if ($debug > '0') echo mysql_error();
                                $row = mysql_fetch_row($result);
                                $link_id = $row[0];

                                for ($i=0;$i<=15; $i++) {
                                    $char = dechex($i);
                                    mysql_query ("delete from ".$mysql_table_prefix."link_keyword$char where link_id=$link_id");
                                    if ($debug > '0') echo mysql_error();
                                }
                                if ($clear == 1) clean_resource($result, '05');

                                if ($debug == '2') {    //  if debug mode, show details
                                    printStandardReport('newKeywords', $command_line, $no_log);
                                }
                                save_keywords($wordarray, $link_id, $dom_id);

                                $query = "update ".$mysql_table_prefix."links set title='$title', description ='$desc', fulltxt = '$fulltxt', indexdate=now(), size = '$pageSize', md5sum='$newmd5sum', level='$thislevel' where link_id='$link_id'";
                                mysqltest();
                                mysql_query($query);
                                if ($debug > '0') echo mysql_error();

                                if ($index_media == '1') {
                                    include "index_media.php";      //  try to find media files
                                }

                                if ($debug == '2') {
                                    printStandardReport('re-indexed1', $command_line, $no_log);
                                }
                            }
                        }else {
                            printStandardReport('minWords', $command_line, $no_log);
                            $realnum -- ;
                        }
                    } else {
                        printStandardReport('link_okay', $command_line, $no_log);
                    }
                    unset ($wordarray, $title, $fulltxt, $desc, $data, $seg_data);
                }
            }
        } else {
            $deletable = 1;
            printUrlStatus($url_status['state'], $command_line, $no_log);
        }
        mysqltest();
        if ($url_status['relocate'] ){
            //  remove this relocated URL from temp table, because it is indexed now
            mysql_query ("delete from ".$mysql_table_prefix."temp where link = '$url' AND id = '$sessid'");
            if ($debug > '0') echo mysql_error();
        }

		if ($reindex ==1 && $deletable == 1) {
			check_for_removal($url);
		} else if ($reindex == 1) {

		}
		if (!isset($all_links)) {
			$all_links = 0;
		}
		if (!isset($numoflinks)) {
			$numoflinks = 0;
		}
        if ($smp != 1 && $OKtoIndex == 1) {   //      if valid sitemap found,or canonical link, or something else, no LinkReport
            printLinksReport($numoflinks, $all_links, $command_line);
        }
	}

    function index_site($url, $reindex, $maxlevel, $soption, $url_inc, $url_not_inc, $can_leave, $use_robot, $use_nofollow, $cl, $all, $use_pref) {
        global $mysql_table_prefix, $command_line, $mainurl,  $tmp_urls, $domain_arr, $all_keywords, $smp, $follow_sitemap;
        global $link_check, $smap_dir, $index_media, $db_con, $clear, $create_sitemap, $tmp_dir, $domaincb;
        global $max_links, $realnum, $debug, $no_log, $dba_act, $add_auth, $interrupt, $index_media, $thumb_folder;

        if (!$can_leave) {
            $can_leave = $domaincb;
        }
        $can_leave_domain = $can_leave;

        $starttime  = getmicrotime();   //  start time to index this site
        $site_id    = '';
        $skip       = '';
        $smp        = '0';
        $omit       = array();

        if (strstr($interrupt, "-")) {  //  if indexer should not be interrupted periodically
            $interrupt = '999999';      //  never
        }
        $int_count = $interrupt;        //  $int_count will be decreased by each indexed link until $int_count = 1

        printStandardReport('starting',$command_line, $no_log);

        if (!isset($all_keywords)) {
            mysqltest();
            $result = mysql_query("select keyword_ID, keyword from ".$mysql_table_prefix."keywords");
            if ($debug > '0') echo mysql_error();
            while($row=mysql_fetch_array($result)) {
                $all_keywords[addslashes($row[1])] = $row[0];
            }
            if ($clear == 1) clean_resource($result, '06') ;
        }

        $url = convert_url($url);
        $compurl = parse_addr($url);

        if ($compurl['path'] == '') {
            $url = $url . "/";
        }

        $t = microtime();
        $a =  getenv("REMOTE_ADDR");
        $sessid = md5 ($t.$a);

        if ($url != '/') {      //      ignore dummies
            $urlparts = parse_addr($url);

            $domain = $urlparts['host'];
            if (isset($urlparts['port'])) {
                $port = (int)$urlparts['port'];
            }else {
                $port = 80;
            }

            mysqltest();
            $result = mysql_query("select site_id, authent from ".$mysql_table_prefix."sites where url='$url'");
            if ($debug > '0') echo mysql_error();
            $row = mysql_fetch_row($result);
            $site_id = $row[0];
            $authent = $row[2];
            if ($clear == 1) clean_resource($result, '07') ;

            if ($add_auth && $authent) {        //  for sites with authentication we need to verify the value
                $url_status = url_status($url);
                $url_parts  = parse_all_url($url);

                if ($url_status['state'] == 'ok' && $url_status['content'] == 'text') {

                    if ($url_status['relocate'] ){          //  if relocated,  print message and redirect to new URL
                        $new_url = $url_status['path'] ;
                        $diff = strlen($url);
                        $redir = substr( $new_url, $diff);      //      extract diff. between original URL and relocated URL

                        if ($redir == "index.php" || $redir == "index.html" || $redir == "index.htm") {
                            $local_redir = '1';

                            //      no output because diff.  is only index.html etc.
                        } else {
                            printRedirected($url_status['relocate'], $url_status['path'], $cl);
                        }
                        if (strstr($url_status['path'], "//")) {                            //  if redirected to absolute URL, use this for further usage
                            $url = $url_status['path'];
                        } else {
                            $relo_url = str_replace($url_parts['query'], "", $url);         //  url without query
                            $relo_url = substr($url, 0, strrpos($relo_url, "/")+1);         //  url without file name
                            if (strpos($url_status['path'], "./") === 0) {                  //  if redirected relativ to same folder depth
                                $url_status['path'] = str_replace("./", "", $url_status['path']);
                                $url = "".$relo_url."".$url_status['path']."";
                            }
                            if (strpos($url_status['path'], "../") === 0) {                 //  if redirected relativ and one folder up
                                $url_status['path'] = str_replace("./", "", $url_status['path']);
                                $relo_url = substr($url, 0, strpos($url_parts['path']));    //  url without file name
                                $relo_url = substr($url, 0, strrpos($relo_url, "/")+1);     //  url without last folder
                                $url = "".$relo_url."".$url_status['path']."";
                            }
                        }
                    }

                    //  read file
                    $contents   = array();
                    $file       = '';
                    $file = file_get_contents($url);

                    if ($file === FALSE) {  //  we know another way to get the content
                        $get_charset    = '';
                        $contents = getFileContents($url, $get_charset);
                        $file = $contents['file'];
                    }

                    //  parse header only
                    preg_match("@<head[^>]*>(.*?)<\/head>@si",$file, $regs);
                    $headdata = $regs[1];
                    //  fetch the tag value
                    preg_match("/<meta +name *=[\"']?Sphider-plus[\"']? *content=[\"'](.*?)[\"']/i", $headdata, $res);
                    if (isset ($res)) {
                        if ($authent != $res[1]) {      //  invalid value in authentication tag
                            $skip = '1';
                            printHeader ($omit, $url, $command_line);
                            printStandardReport('Skipped_03', $command_line, $no_log);
                        }
                    } else {                            //  no authentication tag found in header
                        $skip = '1';
                        printHeader ($omit, $url, $command_line);
                        printStandardReport('Skipped_02', $command_line, $no_log);
                    }

                } else {
                    $skip = '1';
                    printHeader ($omit, $url, $command_line);
                    printStandardReport('statError', $command_line, $no_log);
                }
            }

            if (!$skip) {
                if ($site_id != "" && $reindex == 1) {
                    mysqltest();
                    mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', 0, '$sessid')");
                    if ($debug > '0') echo mysql_error();
                    $result = mysql_query("select url, level from ".$mysql_table_prefix."links where site_id = $site_id");
                    while ($row = mysql_fetch_array($result)) {
                        $site_link = $row['url'];
                        $link_level = $row['level'];
                        if ($site_link != $url) {
                            mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$site_link', '$link_level', '$sessid')");
                        }
                    }
                    if ($clear == 1) clean_resource($result, '08') ;

                    $qry = "update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth ='$maxlevel', required = '$url_inc'," .
                        "disallowed = '$url_not_inc', can_leave_domain='$can_leave', use_prefcharset='$use_pref' where site_id='$site_id'";
                    mysqltest();
                    mysql_query ($qry);
                    if ($debug > '0') echo mysql_error();
                } else if ($site_id == '') {
                    mysqltest();
                    mysql_query ("insert into ".$mysql_table_prefix."sites (url, indexdate, spider_depth, required, disallowed, can_leave_domain, use_prefcharset) " .
                        "values ('$url', now(), '$maxlevel', '$url_inc', '$url_not_inc', '$can_leave_domain', '$use_pref')");
                    if ($debug > '0') echo mysql_error();
                    $result = mysql_query("select site_ID from ".$mysql_table_prefix."sites where url='$url'");
                    $row = mysql_fetch_row($result);
                    $site_id = $row[0];
                    if ($clear == 1) clean_resource($result, '09') ;
                } else {
                    mysqltest();
                    mysql_query ("update ".$mysql_table_prefix."sites set indexdate=now(), spider_depth ='$maxlevel', required = '$url_inc'," .
                        "disallowed = '$url_not_inc', can_leave_domain='$can_leave_domain', use_prefcharset='$use_pref' where site_id='$site_id'");
                    if ($debug > '0') echo mysql_error();
                }

                $pending = array();
                mysqltest();
                $result = mysql_query("select site_id, temp_id, level, count, num from ".$mysql_table_prefix."pending where site_id='$site_id'");
                if ($debug > '0') echo mysql_error();
                $row        = mysql_fetch_row($result);
                $pending    = $row[0];
                $level      = '0';
                $count      = '0';
                if ($clear == 1) clean_resource($result, '10') ;

                $domain_arr = get_domains();
                if ($pending == '') {
                    mysqltest();
                    mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', 0, '$sessid')");
                    if ($debug > '0') echo mysql_error();
                } else if ($pending != '') {
                    printStandardReport('continueSuspended',$command_line, $no_log);
                    mysqltest();
                    $pend_count = '0';
                    //$result = mysql_query("select temp_id, level, count from ".$mysql_table_prefix."pending where site_id='$site_id'");
                    $result = mysql_query("select * from ".$mysql_table_prefix."pending where site_id='$site_id'");
                    if ($debug > '0') echo mysql_error();
                    $row = mysql_fetch_row($result);
                    if ($row) {
                        $sessid = $row[1];
                        $level = $row[2];
                        $pend_count = $row[3] + 1;
                        $num = $row[4];
                        $pending = 1;
                        $tmp_urls = get_temp_urls($sessid);
                        if ($clear == 1) clean_resource($result, '11') ;
                    }
                }

                if ($pending != 1) {
                    mysqltest();
                    mysql_query ("insert into ".$mysql_table_prefix."pending (site_id, temp_id, level, count) values ('$site_id', '$sessid', '0', '0')");
                    if ($debug > '0') echo mysql_error();
                }

                $time   = time();
                $robots = ("robots.txt"); // standardname of robots file
                if ($use_robot == '1') {
                    $omit = check_robot_txt($url, $robots);
                }

                printHeader ($omit, $url, $command_line);

                if ($link_check == 1) printStandardReport('start_link_check', $command_line, $no_log);
                if ($link_check == 0 && $reindex == 1 ) printStandardReport('start_reindex', $command_line, $no_log);
                if ($link_check == 0 && $reindex == 0 ) printStandardReport('starting', $command_line, $no_log);

                $mainurl    = $url;
                $realnum    = $num;
                $num        = 0;

                while (($level <= $maxlevel && $soption == 'level') || ($soption == 'full')) {
                    if ($pending == 1) {
                        $count = $pend_count;
                        $pending = 0;
                    } else {
                        $count = 0;
                    }

                    $links = array();
                    mysqltest();
                    $result = mysql_query("select distinct link from ".$mysql_table_prefix."temp where level=$level && id='$sessid' order by link");
                    if ($debug > '0') echo mysql_error();
                    $rows = mysql_num_rows($result);

                    if ($rows == 0) {
                        break;
                    }

                    while ($row = mysql_fetch_array($result)) {
                        $links[] = $row['link'];
                    }

                    if ($clear == 1) clean_resource($result, '12') ;
                    reset ($links);

                    //  now loop through all available links(pages)
                    while ($count < count($links)) {
                        $num++;
                        $realnum ++ ;
                        if ($realnum > $max_links ) {    //  if max. links per page reached
                            mysqltest();
                            mysql_query ("delete from ".$mysql_table_prefix."temp where id = '$sessid'");
                            if ($debug > '0') echo mysql_error();
                            mysql_query ("delete from ".$mysql_table_prefix."pending where site_id = '$site_id'");
                            if ($debug > '0') echo mysql_error();
                            printMaxLinks($max_links, $cl);
                            printStandardReport('completed',$command_line, $no_log);

                            return;
                        }

                        $thislink   = $links[$count];
                        $urlparts   = parse_addr($thislink);
                        $forbidden  = 0;

                        if (is_array($omit)) {   //      if valid robots.txt  was found
                            reset ($omit);
                            foreach ($omit as $omiturl) {
                                $omiturl = trim($omiturl);

                                $omiturl_parts = array();
                                $omiturl_parts = parse_addr($omiturl);
                                if (@$omiturl_parts['scheme'] == '') {
                                    $check_omit = $urlparts['host'] . $omiturl;
                                } else {
                                    $check_omit = $omiturl;
                                }

                                if (strpos($thislink, $check_omit)) {
                                    printRobotsReport($num, $thislink, $command_line);
                                    $realnum -- ;
                                    check_for_removal($thislink);
                                    $forbidden = 1;
                                    break;
                                }
                            }
                        }

                        if (!check_include($thislink, $url_inc, $url_not_inc )) {
                            $realnum -- ;
                            printUrlStringReport($num, $thislink, $command_line);
                            //printUrlStringReport($realnum, $thislink, $command_line);
                            check_for_removal($thislink);
                            $forbidden = 1;
                        }

                        if ($forbidden == 0) {
                            printRetrieving($num, $thislink, $command_line);
                            //printRetrieving($realnum, $thislink, $command_line);
                            mysqltest();
                            $query = "select md5sum, indexdate from ".$mysql_table_prefix."links where url='$thislink'";
                            $result = mysql_query($query);
                            if ($debug > '0') echo mysql_error();
                            $rows = mysql_num_rows($result);
                            if ($rows == 0) {
                                index_url($thislink, $level+1, $site_id, '',  $domain, '', $sessid, $can_leave_domain, $reindex, $use_nofollow, $cl, $use_robot, $use_pref );
                                mysqltest();
                                mysql_query("update ".$mysql_table_prefix."pending set level ='$level', count='$count', num='$realnum' where site_id='$site_id'");
                                if ($debug > '0') echo mysql_error();

                            } else if ($rows <> 0 && $reindex == 1) {
                                $row = mysql_fetch_array($result);
                                $md5sum = $row['md5sum'];
                                $indexdate = $row['indexdate'];

                                if ($link_check == 1 && $reindex == 1) link_check($thislink, $level+1, $sessid, $can_leave_domain, $reindex);
                                else {
                                    mysqltest();
                                    index_url($thislink, $level+1, $site_id, $md5sum,  $domain, $indexdate, $sessid, $can_leave_domain, $reindex, $use_nofollow, $cl, $use_robot, $use_pref);
                                }
                            }else {
                                printStandardReport('inDatabase',$command_line, $no_log);
                                $realnum -- ;
                                //$num--;
                            }
                            if ($rows <> 0) {
                                mysqltest();
                                mysql_query("update ".$mysql_table_prefix."pending set level ='$level', count='$count', num='$realnum' where site_id='$site_id'");
                                if ($debug > '0') echo mysql_error();
                            }
                            if ($clear == 1) clean_resource($result, '13') ;
                        }

                        //  check for interrupt counter
                        if ($int_count == '1') {   //  interrupt the index procedure until interactive resume
                            mysql_query("update ".$mysql_table_prefix."pending set level ='$level', count='$count', num='$realnum' where site_id='$site_id'");
                            if ($debug > '0') echo mysql_error();
                            printInterrupt($interrupt, $url, $cl) ;
                            die;
                        }
                        $count++;
                        $int_count--;
                    }
                    $level++;
                }
            }

            mysqltest();
            mysql_query ("delete from ".$mysql_table_prefix."temp where id = '$sessid'");
            if ($debug > '0') echo mysql_error();
            mysql_query ("delete from ".$mysql_table_prefix."pending where site_id = '$site_id'");
            if ($debug > '0') echo mysql_error();

            if ($create_sitemap == 1) {
                create_sitemap($site_id, $url);
            }
            /*
             $consumed   = round(getmicrotime() - $starttime, 3);
             printConsumedReport('consumed', $cl, '0', $consumed);   //  time elapsed to index this URL
             */
            printStandardReport('completed',$command_line, $no_log);
            $stats = get_Stats();
            printDatabase($stats, $cl);
        }

        if ($index_media) {
            //  delete all thumbnails in .../admin/tmp/thumbs/ folder
            clear_folder(".".$thumb_folder);
        }

    }

    function index_all() {
        global $mysql_table_prefix, $reindex, $command_line, $omit;
        global $url, $cl, $clear, $real_log, $debug, $use_robot, $use_nofollow, $no_log;

        $all = '1'; //  here only as a dummy; needed to display the back to admin  button
        mysqltest();
        $result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain, use_prefcharset from ".$mysql_table_prefix."sites");
        if ($debug > '0') echo mysql_error();
        while ($row=mysql_fetch_row($result)) {
            $url                = $row[0];
            $depth              = $row[1];
            $include            = $row[2];
            $not_include        = $row[3];
            $can_leave_domain   = $row[4];

            $use_prefcharset    = $row[5];

            if ($can_leave_domain=='') {
                $can_leave_domain=0;
            }

            if ($depth == -1) {
                $soption = 'full';
            } else {
                $soption = 'level';
            }

            index_site($url, 1, $depth, $soption, $include, $not_include, $can_leave_domain, $use_robot, $use_nofollow, $cl, $all, $use_prefcharset);
        }
        if ($clear == 1) clean_resource($result, '14') ;
        printStandardReport('ReindexFinish', $command_line, $no_log);

        create_footer();
    }

    function index_these() {
        global $mysql_table_prefix, $reindex, $command_line, $omit, $tmp_dir;
        global $url, $cl, $clear, $real_log, $debug, $use_robot, $use_nofollow, $no_log;

        $site_ids   = array();
        $all        = '1';                              //  here only as a dummy; needed to display the back to admin  button
        $site_ids   = @file("$tmp_dir/act_sites.txt");  //   read the temp file that holds the actual site ids

        if (is_array($site_ids) && count($site_ids)) {
            mysqltest();
            foreach($site_ids as $this_id) {
                $result = mysql_query("select url, spider_depth, required, disallowed, can_leave_domain, use_prefcharset from ".$mysql_table_prefix."sites where site_id='$this_id'");
                if ($debug > '0') echo mysql_error();

                $row = mysql_fetch_row($result);

                $url                = $row[0];
                $depth              = $row[1];
                $include            = $row[2];
                $not_include        = $row[3];
                $can_leave_domain   = $row[4];
                $use_prefcharset    = $row[5];

                if ($can_leave_domain=='') {
                    $can_leave_domain=0;
                }

                if ($depth == -1) {
                    $soption = 'full';
                } else {
                    $soption = 'level';
                }

                index_site($url, 1, $depth, $soption, $include, $not_include, $can_leave_domain, $use_robot, $use_nofollow, $cl, $all, $use_prefcharset);
            }
        } else {
            printStandardReport('NoSitesFound', $command_line, $no_log);    //  print warning message
        }

        if ($clear == 1) {
            clean_resource($result, '14') ;
            $site_ids   = array();
            $row        = array();
        }
        printStandardReport('ReindexFinish', $command_line, $no_log);

        create_footer();
    }

    function erase() {    //  only for command line option:  -erase
        global $mysql_table_prefix, $reindex, $command_line, $omit;
        global $url, $cl, $clear, $real_log, $debug, $use_robot, $use_nofollow;
        global $no_log, $clear_cache, $textcache_dir, $mediacache_dir ;

        //  if Admin selected, clear text and media cache
        if ($clear_cache == '1') {
            if ($handle = opendir($textcache_dir)) {
                while (false !== ($file = readdir($handle))) {
                    if ($file != "." && $file != "..") {
                        @unlink("".$textcache_dir."/".$file."");
                    }
                }
            }

            if ($handle = opendir($mediacache_dir)) {
                while (false !== ($file = readdir($handle))) {
                    if ($file != "." && $file != "..") {
                        @unlink("".$mediacache_dir."/".$file."");
                    }
                }
            }

        }

        //  clear all data in database
        $erase =array ("domains","keywords","links","link_keyword0","link_keyword1","link_keyword2","link_keyword3","link_keyword4","link_keyword5","link_keyword6","link_keyword7","link_keyword8","link_keyword9","link_keyworda","link_keywordb","link_keywordc","link_keywordd","link_keyworde","link_keywordf","media");
        foreach ($erase as $allthis){
            mysql_query ("TRUNCATE `".$mysql_table_prefix."$allthis`");
            if ($debug > '0') echo mysql_error();
        }
        if ($clear == 1) clean_resource($result, '14') ;
        printStandardReport('ErasedFinished', $command_line, $no_log);
        create_footer();
    }



    function index() {    //  only for command line option:  -eall
        global $mysql_table_prefix, $command_line, $no_log;
        global $url, $clear, $debug, $use_robot, $use_nofollow;

        //  now re-index all
        mysqltest();
        $result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain, use_prefcharset from ".$mysql_table_prefix."sites");
        if ($debug > '0') echo mysql_error();
        while ($row=mysql_fetch_row($result)) {
            $url                = $row[0];
            $depth              = $row[1];
            $include            = $row[2];
            $not_include        = $row[3];
            $can_leave_domain   = $row[4];
            $use_prefcharset    = $row[5];

            if ($can_leave_domain=='') {
                $can_leave_domain=0;
            }
            if ($depth == -1) {
                $soption = 'full';
            } else {
                $soption = 'level';
            }

            index_site($url, 1, $depth, $soption, $include, $not_include, $can_leave_domain, $use_robot, $use_nofollow, $use_prefcharset );
        }
        if ($clear == 1) clean_resource($result, '14') ;
        printStandardReport('ReindexFinish', $command_line, $no_log);
        create_footer();
    }

    function get_temp_urls($sessid) {
        global $mysql_table_prefix, $debug, $clear;

        $result = mysql_query("select link from ".$mysql_table_prefix."temp where id='$sessid' limit 0,100");
        if ($debug > '0') echo mysql_error();
        $tmp_urls = Array();
        while ($row=mysql_fetch_row($result)) {
            $tmp_urls[$row[0]] = 1;
        }
        if ($clear == 1) clean_resource($result, '15') ;
        return $tmp_urls;

    }

    function get_domains() {
        global $mysql_table_prefix, $debug, $clear;

        mysqltest();
        $result = mysql_query("select domain_id, domain from ".$mysql_table_prefix."domains");
        if ($debug > '0') echo mysql_error();
        $domains = Array();
        while ($row=mysql_fetch_row($result)) {
            $domains[$row[1]] = $row[0];
        }
        if ($clear == 1) clean_resource($result, '16') ;
        return $domains;

    }

    function get_arch_content($buf, $name, $url) {
        global $index_framesets, $command_line, $no_log, $can_leave_domain, $index_rss;

        $suffix = substr(strtolower($name), strrpos($name, ".")+1);
        //  if special converter is required
        if ($suffix == 'pdf') $buf = extract_text($buf, 'pdf', 0);
        if ($suffix == 'doc') $buf = extract_text($buf, 'doc', 0);
        if ($suffix == 'rtf') $buf = extract_text($buf, 'rtf', 0);
        if ($suffix == 'xls') $buf = extract_text($buf, 'xls', 0);
        if ($suffix == 'ptt') $buf = extract_text($buf, 'ptt', 0);

        //  for extracting framesets of this file enter here. Iframes will be extracted later on for the complete $file
        if ($index_framesets == '1') {
            if (preg_match("@<frameset[^>]*>(.*?)<\/frameset>@si",$buf, $regs)) {
                printStandardReport('newFrameset', $command_line, $no_log);
                //  separate the <frameset> ....</frameset> part of this file
                $frame = $regs[1];
                $replace = get_frames($frame, $url, $can_leave_domain);
                $replace ="<body>".$replace."</body>";  //  create the body tags for $buf
                //  include all replacements instead of the frameset tag into the actual file. This will become the body
                $buf = preg_replace("@<frameset.*?</frameset>@si", "$replace", $buf);
            }
        }

        // for extracting archived feeds enter here
        if ((preg_match("/<rss|atom|<feed|<rdf|<rsd/si", substr($buf,0,400))) && $index_rss =='1')  {
            $buf = get_arch_feeds($buf, $url);
        }

        return $buf;
    }

    function get_arch_feeds($buf, $url) {
        global $command_line, $no_log, $debug, $cl, $max_links, $dc, $preferred, $cdata;

        $html = '';
        $xml = XML_IsWellFormed($buf);     //      check for well-formed XML
        if ($xml != '1') {
            if ($debug > 0 ) {
                printNotWellFormedXML($xml, $cl);
            }
        } else {
            $rss = new feedParser;
            // define options for feed parser
            $rss->limit     = $max_links;   //   save time by limiting the items/entries to be processed
            $rss->in_cp     = strtoupper($contents['charset']); //  charset of actual file
            $rss->out_cp    = 'UTF-8';      //  convert all into this charset
            $rss->cache_dir = '';           //  currently unused
            $rss->dc        = $dc;          //  treat Dublin Core tags in RDF feeds
            $rss->pro       = $preferred;   //  obey the PREFERRED directive in RSD feeds
            $rss->file      = '1';          //  use $buf as feed (as a string, not URL)

            if ($cdata != 1) {
                $rss->CDATA = 'content';    //  get it all  (naughty)
            } else {
                $rss->CDATA = 'nochange';   //  well educated crawler
            }
            //  get feed as array
            if ($feed = $rss->get($url, $buf)){
                //  if you want to see the feed, uncomment the following row
                //echo "<br>Feed array:<br><pre>";print_r($feed);echo "</pre>";
                $link           = '';
                $textinput_link = '';
                $image_url      = '';
                $image_link     = '';
                $docs           = '';
                $subjects       = '';
                $count          = '';
                $type           = $feed[type];
                $count          = $feed[sub_count];
                $cached         = $feed[cached];

                //  kill all no longer required values
                $feed[type]         = '';
                $feed[sub_count]    = '';
                $feed[encoding_in]  = '';
                $feed[encoding_out] = '';
                $feed[items_count]  = '';
                $feed[cached]       = '';

                if (!$count) {
                    $count = '0';
                }

                if ($type == 'RSD') {
                    //      prepare all RSD APIs
                    for($i=0;$i<$count;$i++){
                        $subjects .= ''.$feed['api'][$i]['name'].'<br />
                                '.$feed['api'][$i]['apiLink'].'<br />
                                '.$feed['api'][$i]['blogID'].'<br />
                                '.$feed['api'][$i]['settings_docs'].'<br />
                                '.$feed['api'][$i]['settings_notes'].'<br />';
                    }
                }



                if ($type == 'Atom') {
                    //      prepare all Atom entries
                    for($i=0;$i<$count;$i++){
                        $subjects .= ''.$feed['entries'][$i]['link'].'<br />
                                '.$feed['entries'][$i]['title'].'<br />
                                '.$feed['entries'][$i]['id'].'<br />
                                '.$feed['entries'][$i]['published'].'<br />
                                '.$feed['entries'][$i]['updated'].'<br />
                                '.$feed['entries'][$i]['summary'].'<br />
                                '.$feed['entries'][$i]['rights'].'<br />
                                '.$feed['entries'][$i]['author_name'].' '.$feed['entries'][$i]['author_email'].' '.$feed['entries'][$i]['author_uri'].'<br />
                                '.$feed['entries'][$i]['category_term'].' '.$feed['entries'][$i]['category_label'].' '.$feed['entries'][$i]['category_scheme'].'<br />
                                '.$feed['entries'][$i]['contributor_name'].' '.$feed['entries'][$i]['contributor_email'].' '.$feed['entries'][$i]['contributor_uri'].'<br />
                            ';
                    }

                }
                if ($type == 'RDF' | $type =='RSS v.0.91/0.92' | $type == 'RSS v.2.0'){    //  For RDF and RSS feeds enter here
                    //  prepare channel image
                    $image_url = $feed[image_url];
                    if($image_url){
                        $width = $feed[image_width];
                        if (!$width || $width > '144') {
                            $width = '88';  //set to default value
                        }
                        $height = $feed[image_height];
                        if (!$height || $height > '400') {
                            $height = '31';  //set to default value
                        }

                        $feed[image_url] = "<img id=\"rss_007\" src=\"".$image_url."\" alt=\"".$feed[image_title]."\" width=\"".$width."\" height=\"".$height."\">";
                    }
                    $image_link = $feed[image_link];
                    if($image_link){
                        $feed[image_link] = "<a href=\"".$image_link."\">".$image_link."</a>";
                    }

                    //      prepare all RDF or RSS items
                    for($i=0;$i<$count;$i++){
                        $subjects .= ''.$feed['items'][$i]['link'].'<br />
                                '.$feed['items'][$i]['title'].'<br />
                                '.$feed['items'][$i]['description'].'<br />
                                '.$feed['items'][$i]['author'].'<br />
                                '.$feed['items'][$i]['category'].'<br />
                                '.$feed['items'][$i]['guid'].'<br />
                                '.$feed['items'][$i]['comments'].'<br />
                                '.$feed['items'][$i]['pubDate'].'<br />
                                '.$feed['items'][$i]['source'].'<br />
                                '.$feed['items'][$i]['enclosure'].'<br />
                                '.$feed['items'][$i]['country'].'<br />
                                '.$feed['items'][$i]['coverage'].'<br />
                                '.$feed['items'][$i]['contributor'].'<br />
                                '.$feed['items'][$i]['date'].'<br />
                                '.$feed['items'][$i]['industry'].'<br />
                                '.$feed['items'][$i]['language'].'<br />
                                '.$feed['items'][$i]['publisher'].'<br />
                                '.$feed['items'][$i]['state'].'<br />
                                '.$feed['items'][$i]['subject'].'<br />
                            ';
                    }
                }

                //  convert  the channel/feed part  into a string
                $feed_common = implode(" ", $feed);

                //  build something that could be indexed
                $html .= "<html>\r\n<head>\r\n<title>".$feed['title']."</title>\r\n<meta name=\"description\" content=\"".$feed['description']." \">\r\n</head>\r\n";
                $html .= "<body>\r\n".$feed_common."\r\n".$subjects."\r\n</body>\r\n</html>\r\n";
            }

            if (strlen($html) < '100') {    //  can't be a valid feed
                printStandardReport('invalidRSS',$command_line, $no_log);
            } else {
                if ($debug > 0 ) {
                    printValidFeed($type, $count, $cl);
                }
            }
        }
        return $html;
    }

    function commandline_help() {
        print "Usage: php spider.php <options>\n\n";
        print "Options:\n";
        print " -all\t\t Re-index everything in the database\n";
        print " -eall\t\t Erase and afterwards Re-index everything in the database\n";
        print " -new\t\t Index only the new sites\n";
        print " -erase\t\t Erase database\n";
        print " -erased\t\t Index all meanwhile erased sites\n";
        print " -preall\t\t Set 'Last indexed' to 0000\n";
        print " -u <url>\t Set url to index\n";
        print " -f\t\t Set indexing depth to full (unlimited depth)\n";
        print " -d <num>\t Set indexing depth to <num>\n";
        print " -l\t\t Allow spider to leave the initial domain\n";
        print " -r\t\t Set spider to reindex a site\n";
        print " -m <string>\t Set the string(s) that an url must include (use \\n as a delimiter between multiple strings)\n";
        print " -n <string>\t Set the string(s) that an url must not include (use \\n as a delimiter between multiple strings)\n";
    }

    function link_check($url, $level, $sessid, $can_leave_domain, $reindex) {
        global $command_line, $mysql_table_prefix, $user_agent, $debug, $index_media, $no_log, $clear;

        $needsReindex = 1;
        $deletable = 0;
        $local_url = 0;

        $local_url = strpos($url, 'localhost');
        if ($local_url != '7') {
            $url_status = url_status($url);
            $thislevel = $level - 1;

            if (strstr($url_status['state'], "Relocation")) {
                $care_excl      = '1';   //  care file suffixed to be excluded
                $relocated      = '1';   //  URL is relocated
                $local_redir    = '';


                $url = preg_replace("/ /i", "", url_purify($url_status['path'], $url, $can_leave_domain, $care_excl, $relocated, $local_redir));
                if ($url <> '') {
                    mysqltest();

                    $result = mysql_query("select link from ".$mysql_table_prefix."temp where link='$url' && id = '$sessid'");
                    if ($debug > '0') echo mysql_error();
                    $rows = mysql_num_rows($result);
                    if ($rows == 0) {
                        mysql_query ("insert into ".$mysql_table_prefix."temp (link, level, id) values ('$url', '$level', '$sessid')");
                        if ($debug > '0') echo mysql_error();
                    }
                }
                $url_status['state'] == "redirected";
                if ($clear == 1) clean_resource($result, '17') ;

            }

            ini_set("user_agent", $user_agent);
            if ($url_status['state'] == 'ok') {
                printStandardReport('link_okay', $command_line, $no_log);
            } else {
                $deletable = 1;
                printUrlStatus($url_status['state'], $command_line);
            }
        }

        if ($local_url == '7') {
            printStandardReport('link_local', $command_line, $no_log);
        }

        if ($reindex ==1 && $deletable == 1) {
            check_for_removal($url);
        } else if ($reindex == 1) {

        }
        if (!isset($all_links)) {
            $all_links = 0;
        }
        if (!isset($numoflinks)) {
            $numoflinks = 0;
        }
    }

    function get_Stats() {
        global $mysql_table_prefix, $debug, $clear, $db_con;

        $stats = array();
        $keywordQuery = "select count(keyword_id) from ".$mysql_table_prefix."keywords";
        $linksQuery = "select count(url) from ".$mysql_table_prefix."links";
        $siteQuery = "select count(site_id) from ".$mysql_table_prefix."sites";
        $categoriesQuery = "select count(category_id) from ".$mysql_table_prefix."categories";
        $mediaQuery = "select count(media_id) from ".$mysql_table_prefix."media";
        mysqltest();

        $result = mysql_query($keywordQuery);
        if ($debug > '0') echo mysql_error();
        if ($row=mysql_fetch_array($result)) {
            $stats['keywords']=$row[0];
        }
        $result = mysql_query($linksQuery);
        if ($debug > '0') echo mysql_error();
        if ($row=mysql_fetch_array($result)) {
            $stats['links']=$row[0];
        }
        for ($i=0;$i<=15; $i++) {
            $char = dechex($i);
            mysqltest();

            $result = mysql_query("select count(link_id) from ".$mysql_table_prefix."link_keyword$char");
            if ($debug > '0') echo mysql_error();
            if ($row=mysql_fetch_array($result)) {
                $stats['index']+=$row[0];
            }
        }


        mysqltest();
        $result = mysql_query($siteQuery);
        if ($debug > '0') echo mysql_error();
        if ($row=mysql_fetch_array($result)) {
            $stats['sites']=$row[0];
        }
        $result = mysql_query($categoriesQuery);
        if ($debug > '0') echo mysql_error();
        if ($row=mysql_fetch_array($result)) {
            $stats['categories']=$row[0];
        }
        $result = mysql_query($mediaQuery);
        if ($debug > '0') echo mysql_error();
        if ($row=mysql_fetch_array($result)) {
            $stats['media']=$row[0];
        }

        return $stats;
    }

    function index_new() {
        global $mysql_table_prefix, $command_line, $debug, $use_robot, $use_nofollow, $no_log, $clear, $cl, $started;

        $reindex == 0;
        printStandardReport('NewStart',$command_line, $no_log);

        mysqltest();
        $result=mysql_query("select url, indexdate, spider_depth, required, disallowed, can_leave_domain, use_prefcharset from ".$mysql_table_prefix."sites");
        if ($debug > '0') echo mysql_error();
        while ($row=mysql_fetch_row($result)) {
            $url = $row[0];
            //  get actual status of indexdate, eventually other threads meanwhile indexed this URL
            $res=mysql_query("select indexdate from ".$mysql_table_prefix."sites where url='$url'");
            if ($debug > '0') echo mysql_error();
            $ind=mysql_fetch_row($res);

            if ($ind[0] == '') {
                // immediately info for all other threads: now indexed by this thread
                $qry = "update ".$mysql_table_prefix."sites set indexdate=now() where url='$url'";
                mysqltest();
                mysql_query ($qry);
                if ($debug > '0') echo mysql_error();

                $depth = $row[2];
                $include = $row[3];
                $not_include = $row[4];
                $can_leave_domain = $row[5];
                $use_prefcharset = $row[6];

                if ($can_leave_domain=='') {
                    $can_leave_domain=0;
                }
                if ($depth == -1) {
                    $soption = 'full';
                } else {
                    $soption = 'level';
                }

                //  now index this new site
                index_site($url, 1, $depth, $soption, $include, $not_include, $can_leave_domain, $use_robot, $use_nofollow, $use_prefcharset );
            }
        }

        if ($clear == 1) clean_resource($result, '18');
        $ended = time();
        $consumed = $ended - $started;
        printConsumedReport('consumed', $cl, '0', $consumed);
        printStandardReport('NewFinish',$command_line, '0');
        create_footer();
    }

    function index_erased() {
        global $mysql_table_prefix, $command_line, $debug, $use_robot, $use_nofollow, $no_log, $clear, $started, $cl;

        $started = time();
        $reindex == 0;
        printStandardReport('ErasedStart',$command_line, $no_log);

        mysqltest();
        $result=mysql_query("select url, indexdate, spider_depth, required, disallowed, can_leave_domain, use_prefcharset from ".$mysql_table_prefix."sites");
        if ($debug > '0') echo mysql_error();
        while ($row=mysql_fetch_row($result)) {
            $url = $row[0];
            //  get actual status of indexdate, eventually other threads meanwhile indexed this URL
            $res=mysql_query("select indexdate from ".$mysql_table_prefix."sites where url='$url'");
            if ($debug > '0') echo mysql_error();
            $ind=mysql_fetch_row($res);

            if (strstr($ind[0], '0000')) {
                // immediately info for all other threads: now indexed by this thread
                $qry = "update ".$mysql_table_prefix."sites set indexdate=now() where url='$url'";
                mysqltest();
                mysql_query ($qry);
                if ($debug > '0') echo mysql_error();

                $depth              = $row[2];
                $include            = $row[3];
                $not_include        = $row[4];
                $can_leave_domain   = $row[5];
                $use_prefcharset    = $row[6];

                if ($can_leave_domain=='') {
                    $can_leave_domain=0;
                }
                if ($depth == -1) {
                    $soption = 'full';
                } else {
                    $soption = 'level';
                }

                //  now index this erased site
                index_site($url, 1, $depth, $soption, $include, $not_include, $can_leave_domain, $use_robot, $use_nofollow, $cl, 1, $use_prefcharset);
            }
        }

        if ($clear == 1) clean_resource($result, '19');
        $ended = time();
        $consumed = $ended - $started;
        printConsumedReport('consumed', $cl, '0', $consumed);
        //printStandardReport('ErasedFinish',$command_line, '0');
        printStandardReport('ReindexFinish',$command_line, '0');
        create_footer();
    }

    function index_suspended() {
        global $mysql_table_prefix, $command_line, $debug, $use_robot, $use_nofollow, $no_log, $clear, $started, $cl;

        $started = time();
        $reindex = 0;
        printStandardReport('SuspendedStart',$command_line, $no_log);

        //  get ID and URL of all sites
        $result1 = mysql_query("SELECT site_id, url from ".$mysql_table_prefix."sites ORDER by url");
        if ($debug > '0') echo mysql_error();

        while ($row1=mysql_fetch_row($result1)) {
            $url = $row1[1];
            $site_id = $row1[0];

            //  check whether this site is pending
            $result2 = mysql_query("SELECT site_id from ".$mysql_table_prefix."pending where site_id =$site_id");
            if ($debug > '0') echo mysql_error();
            $row2=mysql_fetch_array($result2);

            //  if pending, continue indexing this URL
            if ($row2['site_id'] == $site_id) {
                //  fetch all important data of this site
                $result=mysql_query("select url, spider_depth, required, disallowed, can_leave_domain, use_prefcharset from ".$mysql_table_prefix."sites where url='$url'");
                if ($debug > '0') echo mysql_error();
                if($row=mysql_fetch_row($result)) {
                    $maxlevel           = $row[1];
                    $in                 = $row[2];
                    $out                = $row[3];
                    $domaincb           = $row[4];
                    $use_prefcharset    = $row[5];

                    if ($domaincb=='') {
                        $domaincb=0;
                    }
                    if ($maxlevel == -1) {
                        $soption = 'full';
                    } else {
                        $soption = 'level';
                    }
                }

                if ($clear == 1) clean_resource($result, '21') ;

                if (!isset($in)) {
                    $in = "";
                }

                if (!isset($out)) {
                    $out = "";
                }
                //  now indnex the rest of this site
                index_site($url, $reindex, $maxlevel, $soption, $in, $out, $domaincb, $use_robot, $use_nofollow, $cl, $all, $use_prefcharset);
            }
        }

        if ($clear == 1) clean_resource($result, '20');
        $ended = time();
        $consumed = $ended - $started;
        printConsumedReport('consumed', $cl, '0', $consumed);
        printStandardReport('SuspendedFinish',$command_line, '0');
        create_footer();
    }

    function create_footer() {
        global $plus_nr, $log_handle, $log_file;

        $footer_msg = "<p class='bd'>
                <span class='em'>
                <br /><br />Indexing / Re-indexing finished.<br /><br />
                </span></p>
            ";

        LogUpdate($log_handle, $footer_msg);
    }

    function create_logFile($id) {
        global $log_format, $log_dir, $dba_act;

        //  prepare current log file
        if ($log_format == 'text') {
            $log_file =  $log_dir."/db".$dba_act."_".Date("ymd-H.i.s").".txt";
        } else {
            $log_file =  $log_dir."/db".$dba_act."_".Date("ymd-H.i.s")."_".$id.".html";
        }
        if (!$log_handle = fopen($log_file, 'w')) {             //      create a new log file
            $logdir = mkdir($log_dir, 0777);                    //      try to create a log directory
            if ($logdir != '1') {
                die ("Logging option is set, but cannot create folder for logging files.");
            } else {
                if (!$log_handle = fopen($log_file, 'w')) {     //      try again to create a log file
                    die ("Logging option is set, folder was created, but cannot open a file for logging.");
                }
            }
        }
        return $log_handle;
    }

    function LogUpdate($log_handle, $log_msg){
        if (!$log_handle) {
            die ("Cannot open file for realtime logging. ");
        }

        if (fwrite($log_handle, $log_msg) === FALSE) {
            die ("Cannot write to file for realtime logging. ");
        }
    }

    function clear_TextCache() {
        global $textcache_dir;

        $count = '0';
        if ($handle = opendir($textcache_dir)) {
            while (false !== ($file = readdir($handle))) {
                if ($file != "." && $file != "..") {
                    @unlink("".$textcache_dir."/".$file."");
                    $count++;
                }
            }
        }
    }

    function clear_MediaCache() {
        global $mediacache_dir;

        $count = '0';
        if ($handle = opendir($mediacache_dir)) {
            while (false !== ($file = readdir($handle))) {
                if ($file != "." && $file != "..") {
                    @unlink("".$mediacache_dir."/".$file."");
                    $count++;
                }
            }
        }
    }

    function gz_decode($data, $c, $t) {
        $fpointer   = 0;
        $result     = '';

        //  check, for really gzip coded data
        if("\x1f\x8b" != substr($data, $pointer,2) ){
          $result = "error_gz0";
        }
/*
        if("\x08" != substr($data, $pointer,1) ){
          $result = "Compression method must be 'deflate'";
        }
*/
        if(!$result) {
            $result = gzinflate(substr($data,10,-8));
        }

        return $result;
    }

    function pre_all() {
        global $mysql_table_prefix, $debug;

        $qry = "update ".$mysql_table_prefix."sites set indexdate='NULL'";
        mysql_query ($qry);
        if ($debug > '0') echo mysql_error();
    }

    function extract_js($contents) {
        global $clear;

        $regs = array();

        if(preg_match_all("/document\.write\((\"|')(.*?)(\"|')\);/si", $contents, $regs)) {
            $content = '';
            $content = implode("\r\n", $regs[2]);

            //  remove unused parts of the content
            $content    = preg_replace("@<!--.*?-->@si", " ",$content);
            $content    = preg_replace("@<style[^>]*>.*?<\/style>@si", " ", $content);
            $content    = preg_replace("/<link rel[^<>]*>/i", " ", $content);
            $content    = str_replace ("encoding: ''", " ", $content);        //  yes, I've seen such nonsense !
            $content    = preg_replace("@<script[^>]*?>.*?<\/script>@si", " ",$content);
        }

/*
//  if only links and their titles should be found in JavaScript
//  comment the above if preg_match_all loop completely and use this one here
        if(preg_match_all("/<a\s*href(.*?)<\/a>/si", $contents, $regs)) {
            $content = '';
            $content = implode("\r\n", $regs[0]);
        }
*/
        if ($clear == 1) {
            $regs = array ();
            unset ($contents);
        }

        return $content;
    }

    function convertToUTF8($file, $charSet, $char_Set, $converter_dir) {
        global $home_charset;

        $conv_file  = $file;     //  pure code
        $iconv_file = @iconv($charSet,"UTF-8//IGNORE",$conv_file);  //      if installed, first try to use PHP function iconv()
        //      IGNORE => ignore unknown characters
        //      TRANSLIT=> replace unknown characters  with something similar
        //      Attention: TRANSLIT breaks converting, if no 'close to' chararacter will be found
        //echo "\r\n\r\n<br /> iconv_file: $iconv_file<br />";
        if(trim($iconv_file) == ""){        // iconv is not installed or input charSet not available. We need to use class ConvertCharset
            $char_Set = str_ireplace ('iso-','',$charSet);
            //$charSet = str_ireplace ('iso','',$charSet);
            $converter = "".$converter_dir."/charsets/".$char_Set.".txt" ;
            if(!is_file($converter) ) {                             //      if this charset table is not avaulable
                $char_Set = str_ireplace ('iso-','',$home_charset);  //      try alternatively the home charset
                printConverterError($charSet, $cl);
                printTryHome($home_charset, $cl);
            }

            if (is_file($converter) || $home_charset != 'UTF-8') {  //  UTF-8 -> UTF-8 would not work
                $NewEncoding    = new ConvertCharset($char_Set, "utf-8");
                $NewFileOutput  = $NewEncoding->Convert($conv_file);

                //$NewEncoding    = new ConvertCharset;
                //$NewFileOutput  = $NewEncoding->Convert($conv_file, $chrSet, "utf-8",false);
                $file = $NewFileOutput;
            }
        }else{
            $file = $iconv_file;
        }
        unset ($conv_file, $iconv_file, $NewEncoding, $NewFileOutput);
        return $file;
    }

    function check_utf8($str) {
        $len = strlen($str);
        for($i = 0; $i < $len; $i++){
            $c = ord($str[$i]);
            if ($c > 128) {
                if (($c > 247)) return false;
                elseif ($c > 239) $bytes = 4;
                elseif ($c > 223) $bytes = 3;
                elseif ($c > 191) $bytes = 2;
                else return false;
                if (($i + $bytes) > $len) return false;
                while ($bytes > 1) {
                    $i++;
                    $b = ord($str[$i]);
                    if ($b < 128 || $b > 191) return false;
                    $bytes--;
                }
            }
        }
        return true;
    }

    // Unicode BOM is U+FEFF, but after encoded, it will look like this.
    define ('UTF32_BIG_ENDIAN_BOM'   , chr(0x00) . chr(0x00) . chr(0xFE) . chr(0xFF));
    define ('UTF32_LITTLE_ENDIAN_BOM', chr(0xFF) . chr(0xFE) . chr(0x00) . chr(0x00));
    define ('UTF16_BIG_ENDIAN_BOM'   , chr(0xFE) . chr(0xFF));
    define ('UTF16_LITTLE_ENDIAN_BOM', chr(0xFF) . chr(0xFE));
    define ('UTF8_BOM'               , chr(0xEF) . chr(0xBB) . chr(0xBF));

    function detect_utf_encoding($filename) {

        $text = file_get_contents($filename);
        $first2 = substr($text, 0, 2);
        $first3 = substr($text, 0, 3);
        $first4 = substr($text, 0, 3);

        if ($first3 == UTF8_BOM) return 'UTF-8';
        elseif ($first4 == UTF32_BIG_ENDIAN_BOM) return 'UTF-32BE';
        elseif ($first4 == UTF32_LITTLE_ENDIAN_BOM) return 'UTF-32LE';
        elseif ($first2 == UTF16_BIG_ENDIAN_BOM) return 'UTF-16BE';
        elseif ($first2 == UTF16_LITTLE_ENDIAN_BOM) return 'UTF-16LE';
    }

    function utf16_to_utf8($str) {

        $c0 = ord($str[0]);
        $c1 = ord($str[1]);

        if ($c0 == 0xFE && $c1 == 0xFF) {
            $be = true;
        } else if ($c0 == 0xFF && $c1 == 0xFE) {
            $be = false;
        } else {
            return $str;
        }

        $str = substr($str, 2);
        $len = strlen($str);
        $dec = '';
        for ($i = 0; $i < $len; $i += 2) {
            $c = ($be) ? ord($str[$i]) << 8 | ord($str[$i + 1]) :
            ord($str[$i + 1]) << 8 | ord($str[$i]);
            if ($c >= 0x0001 && $c <= 0x007F) {
                $dec .= chr($c);
            } else if ($c > 0x07FF) {
                $dec .= chr(0xE0 | (($c >> 12) & 0x0F));
                $dec .= chr(0x80 | (($c >>  6) & 0x3F));
                $dec .= chr(0x80 | (($c >>  0) & 0x3F));
            } else {
                $dec .= chr(0xC0 | (($c >>  6) & 0x1F));
                $dec .= chr(0x80 | (($c >>  0) & 0x3F));
            }
        }
        return $dec;
    }

    function XML_IsWellFormed($buf) {

        libxml_use_internal_errors(true);
        libxml_clear_errors(true);

        $doc = new DOMDocument('1.0', 'utf-8');
        $doc->loadXML($buf);

        $errors = libxml_get_errors();
        if (empty($errors)){
            return true;
        }

        $error = $errors[ 0 ];
        if ($error->level < 3){
            return true;
        }

        $lines = explode("r", $buf);
        $line = $lines[($error->line)-1];

        $message = $error->message . ' at line ' . $error->line . ':<br /><br />&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ' . htmlentities($line);

        return $message;
    }

?>
Return current item: Sphider Plus