Location: PHPKode > scripts > robots.spider(tm) > robots-spidertm/index.php
<?php
/* robots.spider(tm) copyright 2006 Betsy A. Gamrat all rights reserved
   robots.spider is a trademark or Betsy A. Gamrat

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

    Contact Information: Betsy A. Gamrat hide@address.com

Reads and checks robots.txt file.
*/

import_request_variables('gp','rvar_');
if (!isset($rvar_url) || (trim($rvar_url)=='')) 
	$url='';
else
	$url = $rvar_url;

?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
        <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
        <META NAME="COPYRIGHT" CONTENT="&copy; 2006 Betsy A. Gamrat All Rights Reserved" />
        <meta name="author" content="Betsy Gamrat" />

        <meta name="description" content="Reads a robots.txt file and spiders the site to check for references to entires in the robots.txt file." />

        <meta name="keywords" content="checker, check, robots.txt, read, search engine, security, protect, hide, index, keep out of, manage, file, directory, directories" />

        <title>robots.spider&trade; --- robots.txt reference checker</title>
        <link rel="stylesheet"  href="index.css" type="text/css" />
</head>
<h1>robots.spider(tm)</h1>

<h3>URL</h3>

<form action="index.php" method="post" name="check">
        Enter URL of site to check: <input type="text" name="url" value="<?= $url ?>" />&nbsp;&nbsp;
        <input type="submit" value="Go" /><br /><br />
	&copy; Copyright Betsy A. Gamrat 2006
	All rights reserved<br />
	robots.spider(tm) is a trademark of Betsy A. Gamrat
</form>
<?php

// MAX_DEPTH is set to 2 for the demo.  To increase the depth of the scan, increase this value.
// The script may run for a very long time, so be careful.  You may have to increase the runtime
// limit, and you shouldn't run this script during the busy hour.
define ("MAX_DEPTH",2);

// Get the url to scan
import_request_variables('gp','rvar_');
if (!isset($rvar_url) || ($rvar_url=='')) exit();
$url = $rvar_url;

// If http:// was not entered, prepend it to the URL
if ((stristr($url,'http://') == NULL) && (stristr($url,'https://'))==NULL)
        $url = 'http://'.$url;

// Strip off a path if there is one.  This is a site spider, not a page spider.
$arr_url=parse_url($url);

// Set the scheme and host
$url=$arr_url['scheme'].'://'.$arr_url['host'];

// Add a trailing slash if there isn't one
$len=strlen($url)-1;
if ($url[$len] !== '/')
        $url.='/';

// $refs is all the references
$refs=array();
// $noreads is the list of files that couldn't or shouldn't be read
$noreads=array();

echo "<hr /><h3>URL: $url</h3><hr />";
$depth=MAX_DEPTH;
$scan_depth=MAX_DEPTH-1;
echo <<< TOP
<p>
This is a list of all the URLs that were read.  The site was scanned to a depth of $depth
which means the home page, and $scan_depth page(s) deep.
</p>
<p>
The URLs are delivered as links, URLs that could not be opened are also listed.
</p>
<p>
URLs which were extracted from client-side (probably JavaScript) code may be invalid.
</p>
TOP;

// This does all the spidering.  It builds the list of references.
get_references($url);

echo '<hr />';

// $disallow_dir and $disallow_file are the lists of disallowed directories and files from 
// robots.txt
$disallow_dir=array();
$disallow_file=array();

// If the robots.txt read fails, or there are errors - quit
if (!read_robots_txt($disallow_dir,$disallow_file,FILE,$url))
	exit;

// If there aren't any diallows - quit
if ((count($disallow_dir) == 0) && (count($disallow_file)==0))
{
	echo 'The robots.txt file did not have any disallows<br />';
	exit;
}

// If there is a root level diallow - quit
if (in_array('',$disallow_dir))
{
	echo 'The entire site is disallowed with a Disallow: /<br />';
	exit;
}

// List the disallows
echo '<hr /><h3>robots.txt disallows</h3>';
echo '<h4>Directories</h4>';
foreach ($disallow_dir as $k => $v)
{
	echo '  '.$v;
        echo '<br />';
}

echo '<h4>Files</h4>';
foreach ($disallow_file as $k => $v)
{
        echo '  '.$v;
        echo '<br />';
}

echo '<hr /><h3>References</h3>';
echo <<< REFS
<p>
This is the list of references to disallow entries in the robots.txt file.  Each reference 
is listed as the disallow entry, followed by the specific directory or file reference, and 
the URL that refered to it.  The URLs are delivered as links.
</p>
REFS;

// The keys are used to allow a fast cross-reference between the $ref array and the disallows
$dir_keys=array();
$file_keys=array();

// Sort the reference array by keys
ksort($refs);

// Loop through all the references and check if they are in the disallow arrays
foreach ($refs as $k => $v)
{
        $k_url=@parse_url($k);
        $dir='/'.dirname($k_url['path']);
        $path='/'.$k_url['path'];
	if ($ddir=in_array($dir,$disallow_dir))
	{
		$key=array_search($dir,$disallow_dir);
		$dir_keys[]=$key;
		echo '<br />Reference(s) to '.$disallow_dir[$key]." ($k) in<br />";
	}
	if ($dfile=(in_array($path,$disallow_file) || in_array($dir,$disallow_file)))
	{
                $key=array_search($path,$disallow_file);
                $file_keys[]=$key;
		echo '<br />Reference(s) to '.$disallow_file[$key]." ($k) in<br />";
	}

	// Print out all the disallowed elements for that file
	if ($ddir || $dfile)
		foreach ($v as $k1 => $v1)
			echo "   <a href=\"$k1\" target=\"_blank\">$k1</a><br />";
}

echo '<hr /><h3>Analysis</h3>';
echo <<< ANALYSIS
<p>
Search engines spider the site.  If there are no references to a directory or file, it will not be
indexed.  robots.txt files should not refer to unreferenced content because it allows other 
viewers to identify areas of your site that you don't want indexed.
</p>
<p>
The read depth limits the number of pages scanned. 
</p>
<p>
This spider only spidered the site, links from external sources cannot be detected.  You may be able
to use the site statistics to identify links from external sources.
</p>
ANALYSIS;

// This goes through all the disallows and flags the ones that aren't referenced.
echo '<h4>Directories</h4>';
foreach ($disallow_dir as $k => $v)
{
        echo '  '.$v;
	if (!in_array($k,$dir_keys))
		echo ' <span class="red_text">&laquo; no references</span>';
        echo '<br />';
}

echo '<h4>Files</h4>';
foreach ($disallow_file as $k => $v)
{
        echo '  '.$v;
	if (!in_array($k,$file_keys)) 
		echo ' <span class="red_text">&laquo; no references</span>';
        echo '<br />';
}

echo <<< FOOTER
<hr />
&copy; Copyright Betsy A. Gamrat 2006
All rights reserved<br />
robots.spider(tm) is a trademark of Betsy A. Gamrat
FOOTER;

function get_references($url,$depth=0)
{
	global $refs,$noreads;
	global $arr_url;

	// This is used to limit the depth of the scan.
	if ($depth >= MAX_DEPTH)
		return;
	else
		// Indent for each level for readability
		echo '<div class="indent">';

	// Read the file
	$text=@file_get_contents($url);

	// List the file as a link, so it can easily be viewed and checked
	echo "<a href=\"$url\" target=\"_blank\">$url</a><br />\n";
	if ($text)
	{
		// This is used to extract all the references from src, href, url, action, and window.opens
		$ref=run_preg($text,
			"/(?:(?:src|href|url|action|window.open|popup)+\s*[=\(]+\s*[\"'`]*)([\+\w:?=@&\/#._;-]+)(?:[\s\"'`])/i");
		
		// If any references were found
		if (count($ref) > 0)
		{
			// Loop through the references
			foreach ($ref as $k => $v)
			{
				// $url is the url that was passed in to the function and read
				$url_base=@parse_url($url);

				// $new_base is used to setup the base of the referenced url
				$new_base=@parse_url($v);

				// This strips out links within the same file
				$pound = strpos($v,'#');
				if ($pound !== FALSE)
				{
					$target = trim(substr($v,0,$pound));
					if (strlen($target) === 0)	
						continue;
				}

				// This strips out calls to local javascript
				$javascript = strpos($v,'javascript:');
				if ($javascript !== FALSE)
					continue;

				// If there is a 'scheme', such as http://, ftp://, https:// - there is a full url
				if (isset($new_base['scheme']))
				{
					// If this is a link to a different domain, don't follow it
					if (strpos($new_base['host'],$arr_url['host']) === FALSE)
						continue;
					else
						$prefix='';
				}
				else
				{
					// Sometimes there is a //, when you would expect an http://
					if (strpos($v,'//') === 0)
					{
                                                $v = 'http://'.substr($v,2);
						$prefix = "";
					}
					else
					{
						// Build up the full url for referencing
						if (isset($new_base['host']) && (trim($new_base['host']) != ''))
							$prefix = $arr_url['scheme'].'://'.$new_base['host'].'/';
						else
							$prefix = $arr_url['scheme'].'://'.$arr_url['host'].'/';

						// For links that are pointing to a subdirectory
						if ($v[0] === '/')
							$v = substr($v,1);
	
						// Handle any relative links
						if ($v[0] === '.')
						{
							$rel_path=explode('/',$v);
							if (isset($new_base['path']))
							{
								$arr_path=explode('/',$new_base['path']);
								$new_rel_path="";
								foreach ($arr_path as $k1 => $v1)
								{
									if ($rel_path[$k1] !== '..')
										$new_rel_path .= '/'.$v1;
								}
							}
							$v = substr($new_rel_path,1);
						}
						else
						{
							// Absolute links
							if ($depth == 0)
							{
								$dir = dirname($url_base['path']);
								$new_dir = dirname($new_base['path']);
								if (($dir !== "") && ($new_dir !== ""))
									if (strpos($new_dir,$dir) === FALSE)
										if ($dir !== '/')
											$prefix .= substr($dir,1).'/';
							}
						}	
					}
				}
				// If this reference hasn't been read yet
				if (!isset($refs[$v][$url]))
				{
					$refs[$v][$url]=true;

					// If this file hasn't been rejected on an earlier run
					if (!isset($noreads[$v]))
					{
						// Don't read these types of files
						if (preg_match ("/[.](au|avi|bmp
|cab|dcr|fla|gif|gz|ico|mid|mov|mp3|mpeg|ppt|ra|ram|pdf|tar|wav|zip|jpg|jpeg|png|wma|wmv|wm)/i",$v) === 0)
							get_references($prefix.$v,$depth+1);
						else
						{
							echo "$prefix$v not read, looks like a non-text file<br />\n";
							$noreads[$v]=true;
						}
					}
				}
				
			}
		}
	}
	else
		echo "<span class=\"red_text\">Unable to open $url<br /></span>";
	echo '</div>';
}

function run_preg($text,$pattern) 
{
   // Handles the preg for the scan
   preg_match_all ($pattern, $text, $matches);
 
   return (is_array($matches)) ? $matches[1]:FALSE;
}

define ("FILE",'FILE');
define ("TOP",$_SERVER['DOCUMENT_ROOT'].'/');   // Top of the (web) directory structure

function read_robots_txt(&$disallow_dir,&$disallow_file,$source=FILE,$host=TOP)
{
        if ($source===FILE)
        {
                $filename=$host."robots.txt";
                $fp=@fopen($filename,"rt");
                if ($fp != FALSE)
                {
                        $buffer=file($filename);
                        fclose($fp);
                }
                else
                {
                        echo "Unable to open robots.txt<br /><br />";
                        return false;
                }
        }
        else
                $buffer=$source;

        $i=0;
        while ($i < count($buffer))
        {
                $str=$buffer[$i];
                $d=stristr($str,"disallow");
                if ($d !== FALSE)
                {
                        $j=strrpos($d,":");
                        if ($j !== FALSE)
                        {
                                $j++;
                                $path=trim(substr($d,$j));
                                $len=strlen($path)-1;
                                if ($path != '')
                                {
                                        if (strrpos($path,"/") == $len)
                                                $disallow_dir[]=substr($path,0,$len);
                                        else
                                                $disallow_file[]=$path;
                                }
                        }
                }
                $i++;
        }
        return true;
}
?>

Return current item: robots.spider(tm)