<?php
/* robots.spider(tm) copyright 2006 Betsy A. Gamrat all rights reserved
robots.spider is a trademark or Betsy A. Gamrat
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
Contact Information: Betsy A. Gamrat hide@address.com
Reads and checks robots.txt file.
*/
import_request_variables('gp','rvar_');
if (!isset($rvar_url) || (trim($rvar_url)==''))
$url='';
else
$url = $rvar_url;
?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<META NAME="COPYRIGHT" CONTENT="© 2006 Betsy A. Gamrat All Rights Reserved" />
<meta name="author" content="Betsy Gamrat" />
<meta name="description" content="Reads a robots.txt file and spiders the site to check for references to entires in the robots.txt file." />
<meta name="keywords" content="checker, check, robots.txt, read, search engine, security, protect, hide, index, keep out of, manage, file, directory, directories" />
<title>robots.spider™ --- robots.txt reference checker</title>
<link rel="stylesheet" href="index.css" type="text/css" />
</head>
<h1>robots.spider(tm)</h1>
<h3>URL</h3>
<form action="index.php" method="post" name="check">
Enter URL of site to check: <input type="text" name="url" value="<?= $url ?>" />
<input type="submit" value="Go" /><br /><br />
© Copyright Betsy A. Gamrat 2006
All rights reserved<br />
robots.spider(tm) is a trademark of Betsy A. Gamrat
</form>
<?php
// MAX_DEPTH is set to 2 for the demo. To increase the depth of the scan, increase this value.
// The script may run for a very long time, so be careful. You may have to increase the runtime
// limit, and you shouldn't run this script during the busy hour.
define ("MAX_DEPTH",2);
// Get the url to scan
import_request_variables('gp','rvar_');
if (!isset($rvar_url) || ($rvar_url=='')) exit();
$url = $rvar_url;
// If http:// was not entered, prepend it to the URL
if ((stristr($url,'http://') == NULL) && (stristr($url,'https://'))==NULL)
$url = 'http://'.$url;
// Strip off a path if there is one. This is a site spider, not a page spider.
$arr_url=parse_url($url);
// Set the scheme and host
$url=$arr_url['scheme'].'://'.$arr_url['host'];
// Add a trailing slash if there isn't one
$len=strlen($url)-1;
if ($url[$len] !== '/')
$url.='/';
// $refs is all the references
$refs=array();
// $noreads is the list of files that couldn't or shouldn't be read
$noreads=array();
echo "<hr /><h3>URL: $url</h3><hr />";
$depth=MAX_DEPTH;
$scan_depth=MAX_DEPTH-1;
echo <<< TOP
<p>
This is a list of all the URLs that were read. The site was scanned to a depth of $depth
which means the home page, and $scan_depth page(s) deep.
</p>
<p>
The URLs are delivered as links, URLs that could not be opened are also listed.
</p>
<p>
URLs which were extracted from client-side (probably JavaScript) code may be invalid.
</p>
TOP;
// This does all the spidering. It builds the list of references.
get_references($url);
echo '<hr />';
// $disallow_dir and $disallow_file are the lists of disallowed directories and files from
// robots.txt
$disallow_dir=array();
$disallow_file=array();
// If the robots.txt read fails, or there are errors - quit
if (!read_robots_txt($disallow_dir,$disallow_file,FILE,$url))
exit;
// If there aren't any diallows - quit
if ((count($disallow_dir) == 0) && (count($disallow_file)==0))
{
echo 'The robots.txt file did not have any disallows<br />';
exit;
}
// If there is a root level diallow - quit
if (in_array('',$disallow_dir))
{
echo 'The entire site is disallowed with a Disallow: /<br />';
exit;
}
// List the disallows
echo '<hr /><h3>robots.txt disallows</h3>';
echo '<h4>Directories</h4>';
foreach ($disallow_dir as $k => $v)
{
echo ' '.$v;
echo '<br />';
}
echo '<h4>Files</h4>';
foreach ($disallow_file as $k => $v)
{
echo ' '.$v;
echo '<br />';
}
echo '<hr /><h3>References</h3>';
echo <<< REFS
<p>
This is the list of references to disallow entries in the robots.txt file. Each reference
is listed as the disallow entry, followed by the specific directory or file reference, and
the URL that refered to it. The URLs are delivered as links.
</p>
REFS;
// The keys are used to allow a fast cross-reference between the $ref array and the disallows
$dir_keys=array();
$file_keys=array();
// Sort the reference array by keys
ksort($refs);
// Loop through all the references and check if they are in the disallow arrays
foreach ($refs as $k => $v)
{
$k_url=@parse_url($k);
$dir='/'.dirname($k_url['path']);
$path='/'.$k_url['path'];
if ($ddir=in_array($dir,$disallow_dir))
{
$key=array_search($dir,$disallow_dir);
$dir_keys[]=$key;
echo '<br />Reference(s) to '.$disallow_dir[$key]." ($k) in<br />";
}
if ($dfile=(in_array($path,$disallow_file) || in_array($dir,$disallow_file)))
{
$key=array_search($path,$disallow_file);
$file_keys[]=$key;
echo '<br />Reference(s) to '.$disallow_file[$key]." ($k) in<br />";
}
// Print out all the disallowed elements for that file
if ($ddir || $dfile)
foreach ($v as $k1 => $v1)
echo " <a href=\"$k1\" target=\"_blank\">$k1</a><br />";
}
echo '<hr /><h3>Analysis</h3>';
echo <<< ANALYSIS
<p>
Search engines spider the site. If there are no references to a directory or file, it will not be
indexed. robots.txt files should not refer to unreferenced content because it allows other
viewers to identify areas of your site that you don't want indexed.
</p>
<p>
The read depth limits the number of pages scanned.
</p>
<p>
This spider only spidered the site, links from external sources cannot be detected. You may be able
to use the site statistics to identify links from external sources.
</p>
ANALYSIS;
// This goes through all the disallows and flags the ones that aren't referenced.
echo '<h4>Directories</h4>';
foreach ($disallow_dir as $k => $v)
{
echo ' '.$v;
if (!in_array($k,$dir_keys))
echo ' <span class="red_text">« no references</span>';
echo '<br />';
}
echo '<h4>Files</h4>';
foreach ($disallow_file as $k => $v)
{
echo ' '.$v;
if (!in_array($k,$file_keys))
echo ' <span class="red_text">« no references</span>';
echo '<br />';
}
echo <<< FOOTER
<hr />
© Copyright Betsy A. Gamrat 2006
All rights reserved<br />
robots.spider(tm) is a trademark of Betsy A. Gamrat
FOOTER;
function get_references($url,$depth=0)
{
global $refs,$noreads;
global $arr_url;
// This is used to limit the depth of the scan.
if ($depth >= MAX_DEPTH)
return;
else
// Indent for each level for readability
echo '<div class="indent">';
// Read the file
$text=@file_get_contents($url);
// List the file as a link, so it can easily be viewed and checked
echo "<a href=\"$url\" target=\"_blank\">$url</a><br />\n";
if ($text)
{
// This is used to extract all the references from src, href, url, action, and window.opens
$ref=run_preg($text,
"/(?:(?:src|href|url|action|window.open|popup)+\s*[=\(]+\s*[\"'`]*)([\+\w:?=@&\/#._;-]+)(?:[\s\"'`])/i");
// If any references were found
if (count($ref) > 0)
{
// Loop through the references
foreach ($ref as $k => $v)
{
// $url is the url that was passed in to the function and read
$url_base=@parse_url($url);
// $new_base is used to setup the base of the referenced url
$new_base=@parse_url($v);
// This strips out links within the same file
$pound = strpos($v,'#');
if ($pound !== FALSE)
{
$target = trim(substr($v,0,$pound));
if (strlen($target) === 0)
continue;
}
// This strips out calls to local javascript
$javascript = strpos($v,'javascript:');
if ($javascript !== FALSE)
continue;
// If there is a 'scheme', such as http://, ftp://, https:// - there is a full url
if (isset($new_base['scheme']))
{
// If this is a link to a different domain, don't follow it
if (strpos($new_base['host'],$arr_url['host']) === FALSE)
continue;
else
$prefix='';
}
else
{
// Sometimes there is a //, when you would expect an http://
if (strpos($v,'//') === 0)
{
$v = 'http://'.substr($v,2);
$prefix = "";
}
else
{
// Build up the full url for referencing
if (isset($new_base['host']) && (trim($new_base['host']) != ''))
$prefix = $arr_url['scheme'].'://'.$new_base['host'].'/';
else
$prefix = $arr_url['scheme'].'://'.$arr_url['host'].'/';
// For links that are pointing to a subdirectory
if ($v[0] === '/')
$v = substr($v,1);
// Handle any relative links
if ($v[0] === '.')
{
$rel_path=explode('/',$v);
if (isset($new_base['path']))
{
$arr_path=explode('/',$new_base['path']);
$new_rel_path="";
foreach ($arr_path as $k1 => $v1)
{
if ($rel_path[$k1] !== '..')
$new_rel_path .= '/'.$v1;
}
}
$v = substr($new_rel_path,1);
}
else
{
// Absolute links
if ($depth == 0)
{
$dir = dirname($url_base['path']);
$new_dir = dirname($new_base['path']);
if (($dir !== "") && ($new_dir !== ""))
if (strpos($new_dir,$dir) === FALSE)
if ($dir !== '/')
$prefix .= substr($dir,1).'/';
}
}
}
}
// If this reference hasn't been read yet
if (!isset($refs[$v][$url]))
{
$refs[$v][$url]=true;
// If this file hasn't been rejected on an earlier run
if (!isset($noreads[$v]))
{
// Don't read these types of files
if (preg_match ("/[.](au|avi|bmp
|cab|dcr|fla|gif|gz|ico|mid|mov|mp3|mpeg|ppt|ra|ram|pdf|tar|wav|zip|jpg|jpeg|png|wma|wmv|wm)/i",$v) === 0)
get_references($prefix.$v,$depth+1);
else
{
echo "$prefix$v not read, looks like a non-text file<br />\n";
$noreads[$v]=true;
}
}
}
}
}
}
else
echo "<span class=\"red_text\">Unable to open $url<br /></span>";
echo '</div>';
}
function run_preg($text,$pattern)
{
// Handles the preg for the scan
preg_match_all ($pattern, $text, $matches);
return (is_array($matches)) ? $matches[1]:FALSE;
}
define ("FILE",'FILE');
define ("TOP",$_SERVER['DOCUMENT_ROOT'].'/'); // Top of the (web) directory structure
function read_robots_txt(&$disallow_dir,&$disallow_file,$source=FILE,$host=TOP)
{
if ($source===FILE)
{
$filename=$host."robots.txt";
$fp=@fopen($filename,"rt");
if ($fp != FALSE)
{
$buffer=file($filename);
fclose($fp);
}
else
{
echo "Unable to open robots.txt<br /><br />";
return false;
}
}
else
$buffer=$source;
$i=0;
while ($i < count($buffer))
{
$str=$buffer[$i];
$d=stristr($str,"disallow");
if ($d !== FALSE)
{
$j=strrpos($d,":");
if ($j !== FALSE)
{
$j++;
$path=trim(substr($d,$j));
$len=strlen($path)-1;
if ($path != '')
{
if (strrpos($path,"/") == $len)
$disallow_dir[]=substr($path,0,$len);
else
$disallow_file[]=$path;
}
}
}
$i++;
}
return true;
}
?>