<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<html>
<head>
<title>PHPCrawl - Webcrawler Class</title>
<link rel="stylesheet" type="text/css" href="style.css">
</head>
<body>
<div id="header">
<h1>PHPCrawl Documentation</h1>
For PHPCrawl Version 0.7
</div>
<div id="menu_container">
<div id="menu">
<ul id="menu">
<li><a href="index.html">Introduction & Requirements</a></li>
<li><a href="quickstart.html">Quickstart</a></li>
<li><a href="example.html">Example-Script</a></li>
<li><a href="version_info.html">Version-History</a></li>
<li><a href="testinterface.html">The Testinterface</a></li>
<li><a href="classreference.html">Classreference</a></li>
</ul>
</div>
<div id="download">
<ul id="menu">
<li><a href="download.html">Download PHPCrawl<br></a></li>
<li><a href="http://sourceforge.net/projects/phpcrawl">Sourceforge Projectpage<br></a></li>
</ul>
</div>
<div id="sflogo">
<a href="http://sourceforge.net">
<!--
<img src="http://sflogo.sourceforge.net/sflogo.php?group_id=89439&type=7" width="210" height="62" border="0" alt="SourceForge.net Logo"></a></div>
-->
<img src="img/sflogo.png" width="210" height="62" border="0" alt="SourceForge.net Logo"></a></div>
</div>
<div id="main">
<h2>A Example-Script</h2>
<p>
The following code is an complete example for using the class.<br>
The listed script "crawls" a site and just prints out some information about found
pages.<br>
<br>
Please note that this example-script also comes in a file called "example.php" with the phpcrawl-package.
</p>
<p id="code">
<?php<br>
<br>
// It may take a whils to crawl a site ...<br>
set_time_limit(10000);<br>
<br>
// Inculde the phpcrawl-mainclass<br>
include("classes/phpcrawler.class.php");<br>
<br>
// Extend the class and override the handlePageData()-method<br>
class MyCrawler extends PHPCrawler <br>
{<br>
function handlePageData(&$page_data) <br>
{<br>
// Here comes your code.<br>
// Do whatever you want with the information given in the<br>
// array $page_data about a page or file that the crawler actually found.<br>
// See a complete list of elements the array will contain in the <br>
// class-refenence.<br>
// This is just a simple example.<br>
<br>
// Print the URL of the actual requested page or file<br>
echo "Page requested: ".$page_data["url"]."<br>";<br>
<br>
// Print the first line of the header the server sent (HTTP-status)<br>
echo "Status: ".strtok($page_data["header"], "\n")."<br>";<br>
<br>
// Print the referer<br>
echo "Referer-page: ".$page_data["referer_url"]."<br>";<br>
<br>
// Print if the content was be recieved or not<br>
if ($page_data["received"]==true)<br>
echo "Content received: ".$page_data["bytes_received"]." bytes";<br>
else<br>
echo "Content not received";<br>
<br>
// ...<br>
<br>
// Now you should do something with the content of the actual<br>
// received page or file ($page_data[source]), we skip it in this example<br>
<br>
echo "<br><br>";<br>
flush();<br>
}<br>
}<br>
<br>
// Now, create an instance of the class, set the behaviour<br>
// of the crawler (see class-reference for more methods)<br>
// and start the crawling-process.<br>
<br>
$crawler = &new MyCrawler();<br>
<br>
// URL to crawl<br>
$crawler->setURL("www.php.net");<br>
<br>
// Only receive content of files with content-type "text/html"<br>
// (regular expression, preg)<br>
$crawler->addReceiveContentType("/text\/html/");<br>
<br>
// Ignore links to pictures, dont even request pictures<br>
// (preg_match)<br>
$crawler->addNonFollowMatch("/.(jpg|gif|png)$/ i");<br>
<br>
// Store and send cookie-data like a browser does<br>
$crawler->setCookieHandling(true);<br>
<br>
// Set the traffic-limit to 1 MB (in bytes,<br>
// for testing we dont want to "suck" the whole site)<br>
$crawler->setTrafficLimit(1000 * 1024);<br>
<br>
// Thats enough, now here we go<br>
$crawler->go();<br>
<br>
<br>
// At the end, after the process is finished, we print a short<br>
// report (see method getReport() for more information)<br>
<br>
$report = $crawler->getReport();<br>
<br>
echo "Summary:<br>";<br>
if ($report["traffic_limit_reached"]==true)<br>
echo "Traffic-limit reached <br>";<br>
<br>
echo "Links followed: ".$report["links_followed"]."<br>";<br>
echo "Files received: ".$report["files_received"]."<br>";<br>
echo "Bytes received: ".$report["bytes_received"]."<br>";<br>
<br>
?>
</p>
</div>
</body>
</html>