<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en" dir="ltr">
<head>
<title>PHPCrawl webcrawler library for PHP - Example script</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link type="text/css" rel="stylesheet" media="all" href="style.css" />
</head>
<body>
<div id="wrapper">
<div id="page" style="width: 950px">
<div id="top">
<h1 style="margin: 0px; float: left;">PHPCrawl webcrawler library/framework</h1>
</div>
<div id="container">
<iframe id="menuframe" src="menu.html" scrolling="no" frameborder="0"></iframe>
<div id="content">
<h3>Tutorial: Example Script</h3><br />
The following code is a simple example of using phpcrawl.<br /><br />
The listed script just "spiders" some pages of www.php.net until a traffic-limit of 1 mb is reached and prints out some information about all found documents.<br /><br />
Please note that this example-script (and others) also comes in a file called "example.php" with the phpcrawl-package. It's recommended to run it from the commandline (php CLI).
<p id="code" style="width: 670px">
<span style="color: #000000">
<span style="color: #0000BB"><?php
<br />
<br /></span><span style="color: #FF8000">// It may take a whils to crawl a site ...
<br /></span><span style="color: #0000BB">set_time_limit</span><span style="color: #007700">(</span><span style="color: #0000BB">10000</span><span style="color: #007700">);
<br />
<br /></span><span style="color: #FF8000">// Inculde the phpcrawl-mainclass
<br /></span><span style="color: #007700">include(</span><span style="color: #DD0000">"libs/PHPCrawler.class.php"</span><span style="color: #007700">);
<br />
<br /></span><span style="color: #FF8000">// Extend the class and override the handleDocumentInfo()-method
<br /></span><span style="color: #007700">class </span><span style="color: #0000BB">MyCrawler </span><span style="color: #007700">extends </span><span style="color: #0000BB">PHPCrawler
<br /></span><span style="color: #007700">{
<br /> function </span><span style="color: #0000BB">handleDocumentInfo</span><span style="color: #007700">(</span><span style="color: #0000BB">$DocInfo</span><span style="color: #007700">)
<br /> {
<br /> </span><span style="color: #FF8000">// Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>").
<br /> </span><span style="color: #007700">if (</span><span style="color: #0000BB">PHP_SAPI </span><span style="color: #007700">== </span><span style="color: #DD0000">"cli"</span><span style="color: #007700">) </span><span style="color: #0000BB">$lb </span><span style="color: #007700">= </span><span style="color: #DD0000">"\n"</span><span style="color: #007700">;
<br /> else </span><span style="color: #0000BB">$lb </span><span style="color: #007700">= </span><span style="color: #DD0000">"<br />"</span><span style="color: #007700">;
<br />
<br /> </span><span style="color: #FF8000">// Print the URL and the HTTP-status-Code
<br /> </span><span style="color: #007700">echo </span><span style="color: #DD0000">"Page requested: "</span><span style="color: #007700">.</span><span style="color: #0000BB">$DocInfo</span><span style="color: #007700">-></span><span style="color: #0000BB">url</span><span style="color: #007700">.</span><span style="color: #DD0000">" ("</span><span style="color: #007700">.</span><span style="color: #0000BB">$DocInfo</span><span style="color: #007700">-></span><span style="color: #0000BB">http_status_code</span><span style="color: #007700">.</span><span style="color: #DD0000">")"</span><span style="color: #007700">.</span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br />
<br /> </span><span style="color: #FF8000">// Print the refering URL
<br /> </span><span style="color: #007700">echo </span><span style="color: #DD0000">"Referer-page: "</span><span style="color: #007700">.</span><span style="color: #0000BB">$DocInfo</span><span style="color: #007700">-></span><span style="color: #0000BB">referer_url</span><span style="color: #007700">.</span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br />
<br /> </span><span style="color: #FF8000">// Print if the content of the document was be recieved or not
<br /> </span><span style="color: #007700">if (</span><span style="color: #0000BB">$DocInfo</span><span style="color: #007700">-></span><span style="color: #0000BB">received </span><span style="color: #007700">== </span><span style="color: #0000BB">true</span><span style="color: #007700">)
<br /> echo </span><span style="color: #DD0000">"Content received: "</span><span style="color: #007700">.</span><span style="color: #0000BB">$DocInfo</span><span style="color: #007700">-></span><span style="color: #0000BB">bytes_received</span><span style="color: #007700">.</span><span style="color: #DD0000">" bytes"</span><span style="color: #007700">.</span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br /> else
<br /> echo </span><span style="color: #DD0000">"Content not received"</span><span style="color: #007700">.</span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br />
<br /> </span><span style="color: #FF8000">// Now you should do something with the content of the actual
<br /> // received page or file ($DocInfo->source), we skip it in this example
<br />
<br /> </span><span style="color: #007700">echo </span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br />
<br /> </span><span style="color: #0000BB">flush</span><span style="color: #007700">();
<br /> }
<br />}
<br />
<br /></span><span style="color: #FF8000">// Now, create a instance of your class, define the behaviour
<br />// of the crawler (see class-reference for more options and details)
<br />// and start the crawling-process.
<br />
<br /></span><span style="color: #0000BB">$crawler </span><span style="color: #007700">= new </span><span style="color: #0000BB">MyCrawler</span><span style="color: #007700">();
<br />
<br /></span><span style="color: #FF8000">// URL to crawl
<br /></span><span style="color: #0000BB">$crawler</span><span style="color: #007700">-></span><span style="color: #0000BB">setURL</span><span style="color: #007700">(</span><span style="color: #DD0000">"www.php.net"</span><span style="color: #007700">);
<br />
<br /></span><span style="color: #FF8000">// Only receive content of files with content-type "text/html"
<br /></span><span style="color: #0000BB">$crawler</span><span style="color: #007700">-></span><span style="color: #0000BB">addContentTypeReceiveRule</span><span style="color: #007700">(</span><span style="color: #DD0000">"#text/html#"</span><span style="color: #007700">);
<br />
<br /></span><span style="color: #FF8000">// Ignore links to pictures, dont even request pictures
<br /></span><span style="color: #0000BB">$crawler</span><span style="color: #007700">-></span><span style="color: #0000BB">addURLFilterRule</span><span style="color: #007700">(</span><span style="color: #DD0000">"#\.(jpg|jpeg|gif|png)$# i"</span><span style="color: #007700">);
<br />
<br /></span><span style="color: #FF8000">// Store and send cookie-data like a browser does
<br /></span><span style="color: #0000BB">$crawler</span><span style="color: #007700">-></span><span style="color: #0000BB">enableCookieHandling</span><span style="color: #007700">(</span><span style="color: #0000BB">true</span><span style="color: #007700">);
<br />
<br /></span><span style="color: #FF8000">// Set the traffic-limit to 1 MB (in bytes,
<br />// for testing we dont want to "suck" the whole site)
<br /></span><span style="color: #0000BB">$crawler</span><span style="color: #007700">-></span><span style="color: #0000BB">setTrafficLimit</span><span style="color: #007700">(</span><span style="color: #0000BB">1000 </span><span style="color: #007700">* </span><span style="color: #0000BB">1024</span><span style="color: #007700">);
<br />
<br /></span><span style="color: #FF8000">// Thats enough, now here we go
<br /></span><span style="color: #0000BB">$crawler</span><span style="color: #007700">-></span><span style="color: #0000BB">go</span><span style="color: #007700">();
<br />
<br /></span><span style="color: #FF8000">// At the end, after the process is finished, we print a short
<br />// report (see method getProcessReport() for more information)
<br /></span><span style="color: #0000BB">$report </span><span style="color: #007700">= </span><span style="color: #0000BB">$crawler</span><span style="color: #007700">-></span><span style="color: #0000BB">getProcessReport</span><span style="color: #007700">();
<br />
<br />if (</span><span style="color: #0000BB">PHP_SAPI </span><span style="color: #007700">== </span><span style="color: #DD0000">"cli"</span><span style="color: #007700">) </span><span style="color: #0000BB">$lb </span><span style="color: #007700">= </span><span style="color: #DD0000">"\n"</span><span style="color: #007700">;
<br />else </span><span style="color: #0000BB">$lb </span><span style="color: #007700">= </span><span style="color: #DD0000">"<br />"</span><span style="color: #007700">;
<br />
<br />echo </span><span style="color: #DD0000">"Summary:"</span><span style="color: #007700">.</span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br />echo </span><span style="color: #DD0000">"Links followed: "</span><span style="color: #007700">.</span><span style="color: #0000BB">$report</span><span style="color: #007700">-></span><span style="color: #0000BB">links_followed</span><span style="color: #007700">.</span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br />echo </span><span style="color: #DD0000">"Documents received: "</span><span style="color: #007700">.</span><span style="color: #0000BB">$report</span><span style="color: #007700">-></span><span style="color: #0000BB">files_received</span><span style="color: #007700">.</span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br />echo </span><span style="color: #DD0000">"Bytes received: "</span><span style="color: #007700">.</span><span style="color: #0000BB">$report</span><span style="color: #007700">-></span><span style="color: #0000BB">bytes_received</span><span style="color: #007700">.</span><span style="color: #DD0000">" bytes"</span><span style="color: #007700">.</span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br />echo </span><span style="color: #DD0000">"Process runtime: "</span><span style="color: #007700">.</span><span style="color: #0000BB">$report</span><span style="color: #007700">-></span><span style="color: #0000BB">process_runtime</span><span style="color: #007700">.</span><span style="color: #DD0000">" sec"</span><span style="color: #007700">.</span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br /></span><span style="color: #0000BB">?></span>
</span>
</p>
</div>
<!--
<?php
include("google_code.php");
?>
-->
</div>
<div id="footer">Copyright © 2003 - 2012 Uwe Hunfeld hide@address.com</div>
</div>
</div>
</body>
</html>