<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en" dir="ltr">
<head>
<title>PHPCrawl webcrawler library for PHP</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<link type="text/css" rel="stylesheet" media="all" href="style.css" />
</head>
<body>
<div id="wrapper">
<div id="page" style="width: 950px">
<div id="top">
<h1 style="margin: 0px; float: left;">PHPCrawl webcrawler library</h1>
<div style="margin-left: 670px; margin-top: 14px; font-size: 12px;">Docs for version 0.8x</div>
</div>
<div id="container">
<div id="left">
<ul>
<li><a href="index.html">About PHPCrawl</a></li>
<li>
Documentation
<ul id="submenu">
<li><a href="requirements.html">Requirements</a></li>
<li><a href="quickstart.html">Installation & Quickstart</a></li>
<li><a href="example.html">Example</a></li>
<li><a href="multiprocesses.html">Using multi-processes</a></li>
<li><a href="multiprocessing_modes.html">Multiprocessing Modes</a></li>
<li><a href="spidering_huge_websites.html">Spidering huge websites</a></li>
<li><a href="faq.html">FAQ</a></li>
<li><a href="classreferences/index.html" target="blank"><u>Complete Class References</u></a></li>
</ul>
</li>
<li class="fat"><a href="http://sourceforge.net/projects/phpcrawl/files/PHPCrawl/" target="_blank">Download PHPCrawl</a></li>
<li><a href="testinterface.html">Testinterface</a></li>
<li><a href="versionhistory.html">Version history</a></li>
<li><a href="http://sourceforge.net/projects/phpcrawl/forums/forum/307696" target="_blank">Forum</a></li>
<li><a href="http://sourceforge.net/tracker/?group_id=89439&atid=590146" target="_blank">Report a bug</a></li>
</ul>
<div id="sf">
<a href="http://sourceforge.net/projects/phpcrawl"><img src="http://sflogo.sourceforge.net/sflogo.php?group_id=89439&type=14" width="150" height="40" alt="Get PHPCrawl at SourceForge.net. Fast, secure and Free Open Source software downloads" /></a>
</div>
<div id="sf">
<form action="https://www.paypal.com/cgi-bin/webscr" method="post">
<input type="hidden" name="cmd" value="_s-xclick">
<input type="hidden" name="hosted_button_id" value="M53G4LP6XNHM4">
<input type="image" src="https://www.paypalobjects.com/en_US/i/btn/btn_donate_SM.gif" border="0" name="submit" alt="PayPal - The safer, easier way to pay online!">
<img alt="" border="0" src="https://www.paypalobjects.com/de_DE/i/scr/pixel.gif" width="1" height="1">
</form>
</div>
</div>
<div id="content">
<h3>Example</h3><br />
The following code is a simple example of using phpcrawl.<br /><br />
The listed script just "spiders" some pages of www.php.net until a traffic-limit of 1 mb is reached and prints out some information about all found documents.<br /><br />
Please note that this example-script (and others) also comes in a file called "example.php" with the phpcrawl-package. It's recommended to run it from the commandline (php CLI).
<p id="code" style="width: 670px">
<span style="color: #000000">
<span style="color: #0000BB"><?php
<br />
<br /></span><span style="color: #FF8000">// It may take a whils to crawl a site ...
<br /></span><span style="color: #0000BB">set_time_limit</span><span style="color: #007700">(</span><span style="color: #0000BB">10000</span><span style="color: #007700">);
<br />
<br /></span><span style="color: #FF8000">// Inculde the phpcrawl-mainclass
<br /></span><span style="color: #007700">include(</span><span style="color: #DD0000">"libs/PHPCrawler.class.php"</span><span style="color: #007700">);
<br />
<br /></span><span style="color: #FF8000">// Extend the class and override the handleDocumentInfo()-method
<br /></span><span style="color: #007700">class </span><span style="color: #0000BB">MyCrawler </span><span style="color: #007700">extends </span><span style="color: #0000BB">PHPCrawler
<br /></span><span style="color: #007700">{
<br /> function </span><span style="color: #0000BB">handleDocumentInfo</span><span style="color: #007700">(</span><span style="color: #0000BB">$DocInfo</span><span style="color: #007700">)
<br /> {
<br /> </span><span style="color: #FF8000">// Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>").
<br /> </span><span style="color: #007700">if (</span><span style="color: #0000BB">PHP_SAPI </span><span style="color: #007700">== </span><span style="color: #DD0000">"cli"</span><span style="color: #007700">) </span><span style="color: #0000BB">$lb </span><span style="color: #007700">= </span><span style="color: #DD0000">"\n"</span><span style="color: #007700">;
<br /> else </span><span style="color: #0000BB">$lb </span><span style="color: #007700">= </span><span style="color: #DD0000">"<br />"</span><span style="color: #007700">;
<br />
<br /> </span><span style="color: #FF8000">// Print the URL and the HTTP-status-Code
<br /> </span><span style="color: #007700">echo </span><span style="color: #DD0000">"Page requested: "</span><span style="color: #007700">.</span><span style="color: #0000BB">$DocInfo</span><span style="color: #007700">-></span><span style="color: #0000BB">url</span><span style="color: #007700">.</span><span style="color: #DD0000">" ("</span><span style="color: #007700">.</span><span style="color: #0000BB">$DocInfo</span><span style="color: #007700">-></span><span style="color: #0000BB">http_status_code</span><span style="color: #007700">.</span><span style="color: #DD0000">")"</span><span style="color: #007700">.</span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br />
<br /> </span><span style="color: #FF8000">// Print the refering URL
<br /> </span><span style="color: #007700">echo </span><span style="color: #DD0000">"Referer-page: "</span><span style="color: #007700">.</span><span style="color: #0000BB">$DocInfo</span><span style="color: #007700">-></span><span style="color: #0000BB">referer_url</span><span style="color: #007700">.</span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br />
<br /> </span><span style="color: #FF8000">// Print if the content of the document was be recieved or not
<br /> </span><span style="color: #007700">if (</span><span style="color: #0000BB">$DocInfo</span><span style="color: #007700">-></span><span style="color: #0000BB">received </span><span style="color: #007700">== </span><span style="color: #0000BB">true</span><span style="color: #007700">)
<br /> echo </span><span style="color: #DD0000">"Content received: "</span><span style="color: #007700">.</span><span style="color: #0000BB">$DocInfo</span><span style="color: #007700">-></span><span style="color: #0000BB">bytes_received</span><span style="color: #007700">.</span><span style="color: #DD0000">" bytes"</span><span style="color: #007700">.</span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br /> else
<br /> echo </span><span style="color: #DD0000">"Content not received"</span><span style="color: #007700">.</span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br />
<br /> </span><span style="color: #FF8000">// Now you should do something with the content of the actual
<br /> // received page or file ($DocInfo->source), we skip it in this example
<br />
<br /> </span><span style="color: #007700">echo </span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br />
<br /> </span><span style="color: #0000BB">flush</span><span style="color: #007700">();
<br /> }
<br />}
<br />
<br /></span><span style="color: #FF8000">// Now, create a instance of your class, define the behaviour
<br />// of the crawler (see class-reference for more options and details)
<br />// and start the crawling-process.
<br />
<br /></span><span style="color: #0000BB">$crawler </span><span style="color: #007700">= new </span><span style="color: #0000BB">MyCrawler</span><span style="color: #007700">();
<br />
<br /></span><span style="color: #FF8000">// URL to crawl
<br /></span><span style="color: #0000BB">$crawler</span><span style="color: #007700">-></span><span style="color: #0000BB">setURL</span><span style="color: #007700">(</span><span style="color: #DD0000">"www.php.net"</span><span style="color: #007700">);
<br />
<br /></span><span style="color: #FF8000">// Only receive content of files with content-type "text/html"
<br /></span><span style="color: #0000BB">$crawler</span><span style="color: #007700">-></span><span style="color: #0000BB">addContentTypeReceiveRule</span><span style="color: #007700">(</span><span style="color: #DD0000">"#text/html#"</span><span style="color: #007700">);
<br />
<br /></span><span style="color: #FF8000">// Ignore links to pictures, dont even request pictures
<br /></span><span style="color: #0000BB">$crawler</span><span style="color: #007700">-></span><span style="color: #0000BB">addURLFilterRule</span><span style="color: #007700">(</span><span style="color: #DD0000">"#\.(jpg|jpeg|gif|png)$# i"</span><span style="color: #007700">);
<br />
<br /></span><span style="color: #FF8000">// Store and send cookie-data like a browser does
<br /></span><span style="color: #0000BB">$crawler</span><span style="color: #007700">-></span><span style="color: #0000BB">enableCookieHandling</span><span style="color: #007700">(</span><span style="color: #0000BB">true</span><span style="color: #007700">);
<br />
<br /></span><span style="color: #FF8000">// Set the traffic-limit to 1 MB (in bytes,
<br />// for testing we dont want to "suck" the whole site)
<br /></span><span style="color: #0000BB">$crawler</span><span style="color: #007700">-></span><span style="color: #0000BB">setTrafficLimit</span><span style="color: #007700">(</span><span style="color: #0000BB">1000 </span><span style="color: #007700">* </span><span style="color: #0000BB">1024</span><span style="color: #007700">);
<br />
<br /></span><span style="color: #FF8000">// Thats enough, now here we go
<br /></span><span style="color: #0000BB">$crawler</span><span style="color: #007700">-></span><span style="color: #0000BB">go</span><span style="color: #007700">();
<br />
<br /></span><span style="color: #FF8000">// At the end, after the process is finished, we print a short
<br />// report (see method getProcessReport() for more information)
<br /></span><span style="color: #0000BB">$report </span><span style="color: #007700">= </span><span style="color: #0000BB">$crawler</span><span style="color: #007700">-></span><span style="color: #0000BB">getProcessReport</span><span style="color: #007700">();
<br />
<br />if (</span><span style="color: #0000BB">PHP_SAPI </span><span style="color: #007700">== </span><span style="color: #DD0000">"cli"</span><span style="color: #007700">) </span><span style="color: #0000BB">$lb </span><span style="color: #007700">= </span><span style="color: #DD0000">"\n"</span><span style="color: #007700">;
<br />else </span><span style="color: #0000BB">$lb </span><span style="color: #007700">= </span><span style="color: #DD0000">"<br />"</span><span style="color: #007700">;
<br />
<br />echo </span><span style="color: #DD0000">"Summary:"</span><span style="color: #007700">.</span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br />echo </span><span style="color: #DD0000">"Links followed: "</span><span style="color: #007700">.</span><span style="color: #0000BB">$report</span><span style="color: #007700">-></span><span style="color: #0000BB">links_followed</span><span style="color: #007700">.</span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br />echo </span><span style="color: #DD0000">"Documents received: "</span><span style="color: #007700">.</span><span style="color: #0000BB">$report</span><span style="color: #007700">-></span><span style="color: #0000BB">files_received</span><span style="color: #007700">.</span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br />echo </span><span style="color: #DD0000">"Bytes received: "</span><span style="color: #007700">.</span><span style="color: #0000BB">$report</span><span style="color: #007700">-></span><span style="color: #0000BB">bytes_received</span><span style="color: #007700">.</span><span style="color: #DD0000">" bytes"</span><span style="color: #007700">.</span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br />echo </span><span style="color: #DD0000">"Process runtime: "</span><span style="color: #007700">.</span><span style="color: #0000BB">$report</span><span style="color: #007700">-></span><span style="color: #0000BB">process_runtime</span><span style="color: #007700">.</span><span style="color: #DD0000">" sec"</span><span style="color: #007700">.</span><span style="color: #0000BB">$lb</span><span style="color: #007700">;
<br /></span><span style="color: #0000BB">?></span>
</span>
</p>
</div>
</div>
</div>
</div>
</body>
</html>