<?PHP
class tagSpider {
var $crl; // this will hold our curl instance
var $html; // this is where we dump the html we get
var $binary; // set for binary type transfer
var $url; // this is the url we are going to do a pass on
function tagSpider() {
$this->html = "";
$this->binary = 0;
$this->url = "";
}
function fetchPage($url){
$this->url = $url;
if (isset($this->url)) {
$this->ch = curl_init (); // start cURL instance
curl_setopt ($this->ch, CURLOPT_RETURNTRANSFER, 1); // this tells cUrl to return the data
curl_setopt ($this->ch, CURLOPT_URL, $this->url); // set the url to download
curl_setopt($this->ch, CURLOPT_BINARYTRANSFER, $this->binary); // tell cURL if the data is binary data or not
$this->html = curl_exec($this->ch); // grabs the webpage from the internet
curl_close ($this->ch); // closes the connection
}
}
function parse_array($beg_tag, $close_tag) {
preg_match_all("($beg_tag.*$close_tag)siU", $this->html, $matching_data); // match data between specificed tags
return $matching_data[0];
}
}
header('Content-Type: application/xml; charset=UTF-8');
$writer = new XMLWriter();
$writer->openURI('php://output');
$writer->setIndent(true);
$writer->startDocument('1.0', 'utf-8');
$writer->startElement('XML');
$writer->writeElement('document', 'Craigslist XML');
$writer->writeElement('description', 'Craigslist XML Lists');
if (isset($_GET['url'])){
$page = "$_GET[url]"; //http://fortmyers.craigslist.org/web
}
$urlrun = $page;
$writer->writeElement('link', $urlrun);
if (stristr($_GET['url'], '.html')==true){
$stag='<section class="body">';
$etag='<footer>';
} else {
$stag='<p';
$etag='</p>';
}
$tspider = new tagSpider();
$tspider->fetchPage($urlrun);
$linkarray = $tspider->parse_array($stag, $etag);
if (stristr($_GET['url'], '.html')==true){
foreach ($linkarray as $list) {
preg_match("'<span id=\"replytext\">Reply to:</span> <a href=\"(.*?)\">'si", $list, $reply);
$reply_url = str_replace('?', '&', $reply[1]);
$reply_url = str_replace('mailto:', 'mailto=', $reply_url);
parse_str(str_replace('amp;', '&', $reply_url), $reply_array);
preg_match("' -->Compensation: (.*?)</li>'si", $list, $compensation);
preg_match("' -->Location: (.*?)</li>'si", $list, $location);
preg_match("'<section id=\"postingbody\">(.*?)</section>'si", $list, $post_body);
preg_match_all("'href=\"http:\/\/images.craigslist.org\/(.*?)\"'si", $list, $images);
$writer->startElement('post');
$writer->writeElement('reply-mailto', $reply_array['mailto']);
$writer->writeElement('reply-subject', $reply_array['subject']);
$writer->writeElement('reply-body', $reply_array['body']);
$writer->writeElement('compensation', $compensation[1]);
$writer->writeElement('location', $location[1]);
$writer->writeElement('post-body', $post_body[1]);
foreach ($images[1] as $img) {
$writer->writeElement('image', 'http://images.craigslist.org/'.$img);
}
$writer->endElement();
}
} else {
foreach ($linkarray as $list) {
preg_match('/href="([^"]*)"/i', $list, $url);
if (stristr($url[1], 'index')==false){
preg_match("'<a href=\".*?\">(.*?)</a>'si", $list, $title);
preg_match("'<span class=\"itempp\"> (.*?)</span>'si", $list, $price);
preg_match("'<span class=\"itempn\"><font size=\"-1\"> (.*?)</font></span>'si", $list, $loc);
$writer->startElement('item');
$writer->writeElement('url', $url[1]);
$writer->writeElement('title', $title[1]);
$writer->writeElement('new_url', $_SERVER['SERVER_NAME'].'/craigslist.php?url='.$url[1]);
$writer->writeElement('price', $price[1]);
$writer->writeElement('location', $loc[1]);
$writer->endElement();
}
}
}
$writer->endElement();
$writer->endDocument();
?>