Location: PHPKode > scripts > PHP Scraper > php-scraper/class_scraper.php
<?
/* 	XHTML Screen Scraper PHP Class version 0.3
	------------------------------------------
	Copyright (c) 2004 Antonio Mota Rodrigues - antoniorodrigues_at_omnisinal.com
	This program is free software; you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation; either version 2 of the License, or
	(at your option) any later version.
	
	Keywords: html scraper, html screen scraper, html to array, convert html tables into arrays
	extract data from web pages, get html page as array, scraping, scrapers, xhtml to array,
	xhtml2array
*/


class scraper {

	
	// ----------------------------------
	function browse($s_url, $s_user_agent) {

		print "scraper: browse: Calling $s_url...\n";
		$o_ch = curl_init();

		curl_setopt ($o_ch, CURLOPT_URL, $s_url);
		curl_setopt ($o_ch, CURLOPT_USERAGENT, $s_user_agent);
		curl_setopt ($o_ch, CURLOPT_HEADER, 0);
		curl_setopt ($o_ch, CURLOPT_RETURNTRANSFER, 1);
		$s_html = curl_exec ($o_ch);
		curl_close ($o_ch);
		unset($o_ch);
		
		// Clean html ---------------------
		for ($ascii = 0; $ascii <= 9; $ascii++) $s_html = str_replace(chr($ascii), "", $s_html);
		for ($ascii = 11; $ascii < 32; $ascii++) $s_html = str_replace(chr($ascii), "", $s_html);
		for ($ascii = 127; $ascii <= 255; $ascii++) $s_html = str_replace(chr($ascii), "", $s_html);

		if (!$s_html) print "scraper: WARNING: no results...\n\n";
		return $s_html;

	} //end function
	
	
	// ------------------------------------------------------------------
	function extract ($s_html, $s_start_pattern, $s_end_pattern, $s_model) {
		
		print "scraper: OK. extracting...\n";

		$a_result = array();

		// Cut first block -----------------------
		$i_pos = strpos($s_html, $s_start_pattern);
		$s_html = substr($s_html, $i_pos);

		// Cut last block ----------------------
		$i_pos = strpos($s_html, $s_end_pattern);
		$s_html = substr($s_html, 0, $i_pos);
		
		$s_model = strtolower($s_model);
		$a_model = explode ("\n", $s_model);
		$i_model = count($a_model);
		if (!$a_model[$i_model - 1]) unset($a_model[$i_model - 1]);
		
		$a_html = explode ("<", $s_html);
		$i_cnt = count($a_html);

		// Extract data within tags -----
		for ($f = 0; $f < $i_cnt; $f++) {
			$tag = "<" . $a_html[$f];
			$closepos = strpos ($tag, ">");
			$value = substr($tag, $closepos + 1, strlen($tag) - $closepos);
			$tag = substr($tag,0,strlen($tag) - strlen($value));
			$a_html[$f] = strtolower($tag);
			$dat[$f] = $value;
		}

		$pat = 0;
		$a_pat = array();
		
		for ($f=0; $f < $i_cnt; $f++) {
		
			if ($a_model[$pat]=="<field>") {
				
				// Get data --------
				$value = $dat[$f-1];
				$value = str_replace ("\t", "", $value);
				$value = str_replace ("\n", "", $value);
				$value = str_replace ("\r", "", $value);
				$value = trim ($value);
				if (!$value) $value = "{e}"; 

				array_push($a_pat,$dat[$f-1]);
				$pat++;
				$f--;
			
			} else {
				
				// check pattern ---------------------
				if (substr($a_model[$pat],0,1) == "<") {
					$result = strpos (" " . $a_html[$f], $a_model[$pat],0);
				} else {
					$result = strpos (" " . strtolower($dat[$f]), $a_model[$pat],0);
				}
	
				if (is_integer($result)) { $pat++; }
			}

			if ($pat == count($a_model)-1) {
				$pat = 0;
				if (count($a_pat)) array_push($a_result, $a_pat);
				$a_pat = array();
			}

		} // end for each tag

		return $a_result;

	} // end function


} // end class

?>
Return current item: PHP Scraper