Location: PHPKode > scripts > Site Map Generator > site-map-generator/SiteMapGenerator.class.php
<?php

/**
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>. 
 */

/**
 * This class can help you to generate a sitemap to enhance the way Google
 * (and other search engines) indexes your page. According to this documentation:
 * https://www.google.com/webmasters/tools/docs/en/protocol.html
 * 
 * @author Felipe Ribeiro - http://feliperibeiro.com <hide@address.com>
 * @date Dec 25th, 2007 
 * 
 */
class SiteMapGenerator {

	/**
	 * The url of the site
	 *
	 * @var string
	 */
	private $site;
	
	/**
	 * The HTML content of the current page the system is navigating
	 *
	 * @var string
	 */
	private $html;
	
	/**
	 * The url of the actual page the system is navigating
	 *
	 * @var string
	 */
	private $actual;
	
	/**
	 * A hash table containing all the links
	 *
	 * @var array
	 */
	private $hash;
	
	
	
	/**
	 * Constructs the SiteMapGenerator Object
	 *
	 * @param string $site the URL of the site that you want to generate the sitemap
	 * @param bool $navigate if you want the SiteMapGenerator to already start navigating through the site, default=true
	 */
	public function __construct($site,$navigate=true) {
		$this->site = $site;
		$this->hash = array();
		if($navigate) {
			$this->link($site);
		}
	}

	/**
	 * Gets the title of the current page, i.e. the text between <title> and </title> 
	 *
	 * @return string title
	 */
	public function getTitle() {
		$preg2="/<title>(.*?)<\/title>/i";
		$title = array();
		preg_match($preg2,$this->html,$title);
		return $title[1];
	}

	/**
	 * Returns all the <b>internal</b> links of the current page.
	 *
	 * @return array containing the links
	 */
	public function getLinks() {
		$preg = "/<a.*? href=(\"|')(.*?)(\"|').*?>(.*?)<\/a>/i";  
		$links = array();
		preg_match_all($preg,$this->html,$links);
		$urlsTemp = $links[2];
		$linksTemp = $links[4];
		$urls = array();
		$baseUrl = '';
		$linkTexts = array();

		// set base url

		// make sure we use real url in case of redirection
		$headers = get_headers($this->actual, 1);
		if (isset($headers['Location'])) {
			if (strpos($headers['Location'], $this->site) !== false) {
				$baseUrl = $headers['Location'];
			} else {
				$baseUrl = $this->site . '/' . $headers['Location'];
			}
		} else {
			$baseUrl = $this->actual;
		}

		if ( !(substr($baseUrl, -1) == '/') ) {
			if ( is_dir($baseUrl) ) { // if base url is a dir without trailing slash
				$baseUrl .= '/';
			} else if ( $baseUrl != $this->site ) { // base url is a file and not the root
				$baseUrl = str_replace(basename($baseUrl), '', $baseUrl);
			}
		}

		foreach ($urlsTemp as $i=>$url){
			if(strstr($url,$this->site)) { //If it has link to absolute url path with domain
				$urls[] = $url;
				//$linkTexts[] = $linksTemp[$i];
			} else if( substr($url, 0, 1) == '/' ) { //If it has link to absolute url path without domain
				$urls[] = $this->site . $url;
				//$linkTexts[] = $linksTemp[$i];
			} else if( !preg_match("/^(mailto|http|\#)/",$url) ) { //If it has link to a relative URL.

				if (substr($url,0,2) == './') {
					$urls[] = $baseUrl . substr($url,2);
				} else if (substr($url,0,3) == '../') {
					$pad = substr_count($url,'../');
					$baseUrlTemp = $baseUrl;
					for ($i=1; $i<=$pad+1; $i++) {
						$baseUrlTemp = substr($baseUrlTemp, 0, strrpos($baseUrlTemp, '/') );
					}
					$urls[] = $baseUrlTemp . '/' . str_replace('../','',$url);
				} else {
					$urls[] = $baseUrl . $url;
				}

			}
		}

		return $urls;
	}
	
	/**
	 * Redirect the object to a new page
	 *
	 * @param string $link the URL that the system will visit
	 */
	public function link($link) {
		// remove hash portion of url to make sure the url isn't reprocessed
		$hashPos = strpos($link,'#');
		if ($hashPos !== false) $link = substr($link, 0, $hashPos);
		if(!empty($link) && !isset($this->hash[$link])) {
			$this->actual = $link;
			$this->navigate();
		}
	}
	
	/**
	 * Navigates through the page collecting the data
	 *
	 */
	public function navigate() {
		$this->html = file_get_contents($this->actual);
		
		if ($this->html) { // make sure we have something to parse
			// remove html comments in order to avoid hidden urls
			$this->html = preg_replace('/<!--.*?-->/s','',$this->html);
			$title = $this->getTitle();
			$this->hash[$this->actual] = $title;
		
			if($title==null || $title=="") {
				$this->hash[$this->actual] = "Untitled";
			}
			
			$links = $this->getLinks();
			foreach($links as $link) {
				$this->link($link);			
			}
		}
	}
	
	/**
	 * Returns the Hashtable with the links and titles of each page. e.g
	 * 
	 * <code>
	 * Array (
	 * "http://blog.feliperibeiro.com" => "Felipe Ribeiro Home"
	 * )
	 * </code>
	 * @return array
	 */
	public function getHash() {
		return $this->hash;
	}
	
	/**
	 * Generates the SiteMap in XML format, according to the default specs
	 * https://www.google.com/webmasters/tools/docs/en/protocol.html
	 * @return string well formed xml
	 */
	public function generateSiteMap() {
		$xml = new SimpleXMLElement("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\"></urlset>");
		foreach($this->hash as $url=>$title) {
			$urlNode = $xml->addChild("url");
			$urlNode->addChild("loc",$url);
			$priority = 0.5;
			if($url == $this->site) {
				$priority = 1.0;
			}
			$urlNode->addChild("priority",$priority);
		}
		return $xml->asXML();
	}
	
}
?>
Return current item: Site Map Generator