<?php
/**
* Syntax based normalization of URI's
*
* This normalises URI's based on the specification RFC 3986
* http://www.apps.ietf.org/rfc/rfc3986.html
*
* Example usage:
* <code>
* require_once 'URLNormalizer.php';
*
* $url = 'eXAMPLE://a/./b/../b/%63/%7bfoo%7d';
* $un = new URLNormalizer();
* $un->setUrl( $url );
* echo $un->normalize();
*
* // result: "example://a/b/c/%7Bfoo%7D"
* </code>
*
* @author Glen Scott <hide@address.com>
*/
class URLNormalizer {
private $url;
private $scheme;
private $host;
private $port;
private $user;
private $pass;
private $path;
private $query;
private $fragment;
public function __construct() {
$this->scheme = '';
$this->host = '';
$this->port = '';
$this->user = '';
$this->pass = '';
$this->path = '';
$this->query = '';
$this->fragment = '';
}
public function getUrl() {
return $this->url;
}
public function setUrl( $url ) {
$this->url = $url;
// parse URL into respective parts
$url_components = parse_url( $this->url );
if ( ! $url_components ) {
return false;
}
else {
foreach ( $url_components as $key => $value ) {
if ( property_exists( $this, $key ) ) {
$this->$key = $value;
}
}
return true;
}
}
public function getScheme() {
return $this->scheme;
}
public function normalize() {
if ( $this->path ) {
# case normalization
$this->path = preg_replace( '/(%([0-9abcdef][0-9abcdef]))/ex', "'%'.strtoupper('\\2')", $this->path );
# percent-encoding normalization
$this->path = $this->urlDecodeUnreservedChars( $this->path );
# path segment normalization
$this->path = $this->removeDotSegments( $this->path );
}
if ( $this->scheme ) {
$this->scheme = strtolower( $this->scheme ) . '://';
}
if ( $this->host ) {
$this->host = strtolower( $this->host );
}
return $this->scheme . $this->host . $this->port . $this->user . $this->pass . $this->path . $this->query . $this->fragment;
}
/**
* Decode unreserved characters
* http://www.apps.ietf.org/rfc/rfc3986.html#sec-2.3
*/
public function urlDecodeUnreservedChars( $string ) {
$unreserved = array();
for ( $octet = 65; $octet <= 90; $octet++ ) {
$unreserved[] = dechex( $octet );
}
for ( $octet = 97; $octet <= 122; $octet++ ) {
$unreserved[] = dechex( $octet );
}
for ( $octet = 48; $octet <= 57; $octet++ ) {
$unreserved[] = dechex( $octet );
}
$unreserved[] = dechex( ord( '-' ) );
$unreserved[] = dechex( ord( '.' ) );
$unreserved[] = dechex( ord( '_' ) );
$unreserved[] = dechex( ord( '~' ) );
return preg_replace_callback( array_map( create_function( '$str', 'return "/%" . strtoupper( $str ) . "/x";' ), $unreserved ), create_function( '$matches', 'return chr( hexdec( $matches[0] ));' ), $string );
//return chr( hexdec( '%63' ) );
}
/**
* Path segment normalization
* http://www.apps.ietf.org/rfc/rfc3986.html#sec-5.2.4
*/
public function removeDotSegments( $path ) {
$new_path = '';
$iteration = 0;
$step = ' ';
while ( ! empty( $path ) ) {
//echo ++$iteration . "$step:" . $new_path . "\t\t\t\t" . $path . "\n";
// A
$pattern_a = '!^(\.\./|\./)!x';
$pattern_b_1 = '!^(/\./)!x';
$pattern_b_2 = '!^(/\.)$!x';
$pattern_c = '!^(/\.\./|/\.\.)!x';
$pattern_d = '!^(\.|\.\.)$!x';
$pattern_e = '!(/*[^/]*)!x';
if ( preg_match( $pattern_a, $path ) ) {
$step = 'A';
// remove prefix from $path
$path = preg_replace( $pattern_a, '', $path );
}
elseif ( preg_match( $pattern_b_1, $path, $matches ) || preg_match( $pattern_b_2, $path, $matches ) ) {
$step = 'B';
$path = preg_replace( "!^" . $matches[1] . "!", '/', $path );
}
elseif ( preg_match( $pattern_c, $path, $matches ) ) {
$step = 'C';
$path = preg_replace( '!^' . preg_quote( $matches[1], '!' ) . '!x', '/', $path );
# remove the last segment and its preceding "/" (if any) from output buffer
$new_path = preg_replace( '!/([^/]+)$!x', '', $new_path );
}
elseif ( preg_match( $pattern_d, $path ) ) {
$step = 'D';
$path = preg_replace( $pattern_d, $path );
}
else {
$step = 'E';
if ( preg_match( $pattern_e, $path, $matches ) ) {
$first_path_segment = $matches[1];
$path = preg_replace( '/^' . preg_quote( $first_path_segment, '/' ) . '/', '', $path, 1 );
$new_path .= $first_path_segment;
}
}
}
return $new_path;
}
}