<?php
/**
* IMDB Parser
*
* Parses data from the Internet Movie Database
*
* @package Engines
* @author Andreas Gohr <hide@address.com>
* @link http://www.imdb.com Internet Movie Database
* @version $Id: imdb.php,v 1.52 2009/02/28 12:09:50 andig2 Exp $
*/
$GLOBALS['imdbServer'] = 'http://www.imdb.com';
$GLOBALS['imdbIdPrefix'] = 'imdb:';
/**
* Get meta information about the engine
*
* @todo Include image search capabilities etc in meta information
*/
function imdbMeta()
{
return array('name' => 'IMDB', 'stable' => 1);
}
/**
* Get Url to search IMDB for a movie
*
* @author Andreas Goetz <hide@address.com>
* @param string The search string
* @return string The search URL (GET)
*/
function imdbSearchUrl($title)
{
global $imdbServer;
return $imdbServer.'/Find?'.urlencode($title);
}
/**
* Get Url to visit IMDB for a specific movie
*
* @author Andreas Goetz <hide@address.com>
* @param string $id The movie's external id
* @return string The visit URL
*/
function imdbContentUrl($id)
{
global $imdbServer;
global $imdbIdPrefix;
$id = preg_replace('/^'.$imdbIdPrefix.'/', '', $id);
return $imdbServer.'/title/tt'.$id.'/';
}
/**
* Search a Movie
*
* Searches for a given title on the IMDB and returns the found links in
* an array
*
* @author Tiago Fonseca <hide@address.com>
* @author Charles Morgan <hide@address.com>
* @param string title The search string
* @param boolean aka Use AKA search for foreign language titles
* @return array Associative array with id and title
*/
function imdbSearch($title, $aka=null)
{
global $imdbServer;
global $imdbIdPrefix;
global $CLIENTERROR;
global $cache;
$url = $imdbServer.'/find?q='.urlencode($title);
if ($aka) $url .= ';s=tt;site=aka';
$resp = httpClient($url, $cache);
if (!$resp['success']) $CLIENTERROR .= $resp['error']."\n";
$data = array();
// add encoding
$data['encoding'] = engine_get_encoding($resp);
// direct match (redirecting to individual title)?
if (preg_match('/^'.preg_quote($imdbServer,'/').'\/[Tt]itle(\?|\/tt)([0-9?]+)\/?/', $resp['url'], $single))
{
$info = array();
$info['id'] = $imdbIdPrefix.$single[2];
// Title
preg_match('/<title>(.*?) \([1-2][0-9][0-9][0-9].*?\)<\/title>/i', $resp['data'], $m);
list($t, $s) = split(' - ', trim($m[1]), 2);
$info['title'] = trim($t);
$info['subtitle'] = trim($s);
$data[] = $info;
}
// multiple matches
else if (preg_match_all('#<a href="/title/tt(\d+)/?".*?>(.+?)</a>\s*\(([0-9?]+)\)?#i', $resp['data'], $multi, PREG_SET_ORDER))
{
foreach ($multi as $row)
{
// fix for images in search results (re-apply search to title)
if (preg_match('#<a href="/title/tt(\d+)/?".*?>(.+?)$#i', $row[2], $row2)) $row[2] = $row2[2];
$info = array();
$info['id'] = $imdbIdPrefix.$row[1];
$info['title'] = $row[2];
$info['year'] = $row[3];
# dump($info);
$data[] = $info;
}
}
return $data;
}
/**
* Fetches the data for a given IMDB-ID
*
* @author Tiago Fonseca <hide@address.com>
* @author Victor La <hide@address.com>
* @param int IMDB-ID
* @return array Result data
*/
function imdbData($imdbID)
{
global $imdbServer;
global $imdbIdPrefix;
global $CLIENTERROR;
global $cache;
$imdbID = preg_replace('/^'.$imdbIdPrefix.'/', '', $imdbID);
$data= array(); // result
$ary = array(); // temp
// fetch mainpage
$resp = httpClient($imdbServer.'/title/tt'.$imdbID.'/', $cache); // added trailing / to avoid redirect
if (!$resp['success']) $CLIENTERROR .= $resp['error']."\n";
// add encoding
$data['encoding'] = engine_get_encoding($resp);
// Titles
preg_match('/<TITLE>(.*?) \([1-2][0-9][0-9][0-9].*?\)<\/TITLE>/i', $resp['data'], $ary);
list($t, $s) = split(' - ', trim($ary[1]), 2);
$data['title'] = html_clean($t);
$data['subtitle'] = html_clean($s);
// Year
preg_match('/<A HREF="\/Sections\/Years\/[1-2][0-9][0-9][0-9]\/?">([1-2][0-9][0-9][0-9])<\/A>/i', $resp['data'], $ary);
$data['year'] = trim($ary[1]);
// Cover URL
preg_match('/name="poster".+?<IMG.+?(http:\/\/.+?\.(jpe?g|gif))/i', $resp['data'], $ary);
$data['coverurl'] = trim($ary[1]);
// MPAA Rating
preg_match('/<A HREF="\/mpaa">MPAA<\/A>: ?<\/h5>(.+?)<\/div>/is', $resp['data'], $ary);
$data['mpaa'] = trim($ary[1]);
// UK BBFC Rating
preg_match('/>\s*UK:(.*?)<\/a>\s+/s', $resp['data'], $ary);
$data['bbfc'] = trim($ary[1]);
// Runtime
preg_match('/Runtime:?<\/h5>:?.*?([0-9,]+).*?<\/TD>/si', $resp['data'], $ary);
$data['runtime'] = preg_replace('/,/', '', trim($ary[1]));
// Director
preg_match('/<h5>Directors?:<\/h5>(.+?)<\/div>/si', $resp['data'], $ary);
preg_match_all('/<A HREF="\/Name[?\/].+?">(.+?)<\/A>/si', $ary[1], $ary, PREG_PATTERN_ORDER);
// TODO: Update templates to use multiple directors
$data['director'] = trim(join(', ', $ary[1]));
// Rating
preg_match('/<b>([0-9.]+)\/10<\/b>[^<]*<a href="ratings" class="tn15more">[0-9,]+ votes<\/a>/si', $resp['data'], $ary);
$data['rating'] = trim($ary[1]);
// Countries
preg_match('/<h5>Country:<\/h5>(.+?)<\/div>/si', $resp['data'], $ary);
preg_match_all('/<A HREF="\/Sections\/Countries\/.+?\/">(.+?)<\/A>/si', $ary[1], $ary, PREG_PATTERN_ORDER);
$data['country'] = trim(join(', ', $ary[1]));
// Languages
preg_match_all('/<A HREF="\/Sections\/Languages\/.+?\/">(.+?)<\/A>/si', $resp['data'], $ary, PREG_PATTERN_ORDER);
$data['language'] = trim(strtolower(join(', ', $ary[1])));
// Plot (movies in their early stages have the plot here but not yet in plotsummary?)
preg_match('/<h5>Plot Outline:<\/h5>\s+(.*?)<\/div>/si', $resp['data'], $ary);
if (!empty($ary[1])) $data['plot'] = trim($ary[1]);
// Genres (as Array)
preg_match('/<h5>Genre:<\/h5>(.+?)<\/div>/si', $resp['data'], $ary);
preg_match_all('/<A HREF="\/Sections\/Genres\/.+?\/">(.+?)<\/A>/si', $ary[1], $ary, PREG_PATTERN_ORDER);
foreach($ary[1] as $genre)
{
$data['genres'][] = trim($genre);
}
// Plot (simple- from main page)
preg_match('/<h5>Plot:<\/h5>(.+?)(\||<\/div>)/si', $resp['data'], $ary);
$data['plot'] = trim($ary[1]);
// Fetch credits
$resp = httpClient($imdbServer.'/title/tt'.$imdbID.'/fullcredits', $cache);
if (!$resp['success']) $CLIENTERROR .= $resp['error']."\n";
# dump($resp['data']);
// Cast
if (preg_match_all('/<table class="cast">(.*?)<\/table>/si', $resp['data'], $match))
{
$allcast = implode('', $match[1]);
if (preg_match_all('#<td class="nm"><a href="/name/(.*?)/?">(.*?)</a>.*?<td class="char">(.*?)</td>#si', $allcast, $ary, PREG_PATTERN_ORDER))
{
#dump($ary); die;
for ($i=0; $i < sizeof($ary[0]); $i++)
{
$actorid = trim(strip_tags($ary[1][$i]));
$actor = trim(strip_tags($ary[2][$i]));
$character = trim(strip_tags($ary[3][$i]));
$cast .= "$actor::$character::$imdbIdPrefix$actorid\n";
}
}
// remove html entities and replace with simple space
$data['cast'] = html_clean($cast);
// sometimes appearing in series (e.g. Scrubs)
$data['cast'] = preg_replace('#/ ... #', '', $data['cast']);
}
// Fetch plot
$resp = httpClient($imdbServer.'/title/tt'.$imdbID.'/plotsummary', $cache);
if (!$resp['success']) $CLIENTERROR .= $resp['error']."\n";
// Plot
preg_match('/<P CLASS="plotpar">(.+?)<\/P>/is', $resp['data'], $ary);
if ($ary[1])
{
$data['plot'] = trim($ary[1]);
$data['plot'] = preg_replace('/"/', '"', $data['plot']); //Replace HTML " with "
//Begin removal of 'Written by' section
$data['plot'] = preg_replace('/<a href="\/SearchPlotWriters.*?<\/a>/', '', $data['plot']);
$data['plot'] = preg_replace('/Written by/', '', $data['plot']);
$data['plot'] = preg_replace('/<i>\s+<\/i>/', ' ', $data['plot']);
//End of removal of 'Written by' section
$data['plot'] = preg_replace('/\s+/s', ' ', $data['plot']);
}
$data['plot'] = html_clean($data['plot']);
#dump($data['plot']);
return $data;
}
/**
* Get Url to visit IMDB for a specific actor
*
* @author Michael Kollmann <hide@address.com>
* @param string $name The actor's name
* @param string $id The actor's external id
* @return string The visit URL
*/
function imdbActorUrl($name, $id)
{
global $imdbServer;
global $imdbIdPrefix;
$path = ($id) ? 'name/'.urlencode($id).'/' : 'Name?'.urlencode(html_entity_decode_all($name));
return $imdbServer.'/'.$path;
}
/**
* Parses Actor-Details
*
* Find image and detail URL for actor, not sure if this can be made
* a one-step process?
*
* @author Andreas Goetz <hide@address.com>
* @param string $name Name of the Actor
* @return array array with Actor-URL and Thumbnail
*/
function imdbActor($name, $actorid)
{
global $imdbServer;
global $cache;
// search directly by id or via name?
$url = ($actorid) ? '/name/'.urlencode($actorid).'/' : '/Name?'.urlencode($name);
$resp = httpClient($imdbServer.$url, $cache);
$ary = array();
// if not direct match load best match
if (preg_match('#<b>Popular Names</b>.+?<a\s+href="(.*?)">#i', $resp['data'], $m) ||
preg_match('#<b>Names \(Exact Matches\)</b>.+?<a\s+href="(.*?)">#i', $resp['data'], $m) ||
preg_match('#<b>Names \(Approx Matches\)</b>.+?<a\s+href="(.*?)">#i', $resp['data'], $m))
{
if (!preg_match('/http/i', $m[1])) $m[1] = $imdbServer.$m[1];
$resp = httpClient($m[1], true);
}
// now we should have loaded the best match
if (preg_match('/<a\s+name="headshot"\s+href="(.+?)">\s*<img\s+.*?src="(.*?)"/i', $resp['data'], $m))
{
$ary[0][0] = $m[1];
$ary[0][1] = $m[2];
}
return $ary;
}
?>