Location: PHPKode > projects > VideoDB > videodb/engines/imdb.php
<?php
/**
 * IMDB Parser
 *
 * Parses data from the Internet Movie Database
 *
 * @package Engines
 * @author  Andreas Gohr    <hide@address.com>
 * @link    http://www.imdb.com  Internet Movie Database
 * @version $Id: imdb.php,v 1.52 2009/02/28 12:09:50 andig2 Exp $
 */

$GLOBALS['imdbServer']   = 'http://www.imdb.com';
$GLOBALS['imdbIdPrefix'] = 'imdb:';

/**
 * Get meta information about the engine
 *
 * @todo    Include image search capabilities etc in meta information
 */
function imdbMeta()
{
    return array('name' => 'IMDB', 'stable' => 1);
}


/**
 * Get Url to search IMDB for a movie
 *
 * @author  Andreas Goetz <hide@address.com>
 * @param   string    The search string
 * @return  string    The search URL (GET)
 */
function imdbSearchUrl($title)
{
    global $imdbServer;
    return $imdbServer.'/Find?'.urlencode($title);
}

/**
 * Get Url to visit IMDB for a specific movie
 *
 * @author  Andreas Goetz <hide@address.com>
 * @param   string  $id The movie's external id
 * @return  string      The visit URL
 */
function imdbContentUrl($id)
{
    global $imdbServer;
    global $imdbIdPrefix;
    $id = preg_replace('/^'.$imdbIdPrefix.'/', '', $id);
    return $imdbServer.'/title/tt'.$id.'/';
}

/**
 * Search a Movie
 *
 * Searches for a given title on the IMDB and returns the found links in
 * an array
 *
 * @author  Tiago Fonseca <hide@address.com>
 * @author  Charles Morgan <hide@address.com>
 * @param   string  title   The search string
 * @param   boolean aka     Use AKA search for foreign language titles
 * @return  array           Associative array with id and title
 */
function imdbSearch($title, $aka=null)
{
    global $imdbServer;
    global $imdbIdPrefix;
    global $CLIENTERROR;
    global $cache;

    $url    = $imdbServer.'/find?q='.urlencode($title);
    if ($aka) $url .= ';s=tt;site=aka';
    
    $resp = httpClient($url, $cache);
    if (!$resp['success']) $CLIENTERROR .= $resp['error']."\n";

    $data = array();

    // add encoding
    $data['encoding'] = engine_get_encoding($resp);

    // direct match (redirecting to individual title)?
    if (preg_match('/^'.preg_quote($imdbServer,'/').'\/[Tt]itle(\?|\/tt)([0-9?]+)\/?/', $resp['url'], $single))
    {
        $info       = array();
        $info['id'] = $imdbIdPrefix.$single[2];

        // Title
        preg_match('/<title>(.*?) \([1-2][0-9][0-9][0-9].*?\)<\/title>/i', $resp['data'], $m);
        list($t, $s)        = split(' - ', trim($m[1]), 2);
        $info['title']      = trim($t);
        $info['subtitle']   = trim($s);

        $data[]     = $info;
    }

    // multiple matches
    else if (preg_match_all('#<a href="/title/tt(\d+)/?".*?>(.+?)</a>\s*\(([0-9?]+)\)?#i', $resp['data'], $multi, PREG_SET_ORDER))
    {
        foreach ($multi as $row) 
        {
            // fix for images in search results (re-apply search to title)
            if (preg_match('#<a href="/title/tt(\d+)/?".*?>(.+?)$#i', $row[2], $row2)) $row[2] = $row2[2];
            
            $info           = array();
            $info['id']     = $imdbIdPrefix.$row[1];
            $info['title']  = $row[2];
            $info['year']   = $row[3];
#           dump($info);
            $data[]         = $info;
        }
    }

    return $data;
}

/**
 * Fetches the data for a given IMDB-ID
 *
 * @author  Tiago Fonseca <hide@address.com>
 * @author  Victor La <hide@address.com>
 * @param   int   IMDB-ID
 * @return  array Result data
 */
function imdbData($imdbID) 
{
    global $imdbServer;
    global $imdbIdPrefix;
    global $CLIENTERROR;
    global $cache;

    $imdbID = preg_replace('/^'.$imdbIdPrefix.'/', '', $imdbID);
    $data= array(); // result
    $ary = array(); // temp

    // fetch mainpage
    $resp = httpClient($imdbServer.'/title/tt'.$imdbID.'/', $cache);     // added trailing / to avoid redirect
    if (!$resp['success']) $CLIENTERROR .= $resp['error']."\n";

    // add encoding
    $data['encoding'] = engine_get_encoding($resp);
    
    // Titles
    preg_match('/<TITLE>(.*?) \([1-2][0-9][0-9][0-9].*?\)<\/TITLE>/i', $resp['data'], $ary);
    list($t, $s)      = split(' - ', trim($ary[1]), 2);
    $data['title']    = html_clean($t);
    $data['subtitle'] = html_clean($s);

    // Year
    preg_match('/<A HREF="\/Sections\/Years\/[1-2][0-9][0-9][0-9]\/?">([1-2][0-9][0-9][0-9])<\/A>/i', $resp['data'], $ary);
    $data['year']     = trim($ary[1]);

    // Cover URL
    preg_match('/name="poster".+?<IMG.+?(http:\/\/.+?\.(jpe?g|gif))/i', $resp['data'], $ary);
    $data['coverurl'] = trim($ary[1]);

    // MPAA Rating
    preg_match('/<A HREF="\/mpaa">MPAA<\/A>: ?<\/h5>(.+?)<\/div>/is', $resp['data'], $ary);
    $data['mpaa']     = trim($ary[1]);

    // UK BBFC Rating
    preg_match('/>\s*UK:(.*?)<\/a>\s+/s', $resp['data'], $ary);
    $data['bbfc'] = trim($ary[1]);

    // Runtime
    preg_match('/Runtime:?<\/h5>:?.*?([0-9,]+).*?<\/TD>/si', $resp['data'], $ary);
    $data['runtime']  = preg_replace('/,/', '', trim($ary[1]));

    // Director
    preg_match('/<h5>Directors?:<\/h5>(.+?)<\/div>/si', $resp['data'], $ary);
    preg_match_all('/<A HREF="\/Name[?\/].+?">(.+?)<\/A>/si', $ary[1], $ary, PREG_PATTERN_ORDER);
    // TODO: Update templates to use multiple directors
    $data['director']  = trim(join(', ', $ary[1]));

    // Rating
    preg_match('/<b>([0-9.]+)\/10<\/b>[^<]*<a href="ratings" class="tn15more">[0-9,]+ votes<\/a>/si', $resp['data'], $ary);
    $data['rating']   = trim($ary[1]);

    // Countries
    preg_match('/<h5>Country:<\/h5>(.+?)<\/div>/si', $resp['data'], $ary);
    preg_match_all('/<A HREF="\/Sections\/Countries\/.+?\/">(.+?)<\/A>/si', $ary[1], $ary, PREG_PATTERN_ORDER);
    $data['country']  = trim(join(', ', $ary[1]));

    // Languages
    preg_match_all('/<A HREF="\/Sections\/Languages\/.+?\/">(.+?)<\/A>/si', $resp['data'], $ary, PREG_PATTERN_ORDER);
    $data['language'] = trim(strtolower(join(', ', $ary[1])));

    // Plot (movies in their early stages have the plot here but not yet in plotsummary?)
    preg_match('/<h5>Plot Outline:<\/h5>\s+(.*?)<\/div>/si', $resp['data'], $ary);
    if (!empty($ary[1])) $data['plot'] = trim($ary[1]);

    // Genres (as Array)
    preg_match('/<h5>Genre:<\/h5>(.+?)<\/div>/si', $resp['data'], $ary);
    preg_match_all('/<A HREF="\/Sections\/Genres\/.+?\/">(.+?)<\/A>/si', $ary[1], $ary, PREG_PATTERN_ORDER);
    foreach($ary[1] as $genre)
    {
        $data['genres'][] = trim($genre);
    }
    
    // Plot (simple- from main page)
    preg_match('/<h5>Plot:<\/h5>(.+?)(\||<\/div>)/si', $resp['data'], $ary);
    $data['plot'] = trim($ary[1]);

    // Fetch credits
    $resp = httpClient($imdbServer.'/title/tt'.$imdbID.'/fullcredits', $cache);
    if (!$resp['success']) $CLIENTERROR .= $resp['error']."\n";
#    dump($resp['data']);
    
    // Cast
    if (preg_match_all('/<table class="cast">(.*?)<\/table>/si', $resp['data'], $match))
    {
        $allcast = implode('', $match[1]);

        if (preg_match_all('#<td class="nm"><a href="/name/(.*?)/?">(.*?)</a>.*?<td class="char">(.*?)</td>#si', $allcast, $ary, PREG_PATTERN_ORDER))
        {
#dump($ary); die;                
            for ($i=0; $i < sizeof($ary[0]); $i++)
            {
                $actorid    = trim(strip_tags($ary[1][$i]));
                $actor      = trim(strip_tags($ary[2][$i]));
                $character  = trim(strip_tags($ary[3][$i]));
                $cast  .= "$actor::$character::$imdbIdPrefix$actorid\n";
            }
        }
        
        // remove html entities and replace &nbsp; with simple space
        $data['cast'] = html_clean($cast);

        // sometimes appearing in series (e.g. Scrubs)
        $data['cast'] = preg_replace('#/ ... #', '', $data['cast']);
    }
    
    // Fetch plot
    $resp = httpClient($imdbServer.'/title/tt'.$imdbID.'/plotsummary', $cache);
    if (!$resp['success']) $CLIENTERROR .= $resp['error']."\n";

    // Plot
    preg_match('/<P CLASS="plotpar">(.+?)<\/P>/is', $resp['data'], $ary);
    if ($ary[1])
    {
        $data['plot'] = trim($ary[1]);
        $data['plot'] = preg_replace('/&#34;/', '"', $data['plot']);     //Replace HTML " with "
        //Begin removal of 'Written by' section
        $data['plot'] = preg_replace('/<a href="\/SearchPlotWriters.*?<\/a>/', '', $data['plot']);
        $data['plot'] = preg_replace('/Written by/', '', $data['plot']);
        $data['plot'] = preg_replace('/<i>\s+<\/i>/', ' ', $data['plot']);
        //End of removal of 'Written by' section
        $data['plot'] = preg_replace('/\s+/s', ' ', $data['plot']);
    }    
    $data['plot'] = html_clean($data['plot']);
#dump($data['plot']);

    return $data;
}

/**
 * Get Url to visit IMDB for a specific actor
 *
 * @author  Michael Kollmann <hide@address.com>
 * @param   string  $name   The actor's name
 * @param   string  $id The actor's external id
 * @return  string      The visit URL
 */
function imdbActorUrl($name, $id)
{
    global $imdbServer;
    global $imdbIdPrefix;
    
    $path = ($id) ? 'name/'.urlencode($id).'/' : 'Name?'.urlencode(html_entity_decode_all($name));
    
    return $imdbServer.'/'.$path;
}

/**
 * Parses Actor-Details
 *
 * Find image and detail URL for actor, not sure if this can be made
 * a one-step process?
 *
 * @author                Andreas Goetz <hide@address.com>
 * @param  string  $name  Name of the Actor
 * @return array          array with Actor-URL and Thumbnail
 */
function imdbActor($name, $actorid)
{
    global $imdbServer;
    global $cache;

    // search directly by id or via name?
    $url    = ($actorid) ? '/name/'.urlencode($actorid).'/' : '/Name?'.urlencode($name);
    $resp   = httpClient($imdbServer.$url, $cache);

    $ary    = array();
    
    // if not direct match load best match
    if (preg_match('#<b>Popular Names</b>.+?<a\s+href="(.*?)">#i', $resp['data'], $m) ||
        preg_match('#<b>Names \(Exact Matches\)</b>.+?<a\s+href="(.*?)">#i', $resp['data'], $m) ||
        preg_match('#<b>Names \(Approx Matches\)</b>.+?<a\s+href="(.*?)">#i', $resp['data'], $m))
    {
        if (!preg_match('/http/i', $m[1])) $m[1] = $imdbServer.$m[1];
        $resp = httpClient($m[1], true);
    }

    // now we should have loaded the best match    
    if (preg_match('/<a\s+name="headshot"\s+href="(.+?)">\s*<img\s+.*?src="(.*?)"/i', $resp['data'], $m))
    {
        $ary[0][0] = $m[1];
        $ary[0][1] = $m[2];
    } 
    
    return $ary;
}

?>
Return current item: VideoDB