Location: PHPKode > scripts > IMDB Parser Script > parser.class.php
<?php
/*
 * This class is a parser for imdb movie/serie/documental/videogame/episode names, ratings and aka titles.
 * Since imdb has a huge database and this class is done in PHP, all data can't be parsed at once.
 * Just set a cron each ~5 minutes and keep reading/storing the run method output.
 *
 * Usage example:
 * $imdb = new IMDB_Parser();
 * $output_array = $imdb->run();
 * //do something with $output_array, like printing it in a readable manner.
 * //eg. echo "<pre>".print_r($output_array,true)."</pre>";
 * //Use $imdb->current_action to know which output data is, if its names, aka titles or ratings.
 *
 * Author: R. Beltran (hide@address.com)
 *
 * Copyleft, no rights reserved. Use at your own risk.
 * IMDB content is copyrighted and you must ask for permission before publishing any of the parsed content.
 */

class  IMDB_Parser {

    public $temp_dir = './temp/';   //Must have read/write permission
    public $source_url = 'ftp://ftp.fu-berlin.de/misc/movies/database/';
    public $stream_lines = 20000;   //Average number of lines parsed each execution
    private $last_action;
    public $current_action; //2 = movies.list, 4 = aka-titles.list, 6 = ratings.list
    private $seek_pointer;
    private $file = array('movies.list.gz', 'movies.list', 'aka-titles.list.gz', 'aka-titles.list', 'ratings.list.gz', 'ratings.list');
    
    public function run() {
    	
    	$this->reload_status();
    	$this->current_action = $this->last_action + 1;

        if (($this->last_action % 2) == 0) {
            $this->fetch_file($this->file[$this->last_action]);
            $this->last_action++;
        } else {
            $stream = &$this->stream_file($this->file[$this->last_action]);
            $r = &$this->parse_stream($stream);
        }
    	
    	$this->save_status();

        return $r;
    }

    private function fetch_file($filename) {

        if (substr($filename,-3) == '.gz') {

            @exec('wget -q '.$this->source_url.$filename.' -O '.$this->temp_dir.$filename);
            @exec('sleep 1');
            $e = exec('gzip -d ./temp/'.$filename);

            if (!empty($e)) Throw new Exception("File $filename not in gz format: ".$e);

            return substr($filename,0,-3);

        } else Throw new Exception("File $filename != *.gz");
    }

    private function stream_file($filename) {

        if($fh = fopen($this->temp_dir.$filename,"r")){

            fseek($fh, $this->seek_pointer);
            
            $lines = 0;
            while (!feof($fh)){

                $lines++;
                $current = fgets($fh,999);
                $stream .= $current;

                //break if we're too away from max lines allowed or if we found a suitable place to break.
                if ($lines > $this->stream_lines && (strlen($current) < 5 || $lines > ($this->stream_lines + 10))) break;
            }
            
           	if (feof($fh)) {
           		
           		$this->seek_pointer = 0;
           		$this->last_action++;
           		
           	} else $this->seek_pointer = ftell($fh);
           	
            fclose($fh);
        }

        return $stream;
    }
    
    private function parse_stream(&$stream) {

    	switch ($this->current_action) {

            case 2: //movies.list

                preg_match_all("/^(.*?)(?:.\(([\d]+?)[\/I^\(]*?\))(?:.\((.*?)\))?/m",$stream,$r, PREG_SET_ORDER);
                break;
            case 4: //aka-titles.list

                $data = explode("\n\n",$stream);

                foreach ($data as &$row) {
                    $lines = explode("\n",$row);
                    $r2 = array();
                    foreach ($lines as $key => &$line) {
                        if ($key == 0) {
                            if (preg_match("/^(.+?)(?:.\((\d+).*?\)(?:.\((.+?)\))?)/", $line, $matches)) {
                                $r2[] = $matches;
                            } else continue 2;
                        } else {
                            if (preg_match("/.*?\(aka.(.*?)(?:.\()(\d+).*?\)\)(?:.*?\((.*?)\))?/", $line, $matches)) {
                                $r2[] = $matches;
                            } else continue;
                        }
                    }

                    $r[] = $r2;
                }

                break;

            case 6: //ratings.list

                preg_match_all("/^.*?([\d\.]+).*?(\d+).*?([\d\.]+)..(.*?)(?:.\((\d+)[\/I]*\))(?:.\((.*?)\))?(?:.\{(.*?)(?:.\(.([\d\.]+)))?/m",$stream,$r, PREG_SET_ORDER);
                break;

    	}

        return $r;
    }
    
    private function reload_status() {
    	
    	$data = file_get_contents($this->temp_dir.'status');
    	$e = explode('#',$data);
    	$this->last_action = $e[0];
    	$this->seek_pointer = $e[1];
    	
    	return true;
    }
    
    private function save_status() {
    	
    	file_put_contents($this->temp_dir.'status', $this->last_action.'#'.$this->seek_pointer);
    }

}

?>
Return current item: IMDB Parser Script