<?php
/*
* This class is a parser for imdb movie/serie/documental/videogame/episode names, ratings and aka titles.
* Since imdb has a huge database and this class is done in PHP, all data can't be parsed at once.
* Just set a cron each ~5 minutes and keep reading/storing the run method output.
*
* Usage example:
* $imdb = new IMDB_Parser();
* $output_array = $imdb->run();
* //do something with $output_array, like printing it in a readable manner.
* //eg. echo "<pre>".print_r($output_array,true)."</pre>";
* //Use $imdb->current_action to know which output data is, if its names, aka titles or ratings.
*
* Author: R. Beltran (hide@address.com)
*
* Copyleft, no rights reserved. Use at your own risk.
* IMDB content is copyrighted and you must ask for permission before publishing any of the parsed content.
*/
class IMDB_Parser {
public $temp_dir = './temp/'; //Must have read/write permission
public $source_url = 'ftp://ftp.fu-berlin.de/misc/movies/database/';
public $stream_lines = 20000; //Average number of lines parsed each execution
private $last_action;
public $current_action; //2 = movies.list, 4 = aka-titles.list, 6 = ratings.list
private $seek_pointer;
private $file = array('movies.list.gz', 'movies.list', 'aka-titles.list.gz', 'aka-titles.list', 'ratings.list.gz', 'ratings.list');
public function run() {
$this->reload_status();
$this->current_action = $this->last_action + 1;
if (($this->last_action % 2) == 0) {
$this->fetch_file($this->file[$this->last_action]);
$this->last_action++;
} else {
$stream = &$this->stream_file($this->file[$this->last_action]);
$r = &$this->parse_stream($stream);
}
$this->save_status();
return $r;
}
private function fetch_file($filename) {
if (substr($filename,-3) == '.gz') {
@exec('wget -q '.$this->source_url.$filename.' -O '.$this->temp_dir.$filename);
@exec('sleep 1');
$e = exec('gzip -d ./temp/'.$filename);
if (!empty($e)) Throw new Exception("File $filename not in gz format: ".$e);
return substr($filename,0,-3);
} else Throw new Exception("File $filename != *.gz");
}
private function stream_file($filename) {
if($fh = fopen($this->temp_dir.$filename,"r")){
fseek($fh, $this->seek_pointer);
$lines = 0;
while (!feof($fh)){
$lines++;
$current = fgets($fh,999);
$stream .= $current;
//break if we're too away from max lines allowed or if we found a suitable place to break.
if ($lines > $this->stream_lines && (strlen($current) < 5 || $lines > ($this->stream_lines + 10))) break;
}
if (feof($fh)) {
$this->seek_pointer = 0;
$this->last_action++;
} else $this->seek_pointer = ftell($fh);
fclose($fh);
}
return $stream;
}
private function parse_stream(&$stream) {
switch ($this->current_action) {
case 2: //movies.list
preg_match_all("/^(.*?)(?:.\(([\d]+?)[\/I^\(]*?\))(?:.\((.*?)\))?/m",$stream,$r, PREG_SET_ORDER);
break;
case 4: //aka-titles.list
$data = explode("\n\n",$stream);
foreach ($data as &$row) {
$lines = explode("\n",$row);
$r2 = array();
foreach ($lines as $key => &$line) {
if ($key == 0) {
if (preg_match("/^(.+?)(?:.\((\d+).*?\)(?:.\((.+?)\))?)/", $line, $matches)) {
$r2[] = $matches;
} else continue 2;
} else {
if (preg_match("/.*?\(aka.(.*?)(?:.\()(\d+).*?\)\)(?:.*?\((.*?)\))?/", $line, $matches)) {
$r2[] = $matches;
} else continue;
}
}
$r[] = $r2;
}
break;
case 6: //ratings.list
preg_match_all("/^.*?([\d\.]+).*?(\d+).*?([\d\.]+)..(.*?)(?:.\((\d+)[\/I]*\))(?:.\((.*?)\))?(?:.\{(.*?)(?:.\(.([\d\.]+)))?/m",$stream,$r, PREG_SET_ORDER);
break;
}
return $r;
}
private function reload_status() {
$data = file_get_contents($this->temp_dir.'status');
$e = explode('#',$data);
$this->last_action = $e[0];
$this->seek_pointer = $e[1];
return true;
}
private function save_status() {
file_put_contents($this->temp_dir.'status', $this->last_action.'#'.$this->seek_pointer);
}
}
?>