Location: PHPKode > projects > Obsessive Website Statistics > ows/scripts/upload_log.php
#!/usr/bin/php
<?php
/*
	$Id: upload_log.php 107 2007-09-14 16:26:46Z randomperson83 $

	Obsessive Web Statistics
    Copyright (C) 2007 Dustin Spicuzza <hide@address.com>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
	
	This script updates the database with log file data. At one time, this was simple. 
	Then, options and possible robustness were added.. probably needs to be reworked.
	
	
*/

	global $cfg;
	
	$base = realpath(dirname(__FILE__) . '/../include/');
	require "$base/base.inc.php";
	require "$base/apache_log_parser.php";
	require "$base/plugin.inc.php";
	require "$base/analysis.inc.php";
	require "$base/dimensions.inc.php";
	
	require_cli();

	// check args
	// [file] [domain]
	if ($argc < 3 || in_array($argv[1], array('--help', '-help', '-h', '-?')) ){
		echo wordwrap(
"Usage: $argv[0] [-q] filename domain [debug]

This program uploads log files into the OWS database. You need to provide the filename to parse (it can either be a text file or a gzipped file), and the domain to upload it for. It will display output every 10000 lines parsed, but it may take awhile to upload the files if you have giant log files. You may want to schedule this with cron.

If filename is specified as -, the program will read the logfile data from STDIN (this can be slow for large files)

If the third argument is debug, then the program will show some useful debugging information and exit. This can be useful if it is telling you that all the lines are bad.
");
		die;
	}
	
	echo "\n";

	// first, validate the website
	if (!validate_website_table($argv[2]))
		die;
	
	// variables
	$file = $argv[1];
	$website = $argv[2];
	$fact_table = str_replace('.','_',$argv[2]);
	$s_fact_table = db_escape_string($fact_table);
	$options = get_website_options($argv[2]);

	// load the plugins now, so if it has to die, it does it before information is inserted
	// into the database -- we dont use the vars here, but we should do this right now anyways
	if (!load_plugins(''))
		die;
	$plugins = get_plugins('analysis');
	
	// next, lock DB
	$lock = new AnalysisLock($website);
	if ($lock->Locked())
		die();
	
	// initialize stuff
	if (($dimensions = compile_dimensions(false)) === false)
		die();
	$analysis = new Analysis();
	if (!$analysis->Initialize($website,$dimensions))
		die();
		
	$apache_log = new ApacheLogRegex($options['log_format']);
	
	$debug = $argc > 3 && $argv[3] == 'debug' ? true : false;
	
	// show debug information if they wanted it.. 
	if ($debug){
		echo "\n==== Debug Info ====\nLog Format:\n\n$options[log_format]\n\nUsing regular expression: \n\n" . $apache_log->regex() . "\n\nMatching fields (case-sensitive fields present in the log_format string):\n\n";
		print_r($apache_log->names());
		echo "\n====\n\n";
		
		if ($debug)
			die;
	}
	

	
	$existing_lines = 0;
	
	// open the file
	if ($file != '-' && !file_exists($file))
		die("*** File $file does not exist, foo!\n");
	if ($file == '-')
		$file = 'php://stdin';
		
	if (($hfile = gzopen($file,'r')) === false)
		die("*** Could not open file $file for reading!");

	// find how many lines we already read, for information purposes
	$result = db_query("SELECT COUNT(id),MAX(id) FROM $s_fact_table");
	if (!db_has_rows($result))
		die("*** Error retrieving record count!\n");
		
	list($num_lines,$num_last) = db_fetch_row($result);
	if ($num_lines == null)
		$num_lines = 0;
	if ($num_last == null)
		$num_last = 0;
		
	echo ">>> $num_lines existing lines for $fact_table... \n";

	/*
		If it recognizes the first line, then it assumes that the file is
		the same file it read before, and seeks to the place it left off. 
		It then verifies the last entry that it just seeked to to ensure that
		nothing went wrong, and then uploads all the files into the database.
		
		If it does not recognize the first line, then it assumes the file 
		is either a new file or that the file somehow had the beginning of it
		chopped off. So it keeps reading lines until it finds a line newer than
		the last one in the database.
	*/
	
	// dont move these
	$first_read_line = null;
	$rows = array();
	
	// we're using two variables in the config database..
	// first_line, last_line, and filepos. Now, filepos is the value of the last gztell read.. not including
	// the final line. So, the first line read after gzseeking should be the last line.
	$is_new_file = true;
	
	if ($num_lines == 0 || $num_last == 0)
		echo "No previous data found. Uploading from beginning...\n";
	else if ($num_lines != 0 && $num_last != 0){
	
	
		// grab information that we retained previously, we can use these for comparison
		$first_db_line = get_config_var($website,'uploadlog_first_line');
		$last_db_line = get_config_var($website,'uploadlog_last_line');
		$db_filepos = intval(get_config_var($website,'uploadlog_filepos'));
		
		// we cant allow this
		if ($last_db_line === null)
			die ("!!! ERROR: database state is inconsistant! No last line found in database!\n");
		
		// grab a timestamp from the last_db_line
		if (($parsed_line = $apache_log->parse(trim($last_db_line))) === null)
			die("*** Last line in config db was invalid!\n");
		
		if (($last_db_time = strtotime(str_replace('/',' ',$parsed_line['Date']) . ' ' . $parsed_line['Time'])) === false)
			die("*** Last line in config db did not contain a valid time!\n");
		
		
		if ($file == 'php://stdin')
			echo "!!! STDIN detected. Scanning entire input.\n";
		else if ($first_db_line === null || $db_filepos === null)
			echo ">>> No previous file information found. Assuming new file...\n";
		else {
		
			// so grab the first line from the file, and compare it to our stored value
			if (($line = gzgets($hfile,65535)) === false)
				if (gzeof($hfile))
					die("*** Warning: reached end of file when reading first line.\n");
				else
					die("*** Error reading logfile when reading first line.\n");
			
			
			// set this, doesn't matter whether its good or bad
			$first_read_line = $line;
			
				
			if (trim($line) == trim($first_db_line)){
				
				// ok, so far so good.. see if the data matches up
				gzseek($hfile,$db_filepos);
				
				// now, read the line and see if it matches the last line we read
				if (($line = gzgets($hfile,65535)) === false)
					if (gzeof($hfile))
						die("*** Reached end of logfile when seeking for new position.\n");
					else
						die("*** Error reading logfile when seeking for new position.\n");
				
				// does it match?
				if (trim($line) == trim($last_db_line)){
					$is_new_file = false;
				
					/*
						Skip the sanity check for now, a match is good enough atm
					
					// lets do a sanity check here: see if the last line is the last line
					// in the database, just in case
					if (($parsed_line = $apache_log->parse(trim($line))) === null)
						die("*** Last line read was an invalid line!\n");
					
					// ensure everything exists
					$parsed_line = array_merge($merge,$parsed_line);

					
					if ($last_db_row[0] != intval($parsed_line['Status']) ||
						$last_db_row[1] != intval($parsed_line['Bytes-Sent']) ||
						$last_db_row[2] != e_str($parsed_line['Referrer']) ||
						$last_db_row[3] != e_str($parsed_line['User-Agent'])){
						if ($cfg['debug']){
							echo "At file position " . gztell($hfile) . ":\n";
							echo "\nExpecting: \n";
							print_r($last_db_row);
							echo "\nFound:\n";
							print_r($parsed_line);
						}
						die("*** Error: last line failed sanity check! Possible logfile and/or database corruption!\n");	
					}
					*/
					
					echo ">>> Successfully found last read line, now adding new lines to DB\n";
				}
			}
		}
		
		
		
		// rewind back to the beginning, and try and search for the oldest line that corresponds with
		// the SQL database.. 
		if ($is_new_file){
		
			if ($file != 'php://stdin'){
				echo "!!! Could not identify logfile from previous data, scanning from the beginning...\n";
			
				if (gzrewind($hfile) === false)
					die("*** Error: could not move file pointer to beginning!\n");
			}
			
			$last_read_tell = 0;
			
			// keep reading a line until we find a newer line, or possibly the same line
			while (!gzeof($hfile)){
			
				if (($line = gzgets($hfile,65535)) === false){
			
					// if the file isn't compressed, then the last line returns a false
					if (gzeof($hfile))
						break;
					die("*** While searching for new lines, an error occurred reading a line from $logfile for some reason\n");
				}
				
				$line = trim($line);
				
				// set this, doesn't matter whether its good or bad
				if ($first_read_line == null)
					$first_read_line = $line;
				
				if (($parsed_line = $apache_log->parse($line)) !== null){
				
					
					if (trim($last_db_line) == $line){
					
						// found the existing line
						echo ">>> Found last line from DB in logfile! Continuing from that point\n";
						break;
						
					}else if (strtotime(str_replace('/',' ',$parsed_line['Date']) . ' ' . $parsed_line['Time']) > $last_db_time){
						
						// found something newer, didn't find the last line.. oh well
						echo "!!! Warning: found a line with a new date that does not exist in DB!\n";
						
						// add the line so we dont have to rewind backwards
						$rows[] = $parsed_line;
					
						break;
					
					}
					
					// last line that wasn't an error.. 
					$last_read_tell = gztell($hfile);
				}
			}
		}
	}
	
	// ok, we got the file to where it needs to be, go ahead and run the analysis on it
	$analysis->analyzeFile($website,$hfile,$rows,$first_read_line,$apache_log);
	
	gzclose($hfile);
	
	echo "\n\n>>> Ended at " .date("D M j G:i:s T Y") . "\n\n";


?>
Return current item: Obsessive Website Statistics