Location: PHPKode > projects > OpenBizMap > openbizmap/mydata/script/odb_dedup.txt
<?php
/*
OpenDataBag - Data Web Interface
Copyright (C) 2004 Nawara

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
*/

// INIT  ---------------------------------------------
set_time_limit(0);

include(cfg_data_path.'/script/inc_functions.txt');
include(cfg_data_path.'/script/inc_data.txt');
include(cfg_data_path.'/script/sec_basic.txt');

$test=0;
$sleep=1;
$find=1;
$replace=0;
$check_range_days=300;
$similarity=92;

// DEDUP  ---------------------------------------------
$deamonid=date('Ymd His ').rand(100,999);
set_time_limit(0);
echo(INFO.' '.$deamonid.CRLF);

echo(CRLF);
echo(' sleep='.$sleep.CRLF);
echo(' find='.$find.CRLF);
echo(' replace='.$replace.CRLF);
echo(' check range days='.$check_range_days.CRLF);
echo(' similarity='.$similarity.CRLF);

echo(CRLF.'working...');


while(check_version())
{
  if($test)
    $file_array[cfg_data_path.'/data/dat_chsaab212.txt']=1;
  else
  {
    $res=opendir(cfg_data_path.'/data');
    while($fl = readdir($res))
      if(substr($fl,0,4)=='dat_' and $fl!='dat_.txt')
        $file_array[cfg_data_path.'/data/'.$fl]=1;
    closedir($res);
  }

  $replace_array=array();


  reset($file_array);
  while(list($dat_file,$tmp)=each($file_array))
  {
    $curr_bag=substr(basename($dat_file),4,-4);

    //replacing by rules ---------------------------------------------------
    if($replace)
    {
      $dedup_active_rules_array=search('sysGroupBy=dedup Active=1 MyBag=dedup'); //.$curr_bag
      $cnt=0;
      reset($dedup_active_rules_array);
      while(list($tmp,$dedup_active_rule_line)=each($dedup_active_rules_array))
      {
        if(read($dedup_active_rule_line,'Active'))
        {
          $replace_array[$cnt]['sysId']=read($dedup_active_rule_line,'sysId');
          $replace_array[$cnt]['search']=read($dedup_active_rule_line,'Change Filter');
          $replace_array[$cnt]['from']=read($dedup_active_rule_line,'Change From');
          $replace_array[$cnt]['to']=read($dedup_active_rule_line,'Change To');

          $cnt++;
        }
      }

      $fp=fopen($dat_file,'rb');
      while(!feof($fp))
      {
        $lc_line=fgets($fp,2048);
        $line=$lc_line;

        $changed=0;
        $changes='';

        if(read($line,'sysValidTo')=='99991231' and read($line,'sysValid')=='1' and read($line,'sysValidFrom')<=date('Ymd'))
        {
        for($ii=0;$ii<count($replace_array);$ii++)
        {
            if(stristr($line,$replace_array[$ii]['search']))
            if(stristr($line,$replace_array[$ii]['from']))
            if(stristr($line,'sysGroupBy=dedup')==0)
          {
            $line1=str_replace($replace_array[$ii]['from'],$replace_array[$ii]['to'],$line);
            if($line1!=$line)
            {
              $line=$line1;
              $changes.=' '.$replace_array[$ii]['sysId'];
              $changed=1;
            }
          }
        }

        if($changed)
        {
          $variable_array=line2array($line);
          $variable_array['sysComment']='Dedup fix:'.$changes;
          $variable_array['sysModInfo']=date('Ymd H:i').' Dedup Deamon';
          save($variable_array);
          //echo CRLF.' REPLACED '.read($line,'sysId').' '.$changes;
          echo CRLF.CRLF.' FIX bag='.$curr_bag.' rule='.read($line,'sysId').' changes='.$changes;
            if($sleep) sleep(1);
          }

          if(rand(0,200)<1)
            if($sleep) sleep(1);
        }
        //echo '/'; ob_flush();
      }
      fclose($fp);
    }
    //replacing by rules ---------------------------------------------------

    //echo CRLF.'BAG:'.$curr_bag;

    reset($cfg_dedup_array);
    while(list($tmp,$config_line)=each($cfg_dedup_array))
    {
      $pattern_variable_array=array();
      $variable_names_array=explode(',',$config_line);
      reset($variable_names_array);
      while(list($tmp,$variable_name)=each($variable_names_array))
        if(strlen(trim($variable_name)))
          $pattern_variable_array[trim($variable_name)]='';



      //echo CRLF.' RULE:'.$config_line;


      //finding rules ---------------------------------------------------
      if($find)
      {
        $fp=fopen($dat_file,'rb');
        while(!feof($fp))
        {
          $lc_line=fgets($fp,2048);
          $line=$lc_line;

          if(read($line,'sysValidTo')=='99991231' and read($line,'sysValid')=='1' and read($line,'sysValidFrom')<=date('Ymd') and read($line,'sysValidFrom')>date('Ymd',mktime(0,0,0,date('m'),date('d')-$check_range_days,date('Y'))))
          {
            $result_array=array();
            $variable_array=array();

            reset($pattern_variable_array);
            while(list($variable_name,$variable_value)=each($pattern_variable_array))
            {
              $variable_array[$variable_name]=read($line,$variable_name);

              if(strlen($variable_array[$variable_name])==0)
              {
                $variable_array=array();
                break;
              }
            }

            if(count($variable_array)==count($pattern_variable_array))
            {
              $result_array=find_similar($variable_array,60,$similarity,1,$curr_bag,$sleep,read($line,'sysId'));
              //echo '.'; ob_flush();
            }

            if(count($result_array))
            {
              if($sleep) sleep(2);
              reset($result_array);
              while(list($replacement,$sim)=each($result_array))
              {
                $replacement_array=explode(' ',$replacement);
                $replacement_left='';
                $replacement_right='';
                $filter='';
                for($ii=0;$ii<count($replacement_array);$ii+=2)
                {
                  $pattern=$replacement_array[$ii];
                  $word=$replacement_array[$ii+1];
                  if($pattern==$word)
                    $filter.=$pattern.' ';
                  else
                  {
                    $replacement_left.=$pattern.' ';
                    $replacement_right.=$word.' ';
                  }
                }

                $replacement_variable_array=array();
                $replacement_variable_array['sysGroupBy']='dedup';
                $replacement_variable_array['sysId']='';
                $replacement_variable_array['Active']='0';
                $replacement_variable_array['Rule Type']=$config_line;
                $replacement_variable_array['Change Filter']=trim($filter);
                $replacement_variable_array['Change From']=trim($replacement_left);
                $replacement_variable_array['Change To']=trim($replacement_right);
                $replacement_variable_array['Similarity']=$sim.'%';
                $replacement_variable_array['MyBag']='dedup';//substr(basename($dat_file),4,-4);

                //~ echo CRLF.'-----------------------'.CRLF; ob_flush();
                //~ print_r($variable_array);
                //~ print_r($result_array);
                //~ echo CRLF.'-----------------------'.CRLF; ob_flush();

                if(0+trim($replacement_left)==0 and 0+trim($replacement_right)==0 and count(search('dedup '.$filter.' '.$replacement_left.' '.$replacement_right.' MyBag=dedup '))==0)
                {
                  save($replacement_variable_array);
                  //echo CRLF.'  FOUND '.$sim.'% '.$filter.' | '.$replacement_left.' '.$replacement_right.' ';
                  echo CRLF.CRLF.' NEW RULE bag='.$curr_bag.' rule=('.$config_line.')'.' sim='.$sim.'% filter='.$filter.' replace=('.$replacement_left.' '.$replacement_right.')';
                }
              }
              if($sleep) sleep(1);
            }
          }

          ob_flush();
        }
        fclose($fp);
      }
      //finding rules ---------------------------------------------------

    }//end foreach config_line
    if($sleep) sleep(5);
  }//end foreach file
}

?>
Return current item: OpenBizMap