<?php
/**
*
* Name Extractor
* Copyright (c) 2012 Peter Kahl. All rights reserved.
* Use of this source code is governed by a GNU General Public License
* that can be found in the LICENSE file.
*
* https://github.com/peterkahl/name-extractor
*
*/
class name_extractor {
protected $dict_array = array();
protected $longest_word = 14;
//------------------------------------------------------------------
public function __construct() {
require dirname(__FILE__).'/dictionary-names.php';
$this->dict_array = array_flip($dictArray); // speed trick
}
//------------------------------------------------------------------
public function extract_name($str) {
$str = trim(strtolower($str));
// check if "First Last<hide@address.com>"
if (stristr($str, '<') !== false) {
$name = trim(reset(explode('<', $str)));
if (strlen($name) > 0) {
return $this->ucfirst_words($name); // Got it!!!
}
// clean up the email
$pos_lt = strpos($str, '<');
$str = substr($str, ($pos_lt + 1));
}
$str = reset(explode('@', $str));
// name is short?
if (strlen($str) < 4) return ucfirst($str); // Got it!!!
// contains (.) dots?
if (stristr($str, '.') !== false) {
return $this->ucfirst_words(str_replace('.', ' ', $str)); // Got it!!!
}
// contains (_) underscore?
elseif (stristr($str, '_') !== false) {
return $this->ucfirst_words(str_replace('_', ' ', $str)); // Got it!!!
}
// contains (-) hyphen?
elseif (stristr($str, '-') !== false) {
return $this->ucfirst_words(str_replace('-', ' ', $str)); // Got it!!!
}
// check dictionary
if (strlen($str) <= $this->longest_word) {
if (array_key_exists($str, $this->dict_array)) {
return ucfirst($str); // Got it!!!
}
}
// must break string
$new = preg_replace('#[0-9]{1,}#', ' ', $str);
if ($new != ' ' && strlen($new) > 3) {
$nameArr = explode(' ', $new);
$str = '';
foreach ($nameArr as $frag) {
$frag = strtolower($frag);
if (strlen($frag) > 2) {
$arr = $this->breakString($frag);
$frag = $this->ucfirst_words($arr);
}
$str .= ' '. ucfirst($frag);
}
$str = trim($str);
}
return $str;
}
//------------------------------------------------------------------
public function breakString($str) {
$wc = 0; // counts segmented words
$str_length = strlen($str);
if ($str_length < $this->longest_word) $maxlen = $str_length;
else $maxlen = $this->longest_word;
// $n .... position (index) in email
for ($n = 0; $n < $str_length; ) {
// build a word with 1 character
$word[$wc] = substr($str, $n, 1);
$m = 1; // count chars in word
$test = $word[$wc];
$found = false;
// keep incrementing
while ($m <= $maxlen && ($n+$m) < $str_length) {
$test .= substr($str, $n+$m, 1); // append 1 character
// try to find the word in dictionary
if (array_key_exists($test, $this->dict_array)) {
$word[$wc] = $test; // because word test exists
$k = $m;
$found = true;
}
$m++; // number of chars in word
}
if ($found) $n += $k+1;
else $n++;
$wc++;
}
//return $word;
// glue together single characters
$n = 0;
$single = false;
foreach ($word as $key => $val) {
if (strlen($val) > 1) {
if ($single == true) {
$n++;
$single = false;
}
$new[$n] = $val;
$n++;
}
else {
$single = true;
if (!isset($new[$n])) $new[$n] = '';
$new[$n] .= $val;
}
}
return $new; // array
}
//------------------------------------------------------------------
// accepts array or string
public function ucfirst_words($arr) {
if (!is_array($arr)) $arr = explode(' ', $arr);
$new = '';
foreach ($arr as $val) {
$new .= ' '.ucfirst($val);
}
return trim($new);
}
}
//----------------------------------------------------------------------
?>