<?php
#
# requires php 5.2+
#
#
# open and parse html
#
$html = file_get_contents($argv[1]);
$html = str_ireplace(array('<br>', '<br/>', '<br />'), "\n", $html);
$doc = new DOMDocument();
@$doc->loadHTML($html);
#
# get an array of <tr>'s we care about
#
$all_trs = get_elts_by_tag($doc, 'tr');
$trs = array();
foreach ($all_trs as $item){
if ($item->getAttribute('class') == 'not_in_proposal'){
continue;
}
if (!preg_match('/^e-\w{3}$/', $item->getAttribute('id'))){
continue;
}
if (!(7 === count(get_elts_by_tag($item, 'td')))){
continue;
}
$trs[] = $item;
}
fprintf(STDERR, "trs count:" . count($trs)."\n");
#
# iterate over the <tr>'s, extracting the data we need
#
$items = array();
foreach ($trs as $tr){
$tds = get_elts_by_tag($tr, 'td');
$item = array(
'mapid' => parse_mapid($tds[0]),
'unicode' => parse_unicode($tds[1]),
'char_name' => parse_char_name($tds[2]),
'docomo' => parse_mobile($tds[3]),
'au' => parse_mobile($tds[4]),
'softbank' => parse_mobile($tds[5]),
'google' => parse_google($tds[6]),
);
$items[] = $item;
}
fprintf(STDERR, "codepoint count:".count($items)."\n");
#
# filter invalid codepoints
#
fprintf(STDERR, "filter only_kaomoji ; like e-554 -> [A] -> [A] -> [A] -> [A]\n");
$items = filter_only_kaomoji($items);
fprintf(STDERR, "codepoint count:".count($items)."\n");
#
# export the catalog
#
echo "<"."?php \$catalog = ";
var_export($items);
echo "; ?".">";
function get_elts_by_tag($root, $tagname){
$nodelist = $root->getElementsByTagName($tagname);
$result = array();
$len = $nodelist->length;
for($i=0; $i<$len; $i++) {
$result[] = $nodelist->item($i);
}
return $result;
}
function parse_mapid($elt) {
// like <a href="#e-19E">e-19E</a>
$links = get_elts_by_tag($elt, 'a');
if(empty($links)) return null;
return $links[0]->getAttribute("href");
}
function parse_unicode($elt) {
//like U+1F469
return get_unicode_chars($elt);
}
function parse_char_name($elt) {
$lines = array_filter(array_map('trim', explode("\n", $elt->textContent)));
if(count($lines[0])>0) {
$result['title'] = $lines[0];
unset($lines[0]);
$result['desc'] = join("\n", $lines);
return $result;
}
return null;
}
function parse_mobile($elt){
if (!has_image($elt)){
return array(
'kaomoji' => trim($elt->textContent)
);
}else{
return array(
'number' => get_number_chars($elt),
'number_old' => get_old_num_chars($elt),
'unicode' => get_unicode_chars($elt),
'sjis' => get_sjis_chars($elt),
'jis' => get_jis_chars($elt),
);
}
}
function parse_google($elt){
return array(
'unicode' => get_unicode_chars($elt),
);
}
function has_image($elt) {
return count(get_elts_by_tag($elt, 'img')) > 0;
}
function get_unicode_chars($elt){ return get_some_chars($elt, '!U\+(\w{4,5})!u', 16); }
function get_sjis_chars($elt){ return get_some_chars($elt, '/\bSJIS-(\w{4})/u', 16); }
function get_jis_chars($elt){ return get_some_chars($elt, '/\bJIS-(\w{4})/u', 16); }
function get_number_chars($elt){ return get_some_chars($elt, '/\#([0-9]{1,})/'); }
function get_old_num_chars($elt){ return get_some_chars($elt, '/\#old([0-9]{1,})/'); }
function get_some_chars($elt, $rx, $base=10){
$out = array();
if (preg_match_all($rx, $elt->textContent, $m)){
foreach ($m[1] as $n){
$out[] = intval($n, $base);
}
}
return $out;
}
function filter_only_kaomoji($mapping) {
$result = array();
foreach($mapping as $map) {
if(isset($map['docomo']['kaomoji'])
&& isset($map['au']['kaomoji'])
&& isset($map['softbank']['kaomoji']))
{
continue;
}
else {
$result[] = $map;
}
}
return $result;
}
function filter_chars_group($mapping) {
$result = array();
foreach($mapping as $map) {
if( @preg_match('/\+$/', $map['docomo']['number'])
|| @preg_match('/\+$/', $map['au']['number'])
|| @preg_match('/\+$/', $map['softbank']['number'])
){
continue;
}
else {
$result[] = $map;
}
}
return $result;
}
function fix_geta_mark($mapping) {
$result = array();
foreach($mapping as $map) {
if(isset($map['docomo']['kaomoji'])
&& $map['docomo']['kaomoji'] == 'ã') {
$map['docomo']['kaomoji'] = '';
}
if(isset($map['au']['kaomoji'])
&& $map['au']['kaomoji'] == 'ã') {
$map['au']['kaomoji'] = '';
}
if(isset($map['softbank']['kaomoji'])
&& $map['softbank']['kaomoji'] == 'ã') {
$map['softbank']['kaomoji'] = '';
}
$result[] = $map;
}
return $result;
}
function get_all_kaomoji($mapping) {
$arr = array();
foreach($mapping as $map) {
if(isset($map['docomo']['kaomoji']) ) {
$arr[ $map['docomo']['kaomoji'] ] = '1';
}
if(isset($map['au']['kaomoji']) ) {
$arr[ $map['au']['kaomoji'] ] = '1';
}
if(isset($map['softbank']['kaomoji']) ) {
$arr[ $map['softbank']['kaomoji'] ] = '1';
}
}
return array_keys($arr);
}