2010年8月9日月曜日

DokuWiki-2009-12-25c その2

indexer.php(2009-12-25)

12-25版。mecabのために手を入れてた部分は関係なさそう。

--- indexer.2009-02-14.php 2010-08-09 14:36:23.000000000 +0900 +++ indexer.2009-12-25.php 2010-08-09 14:36:40.000000000 +0900 @@ -74,7 +74,7 @@ fwrite($fh,$line); } fclose($fh); - if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']); + if(isset($conf['fperm'])) chmod($fn.'.tmp', $conf['fperm']); io_rename($fn.'.tmp', $fn.'.idx'); return true; } @@ -574,12 +574,16 @@ // merge found pages into final result array $final = array(); - foreach(array_keys($result) as $word){ + foreach($result as $word => $res){ $final[$word] = array(); - foreach($result[$word] as $wid){ + foreach($res as $wid){ $hits = &$docs[$wid]; foreach ($hits as $hitkey => $hitcnt) { - $final[$word][$hitkey] = $hitcnt + $final[$word][$hitkey]; + if (!isset($final[$word][$hitkey])) { + $final[$word][$hitkey] = $hitcnt; + } else { + $final[$word][$hitkey] += $hitcnt; + } } } } @@ -664,7 +668,9 @@ if (empty($page_idx)) return; $pagewords = array(); $len = count($page_idx); - for ($n=0;$n<$len;$n++) $pagewords[] = array(); + for ($n=0;$n<$len;$n++){ + $pagewords[] = array(); + } unset($page_idx); $n=0;

indexer.php(2009-12-25mecab対応)

--- indexer.2009-12-25.php 2010-08-09 15:03:07.000000000 +0900 +++ indexer.2009-12-25.mod.php 2010-08-09 15:13:33.000000000 +0900 @@ -45,6 +45,8 @@ ']?'); define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')'); +define('PRE_TOKENIZER', '/usr/bin/mecab -O wakati'); + /** * Measure the length of a string. * Differs from strlen in handling of asian characters. @@ -52,11 +54,16 @@ * @author Tom N Harris <tnharris@whoopdedo.org> */ function wordlen($w){ - $l = strlen($w); + //$l = strlen($w); + $l = utf8_strlen($w); + + /* // If left alone, all chinese "words" will get put into w3.idx // So the "length" of a "word" is faked if(preg_match('/'.IDX_ASIAN2.'/u',$w)) $l += ord($w) - 0xE1; // Lead bytes from 0xE2-0xEF + */ + return $l; } @@ -220,6 +227,28 @@ list($page,$body) = $data; + if(function_exists(proc_open) && defined('PRE_TOKENIZER')) { + $dspec = array( + 0 => array("pipe", "r"), + 1 => array("pipe", "w"), + 2 => array("file", "/dev/null", "w") + ); + $process = proc_open(PRE_TOKENIZER, $dspec, $pipes); + if(is_resource($process)) { + stream_set_blocking($pipes[0], FALSE); + stream_set_blocking($pipes[1], FALSE); + fwrite($pipes[0], $body . "\n"); + fclose($pipes[0]); + + $body = ''; + while(!feof($pipes[1])) { + $body .= fgets($pipes[1], 32768); + } + fclose($pipes[1]); + proc_close($process); + } + } + $body = strtr($body, "\r\n\t", ' '); $tokens = explode(' ', $body); $tokens = array_count_values($tokens); // count the frequency of each token @@ -489,7 +518,8 @@ $wild |= 2; $wlen -= 1; } - if ($wlen < IDX_MINWORDLENGTH && $wild == 0 && !is_numeric($xword)) continue; + //if ($wlen < IDX_MINWORDLENGTH && $wild == 0 && !is_numeric($xword)) continue; + if (preg_match('/[^0-9A-Za-z]/u', $string) && $wlen < IDX_MINWORDLENGTH && $wild == 0 && !is_numeric($xword)) continue; if(!isset($tokens[$xword])){ $tokenlength[$wlen][] = $xword; } @@ -632,12 +662,36 @@ */ function idx_tokenizer($string,&$stopwords,$wc=false){ $words = array(); + + if(function_exists(proc_open) && defined('PRE_TOKENIZER')) { + $dspec = array( + 0 => array("pipe", "r"), + 1 => array("pipe", "w"), + 2 => array("file", "/dev/null", "w") + ); + $process = proc_open(PRE_TOKENIZER, $dspec, $pipes); + if(is_resource($process)) { + stream_set_blocking($pipes[0], FALSE); + stream_set_blocking($pipes[1], FALSE); + fwrite($pipes[0], $string . "\n"); + fclose($pipes[0]); + $string = ''; + while(!feof($pipes[1])) { + $string .= fgets($pipes[1], 32768); + } + fclose($pipes[1]); + proc_close($process); + } + } + $wc = ($wc) ? '' : $wc = '\*'; if(preg_match('/[^0-9A-Za-z]/u', $string)){ + /* // handle asian chars as single words (may fail on older PHP version) $asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string); if(!is_null($asia)) $string = $asia; //recover from regexp failure + */ $arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc)); foreach ($arr as $w) {

つづく

0 件のコメント: