indexer.php(2009-12-25)
12-25版。mecabのために手を入れてた部分は関係なさそう。
--- indexer.2009-02-14.php 2010-08-09 14:36:23.000000000 +0900
+++ indexer.2009-12-25.php 2010-08-09 14:36:40.000000000 +0900
@@ -74,7 +74,7 @@
fwrite($fh,$line);
}
fclose($fh);
- if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']);
+ if(isset($conf['fperm'])) chmod($fn.'.tmp', $conf['fperm']);
io_rename($fn.'.tmp', $fn.'.idx');
return true;
}
@@ -574,12 +574,16 @@
// merge found pages into final result array
$final = array();
- foreach(array_keys($result) as $word){
+ foreach($result as $word => $res){
$final[$word] = array();
- foreach($result[$word] as $wid){
+ foreach($res as $wid){
$hits = &$docs[$wid];
foreach ($hits as $hitkey => $hitcnt) {
- $final[$word][$hitkey] = $hitcnt + $final[$word][$hitkey];
+ if (!isset($final[$word][$hitkey])) {
+ $final[$word][$hitkey] = $hitcnt;
+ } else {
+ $final[$word][$hitkey] += $hitcnt;
+ }
}
}
}
@@ -664,7 +668,9 @@
if (empty($page_idx)) return;
$pagewords = array();
$len = count($page_idx);
- for ($n=0;$n<$len;$n++) $pagewords[] = array();
+ for ($n=0;$n<$len;$n++){
+ $pagewords[] = array();
+ }
unset($page_idx);
$n=0;
indexer.php(2009-12-25mecab対応)
--- indexer.2009-12-25.php 2010-08-09 15:03:07.000000000 +0900
+++ indexer.2009-12-25.mod.php 2010-08-09 15:13:33.000000000 +0900
@@ -45,6 +45,8 @@
']?');
define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')');
+define('PRE_TOKENIZER', '/usr/bin/mecab -O wakati');
+
/**
* Measure the length of a string.
* Differs from strlen in handling of asian characters.
@@ -52,11 +54,16 @@
* @author Tom N Harris <tnharris@whoopdedo.org>
*/
function wordlen($w){
- $l = strlen($w);
+ //$l = strlen($w);
+ $l = utf8_strlen($w);
+
+ /*
// If left alone, all chinese "words" will get put into w3.idx
// So the "length" of a "word" is faked
if(preg_match('/'.IDX_ASIAN2.'/u',$w))
$l += ord($w) - 0xE1; // Lead bytes from 0xE2-0xEF
+ */
+
return $l;
}
@@ -220,6 +227,28 @@
list($page,$body) = $data;
+ if(function_exists(proc_open) && defined('PRE_TOKENIZER')) {
+ $dspec = array(
+ 0 => array("pipe", "r"),
+ 1 => array("pipe", "w"),
+ 2 => array("file", "/dev/null", "w")
+ );
+ $process = proc_open(PRE_TOKENIZER, $dspec, $pipes);
+ if(is_resource($process)) {
+ stream_set_blocking($pipes[0], FALSE);
+ stream_set_blocking($pipes[1], FALSE);
+ fwrite($pipes[0], $body . "\n");
+ fclose($pipes[0]);
+
+ $body = '';
+ while(!feof($pipes[1])) {
+ $body .= fgets($pipes[1], 32768);
+ }
+ fclose($pipes[1]);
+ proc_close($process);
+ }
+ }
+
$body = strtr($body, "\r\n\t", ' ');
$tokens = explode(' ', $body);
$tokens = array_count_values($tokens); // count the frequency of each token
@@ -489,7 +518,8 @@
$wild |= 2;
$wlen -= 1;
}
- if ($wlen < IDX_MINWORDLENGTH && $wild == 0 && !is_numeric($xword)) continue;
+ //if ($wlen < IDX_MINWORDLENGTH && $wild == 0 && !is_numeric($xword)) continue;
+ if (preg_match('/[^0-9A-Za-z]/u', $string) && $wlen < IDX_MINWORDLENGTH && $wild
== 0 && !is_numeric($xword)) continue;
if(!isset($tokens[$xword])){
$tokenlength[$wlen][] = $xword;
}
@@ -632,12 +662,36 @@
*/
function idx_tokenizer($string,&$stopwords,$wc=false){
$words = array();
+
+ if(function_exists(proc_open) && defined('PRE_TOKENIZER')) {
+ $dspec = array(
+ 0 => array("pipe", "r"),
+ 1 => array("pipe", "w"),
+ 2 => array("file", "/dev/null", "w")
+ );
+ $process = proc_open(PRE_TOKENIZER, $dspec, $pipes);
+ if(is_resource($process)) {
+ stream_set_blocking($pipes[0], FALSE);
+ stream_set_blocking($pipes[1], FALSE);
+ fwrite($pipes[0], $string . "\n");
+ fclose($pipes[0]);
+ $string = '';
+ while(!feof($pipes[1])) {
+ $string .= fgets($pipes[1], 32768);
+ }
+ fclose($pipes[1]);
+ proc_close($process);
+ }
+ }
+
$wc = ($wc) ? '' : $wc = '\*';
if(preg_match('/[^0-9A-Za-z]/u', $string)){
+ /*
// handle asian chars as single words (may fail on older PHP version)
$asia = @preg_replace('/('.IDX_ASIAN.')/u',' \1 ',$string);
if(!is_null($asia)) $string = $asia; //recover from regexp failure
+ */
$arr = explode(' ', utf8_stripspecials($string,' ','\._\-:'.$wc));
foreach ($arr as $w) {
つづく
0 件のコメント:
コメントを投稿