ZendFramework 1.0的 搜索Zend_Search_Lucene已经支持的相对不错了,但是对于中文搜索,需要提供自己的分词方式。网上有不少解决方案,一一测试过后,还是转载的比较多的一篇文章说的,已经分不清原文作者是谁了,我只把源代码贴在这里。接下来,我希望利用这个解决joomla-开源天空的中文搜索问题。
中文分词的类如下:
<?php
require_once 'Zend/Search/Lucene/Analysis/Analyzer.php';
require_once 'Zend/Search/Lucene/Analysis/Analyzer/Common.php';
class Phpbean_Lucene_Analyzer extends Zend_Search_Lucene_Analysis_Analyzer_Common {
private $_position;
private $_cnStopWords = array('的','是','地','了');
private $_inputLength;
public function setCnStopWords($cnStopWords){
$this->_cnStopWords = $cnStopWords;
}
/**
* Reset token stream
*/
public function reset()
{
$this->_position = 0;
$search = array(",", "/", "\\", ".", ";", ":", "\"", "!", "~", "`", "^", "(", ")", "?", "-", "\t", "\n", "'", "<", ">", "\r", "\r\n", "$", "&", "%", "#", "@", "+", "=", "{", "}", "[", "]", ":", ")", "(", ".", "。", ",", "!", ";", "“", "”", "‘", "’", "[", "]", "、", "—", " ", "《", "》", "-", "…", "【", "】",);
$this->_input = str_replace($search,' ',$this->_input);
$this->_input = str_replace($this->_cnStopWords,' ',$this->_input);
$this->_input = strtolower($this->_input);
}
/**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* @return Zend_Search_Lucene_Analysis_Token|null
*/
public function nextToken()
{
if ($this->_input === null) {
return null;
}
$this->_inputLength = strlen($this->_input);
while ($this->_position < $this->_inputLength) {
while ($this->_position < $this->_inputLength &&
$this->_input[$this->_position]==' ' ) {
$this->_position++;
}
$termStartPosition = $this->_position;
$temp_char = $this->_input[$this->_position];
$isCnWord = false;
if(ord($temp_char)>127){
$i = 0;
while ($this->_position < $this->_inputLength &&
ord( $this->_input[$this->_position] )>127) {
$this->_position = $this->_position + 3;
$i ++;
if($i==2){
$isCnWord = true;
break;
}
}
if($i==1)continue;
}elseif(47 < ord($temp_char) && ord($temp_char) < 58){
while ($this->_position < $this->_inputLength &&
ctype_digit( $this->_input[$this->_position] )) {
$this->_position++;
}
}else{
while ($this->_position < $this->_inputLength &&
ctype_alpha( $this->_input[$this->_position] )) {
$this->_position++;
}
}
if ($this->_position == $termStartPosition) {
return null;
}
$token = new Zend_Search_Lucene_Analysis_Token(
substr($this->_input,
$termStartPosition,
$this->_position - $termStartPosition),
$termStartPosition,
$this->_position);
$token = $this->normalize($token);
if($isCnWord)$this->_position = $this->_position - 3;
if ($token !== null) {
return $token;
}
}
return null;
}
}
?>
注意要保存为utf-8编码格式,在做索引和查询的时候设置默认的分析器是这个中文分析器就可以了。
作者:tenfoon 星期三, 28 五月 2008 19:43 |
注意要保存为utf-8编码格式,在做索引和查询的时候设置默认的分析器是这个中文分析器就可以了。 应该怎么设置为默认? |
作者:admin 星期四, 29 五月 2008 00:11 |
这句话就可以实现: Zend_Search_Lucene_Analysis_Analyzer::setDefault(new Phpbean_Lucene_Analyzer()); |
作者:tenfoon 星期四, 29 五月 2008 19:24 |
加入后搜索结果为空 [php] //Monkey's 二元分词 function sp_str($str) { //所有汉字后添加ASCII的0字符,此法是为了排除特殊中文拆分错误的问题 $str=preg_replace("/[\x80-\xff]{2}/","\\0".chr(0x00),$str); //拆分的分割符 $search = array(",", "/", "\\", ".", ";", ":", "\"", "!", "~", "`", "^", "(", ")", "?", "-", "\t", "\n", "'", "", "\r", "\r\n", "$", "&", "%", "#", "@", "+", "=", "{", "}", "[", "]", ":", ")", "(", ".", "。", ",", "!", ";", "“", "”", "‘", "’", "[", "]", "、", "—", " ", "《", "》", "-", "…", "【", "】",); //替换所有的分割符为空格 $str = str_replace($search,' ',$str); //用正则匹配半角单个字符或者全角单个字符,存入数组$ar preg_match_all("/[\x80-\xff]?./",$str,$ar);$ar=$ar[0]; //去掉$ar中ASCII为0字符的项目 for ($i=0;$i M [1] => o [2] => n [3] => k [4] => e [5] => y [6] => [7] => s [8] => [9] => 二 [10] => 元 [11] => 分 [12] => 词 ) */ //把连续的半角存成一个数组下标,或者全角的每2个字符存成一个数组的下标 for ($ar_str='',$i=0;$i0 and $sw!=$oldsw) $ar_str.=" "; if ($sw==1) $ar_str.=$ar[$i]; else if (strlen($ar[$i+1])==2) $ar_str.=$ar[$i].$ar[$i+1].' '; elseif ($oldsw==1 or $oldsw==0) $ar_str.=$ar[$i]; $oldsw=$sw; } //去掉连续的空格 $ar_str=trim(preg_replace("# {1,}#i"," ",$ar_str));//$ar_str = "Monkey s 二元 元分 分词" //返回拆分后的结果 return explode(' ',$ar_str); } # print_r(sp_str("Monkey's 二元分词")); /* Array ( [0] => Monkey [1] => s [2] => 二元 [3] => 元分 [4] => 分词 ) */ [/php] [php] require_once('Zend/Search/Lucene.php'); Zend_Search_Lucene_Analysis_Analyzer::setDefault(new Phpbean_Lucene_Analyzer()); $index = Zend_Search_Lucene::open('/tmp/caoyao-index'); $userQuery = Zend_Search_Lucene_Search_QueryParser::parse($keywords, "utf-8"); $query = new Zend_Search_Lucene_Search_Query_MultiTerm(); $sp_str = sp_str($keywords); foreach($sp_str as $keyword) { $query->addTerm(new Zend_Search_Lucene_Index_Term($keyword), null); } $query = new Zend_Search_Lucene_Search_Query_Boolean(); $query->addSubquery($userQuery, null /* required */); [/php] |
作者:admin 星期五, 30 五月 2008 16:47 |
你的索引也要用这个parser做。 |
作者:tenfoon 星期五, 30 五月 2008 20:38 |
有一点我不太明白。 当在做indexes 的时候,是不是也要设置默认Zend_Search_Lucene_Analysis_Analyzer::setDefault(new Phpbean_Lucene_Analyzer()); [php] require_once('Zend/Search/Lucene.php'); Zend_Search_Lucene_Analysis_Analyzer::setDefault(new Phpbean_Lucene_Analyzer()); $index = Zend_Search_Lucene::open('/tmp/caoyao-index'); $query = new Zend_Search_Lucene_Search_Query_Boolean(); try { $query = Zend_Search_Lucene_Search_QueryParser::parse($keywords); } catch (Zend_Search_Lucene_Search_QueryParserException $e) { echo "Query syntax error: " . $e->getMessage() . "\n"; } $hits = $index->find($query); $totalrecords = count($hits); $success = 1; [/php] |
作者:admin 星期五, 30 五月 2008 21:04 |
要设置的。 |