能高效地匹配字符串,具体原理就不搬了,这边给出PHP的实现代码:
root = $this->createNode();
foreach ($keywords as $keyword) {
$this->addKeyword($keyword);
}
$this->buildFailIndex();
}
/**
* 创建节点
* @param string $value
* @return stdClass
*/
private function createNode($value = "")
{
$node = new stdClass();
$node->value = $value;
$node->next = array();
$node->fail = NULL;
$node->len = 0; // keyword长度,0表示非末尾,非0表示keyword的长度
return $node;
}
/**
* 添加关键词
* @param $keyword
*/
private function addKeyword($keyword)
{
$keyword = trim($keyword);
if (!$keyword) {
return;
}
$cur = $this->root;
$matches = unpack('N*',iconv('UTF-8', 'UCS-4', strtolower($keyword)));
for ($i = 1; isset($matches[$i]); $i++) {
$v = $matches[$i];
if (!isset($cur->next[$v])) {
$node = $this->createNode($v);
$cur->next[$v] = $node;
}
if (!isset($matches[$i+1])) {
$cur->next[$v]->len = $i;
}
$cur = $cur->next[$v];
}
}
/**
* 构造fail指针
*/
private function buildFailIndex()
{
$queue = array();
foreach ($this->root->next as $node) {
$node->fail = $this->root;
$queue[] = $node;
}
while ($queue) {
$node = array_shift($queue);
foreach ($node->next as $child_node) {
$val = $child_node->value;
$p = $node->fail;
while ($p != NULL) {
if (isset($p->next[$val])) {
$child_node->fail = $p->next[$val];
break;
}
$p = $p->fail;
}
if ($p === NULL) {
$child_node->fail = $this->root;
}
$queue[] = $child_node;
}
}
}
/**
* 搜索
* @param $content
* @return array
*/
public function search($content)
{
$pos_list = array();
$p = $this->root;
$matches = unpack('N*',iconv('UTF-8', 'UCS-4', strtolower($content)));
for ($i = 1; isset($matches[$i]); $i++) {
$val = $matches[$i];
while (!isset($p->next[$val]) && $p != $this->root) {
$p = $p->fail;
}
$p = isset($p->next[$val]) ? $p->next[$val] : $this->root;
$temp = $p;
while ($temp != $this->root) {
if ($temp->len) {
$pos_list[] = array($i - $temp->len, $i);
}
$temp = $temp->fail;
}
}
return $pos_list;
}
}
Example:
search($content);
foreach ($res as $pos) {
echo mb_substr($content,$pos[0],$pos[1] - $pos[0],'UTF-8'),PHP_EOL;
}