1.引用jieba.net-0.38.2; jiebanet.segment 1.6.0 ;lucene.net 30.3
2.复制resources文件夹到项目根目录下面;
3.把要查询的数据成txt文件;
4.重写的文件;
using System.Collections.Generic;
using System.IO;
using JiebaNet.Segmenter;
using Lucene.Net.Analysis;
public class JiebaAnalyzer : Analyzer
{
protected static readonly ISet DefaultStopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
private static ISet StopWords;
static JiebaAnalyzer()
{
var stopWordsFile = Path.GetFullPath(JiebaNet.Analyser.ConfigManager.StopWordsFile);
if (File.Exists(stopWordsFile))
{
var lines = File.ReadAllLines(stopWordsFile);
StopWords = new HashSet();
foreach (var line in lines)
{
StopWords.Add(line.Trim());
}
}
else
{
StopWords = DefaultStopWords;
}
}
public override TokenStream TokenStream(string fieldName, TextReader reader)
{
var seg = new JiebaSegmenter();
TokenStream result = new JiebaTokenizer(seg, reader);
// 此筛选器是必需的,因为解析器将查询转换为小写形式
result = new LowerCaseFilter(result);
result = new StopFilter(true, result, StopWords);
return result;
}
}
using System.Collections.Generic;
using System.IO;
using System.Linq;
using JiebaNet.Segmenter;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Tokenattributes;
public class JiebaTokenizer : Tokenizer
{
private JiebaSegmenter segmenter;
private ITermAttribute termAtt;
private IOffsetAttribute offsetAtt;
private ITypeAttribute typeAtt;
private List tokens;
private int position = -1;
public JiebaTokenizer(JiebaSegmenter seg, TextReader input) : this(seg, input.ReadToEnd()) { }
public JiebaTokenizer(JiebaSegmenter seg, string input)
{
segmenter = seg;
termAtt = AddAttribute();
offsetAtt = AddAttribute();
typeAtt = AddAttribute();
var text = input;
tokens = segmenter.Tokenize(text, TokenizerMode.Search).ToList();
}
public override bool IncrementToken()
{
ClearAttributes();
position++;
if (position < tokens.Count)
{
var token = tokens[position];
termAtt.SetTermBuffer(token.Word);
offsetAtt.SetOffset(token.StartIndex, token.EndIndex);
typeAtt.Type = "Jieba";
return true;
}
End();
return false;
}
public IEnumerable Tokenize(string text, TokenizerMode mode = TokenizerMode.Search)
{
return segmenter.Tokenize(text, mode);
}
}
5创建分词;
using System.IO;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Store;
using SignalSmart.Models;
private void LuceneCreate(){
string indexPath = Context.Server.MapPath("ListFolder"); // 索引文档保存位置
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
bool isUpdate = IndexReader.IndexExists(directory); //判断索引库是否存在
if (isUpdate)
{
// 如果索引目录被锁定(比如索引过程中程序异常退出),则首先解锁
// Lucene.Net在写索引库之前会自动加锁,在close的时候会自动解锁
// 不能多线程执行,只能处理意外被永远锁定的情况
if (IndexWriter.IsLocked(directory))
{
IndexWriter.Unlock(directory); //unlock:强制解锁,待优化
}
}
// 创建向索引库写操作对象 IndexWriter(索引目录,指定使用盘古分词进行切词,最大写入长度限制)
// 补充:使用IndexWriter打开directory时会自动对索引库文件上锁
IndexWriter writer = new IndexWriter(directory,new Jieba.Common.JiebaAnalyzer() , !isUpdate,
IndexWriter.MaxFieldLength.UNLIMITED);
//读取所有文件
string[] filelist = System.IO.Directory.GetFiles(Server.MapPath("listfolder/upload/"));
// 防止重复索引,如果不存在则删除0条
// writer.DeleteDocuments(new Term("id"));// 防止已存在的数据 => delete from t where id=i
writer.DeleteAll();
foreach (string item in filelist)
{
if (!File.Exists(item))
{
continue;
}
StreamReader sr = new StreamReader(item);
string contents = sr.ReadToEnd();
string[] strlist = System.Text.RegularExpressions.Regex.Split(contents, "<换行>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
//创建索引
string txt = File.ReadAllText(item);
// 一条Document相当于一条记录
Document document = new Document();
// 每个Document可以有自己的属性(字段),所有字段名都是自定义的,值都是string类型
// Field.Store.YES不仅要对文章进行分词记录,也要保存原文,就不用去数据库里查一次了
document.Add(new Field("id", strlist[0], Field.Store.YES, Field.Index.NOT_ANALYZED));
// 需要进行全文检索的字段加 Field.Index. ANALYZED
// Field.Index.ANALYZED:指定文章内容按照分词后结果保存,否则无法实现后续的模糊查询
// WITH_POSITIONS_OFFSETS:指示不仅保存分割后的词,还保存词之间的距离
document.Add(new Field("title", strlist[1], Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
document.Add(new Field("content", strlist[2], Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
document.Add(new Field("imageurl", strlist[3], Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
// 把文档写入索引库
writer.AddDocument(document);
}
writer.Close(); // Close后自动对索引库文件解锁
directory.Close(); // 不要忘了Close,否则索引结果搜不到
Response.write( "索引文件创建成功!");
}
6
.调用可以快速搜索;
using System;
using System.Collections.Generic;
using Lucene.Net.Index;
using Lucene.Net.Documents;
using Lucene.Net.QueryParsers;
using Lucene.Net.Search;
using Lucene.Net.Store;
using System.IO;
using System.Linq;
public List Search(string indexPath, string _flag, string keyword, int PageIndex, int PageSize, out int TotalCount)
{
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory());
IndexReader reader = IndexReader.Open(directory, true);
List searchlist = new List();
// 查询条件
PhraseQuery query = new PhraseQuery();
// 等同于 where contains("msg",kw)
//query.Add(new Term("msg", keyword));
//-----
Jieba.Common.JiebaAnalyzer analyzer = new Jieba.Common.JiebaAnalyzer();
BooleanQuery bq = new BooleanQuery();
Lucene.Net.Util.Version version = Lucene.Net.Util.Version.LUCENE_30;
if (_flag != "")
{
QueryParser qpflag = new QueryParser(version, "flag", analyzer);
Query qflag = qpflag.Parse(_flag);
bq.Add(qflag, Occur.SHOULD);//与运算
}
Query queryKeyword = null;
if (keyword != "")
{
string[] arrResult = CutWords(keyword);
string[] fields = new string[arrResult.Length];//查询字段
for(int i = 0; i < arrResult.Length; i++)
{
fields[i] = "title";
}
queryKeyword = MultiFieldQueryParser.Parse(version, arrResult, fields, analyzer);
bq.Add(queryKeyword, Occur.SHOULD);//与运算
}
//------
TopScoreDocCollector collector = TopScoreDocCollector.Create(100, false);
IndexSearcher searcher = new IndexSearcher(reader);//true-表示只读
searcher.Search(bq, collector);
if (PageIndex < 1) PageIndex = 1;
if (collector == null || collector.TotalHits == 0)
{
TotalCount = 0;
return null;
}
else
{
int start = PageSize * (PageIndex - 1);
//结束数
int limit = PageSize;
ScoreDoc[] hits = collector.TopDocs(start, limit).ScoreDocs;
List list = new List();
int counter = 1;
TotalCount = collector.TotalHits;
foreach (ScoreDoc sd in hits)//遍历搜索到的结果
{
try
{
Document doc = searcher.Doc(sd.Doc);
string id = doc.Get("id");
string title = doc.Get("title");
string content = doc.Get("content");
// string flag = doc.Get("flag");
string imageurl = doc.Get("imageurl");
//string updatetime = doc.Get("updatetime");
PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter = new PanGu.HighLight.SimpleHTMLFormatter("", "");
PanGu.HighLight.Highlighter highlighter = new PanGu.HighLight.Highlighter(simpleHTMLFormatter, new PanGu.Segment());
highlighter.FragmentSize = 50;
content = highlighter.GetBestFragment(keyword, content);
string titlehighlight = highlighter.GetBestFragment(keyword, title);
if (titlehighlight != "") title = titlehighlight;
list.Add(new MySearchUnit(id, title, content, "", imageurl, ""));
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
counter++;
}
return list;
}
//st.Stop();
//Response.Write("查询时间:" + st.ElapsedMilliseconds + " 毫秒
");
}
protected string[] CutWords(string keyword)
{
var segment = new JiebaNet.Segmenter.JiebaSegmenter();
var result = segment.CutForSearch(keyword).ToList();
string[] arr = result.ToArray();
return arr;
}