试过lucene最新版跟paoding2.0版不兼容,被迫换lucene2.9,研究继续。。。。。。
将整合要用到的jar包加入工程,附件中有。
添加环境变量PAODING_DIC_HOME E:\paoding-analysis\dic【dic位置即paoding-analysis-2.0.4-beta.zip得解压位置】
在paoding-dic-home.properties文件下面添加:
#paoding.dic.home=dic
paoding.dic.home=E:/paoding-analysis/dic
paoding-knives.properties文件
paoding.knife.class.letterKnife=net.paoding.analysis.knife.LetterKnife
paoding.knife.class.numberKnife=net.paoding.analysis.knife.NumberKnife
paoding.knife.class.cjkKnife=net.paoding.analysis.knife.CJKKnife
这两个文件加入工程src根目录即可。
附上例子:
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileNotFoundException;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.TokenStream;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.TermPositionVector;
- import org.apache.lucene.queryParser.ParseException;
- import org.apache.lucene.queryParser.QueryParser;
- import org.apache.lucene.search.Hits;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.Searcher;
- import org.apache.lucene.search.highlight.Highlighter;
- import org.apache.lucene.search.highlight.QueryScorer;
- import org.apache.lucene.search.highlight.SimpleFragmenter;
- import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
- import org.apache.lucene.search.highlight.TokenSources;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.FSDirectory;
-
- @SuppressWarnings("deprecation")
- public class LuceneChinese {
-
- private static final String DATA_DIR = "E:\\test\\file";
-
- private static final String INDEX_DIR = "E:\\test\\index";
-
- private static final String FIELD_NAME = "content";
- public static void main(String[] args) throws Exception {
- createIndex();
- search("");
- }
-
-
-
- public static void createIndex() {
- System.out.println("-------------------建立索引开始-----------------------");
- long timeStart = System.currentTimeMillis();
- try {
-
- Analyzer analyzer = new PaodingChineseAnalyzer(new File(
- "E:\\stopwords.txt"));
- IndexWriter writer = new IndexWriter(FSDirectory.open(new File(
- INDEX_DIR)), analyzer, true,
- IndexWriter.MaxFieldLength.LIMITED);
-
- indexDoc(writer, new File(DATA_DIR));
-
- writer.optimize();
- writer.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- long timeEnd = System.currentTimeMillis();
- System.out.println("-------------------建立索引耗时: "
- + (timeEnd - timeStart) + " 毫秒-----------------------");
- }
-
-
-
-
-
-
-
- public static void search(String queryString) throws IOException,
- ParseException {
-
- if (queryString == null || queryString == "") {
- System.out.print("Search for:");
- InputStreamReader in = new InputStreamReader(System.in);
- BufferedReader reader = new BufferedReader(in);
- queryString = reader.readLine();
- if (queryString == "") {
- System.exit(0);
- }
- }
- long timeStart = System.currentTimeMillis();
-
- Directory directory = FSDirectory.open(new File(INDEX_DIR));
-
- Analyzer analyzer = new PaodingChineseAnalyzer();
- IndexReader reader = IndexReader.open(directory, true);
- QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
- Query query = parser.parse(queryString);
-
- Searcher searcher = new IndexSearcher(directory);
- query = query.rewrite(reader);
- Hits hits = searcher.search(query);
-
-
- SimpleHTMLFormatter shf = new
- SimpleHTMLFormatter("<span style=\"color:red\">",
- "</span>");
-
- Highlighter highlighter = new Highlighter(shf, new QueryScorer(
- query));
-
- highlighter.setTextFragmenter(new SimpleFragmenter(50));
- System.out.println("共搜索到: " + hits.length() + " 条资源");
- for (int i = 0; i < hits.length(); i++) {
- String text = hits.doc(i).get(FIELD_NAME);
- String path = hits.doc(i).get("path");
- int maxNumFragmentsRequired = 5;
- String fragmentSeparator = "...";
- TermPositionVector tpv = (TermPositionVector) reader
- .getTermFreqVector(hits.id(i), FIELD_NAME);
- TokenStream tokenStream = TokenSources.getTokenStream(tpv);
- String result = highlighter.getBestFragments(tokenStream, text,
- maxNumFragmentsRequired, fragmentSeparator);
- System.out.println("\n文件路径:" + path);
- System.out.println("\n" + result);
- }
- reader.close();
- System.out.println("共搜索到: " + hits.length() + " 条资源");
- long timeEnd = System.currentTimeMillis();
- System.out.println("-------------------查询耗时: " + (timeEnd - timeStart)
- + " 毫秒-----------------------");
- }
-
-
-
-
-
-
-
-
- private static void indexDoc(IndexWriter writer, File root) {
-
- if (root.canRead()) {
- if (root.isDirectory()) {
- File[] files = root.listFiles();
- if (files.length != 0) {
- for (int i = 0; i < files.length; i++) {
-
- indexDoc(writer, files[i]);
- }
- }
- } else {
- try {
-
- InputStream in = new FileInputStream(root);
- byte b[] = new byte[in.available()];
- in.read(b);
- String content = new String(b, "GBK");
-
- Document d = new Document();
-
- d.add(new Field(FIELD_NAME, content, Field.Store.YES,
- Field.Index.TOKENIZED,
- Field.TermVector.WITH_POSITIONS_OFFSETS));
-
- d.add(new Field("path", root.getAbsolutePath(),
- Field.Store.YES, Field.Index.NOT_ANALYZED));
-
- writer.addDocument(d);
- System.out.println("add file: " + root.getAbsolutePath());
- } catch (FileNotFoundException e) {
- System.out.println("file not found, ignored.");
- e.printStackTrace();
- } catch (IOException e) {
-
- }
- }
- }
- }
- }
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
@SuppressWarnings("deprecation")
public class LuceneChinese {
// 数据文件夹
private static final String DATA_DIR = "E:\\test\\file";
// 索引存放文件夹
private static final String INDEX_DIR = "E:\\test\\index";
// 字段
private static final String FIELD_NAME = "content";
public static void main(String[] args) throws Exception {
createIndex();
search("");
}
/**
* 创建索引
*/
public static void createIndex() {
System.out.println("-------------------建立索引开始-----------------------");
long timeStart = System.currentTimeMillis();
try {
// PaodingChineseAnalyzer实现Analyzer接口,继承PaodingAnalyze,重写tokenizer方法,实现过滤分词
Analyzer analyzer = new PaodingChineseAnalyzer(new File(
"E:\\stopwords.txt"));
IndexWriter writer = new IndexWriter(FSDirectory.open(new File(
INDEX_DIR)), analyzer, true,
IndexWriter.MaxFieldLength.LIMITED);
// 根据指定的目录把该目录下所有txt文件索引起来
indexDoc(writer, new File(DATA_DIR));
// 优化, 可以提高搜索速度。
writer.optimize();
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
long timeEnd = System.currentTimeMillis();
System.out.println("-------------------建立索引耗时: "
+ (timeEnd - timeStart) + " 毫秒-----------------------");
}
/**
* 搜索
*
* @param keyword
* @throws IOException
* @throws ParseException
*/
public static void search(String queryString) throws IOException,
ParseException {
// 输入搜索关键字
if (queryString == null || queryString == "") {
System.out.print("Search for:");
InputStreamReader in = new InputStreamReader(System.in);
BufferedReader reader = new BufferedReader(in);
queryString = reader.readLine();
if (queryString == "") {
System.exit(0);
}
}
long timeStart = System.currentTimeMillis();
// 读取索引文件
Directory directory = FSDirectory.open(new File(INDEX_DIR));
// PaodingChineseAnalyzer实现Analyzer接口,继承PaodingAnalyzer
Analyzer analyzer = new PaodingChineseAnalyzer();
IndexReader reader = IndexReader.open(directory, true);
QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
Query query = parser.parse(queryString);
// 创建索引查询器
Searcher searcher = new IndexSearcher(directory);
query = query.rewrite(reader);
Hits hits = searcher.search(query);
// 高亮显示标签,默认是<b></b>
// BoldFormatter formatter = new BoldFormatter();
SimpleHTMLFormatter shf = new
SimpleHTMLFormatter("<span style=\"color:red\">",
"</span>");
// 构造高亮器,指定高亮的格式,指定查询计分器
Highlighter highlighter = new Highlighter(shf, new QueryScorer(
query));
// 设置块划分器
highlighter.setTextFragmenter(new SimpleFragmenter(50));
System.out.println("共搜索到: " + hits.length() + " 条资源");
for (int i = 0; i < hits.length(); i++) {
String text = hits.doc(i).get(FIELD_NAME);
String path = hits.doc(i).get("path");
int maxNumFragmentsRequired = 5;
String fragmentSeparator = "...";
TermPositionVector tpv = (TermPositionVector) reader
.getTermFreqVector(hits.id(i), FIELD_NAME);
TokenStream tokenStream = TokenSources.getTokenStream(tpv);
String result = highlighter.getBestFragments(tokenStream, text,
maxNumFragmentsRequired, fragmentSeparator);
System.out.println("\n文件路径:" + path);
System.out.println("\n" + result);
}
reader.close();
System.out.println("共搜索到: " + hits.length() + " 条资源");
long timeEnd = System.currentTimeMillis();
System.out.println("-------------------查询耗时: " + (timeEnd - timeStart)
+ " 毫秒-----------------------");
}
/**
* 对指定的目录进行索引
*
* @param writer
* IndexWriter
* @param root
* 指定的目录
*/
private static void indexDoc(IndexWriter writer, File root) {
// 不去索引不能读的文件
if (root.canRead()) {
if (root.isDirectory()) {
File[] files = root.listFiles();
if (files.length != 0) {
for (int i = 0; i < files.length; i++) {
// 递归调用
indexDoc(writer, files[i]);
}
}
} else {
try {
// 文件的文本内容
InputStream in = new FileInputStream(root);
byte b[] = new byte[in.available()];
in.read(b);
String content = new String(b, "GBK");
// 创建一个lucene document
Document d = new Document();
// 把文件的文本内容添加进来 进行索引,保存
d.add(new Field(FIELD_NAME, content, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
// 同时把path也加入进来,只存储,不索引
d.add(new Field("path", root.getAbsolutePath(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
// 把document写入索引文件
writer.addDocument(d);
System.out.println("add file: " + root.getAbsolutePath());
} catch (FileNotFoundException e) {
System.out.println("file not found, ignored.");
e.printStackTrace();
} catch (IOException e) {
}
}
}
}
}
- import java.io.File;
- import java.io.Reader;
- import java.util.Set;
- import net.paoding.analysis.analyzer.PaodingAnalyzer;
- import net.paoding.analysis.analyzer.PaodingTokenizer;
- import org.apache.lucene.analysis.LowerCaseFilter;
- import org.apache.lucene.analysis.StopFilter;
- import org.apache.lucene.analysis.TokenStream;
- import org.apache.lucene.analysis.WordlistLoader;
-
- public class PaodingChineseAnalyzer extends PaodingAnalyzer {
-
- private static String[] stopWords = {
- "www", "的", "和", "与", "时", "在",
- "是", "被", "所", "那", "这", "有",
- "将", "会", "为", "对", "了", "过",
- "去" };
- @SuppressWarnings("unchecked")
- private Set stopSet;
- public PaodingChineseAnalyzer() {
- stopSet = StopFilter.makeStopSet(stopWords);
- }
- public PaodingChineseAnalyzer(String[] stopWords) {
- stopSet = StopFilter.makeStopSet(stopWords);
- }
-
- public PaodingChineseAnalyzer(File stopwordsFile) {
- try {
- stopSet = WordlistLoader.getWordSet(stopwordsFile);
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
-
- @SuppressWarnings("deprecation")
- public final TokenStream tokenStream(String fieldName, Reader reader) {
- TokenStream result = new PaodingTokenizer(reader, getKnife(),
- createTokenCollector());
-
- result = new StopFilter(result, stopSet);
- result = new LowerCaseFilter(result);
- return result;
- }
- }
import java.io.File;
import java.io.Reader;
import java.util.Set;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import net.paoding.analysis.analyzer.PaodingTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WordlistLoader;
public class PaodingChineseAnalyzer extends PaodingAnalyzer {
private static String[] stopWords = {
"www", "的", "和", "与", "时", "在",
"是", "被", "所", "那", "这", "有",
"将", "会", "为", "对", "了", "过",
"去" };
@SuppressWarnings("unchecked")
private Set stopSet;
public PaodingChineseAnalyzer() {
stopSet = StopFilter.makeStopSet(stopWords);
}
public PaodingChineseAnalyzer(String[] stopWords) {
stopSet = StopFilter.makeStopSet(stopWords);
}
// 读取外部stopwords文件
public PaodingChineseAnalyzer(File stopwordsFile) {
try {
stopSet = WordlistLoader.getWordSet(stopwordsFile);
} catch (Exception e) {
e.printStackTrace();
}
}
// 过滤分词
@SuppressWarnings("deprecation")
public final TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new PaodingTokenizer(reader, getKnife(),
createTokenCollector());
// 加入过滤分词方法,lucene也提供了很多过滤分词方法,可以选择使用
result = new StopFilter(result, stopSet);
result = new LowerCaseFilter(result);
return result;
}
}