实习期间领导要我使用Lucene(不使用solr)将一个同事所要查询的数据索引起来,以达到更快的查询速度。
同事索要查询的数据共存在于两张表中,一张是街道表,一张是区划表。映射关系是一个区划对应N个街道。搜索效率要达到1秒千次。
数据库操作
SELECT area_id, area_name, parent_id, parent_name, area_level, `status`, address
from area_code_table LEFT JOIN
(SELECT City, District, GROUP_CONCAT(Address) as address
FROM table_address
GROUP BY City, District) as addr
ON area_code_table.area_name = addr.District
AND area_code_table.parent_name = addr.City
除了使用LeftJoin外还使用了mysql的一个特殊的行转列的方法 GROUP_CONCAT 可以将联合查询后数据组合成由逗号拼装成的字符串,如原本为aa bb cc 的三行数据可以组合成aa,bb,cc 极大的方便查询
项目结构
使用了ibatis这种轻量级的框架,用maven组织项目结构。在下载jar包时很方便,但在打包时遇到了大麻烦。
创建索引主程序
package com.xmmy.areaindex.index;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.omg.CORBA.PUBLIC_MEMBER;
import org.apache.lucene.analysis.*;
import com.xmmy.areaindex.db.AreaDaoImpl;
import com.xmmy.areaindex.params.IndexParams;
import com.xmmy.areaindex.pojo.Area;
import com.xmmy.areaindex.util.DateUtil;
public class IndexCreate {
public static void main(String[] args) {
}
public void createAreaIndex(List areaList) throws IOException {
//存放索引的文件夹
File indexFile = new File(IndexParams.indexAddressSavePath);
//创建Directory对象
Directory directory = FSDirectory.open(indexFile);
//使用二元分词器
//Analyzer cjkAnalyzer = new CJKAnalyzer();
//使用一元分词
Analyzer standardAnalyzer = new StandardAnalyzer();
//创建IndexWriterConfig
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_4_10_4, standardAnalyzer);
//IndexWriter的配置
indexWriterConfig.setMaxBufferedDocs(10000);
//创建IndexWriter
IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);
int n = 0;
//fieldtype构造
FieldType fieldType = new FieldType();
fieldType.setIndexed(true); //存储
fieldType.setStored(true); //索引
fieldType.setStoreTermVectors(true);
fieldType.setTokenized(true);
fieldType.setStoreTermVectorPositions(true);// 存储位置
fieldType.setStoreTermVectorOffsets(true);// 存储偏移量
Document doc = null;
try {
//System.out.println("areaList length = " + areaList.size());
for (Area area : areaList) {
//检查area对象是否有null 有的话替换
checkAreaNull(area);
//建立一个lucene文档
doc = new Document();
//添加文档索引
doc.add(new Field("area_id", area.getArea_id() + "", fieldType));
//设置area_name的权重分数
Field area_name = new Field("area_name", area.getArea_name(), fieldType);
area_name.setBoost(10);
doc.add(area_name);
doc.add(new Field("parent_id", area.getParent_id() + "", fieldType));
doc.add(new Field("parent_name", area.getParent_name(), fieldType));
doc.add(new Field("area_level", area.getArea_level() + "", fieldType));
doc.add(new Field("status", area.getStatus() + "", fieldType));
doc.add(new Field("address", area.getAddress(), fieldType));
n++;
indexWriter.addDocument(doc);
}
System.out.println("indexwriter = " + indexWriter.numDocs());
} catch (Exception e) {
// TODO: handle exception
System.out.println(areaList.get(n).toString());
e.printStackTrace();
} finally {
indexWriter.commit();
indexWriter.forceMerge(1);
indexWriter.close();
}
}
/**
* 判断area对象中是否存在null值
*/
public void checkAreaNull(Area area) {
if (area.getArea_name() == null) {
area.setArea_name("无area_name");
}
if (area.getParent_name() == null) {
area.setParent_name("无parent_name");
}
if (area.getAddress() == null) {
area.setAddress("无address");
}
}
}
使用多线程分批创建索引,测试可以顺利索引160W条数据。
搜索索引
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import com.xmmy.areaindex.params.IndexParams;
import com.xmmy.areaindex.params.SearchParams;
import com.xmmy.areaindex.pojo.Area;
import com.xmmy.areaindex.pojo.AreaCode;
import com.xmmy.areaindex.pojo.AreaIndex;
import com.xmmy.areaindex.util.GsonUtil;
public class SearchIndex {
//结果最大返回书
static Integer maxDocNum = 50;
//要搜索的关键字
String[] fields = new String[]{"area_name", "address"};
//QueryParser类实例化
QueryParser queryParser = new MultiFieldQueryParser(Version.LATEST, fields, new StandardAnalyzer());
//IndexReader对象
IndexReader indexReader = null;
//IndexSearcher对象
IndexSearcher indexSearcher = null;
public SearchIndex() {
// TODO Auto-generated constructor stub
try {
this.indexReader = DirectoryReader.open(FSDirectory.open(new File(IndexParams.indexOldAreaSavePath)));
this.indexSearcher = new IndexSearcher(this.indexReader);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
protected List search(String text) throws Exception {
List result = new ArrayList();
Query query = queryParser.parse(text);
TopDocs topDocs = indexSearcher.search(query, maxDocNum);
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (ScoreDoc scoreDoc : scoreDocs) {
Document document = indexSearcher.doc(scoreDoc.doc);
//结果数据对象
AreaCode areaCode = new AreaCode();
//组装对象
//如果是从address中匹配来的 source标记为1 反之为0
if (document.get("address").contains(text.replace("\"", ""))) {
areaCode.setSource(1);
}else {
areaCode.setSource(0);
}
areaCode.setArea_id(Integer.parseInt(document.get("area_id")));
areaCode.setArea_name(document.get("area_name"));
areaCode.setParent_id(Integer.parseInt(document.get("parent_id")));
areaCode.setParent_name(document.get("parent_name"));
areaCode.setArea_level(Integer.parseInt(document.get("area_level")));
if (areaCode.getSource() == 1) {
areaCode.setFreq(getTF(document.get("address"), text));
}else {
areaCode.setFreq(1);
}
result.add(areaCode);
//System.out.println(result.size());
}
return result;
}
public static Integer getTF(String address, String text) throws Exception {
//出现次数 frequency
//String text = "\"尖山街\"";
int freq = 0;
StringTokenizer st = new StringTokenizer(address, ",", false);
while (st.hasMoreElements()) {
if (st.nextToken().contains(text.replace("\"", ""))) {
freq++;
}
}
//System.out.println("freq = " + freq);
return freq;
}
public List searchByArea(String q) {
List result = null;
//组装查询参数
String query = "area_name:\"" + q + "\"";
try {
result = this.search(query);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
result = null;
}
return result;
}
public List searchByAddress(String q){
List result = null;
//组装参数
String query = "address:\"" + q + "\"";
try {
result = this.search(query);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
result = null;
}
return result;
}
public static void main(String[] args) {
//要搜索的关键词 加双引号才可以完全匹配
// String text = "area_name:\"湖南省1\"";//这样表示只搜索area中有 湖南省
//text = "\"思明区\"";//这样搜索表示匹配area parent address中完整带"思明区"三个字的doc
SearchIndex searchIndex = new SearchIndex();
/**
* 千次查询测试时间
*/
ArrayList line = new ArrayList();
try {
InputStreamReader read = new InputStreamReader(new FileInputStream(new File("F:\\search.txt")));
BufferedReader bf = new BufferedReader(read);
String lineTxt = null;
while ((lineTxt = bf.readLine()) != null) {
line.add(lineTxt);
}
//System.out.println(line.size());
} catch (Exception e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
Iterator it = line.iterator();
long start = System.currentTimeMillis();
String searchText = null;
while (it.hasNext()) {
searchText = "\"" + it.next() + "\"";
try {
//System.out.println(searchText);
searchIndex.search(searchText);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
long end = System.currentTimeMillis();
System.out.println("耗时: " + (end - start));
}
}
项目中有需要获得命中词频,但查阅lucene文档后并没有找到类似的接口。只能自己实现。