package com.mzsx.tika; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; public class TikaOpera { publicString tikaTool(File f) throws IOException, TikaException { Tikatika = new Tika(); Metadata metadata = new Metadata(); metadata.set(Metadata.AUTHOR, "空号"); metadata.set(Metadata.RESOURCE_NAME_KEY, f.getName()); Stringstr = tika.parseToString(new FileInputStream(f), metadata); for(String name : metadata.names()) { System.out.println(name + ":" + metadata.get(name)); } returnstr; } publicString fileToTxt(File f) { Parserparser = new AutoDetectParser(); InputStream is = null; try { Metadata metadata = new Metadata(); metadata.set(Metadata.AUTHOR, "空号"); metadata.set(Metadata.RESOURCE_NAME_KEY, f.getName()); is =new FileInputStream(f); ContentHandler handler = new BodyContentHandler(); ParseContext context = new ParseContext(); context.set(Parser.class, parser); parser.parse(is, handler, metadata, context); for(String name : metadata.names()) { System.out.println(name + ":" + metadata.get(name)); } return handler.toString(); } catch(FileNotFoundException e) { e.printStackTrace(); } catch(IOException e) { e.printStackTrace(); } catch(SAXException e) { e.printStackTrace(); } catch(TikaException e) { e.printStackTrace(); }finally { try{ if(is != null) is.close(); }catch (IOException e) { e.printStackTrace(); } } returnnull; } }
//测试代码
package com.mzsx.test; import java.io.File; import java.io.IOException; import org.apache.tika.exception.TikaException; import org.junit.Test; import com.mzsx.tika.TikaOpera; public class TikaOperaTest { @Test publicvoid testTika01() { TikaOpera iu = new TikaOpera(); System.out.println(iu.fileToTxt(newFile("d:/Java学习手记.pdf"))); } @Test publicvoid testToka02() throws IOException, TikaException { TikaOpera iu = new TikaOpera(); System.out.println(iu.tikaTool(new File("d:/Java学习手记.pdf"))); } }
package com.mzsx.highlighter; import java.io.File; import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; importorg.apache.lucene.index.CorruptIndexException; importorg.apache.lucene.queryParser.MultiFieldQueryParser; importorg.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; importorg.apache.lucene.search.highlight.Formatter; importorg.apache.lucene.search.highlight.Fragmenter; importorg.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; importorg.apache.lucene.search.highlight.QueryScorer; importorg.apache.lucene.search.highlight.SimpleHTMLFormatter; importorg.apache.lucene.search.highlight.SimpleSpanFragmenter; import org.apache.lucene.search.similar.MoreLikeThis; importorg.apache.lucene.search.vectorhighlight.FastVectorHighlighter; importorg.apache.lucene.search.vectorhighlight.FieldQuery; import org.apache.lucene.util.Version; import com.chenlb.mmseg4j.analysis.MMSegAnalyzer; import com.mzsx.index.IndexReaderContext; import com.mzsx.write.DirectoryConext; public class HighlighterOpera { privatestatic MMSegAnalyzer analyzer=new MMSegAnalyzer(newFile("D:\\luceneIndex\\dic")); publicvoid simpleLighter(){ try { String txt = "我爱北京天安门,天安门上彩旗飞,伟大领袖毛主席,指引我们向前进,向前进!!!\n想起身离开东京法律思考的机会那个上的讲话那伟大的个圣诞。那是肯定激发了深刻的机会拉萨宽带计费了那个傻大姐华纳公司的机会节贺卡就是对话框那是国天安门际 北京电话卡开始觉啊北京得人们大会堂 北京!!!!"; //查询 Query query = new QueryParser(Version.LUCENE_35, "f",analyzer).parse("北京伟大"); //高亮的查询评分 QueryScorer scorer = new QueryScorer(query); //高亮的分段 Fragmenter fragment = new SimpleSpanFragmenter(scorer); //高亮的格式 Formatter formatter = new SimpleHTMLFormatter("<spanstyle='color:red'>", "</span>"); //设置高亮 Highlighter highlighter = new Highlighter(formatter,scorer); //设置高亮段 highlighter.setTextFragmenter(fragment); String str = highlighter.getBestFragment(analyzer, "f", txt); System.out.println(str); } catch(IOException e) { e.printStackTrace(); } catch(InvalidTokenOffsetsException e) { e.printStackTrace(); } catch(ParseException e) { e.printStackTrace(); } } publicvoid searcherByHighlighter(String name) { try { IndexSearcher searcher = newIndexSearcher(IndexReaderContext.getIndexReader(DirectoryConext.getDirectory("D:/luceneIndex/index"))); //QueryParser parser = newQueryParser(Version.LUCENE_35,"title",a); MultiFieldQueryParser parser = newMultiFieldQueryParser(Version.LUCENE_35, newString[]{"filename","contents"}, analyzer); Query query = parser.parse(name); TopDocs tds = searcher.search(query, 20); MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader()); mlt.setFieldNames(newString[]{"filename","contents"}); mlt.setMinDocFreq(1); mlt.setMinTermFreq(1); System.out.println("总共:"+tds.totalHits); for(ScoreDoc sd:tds.scoreDocs) { Document doc = searcher.doc(sd.doc); String title = doc.get("filename"); title = lighterStr(analyzer, query, title, "filename"); System.out.println(title); String contents = doc.get("contents"); contents = lighterStr(analyzer, query, contents, "contents"); System.out.println(contents); System.out.println("**************************************************************************************************"); Query moreLike = mlt.like(sd.doc); TopDocs stds = searcher.search(moreLike, 10); for(ScoreDoc ssd:stds.scoreDocs) { Document d = searcher.doc(ssd.doc); System.out.println(d.get("filename")); } } searcher.close(); } catch(CorruptIndexException e) { e.printStackTrace(); } catch(IOException e) { e.printStackTrace(); } catch(ParseException e) { e.printStackTrace(); } catch(InvalidTokenOffsetsException e) { e.printStackTrace(); } } publicvoid searcherByFastHighlighter(String name) { try { FastVectorHighlighter fvh = new FastVectorHighlighter(false,false); IndexSearcher searcher = newIndexSearcher(IndexReaderContext.getIndexReader(DirectoryConext.getDirectory("D:/luceneIndex/index"))); //QueryParser parser = newQueryParser(Version.LUCENE_35,"title",a); MultiFieldQueryParser parser = newMultiFieldQueryParser(Version.LUCENE_35, newString[]{"filename","contents"}, analyzer); Query query = parser.parse(name); FieldQuery fq = fvh.getFieldQuery(query); TopDocs tds = searcher.search(query, 20); for(ScoreDoc sd:tds.scoreDocs) { String highTitle = fvh.getBestFragment(fq, searcher.getIndexReader(),sd.doc, "filename", 100); System.out.println(highTitle); System.out.println("-------------------------"); String highContent = fvh.getBestFragment(fq, searcher.getIndexReader(), sd.doc,"contents",100); System.out.println(highContent); System.out.println("**********************************************************************"); } searcher.close(); } catch(CorruptIndexException e) { //TODO Auto-generated catch block e.printStackTrace(); } catch(IOException e) { //TODO Auto-generated catch block e.printStackTrace(); } catch(ParseException e) { //TODO Auto-generated catch block e.printStackTrace(); } } privateString lighterStr(Analyzer a,Query query,String txt,String fieldname) throwsIOException, InvalidTokenOffsetsException { Stringstr = null; QueryScorer scorer = new QueryScorer(query); Fragmenterfragmenter = new SimpleSpanFragmenter(scorer); Formatter fmt = new SimpleHTMLFormatter("<b>","</b>"); Highlighter lighter = new Highlighter(fmt, scorer); lighter.setTextFragmenter(fragmenter); str =lighter.getBestFragments(a.tokenStream(fieldname,new StringReader(txt)),txt, 3,"......\n"); if(str==null)return txt; returnstr; } }
//测试代码
package com.mzsx.test; import org.junit.Test; import com.mzsx.highlighter.HighlighterOpera; public class HighlighterTest { @Test publicvoid simpleLighter(){ HighlighterOpera opera=newHighlighterOpera(); opera.simpleLighter(); } @Test publicvoid searcherByHighlighter(){ HighlighterOpera opera=newHighlighterOpera(); opera.searcherByHighlighter("台"); } @Test publicvoid searcherByFastHighlighter(){ HighlighterOpera opera=newHighlighterOpera(); opera.searcherByFastHighlighter("台"); } }
1.让solr和tomcat整合
(1)、将solr中的example中的solr拷贝到要作为服务器的位置
(2)、将相应的solr的web程序也拷贝出来
(3)、修改solr-->home文件夹中的solrconfig.xml设置data的路径:
D:\luceneIndex\solr\home\conf\solrconfig.xml
<dataDir>${solr.data.dir: D:\luceneIndex\solr\home\data }</dataDir>
(4)、设置相应的tomcat的context
C:\apache-tomcat-6.0.18\conf\server.xml
<Context path="/solr"docBase="D:\luceneIndex\solr\server\solr" debug="0"crossContext="true"> </Context>
(5)、为context设置相应的环境变量,说明solr的主目录的地址
<Context path="/solr"docBase="D:\luceneIndex\solr\server\solr" debug="0" crossContext="true"> <Environmentname="solr/home" type="java.lang.String"value="D:\luceneIndex\solr\home" override="true"/> </Context>
(6)、取消VelocityResponseWriter这种输出格式
<!--<queryResponseWritername="velocity" class="solr.VelocityResponseWriter" enable="${solr.velocity.enabled:true}"/>-->
(7)、加入中文分词
solr.data.dir:D:\luceneIndex\solr\home\data中的conf\schema.xml中的
(7.1)、将中文分词的包拷贝到server的lib中
D:\luceneIndex\solr\server\solr\WEB-INF\lib
(7.2)、将中文分词添加到FieldType中
D:\luceneIndex\solr\home\conf\schema.xml
<!--MMSeg中文分词器--> <fieldType name="textComplex"class="solr.TextField" > <analyzer> <tokenizerclass="com.chenlb.mmseg4j.solr.MMSegTokenizerFactory"mode="complex" dicPath="D:/luceneIndex/dic"/> </analyzer> </fieldType> <fieldTypename="textMaxWord" class="solr.TextField" > <analyzer> <tokenizerclass="com.chenlb.mmseg4j.solr.MMSegTokenizerFactory"mode="max-word" dicPath="D:/luceneIndex/dic"/> </analyzer> </fieldType> <fieldTypename="textSimple" class="solr.TextField" > <analyzer> <tokenizerclass="com.chenlb.mmseg4j.solr.MMSegTokenizerFactory"mode="simple" dicPath="D:/luceneIndex/dic"/> </analyzer> </fieldType>
2.solrJ的使用
(1).创建SolrServer
private final static String URL ="http://localhost:8080/solr"; privateCommonsHttpSolrServer server = null; @Before publicvoid init() { try{ server= new CommonsHttpSolrServer(URL); }catch (MalformedURLException e) { e.printStackTrace(); } }
3.solr添加文档
配置D:\luceneIndex\solr\home\conf\schema.xml:
<defaultSearchField>msg_all</defaultSearchField> <field name="msg_title"type="textComplex" indexed="true"stored="true"/> <field name="msg_content"type="textComplex" indexed="true"stored="true"/> <field name="msg_all"type="textComplex" indexed="true" stored="false"multiValued="true"/> <copyField source="msg_title"dest="msg_all"/> <copyField source="msg_content"dest="msg_all"/>
@Test publicvoid test01() { try{ SolrInputDocumentdoc = new SolrInputDocument(); //id是唯一的主键,当多次添加的时候,最后添加的相同id的域会覆盖前面的域 doc.addField("id","1"); doc.addField("msg_title","这是我的第一个solrj的程序"); doc.addField("msg_content","我的solrj的程序究竟能不能跑得起来呢?"); server.add(doc); server.commit(); }catch (MalformedURLException e) { e.printStackTrace(); }catch (SolrServerException e) { e.printStackTrace(); }catch (IOException e) { e.printStackTrace(); } }
@Test publicvoid test02() { try{ List<SolrInputDocument>docs = new ArrayList<SolrInputDocument>(); SolrInputDocumentdoc = new SolrInputDocument(); doc.addField("id","2"); doc.addField("msg_title","很好!solr可以工作了"); doc.addField("msg_content","slor总算可以正式工作了"); docs.add(doc); doc= new SolrInputDocument(); doc.addField("id","3"); doc.addField("msg_title","测试一下solr的添加"); doc.addField("msg_content","看看能不能添加一个列表信息"); docs.add(doc); server.add(docs); server.commit(); }catch (SolrServerException e) { e.printStackTrace(); }catch (IOException e) { e.printStackTrace(); } }
@Test publicvoid test03() { try{ List<Message>msgs = new ArrayList<Message>(); msgs.add(newMessage("4","基于java bean的添加", new String[]{"通过java bean完成添加","javabean的添加附件"})); msgs.add(newMessage("5","基于java bean的列表数据的添加", new String[]{"测试如何通过一个对象完成添加","通过对象完成添加的附件"})); server.addBeans(msgs); server.commit(); }catch (SolrServerException e) { e.printStackTrace(); }catch (IOException e) { e.printStackTrace(); } }
4.solr的查询
@Test publicvoid test04() { try{ //定义查询字符串 SolrQueryquery = new SolrQuery("*"); query.setStart(0); query.setRows(3); QueryResponseresp = server.query(query); //查询出来的结果都保存在SolrDocumentList中 SolrDocumentListsdl = resp.getResults(); System.out.println(sdl.getNumFound()); for(SolrDocumentsd:sdl) { // System.out.println(sd); System.out.println(sd.getFieldValue("msg_title")+","+sd.getFieldValue("msg_content")); } }catch (SolrServerException e) { e.printStackTrace(); } }
@Test publicvoid test05() { try{ SolrQueryquery = new SolrQuery("*"); query.setStart(0); query.setRows(3); QueryResponseresp = server.query(query); //可以直接查询相应的bean对象,但是不是很常用 List<Message>list = resp.getBeans(Message.class); System.out.println(list.size()); for(Messagemsg:list) { System.out.println(msg.getTitle()); } }catch (SolrServerException e) { e.printStackTrace(); } }
5.高亮查询
@Test publicvoid test06() { try{ SolrQueryquery = new SolrQuery("测试"); query.setHighlight(true).setHighlightSimplePre("<spanclass='highligter'>") .setHighlightSimplePost("</span>") .setStart(0).setRows(5); query.setParam("hl.fl","msg_title,msg_content"); QueryResponseresp = server.query(query); //查询出来的结果都保存在SolrDocumentList中 SolrDocumentListsdl = resp.getResults(); System.out.println(sdl.getNumFound()); for(SolrDocumentsd:sdl) { Stringid = (String)sd.getFieldValue("id"); System.out.println(resp.getHighlighting().get(id).get("msg_content")); } }catch (SolrServerException e) { e.printStackTrace(); } }