Lucene 5 TokenStream

package com.lucene5.dream;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;

import org.ansj.lucene5.AnsjAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoubleField;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.SortField.Type;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;

import com.lucene5.demo.LuceneQueryTest1;

public class TokenStreamTest {
	static Analyzer analyzer;
	static Directory d;
	static IndexWriterConfig conf;
	static IndexWriter indexWriter;

	final static String queryKeyWord1 = "华美";
	private static final FieldType DOUBLE_FIELD_TYPE_STORED_SORTED = new FieldType();

	static {
		DOUBLE_FIELD_TYPE_STORED_SORTED.setTokenized(true);
		DOUBLE_FIELD_TYPE_STORED_SORTED.setOmitNorms(true);
		DOUBLE_FIELD_TYPE_STORED_SORTED.setIndexOptions(IndexOptions.DOCS);
		DOUBLE_FIELD_TYPE_STORED_SORTED.setNumericType(FieldType.NumericType.DOUBLE);
		DOUBLE_FIELD_TYPE_STORED_SORTED.setStored(true);
		DOUBLE_FIELD_TYPE_STORED_SORTED.setDocValuesType(DocValuesType.NUMERIC);
		DOUBLE_FIELD_TYPE_STORED_SORTED.freeze();
	}

	@BeforeClass
	public static void setup() throws Exception {
		analyzer = new AnsjAnalyzer("user");
		d = new RAMDirectory();
		conf = new IndexWriterConfig(analyzer);
		indexWriter = new IndexWriter(d, conf);
		InputStream is = LuceneQueryTest1.class.getResourceAsStream("/data/data");
		BufferedReader br = new BufferedReader(new InputStreamReader(is));
		String line = null;
		while ((line = br.readLine()) != null) {
			String[] elements = line.split("##");
			Document document = new Document();
			StringField category = new StringField("category", elements[0], Store.YES);
			TextField brandName = new TextField("brandName", elements[1], Store.YES);
			TextField productName = new TextField("productName", elements[2], Store.YES);
			DoubleField price = new DoubleField("price", Double.valueOf(elements[3]), DOUBLE_FIELD_TYPE_STORED_SORTED);
			document.add(category);
			document.add(brandName);
			document.add(productName);
			document.add(price);
			indexWriter.addDocument(document);
		}
		indexWriter.commit();
		indexWriter.close();
		br.close();
		is.close();
	}

	@AfterClass
	public static void teardown() {
		try {
			indexWriter.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	@Test
	public void testSearchBySort() {

		StringReader reader = new StringReader("我爱北京天安门,我在天安门广场上看见很多人");
		TokenStream tokenStream = null;
		try {
			tokenStream = analyzer.tokenStream("message", reader);
			CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
			TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class);
			FlagsAttribute flagsAttribute = tokenStream.getAttribute(FlagsAttribute.class);
			PositionIncrementAttribute positionIncrementAttribute = tokenStream
					.getAttribute(PositionIncrementAttribute.class);
			PayloadAttribute payloadAttribute = tokenStream.getAttribute(PayloadAttribute.class);
			OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
			tokenStream.reset();
			while (tokenStream.incrementToken()) {
				System.err.print("##");
				System.err.print("charTermAttribute:" + charTermAttribute.toString());
				System.err.print("type:" + typeAttribute.type());
				// System.err.print(flagsAttribute.getFlags());
				System.err.print(positionIncrementAttribute.getPositionIncrement());
				// System.err.print(payloadAttribute.getPayload());
				System.err.print("start:" + offsetAttribute.startOffset());
				System.err.print("end:" + offsetAttribute.endOffset());
				System.err.print("##");
				System.err.println();
			}
			tokenStream.end();
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			try {
				tokenStream.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
			analyzer.close();
		}

	}

}

/**
 * 存储了每一个索引位置的有效载荷。他们被生成有用的评分信息在有效信息查询的时候。他们在不同的位置都有存储OffsetAttribute:
 * 
 * 这个记录当前的term在文档中的起始和结束位置
 * 
 * TypeAttribute
 * 
 * 定义数据类型
 * 
 * FlagsAttribute
 * 
 * 和TypeAttribute类似。但是他有别的用途
 * 
 * Suppose you need to add specific information about a token and that
 * information should be available down the analyzer chain, you can pass it as
 * flags. TokenFilters can perform any specific action based on the flags of the
 * token
 * 
 * 假设你有其他的特殊信息。并且这些信息必须在分析器链上沉淀下来。你可以 像flag跳过他们。TokenFilter 可以被执行任何页数的动作在token的
 * 标志上
 * 
 * PayloadAttribute
 * 
 * This stores the payload at each index position and is generally useful in
 * scoring when used with Payload-based queries. Because it's stored at each
 * position, it is best to have a minimum number of bytes per term in the index
 * to minimize overloading the index with a massive amount of data.
 */


你可能感兴趣的:(Lucene 5 TokenStream)