/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.synonym.SolrSynonymParser; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.synonym.WordnetSynonymParser; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.io.FastStringReader; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.Index; import org.elasticsearch.index.settings.IndexSettingsService; import org.elasticsearch.indices.analysis.IndicesAnalysisService; import java.io.Reader; import java.util.List; import java.util.Map; @AnalysisSettingsRequired public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory { private final SynonymMap synonymMap; private final boolean ignoreCase; @Inject public SynonymTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, IndicesAnalysisService indicesAnalysisService, Map<String, TokenizerFactoryFactory> tokenizerFactories, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); //同义词流 Reader rulesReader = null; //获取配置中的synonyms的同义词配置 if (settings.getAsArray("synonyms", null) != null) { List<String> rules = Analysis.getWordList(env, settings, "synonyms"); StringBuilder sb = new StringBuilder(); for (String line : rules) { sb.append(line).append(System.getProperty("line.separator")); } rulesReader = new FastStringReader(sb.toString()); //获取配置文件中同义词配置synonyms_path } else if (settings.get("synonyms_path") != null) { //获取配置路径的同义词文件流 rulesReader = Analysis.getReaderFromFile(env, settings, "synonyms_path"); } else { throw new IllegalArgumentException("synonym requires either `synonyms` or `synonyms_path` to be configured"); } this.ignoreCase = settings.getAsBoolean("ignore_case", false); boolean expand = settings.getAsBoolean("expand", true); //获取 tokenizer String tokenizerName = settings.get("tokenizer", "whitespace"); //获取TokenizerFactoryFactory TokenizerFactoryFactory tokenizerFactoryFactory = tokenizerFactories.get(tokenizerName); if (tokenizerFactoryFactory == null) { tokenizerFactoryFactory = indicesAnalysisService.tokenizerFactoryFactory(tokenizerName); } if (tokenizerFactoryFactory == null) { throw new IllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter"); } final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.create(tokenizerName, Settings.builder().put(indexSettingsService.getSettings()).put(settings).build()); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = tokenizerFactory == null ? new WhitespaceTokenizer() : tokenizerFactory.create(); TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer; return new TokenStreamComponents(tokenizer, stream); } }; try { SynonymMap.Builder parser = null; if ("wordnet".equalsIgnoreCase(settings.get("format"))) { parser = new WordnetSynonymParser(true, expand, analyzer); //解析同义词数据流 ((WordnetSynonymParser) parser).parse(rulesReader); } else { parser = new SolrSynonymParser(true, expand, analyzer); ((SolrSynonymParser) parser).parse(rulesReader); } synonymMap = parser.build(); } catch (Exception e) { throw new IllegalArgumentException("failed to build synonyms", e); } } @Override public TokenStream create(TokenStream tokenStream) { // fst is null means no synonyms //使用 lucene 中的 SynonymFilter return synonymMap.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonymMap, ignoreCase); } }
/** * @return null If no settings set for "settingsPrefix" then return <code>null</code>. * @throws IllegalArgumentException * If the Reader can not be instantiated. * 获取配置同义词流 */ public static Reader getReaderFromFile(Environment env, Settings settings, String settingPrefix) { String filePath = settings.get(settingPrefix, null); if (filePath == null) { return null; } final Path path = env.configFile().resolve(filePath); try { return FileSystemUtils.newBufferedReader(path.toUri().toURL(), Charsets.UTF_8); } catch (IOException ioe) { String message = String.format(Locale.ROOT, "IOException while reading %s_path: %s", settingPrefix, ioe.getMessage()); throw new IllegalArgumentException(message); } }
package org.apache.lucene.analysis.synonym;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader;
import java.text.ParseException;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
/**
* Parser for wordnet prolog format
* <p>
* See http://wordnet.princeton.edu/man/prologdb.5WN.html for a description of the format.
* @lucene.experimental SynonymMap 解析子类
*/ // TODO: allow you to specify syntactic categories (e.g. just nouns, etc) public class WordnetSynonymParser extends SynonymMap.Parser { private final boolean expand; public WordnetSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) { super(dedup, analyzer); this.expand = expand; } @Override public void parse(Reader in) throws IOException, ParseException { //一行一行解析 LineNumberReader br = new LineNumberReader(in); try { String line = null; String lastSynSetID = ""; CharsRef synset[] = new CharsRef[8]; int synsetSize = 0; while ((line = br.readLine()) != null) { String synSetID = line.substring(2, 11); if (!synSetID.equals(lastSynSetID)) { addInternal(synset, synsetSize); synsetSize = 0; } if (synset.length <= synsetSize+1) { synset = Arrays.copyOf(synset, synset.length * 2); } synset[synsetSize] = parseSynonym(line, new CharsRefBuilder()); synsetSize++; lastSynSetID = synSetID; } // final synset in the file addInternal(synset, synsetSize); } catch (IllegalArgumentException e) { ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0); ex.initCause(e); throw ex; } finally { br.close(); } } private CharsRef parseSynonym(String line, CharsRefBuilder reuse) throws IOException { if (reuse == null) { reuse = new CharsRefBuilder(); } int start = line.indexOf('\'')+1; int end = line.lastIndexOf('\''); String text = line.substring(start, end).replace("''", "'"); return analyze(text, reuse); } private void addInternal(CharsRef synset[], int size) { if (size <= 1) { return; // nothing to do } if (expand) { for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) { add(synset[i], synset[j], false); } } } else { for (int i = 0; i < size; i++) { add(synset[i], synset[0], false); } } } }