You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-user@lucene.apache.org by "denis.zhdanov" <de...@gmail.com> on 2013/09/09 08:05:50 UTC
SynonymFilter benefit over explicit field composition
Hello,I recently started using lucene and checking built-in synonyms
processing facilities. So, the main question so far is what is the benefit
of using /SynonymFilter/ over explicitly adding synonyms as document
fields?The former has an obvious drawback that it doesn't support transitive
relations. Consider a simple example below - registering pairs (/"first"/,
/"second"/) and (/"first"/, /"third"/) as synonyms; indexing /"second"/;
searching against /"third"/; no match:
package com.my.social.search.lucene;import
org.apache.lucene.analysis.Analyzer;import
org.apache.lucene.analysis.TokenStream;import
org.apache.lucene.analysis.Tokenizer;import
org.apache.lucene.analysis.core.LowerCaseFilter;import
org.apache.lucene.analysis.core.StopFilter;import
org.apache.lucene.analysis.en.EnglishAnalyzer;import
org.apache.lucene.analysis.en.EnglishPossessiveFilter;import
org.apache.lucene.analysis.en.PorterStemFilter;import
org.apache.lucene.analysis.miscellaneous.LengthFilter;import
org.apache.lucene.analysis.standard.StandardFilter;import
org.apache.lucene.analysis.standard.StandardTokenizer;import
org.apache.lucene.analysis.synonym.SynonymFilter;import
org.apache.lucene.analysis.synonym.SynonymMap;import
org.apache.lucene.analysis.util.StopwordAnalyzerBase;import
org.apache.lucene.document.Document;import
org.apache.lucene.document.Field;import
org.apache.lucene.document.TextField;import
org.apache.lucene.index.DirectoryReader;import
org.apache.lucene.index.IndexWriter;import
org.apache.lucene.index.IndexWriterConfig;import
org.apache.lucene.queryparser.classic.ParseException;import
org.apache.lucene.queryparser.classic.QueryParser;import
org.apache.lucene.search.*;import
org.apache.lucene.store.RAMDirectory;import
org.apache.lucene.util.CharsRef;import org.apache.lucene.util.Version;import
org.jetbrains.annotations.NotNull;import java.io.IOException;import
java.io.Reader;/** * @author Denis Zhdanov * @since 9/5/13 12:10 AM */public
class LuceneTest { public static void main(String[] args) throws
IOException, ParseException { RAMDirectory dir = new RAMDirectory();
SynonymMap.Builder builder = new SynonymMap.Builder(true);
builder.add(new CharsRef("first"), new CharsRef("second"), true);
builder.add(new CharsRef("first"), new CharsRef("third"), true);
MyAnalyzer analyzer = new MyAnalyzer(builder.build()); try
(IndexWriter writer = new IndexWriter(dir, new
IndexWriterConfig(Version.LUCENE_44, analyzer))) { Document
document = new Document(); document.add(new TextField("tag",
"second", Field.Store.YES)); writer.addDocument(document);
} IndexSearcher searcher = new
IndexSearcher(DirectoryReader.open(dir)); QueryParser queryParser =
new QueryParser(Version.LUCENE_44, "tag", analyzer); Query query =
queryParser.parse("third"); TopDocs hits = searcher.search(query,
null, 10); for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc);
System.out.println(doc.get("tag")); //Explanation explain =
searcher.explain(query, scoreDoc.doc);
//System.out.println(explain); } } private static class
MyAnalyzer extends StopwordAnalyzerBase { private final SynonymMap
synonyms; MyAnalyzer(@NotNull SynonymMap synonyms) {
super(Version.LUCENE_44); this.synonyms = synonyms; }
@Override protected Analyzer.TokenStreamComponents
createComponents(String fieldName, Reader reader) { final
Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
result = new EnglishPossessiveFilter(matchVersion, result);
result = new LowerCaseFilter(matchVersion, result); result = new
SynonymFilter(result, synonyms, true); result = new
PorterStemFilter(result); return new
Analyzer.TokenStreamComponents(source, result); } }}
That means that I need to explicitly register all possible pairs from a set
of synonyms to get SynonymFilter-based approach work (I have a large set of
english synonyms (built from gutenberg dictionary) where every synonyms
group contains more than two words).I see the only possible benefit of using
/SynonymFilter/ so far - phrase search where synonym position matters:
package com.my.social.search.lucene;import
org.apache.lucene.analysis.Analyzer;import
org.apache.lucene.analysis.TokenStream;import
org.apache.lucene.analysis.Tokenizer;import
org.apache.lucene.analysis.core.LowerCaseFilter;import
org.apache.lucene.analysis.core.StopFilter;import
org.apache.lucene.analysis.en.EnglishAnalyzer;import
org.apache.lucene.analysis.en.EnglishPossessiveFilter;import
org.apache.lucene.analysis.en.PorterStemFilter;import
org.apache.lucene.analysis.miscellaneous.LengthFilter;import
org.apache.lucene.analysis.standard.StandardFilter;import
org.apache.lucene.analysis.standard.StandardTokenizer;import
org.apache.lucene.analysis.synonym.SynonymFilter;import
org.apache.lucene.analysis.synonym.SynonymMap;import
org.apache.lucene.analysis.util.StopwordAnalyzerBase;import
org.apache.lucene.document.Document;import
org.apache.lucene.document.Field;import
org.apache.lucene.document.TextField;import
org.apache.lucene.index.DirectoryReader;import
org.apache.lucene.index.IndexWriter;import
org.apache.lucene.index.IndexWriterConfig;import
org.apache.lucene.queryparser.classic.ParseException;import
org.apache.lucene.queryparser.classic.QueryParser;import
org.apache.lucene.search.*;import
org.apache.lucene.store.RAMDirectory;import
org.apache.lucene.util.CharsRef;import org.apache.lucene.util.Version;import
org.jetbrains.annotations.NotNull;import java.io.IOException;import
java.io.Reader;/** * @author Denis Zhdanov * @since 9/5/13 12:10 AM */public
class LuceneTest { public static void main(String[] args) throws
IOException, ParseException { RAMDirectory dir = new RAMDirectory();
SynonymMap.Builder builder = new SynonymMap.Builder(true);
builder.add(new CharsRef("first"), new CharsRef("second"), true);
MyAnalyzer analyzer = new MyAnalyzer(builder.build()); try
(IndexWriter writer = new IndexWriter(dir, new
IndexWriterConfig(Version.LUCENE_44, analyzer))) { Document
document = new Document(); document.add(new TextField("tag",
"second point or number", Field.Store.YES));
writer.addDocument(document); document = new Document();
document.add(new TextField("tag", "first number dummy", Field.Store.YES));
writer.addDocument(document); } IndexSearcher searcher = new
IndexSearcher(DirectoryReader.open(dir)); QueryParser queryParser =
new QueryParser(Version.LUCENE_44, "tag", analyzer); Query query =
queryParser.parse("\"second number\""); TopDocs hits =
searcher.search(query, null, 10); for (ScoreDoc scoreDoc :
hits.scoreDocs) { Document doc = searcher.doc(scoreDoc.doc);
System.out.println(doc.get("tag")); //Explanation explain =
searcher.explain(query, scoreDoc.doc);
//System.out.println(explain); } } private static class
MyAnalyzer extends StopwordAnalyzerBase { private final SynonymMap
synonyms; MyAnalyzer(@NotNull SynonymMap synonyms) {
super(Version.LUCENE_44); this.synonyms = synonyms; }
@Override protected Analyzer.TokenStreamComponents
createComponents(String fieldName, Reader reader) { final
Tokenizer source = new StandardTokenizer(matchVersion, reader);
TokenStream result = new StandardFilter(matchVersion, source);
result = new EnglishPossessiveFilter(matchVersion, result);
result = new LowerCaseFilter(matchVersion, result); result = new
SynonymFilter(result, synonyms, true); result = new
PorterStemFilter(result); return new
Analyzer.TokenStreamComponents(source, result); } }}
I google in order to find out if my understanding is correct but
unfortunately it doesn't show any results. That's why I decided to ask the
community before digging into lucene sources
--
View this message in context: http://lucene.472066.n3.nabble.com/SynonymFilter-benefit-over-explicit-field-composition-tp4088819.html
Sent from the Lucene - Java Users mailing list archive at Nabble.com.
Re: SynonymFilter benefit over explicit field composition
Posted by "denis.zhdanov" <de...@gmail.com>.
Just realised that it's not necessary to add all possible synonym pairs -
it's enough to choose any synonym from a synonyms group and register all
other synonyms against it. Than it's necessary just to index that 'key'
synonym.
--
View this message in context: http://lucene.472066.n3.nabble.com/SynonymFilter-benefit-over-explicit-field-composition-tp4088819p4088820.html
Sent from the Lucene - Java Users mailing list archive at Nabble.com.
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org