You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-user@lucene.apache.org by Christian Kaufhold <Ch...@bsb-muenchen.de> on 2017/07/25 06:52:34 UTC
synonyms
Hi,
I am not able to add synonyms to the lucene index.
I condensed my problem into the following class which is based on a Hello World example.
The idea behind the code was to add a document with universität and the synonym 'Hochschule' (highschool)
so that lucene finds universität wenn I query Hochschule.
But it doesn't and I checked the index contents with a term iterator.
Hochschule is not added to the index:
package test;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.Reader;
public class LuceneHelloWorld {
public static void main(String[] args) throws Exception {
Analyzer analyzer = getAnalyzer();
Directory directory = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_48, analyzer);
IndexWriter writer = new IndexWriter(directory, config);
addDoc( writer, "people", "Hello Universität" );
addDoc( writer, "world", "Hello World" );
addDoc( writer, "people", "Hello people" );
writer.close();
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher (reader);
QueryParser parser = new QueryParser ( Version.LUCENE_48, "content", analyzer);
test( parser, searcher, "Hochschule");
test( parser, searcher, "Hello");
test( parser, searcher, "people");
test( parser, searcher, "universität");
printIndexTerms( reader);
}
public static void addDoc( IndexWriter writer, String title, String content) throws Exception {
Document document = new Document ();
document.add(new TextField("title", title, Field.Store.YES));
document.add(new TextField("content", content, Field.Store.YES));
writer.addDocument(document);
}
public static Analyzer getAnalyzer(){
//return new StandardAnalyzer(Version.LUCENE_48);
//return new SynonymAnalyzer();
//return new SynonymFromStandardAnalyzer(Version.LUCENE_48);
return new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
// TODO Auto-generated method stub
Tokenizer source = new ClassicTokenizer(Version.LUCENE_48, reader);
TokenStream filter = new StandardFilter(Version.LUCENE_48, source);
filter = new LowerCaseFilter(Version.LUCENE_48,filter);
SynonymMap mySynonymMap = null;
try {
//mySynonymMap = buildSynonym();
SynonymMap.Builder builder = new SynonymMap.Builder(true);
//loadSynonyms(builder);
builder.add(new CharsRef("Hochschule"), new CharsRef("Universität"), true);
builder.add(new CharsRef("Universität"), new CharsRef("Hochschule"), true);
mySynonymMap = builder.build();
} catch (IOException e) {
// TODO Auto-generated catch bl filter = new SynonymFilter(filter, mySynonymMap, false);
return new TokenStreamComponents(source, filter);
}
};
}
static void test( QueryParser parser, IndexSearcher searcher, String queryStr) throws Exception {
Query query = parser.parse(queryStr);
TopDocs results = searcher.search(query, 10);
System.out.println( "#Hits: " + results.totalHits + " : " + queryStr);
}
public static void printIndexTerms( IndexReader reader) throws Exception {
Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms("content");
TermsEnum iterator =terms.iterator(TermsEnum.EMPTY);
BytesRef byteRef;
while ( (byteRef = iterator.next())!=null){
String term = byteRef.utf8ToString();
System.out.println(" term: " + term);
}
}
}
output:
#Hits: 0 : Hochschule
#Hits: 3 : Hello
#Hits: 1 : people
#Hits: 1 : universität
term: hello
term: people
term: universität
term: world
thanks in advance
christian
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org
Re: synonyms
Posted by Christian Kaufhold <Ch...@bsb-muenchen.de>.
Yep, you hit the point.
Thank you so much!
Output is now
#Hits: 1 : Hochschule
#Hits: 3 : Hello
#Hits: 1 : people
#Hits: 1 : universität
term: hello
term: hochschule
term: people
term: universität
term: world
>>> Alan Woodward <al...@flax.co.uk> 07/25/17 9:14 AM >>>
You have a LowercaseFilter before your SynonymFilter, which means that the entities in your SynonymMap need to be all lowercase or they won’t be matched.
Alan Woodward
www.flax.co.uk
> On 25 Jul 2017, at 07:52, Christian Kaufhold <Ch...@bsb-muenchen.de> wrote:
>
> Hi,
>
> I am not able to add synonyms to the lucene index.
> I condensed my problem into the following class which is based on a Hello World example.
> The idea behind the code was to add a document with universität and the synonym 'Hochschule' (highschool)
> so that lucene finds universität wenn I query Hochschule.
> But it doesn't and I checked the index contents with a term iterator.
> Hochschule is not added to the index:
>
>
> package test;
>
>
> import org.apache.lucene.analysis.Analyzer;
> import org.apache.lucene.analysis.TokenStream;
> import org.apache.lucene.analysis.Tokenizer;
> import org.apache.lucene.analysis.core.LowerCaseFilter;
> import org.apache.lucene.analysis.standard.ClassicTokenizer;
> import org.apache.lucene.analysis.standard.StandardFilter;
> import org.apache.lucene.analysis.synonym.SynonymFilter;
> import org.apache.lucene.analysis.synonym.SynonymMap;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.document.Field;
> import org.apache.lucene.document.TextField;
> import org.apache.lucene.index.*;
> import org.apache.lucene.queryparser.classic.QueryParser;
> import org.apache.lucene.search.IndexSearcher;
> import org.apache.lucene.search.Query;
> import org.apache.lucene.search.TopDocs;
> import org.apache.lucene.store.Directory;
> import org.apache.lucene.store.RAMDirectory;
> import org.apache.lucene.util.BytesRef;
> import org.apache.lucene.util.CharsRef;
> import org.apache.lucene.util.Version;
>
> import java.io.IOException;
> import java.io.Reader;
>
> public class LuceneHelloWorld {
>
> public static void main(String[] args) throws Exception {
>
> Analyzer analyzer = getAnalyzer();
> Directory directory = new RAMDirectory();
> IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_48, analyzer);
>
> IndexWriter writer = new IndexWriter(directory, config);
> addDoc( writer, "people", "Hello Universität" );
> addDoc( writer, "world", "Hello World" );
> addDoc( writer, "people", "Hello people" );
> writer.close();
>
> IndexReader reader = DirectoryReader.open(directory);
> IndexSearcher searcher = new IndexSearcher (reader);
> QueryParser parser = new QueryParser ( Version.LUCENE_48, "content", analyzer);
>
> test( parser, searcher, "Hochschule");
> test( parser, searcher, "Hello");
> test( parser, searcher, "people");
> test( parser, searcher, "universität");
>
> printIndexTerms( reader);
>
> }
>
> public static void addDoc( IndexWriter writer, String title, String content) throws Exception {
> Document document = new Document ();
> document.add(new TextField("title", title, Field.Store.YES));
> document.add(new TextField("content", content, Field.Store.YES));
> writer.addDocument(document);
> }
>
> public static Analyzer getAnalyzer(){
> //return new StandardAnalyzer(Version.LUCENE_48);
> //return new SynonymAnalyzer();
> //return new SynonymFromStandardAnalyzer(Version.LUCENE_48);
> return new Analyzer() {
> @Override
> protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
> // TODO Auto-generated method stub
> Tokenizer source = new ClassicTokenizer(Version.LUCENE_48, reader);
> TokenStream filter(Version.LUCENE_48,filter);
> SynonymMap mySynonymMap = null;
> try {
> //mySynonymMap = buildSynonym();
> SynonymMap.Builder builder = new SynonymMap.Builder(true);
> //loadSynonyms(builder);
> builder.add(new CharsRef("Hochschule"), new CharsRef("Universität"), true);
> builder.add(new CharsRef("Universität"), new CharsRef("Hochschule"), true);
> mySynonymMap = builder.build();
> } catch (IOException e) {
> // TODO Auto-generated catch bl filter = new SynonymFilter(filter, mySynonymMap, false);
> return new TokenStreamComponents(source, filter);
> }
> };
> }
>
> static void test( QueryParser parser, IndexSearcher searcher, String queryStr) throws Exception {
> Query query = parser.parse(queryStr);
> TopDocs results = searcher.search(query, 10);
> System.out.println( "#Hits: " + results.totalHits + " : " + queryStr);
>
> }
>
> public static void printIndexTerms( IndexReader reader) throws Exception {
> Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms("content");
> TermsEnum iterator =terms.iterator(TermsEnum.EMPTY);
> BytesRef byteRef;
> while ( (byteRef = iterator.next())!=null){
> String term = byteRef.utf8ToString();
> System.out.println(" term: " + term);
> }
> }
> }
>
> output:
>
> #Hits: 0 : Hochschule
> #Hits: 3 : Hello
> #Hits: 1 : people
> #Hits: 1 : universität
> term: hello
> term: people
> term: universität
> term: world
>
>
> thanks in advance
> christian
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
>
Re: synonyms
Posted by Alan Woodward <al...@flax.co.uk>.
You have a LowercaseFilter before your SynonymFilter, which means that the entities in your SynonymMap need to be all lowercase or they won’t be matched.
Alan Woodward
www.flax.co.uk
> On 25 Jul 2017, at 07:52, Christian Kaufhold <Ch...@bsb-muenchen.de> wrote:
>
> Hi,
>
> I am not able to add synonyms to the lucene index.
> I condensed my problem into the following class which is based on a Hello World example.
> The idea behind the code was to add a document with universität and the synonym 'Hochschule' (highschool)
> so that lucene finds universität wenn I query Hochschule.
> But it doesn't and I checked the index contents with a term iterator.
> Hochschule is not added to the index:
>
>
> package test;
>
>
> import org.apache.lucene.analysis.Analyzer;
> import org.apache.lucene.analysis.TokenStream;
> import org.apache.lucene.analysis.Tokenizer;
> import org.apache.lucene.analysis.core.LowerCaseFilter;
> import org.apache.lucene.analysis.standard.ClassicTokenizer;
> import org.apache.lucene.analysis.standard.StandardFilter;
> import org.apache.lucene.analysis.synonym.SynonymFilter;
> import org.apache.lucene.analysis.synonym.SynonymMap;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.document.Field;
> import org.apache.lucene.document.TextField;
> import org.apache.lucene.index.*;
> import org.apache.lucene.queryparser.classic.QueryParser;
> import org.apache.lucene.search.IndexSearcher;
> import org.apache.lucene.search.Query;
> import org.apache.lucene.search.TopDocs;
> import org.apache.lucene.store.Directory;
> import org.apache.lucene.store.RAMDirectory;
> import org.apache.lucene.util.BytesRef;
> import org.apache.lucene.util.CharsRef;
> import org.apache.lucene.util.Version;
>
> import java.io.IOException;
> import java.io.Reader;
>
> public class LuceneHelloWorld {
>
> public static void main(String[] args) throws Exception {
>
> Analyzer analyzer = getAnalyzer();
> Directory directory = new RAMDirectory();
> IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_48, analyzer);
>
> IndexWriter writer = new IndexWriter(directory, config);
> addDoc( writer, "people", "Hello Universität" );
> addDoc( writer, "world", "Hello World" );
> addDoc( writer, "people", "Hello people" );
> writer.close();
>
> IndexReader reader = DirectoryReader.open(directory);
> IndexSearcher searcher = new IndexSearcher (reader);
> QueryParser parser = new QueryParser ( Version.LUCENE_48, "content", analyzer);
>
> test( parser, searcher, "Hochschule");
> test( parser, searcher, "Hello");
> test( parser, searcher, "people");
> test( parser, searcher, "universität");
>
> printIndexTerms( reader);
>
> }
>
> public static void addDoc( IndexWriter writer, String title, String content) throws Exception {
> Document document = new Document ();
> document.add(new TextField("title", title, Field.Store.YES));
> document.add(new TextField("content", content, Field.Store.YES));
> writer.addDocument(document);
> }
>
> public static Analyzer getAnalyzer(){
> //return new StandardAnalyzer(Version.LUCENE_48);
> //return new SynonymAnalyzer();
> //return new SynonymFromStandardAnalyzer(Version.LUCENE_48);
> return new Analyzer() {
> @Override
> protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
> // TODO Auto-generated method stub
> Tokenizer source = new ClassicTokenizer(Version.LUCENE_48, reader);
> TokenStream filter = new StandardFilter(Version.LUCENE_48, source);
> filter = new LowerCaseFilter(Version.LUCENE_48,filter);
> SynonymMap mySynonymMap = null;
> try {
> //mySynonymMap = buildSynonym();
> SynonymMap.Builder builder = new SynonymMap.Builder(true);
> //loadSynonyms(builder);
> builder.add(new CharsRef("Hochschule"), new CharsRef("Universität"), true);
> builder.add(new CharsRef("Universität"), new CharsRef("Hochschule"), true);
> mySynonymMap = builder.build();
> } catch (IOException e) {
> // TODO Auto-generated catch bl filter = new SynonymFilter(filter, mySynonymMap, false);
> return new TokenStreamComponents(source, filter);
> }
> };
> }
>
> static void test( QueryParser parser, IndexSearcher searcher, String queryStr) throws Exception {
> Query query = parser.parse(queryStr);
> TopDocs results = searcher.search(query, 10);
> System.out.println( "#Hits: " + results.totalHits + " : " + queryStr);
>
> }
>
> public static void printIndexTerms( IndexReader reader) throws Exception {
> Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms("content");
> TermsEnum iterator =terms.iterator(TermsEnum.EMPTY);
> BytesRef byteRef;
> while ( (byteRef = iterator.next())!=null){
> String term = byteRef.utf8ToString();
> System.out.println(" term: " + term);
> }
> }
> }
>
> output:
>
> #Hits: 0 : Hochschule
> #Hits: 3 : Hello
> #Hits: 1 : people
> #Hits: 1 : universität
> term: hello
> term: people
> term: universität
> term: world
>
>
> thanks in advance
> christian
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
>