You are viewing a plain text version of this content. The canonical link for it is here.

Posted to java-user@lucene.apache.org by Christian Kaufhold <Ch...@bsb-muenchen.de> on 2017/07/25 06:52:34 UTC

synonyms

Hi,

I am not able to add synonyms to the lucene index.
I condensed my problem into the following class which is based on a Hello World example.
The idea behind the code was to add a document with universität and the synonym 'Hochschule' (highschool)
so that lucene finds universität wenn I query Hochschule.
But it doesn't and I checked the index contents with a term iterator.
Hochschule is not added to the index:


package test;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.Version;

import java.io.IOException;
import java.io.Reader;

public class LuceneHelloWorld {

    public static void main(String[] args) throws Exception {

        Analyzer analyzer = getAnalyzer();
        Directory directory = new RAMDirectory();
        IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_48, analyzer);

        IndexWriter writer = new IndexWriter(directory, config);
        addDoc( writer, "people", "Hello Universität" );
        addDoc( writer, "world", "Hello World" );
        addDoc( writer, "people", "Hello people" );
        writer.close();

        IndexReader reader = DirectoryReader.open(directory);
        IndexSearcher searcher = new IndexSearcher (reader);
        QueryParser parser = new QueryParser ( Version.LUCENE_48, "content", analyzer);

        test( parser, searcher, "Hochschule");
        test( parser, searcher, "Hello");
        test( parser, searcher, "people");
        test( parser, searcher, "universität");

        printIndexTerms( reader);

    }

    public static void addDoc( IndexWriter writer, String title, String content) throws Exception {
        Document document = new Document ();
        document.add(new TextField("title", title, Field.Store.YES));
        document.add(new TextField("content", content, Field.Store.YES));
        writer.addDocument(document);
    }

    public static Analyzer getAnalyzer(){
        //return new StandardAnalyzer(Version.LUCENE_48);
        //return new SynonymAnalyzer();
        //return new SynonymFromStandardAnalyzer(Version.LUCENE_48);
        return new Analyzer() {
            @Override
            protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
                // TODO Auto-generated method stub
                Tokenizer source = new ClassicTokenizer(Version.LUCENE_48, reader);
                TokenStream filter = new StandardFilter(Version.LUCENE_48, source);
                filter = new LowerCaseFilter(Version.LUCENE_48,filter);
                SynonymMap mySynonymMap = null;
                try {
                    //mySynonymMap = buildSynonym();
                    SynonymMap.Builder builder = new SynonymMap.Builder(true);
                    //loadSynonyms(builder);
                    builder.add(new CharsRef("Hochschule"), new CharsRef("Universität"), true);
                    builder.add(new CharsRef("Universität"), new CharsRef("Hochschule"), true);
                    mySynonymMap = builder.build();
                } catch (IOException e) {
                    // TODO Auto-generated catch bl               filter = new SynonymFilter(filter, mySynonymMap, false);
                return new TokenStreamComponents(source, filter);
            }
        };
    }

    static void test( QueryParser parser, IndexSearcher searcher, String queryStr) throws Exception {
        Query query = parser.parse(queryStr);
        TopDocs results = searcher.search(query, 10);
        System.out.println(  "#Hits: " + results.totalHits + " : " + queryStr);

    }

    public static void printIndexTerms( IndexReader reader) throws Exception {
        Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms("content");
        TermsEnum iterator =terms.iterator(TermsEnum.EMPTY);
        BytesRef byteRef;
        while ( (byteRef = iterator.next())!=null){
            String term = byteRef.utf8ToString();
            System.out.println("  term: " + term);
        }
    }
}

output:

#Hits: 0 : Hochschule
#Hits: 3 : Hello
#Hits: 1 : people
#Hits: 1 : universität
  term: hello
  term: people
  term: universität
  term: world


thanks in advance
christian


---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org

Re: synonyms

Posted by Christian Kaufhold <Ch...@bsb-muenchen.de>.

Yep, you hit the point. 

Thank you so much!

Output is now

#Hits: 1 : Hochschule
#Hits: 3 : Hello
#Hits: 1 : people
#Hits: 1 : universität
  term: hello
  term: hochschule
  term: people
  term: universität
  term: world

>>> Alan Woodward <al...@flax.co.uk> 07/25/17 9:14 AM >>>
You have a LowercaseFilter before your SynonymFilter, which means that the entities in your SynonymMap need to be all lowercase or they won’t be matched.

Alan Woodward
www.flax.co.uk


> On 25 Jul 2017, at 07:52, Christian Kaufhold <Ch...@bsb-muenchen.de> wrote:
> 
> Hi,
> 
> I am not able to add synonyms to the lucene index.
> I condensed my problem into the following class which is based on a Hello World example.
> The idea behind the code was to add a document with universität and the synonym 'Hochschule' (highschool)
> so that lucene finds universität wenn I query Hochschule.
> But it doesn't and I checked the index contents with a term iterator.
> Hochschule is not added to the index:
> 
> 
> package test;
> 
> 
> import org.apache.lucene.analysis.Analyzer;
> import org.apache.lucene.analysis.TokenStream;
> import org.apache.lucene.analysis.Tokenizer;
> import org.apache.lucene.analysis.core.LowerCaseFilter;
> import org.apache.lucene.analysis.standard.ClassicTokenizer;
> import org.apache.lucene.analysis.standard.StandardFilter;
> import org.apache.lucene.analysis.synonym.SynonymFilter;
> import org.apache.lucene.analysis.synonym.SynonymMap;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.document.Field;
> import org.apache.lucene.document.TextField;
> import org.apache.lucene.index.*;
> import org.apache.lucene.queryparser.classic.QueryParser;
> import org.apache.lucene.search.IndexSearcher;
> import org.apache.lucene.search.Query;
> import org.apache.lucene.search.TopDocs;
> import org.apache.lucene.store.Directory;
> import org.apache.lucene.store.RAMDirectory;
> import org.apache.lucene.util.BytesRef;
> import org.apache.lucene.util.CharsRef;
> import org.apache.lucene.util.Version;
> 
> import java.io.IOException;
> import java.io.Reader;
> 
> public class LuceneHelloWorld {
> 
>    public static void main(String[] args) throws Exception {
> 
>        Analyzer analyzer = getAnalyzer();
>        Directory directory = new RAMDirectory();
>        IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_48, analyzer);
> 
>        IndexWriter writer = new IndexWriter(directory, config);
>        addDoc( writer, "people", "Hello Universität" );
>        addDoc( writer, "world", "Hello World" );
>        addDoc( writer, "people", "Hello people" );
>        writer.close();
> 
>        IndexReader reader = DirectoryReader.open(directory);
>        IndexSearcher searcher = new IndexSearcher (reader);
>        QueryParser parser = new QueryParser ( Version.LUCENE_48, "content", analyzer);
> 
>        test( parser, searcher, "Hochschule");
>        test( parser, searcher, "Hello");
>        test( parser, searcher, "people");
>        test( parser, searcher, "universität");
> 
>        printIndexTerms( reader);
> 
>    }
> 
>    public static void addDoc( IndexWriter writer, String title, String content) throws Exception {
>        Document document = new Document ();
>        document.add(new TextField("title", title, Field.Store.YES));
>        document.add(new TextField("content", content, Field.Store.YES));
>        writer.addDocument(document);
>    }
> 
>    public static Analyzer getAnalyzer(){
>        //return new StandardAnalyzer(Version.LUCENE_48);
>        //return new SynonymAnalyzer();
>        //return new SynonymFromStandardAnalyzer(Version.LUCENE_48);
>        return new Analyzer() {
>            @Override
>            protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
>                // TODO Auto-generated method stub
>                Tokenizer source = new ClassicTokenizer(Version.LUCENE_48, reader);
>                TokenStream filter(Version.LUCENE_48,filter);
>                SynonymMap mySynonymMap = null;
>                try {
>                    //mySynonymMap = buildSynonym();
>                    SynonymMap.Builder builder = new SynonymMap.Builder(true);
>                    //loadSynonyms(builder);
>                    builder.add(new CharsRef("Hochschule"), new CharsRef("Universität"), true);
>                    builder.add(new CharsRef("Universität"), new CharsRef("Hochschule"), true);
>                    mySynonymMap = builder.build();
>                } catch (IOException e) {
>                    // TODO Auto-generated catch bl               filter = new SynonymFilter(filter, mySynonymMap, false);
>                return new TokenStreamComponents(source, filter);
>            }
>        };
>    }
> 
>    static void test( QueryParser parser, IndexSearcher searcher, String queryStr) throws Exception {
>        Query query = parser.parse(queryStr);
>        TopDocs results = searcher.search(query, 10);
>        System.out.println(  "#Hits: " + results.totalHits + " : " + queryStr);
> 
>    }
> 
>    public static void printIndexTerms( IndexReader reader) throws Exception {
>        Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms("content");
>        TermsEnum iterator =terms.iterator(TermsEnum.EMPTY);
>        BytesRef byteRef;
>        while ( (byteRef = iterator.next())!=null){
>            String term = byteRef.utf8ToString();
>            System.out.println("  term: " + term);
>        }
>    }
> }
> 
> output:
> 
> #Hits: 0 : Hochschule
> #Hits: 3 : Hello
> #Hits: 1 : people
> #Hits: 1 : universität
>  term: hello
>  term: people
>  term: universität
>  term: world
> 
> 
> thanks in advance
> christian
> 
> 
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
>

Re: synonyms

Posted by Alan Woodward <al...@flax.co.uk>.

You have a LowercaseFilter before your SynonymFilter, which means that the entities in your SynonymMap need to be all lowercase or they won’t be matched.

Alan Woodward
www.flax.co.uk


> On 25 Jul 2017, at 07:52, Christian Kaufhold <Ch...@bsb-muenchen.de> wrote:
> 
> Hi,
> 
> I am not able to add synonyms to the lucene index.
> I condensed my problem into the following class which is based on a Hello World example.
> The idea behind the code was to add a document with universität and the synonym 'Hochschule' (highschool)
> so that lucene finds universität wenn I query Hochschule.
> But it doesn't and I checked the index contents with a term iterator.
> Hochschule is not added to the index:
> 
> 
> package test;
> 
> 
> import org.apache.lucene.analysis.Analyzer;
> import org.apache.lucene.analysis.TokenStream;
> import org.apache.lucene.analysis.Tokenizer;
> import org.apache.lucene.analysis.core.LowerCaseFilter;
> import org.apache.lucene.analysis.standard.ClassicTokenizer;
> import org.apache.lucene.analysis.standard.StandardFilter;
> import org.apache.lucene.analysis.synonym.SynonymFilter;
> import org.apache.lucene.analysis.synonym.SynonymMap;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.document.Field;
> import org.apache.lucene.document.TextField;
> import org.apache.lucene.index.*;
> import org.apache.lucene.queryparser.classic.QueryParser;
> import org.apache.lucene.search.IndexSearcher;
> import org.apache.lucene.search.Query;
> import org.apache.lucene.search.TopDocs;
> import org.apache.lucene.store.Directory;
> import org.apache.lucene.store.RAMDirectory;
> import org.apache.lucene.util.BytesRef;
> import org.apache.lucene.util.CharsRef;
> import org.apache.lucene.util.Version;
> 
> import java.io.IOException;
> import java.io.Reader;
> 
> public class LuceneHelloWorld {
> 
>    public static void main(String[] args) throws Exception {
> 
>        Analyzer analyzer = getAnalyzer();
>        Directory directory = new RAMDirectory();
>        IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_48, analyzer);
> 
>        IndexWriter writer = new IndexWriter(directory, config);
>        addDoc( writer, "people", "Hello Universität" );
>        addDoc( writer, "world", "Hello World" );
>        addDoc( writer, "people", "Hello people" );
>        writer.close();
> 
>        IndexReader reader = DirectoryReader.open(directory);
>        IndexSearcher searcher = new IndexSearcher (reader);
>        QueryParser parser = new QueryParser ( Version.LUCENE_48, "content", analyzer);
> 
>        test( parser, searcher, "Hochschule");
>        test( parser, searcher, "Hello");
>        test( parser, searcher, "people");
>        test( parser, searcher, "universität");
> 
>        printIndexTerms( reader);
> 
>    }
> 
>    public static void addDoc( IndexWriter writer, String title, String content) throws Exception {
>        Document document = new Document ();
>        document.add(new TextField("title", title, Field.Store.YES));
>        document.add(new TextField("content", content, Field.Store.YES));
>        writer.addDocument(document);
>    }
> 
>    public static Analyzer getAnalyzer(){
>        //return new StandardAnalyzer(Version.LUCENE_48);
>        //return new SynonymAnalyzer();
>        //return new SynonymFromStandardAnalyzer(Version.LUCENE_48);
>        return new Analyzer() {
>            @Override
>            protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
>                // TODO Auto-generated method stub
>                Tokenizer source = new ClassicTokenizer(Version.LUCENE_48, reader);
>                TokenStream filter = new StandardFilter(Version.LUCENE_48, source);
>                filter = new LowerCaseFilter(Version.LUCENE_48,filter);
>                SynonymMap mySynonymMap = null;
>                try {
>                    //mySynonymMap = buildSynonym();
>                    SynonymMap.Builder builder = new SynonymMap.Builder(true);
>                    //loadSynonyms(builder);
>                    builder.add(new CharsRef("Hochschule"), new CharsRef("Universität"), true);
>                    builder.add(new CharsRef("Universität"), new CharsRef("Hochschule"), true);
>                    mySynonymMap = builder.build();
>                } catch (IOException e) {
>                    // TODO Auto-generated catch bl               filter = new SynonymFilter(filter, mySynonymMap, false);
>                return new TokenStreamComponents(source, filter);
>            }
>        };
>    }
> 
>    static void test( QueryParser parser, IndexSearcher searcher, String queryStr) throws Exception {
>        Query query = parser.parse(queryStr);
>        TopDocs results = searcher.search(query, 10);
>        System.out.println(  "#Hits: " + results.totalHits + " : " + queryStr);
> 
>    }
> 
>    public static void printIndexTerms( IndexReader reader) throws Exception {
>        Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms("content");
>        TermsEnum iterator =terms.iterator(TermsEnum.EMPTY);
>        BytesRef byteRef;
>        while ( (byteRef = iterator.next())!=null){
>            String term = byteRef.utf8ToString();
>            System.out.println("  term: " + term);
>        }
>    }
> }
> 
> output:
> 
> #Hits: 0 : Hochschule
> #Hits: 3 : Hello
> #Hits: 1 : people
> #Hits: 1 : universität
>  term: hello
>  term: people
>  term: universität
>  term: world
> 
> 
> thanks in advance
> christian
> 
> 
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
>