You are viewing a plain text version of this content. The canonical link for it is here.

Posted to solr-user@lucene.apache.org by Pranav Prakash <pr...@gmail.com> on 2012/08/01 20:21:55 UTC

Exact match on few fields, fuzzy on others

Hi Folks,

I am using Solr 3.4 and my document schema has attributes - title,
transcript, author_name. Presently, I am using DisMax to search for a user
query across transcript. I would also like to do an exact search on
author_name so that for a query "Albert Einstein", I would want to get all
the documents which contain Albert or Einstein in transcript and also those
documents which have author_name exactly as 'Albert Einstein'.

Can we do this by dismax query parser? The schema for both the fields are
below:

 <fieldType name="text_commongrams" class="solr.TextField">
    <analyzer>
      <charFilter class="solr.HTMLStripCharFilterFactory" />
      <tokenizer class="solr.StandardTokenizerFactory" />
      <filter class="solr.RemoveDuplicatesTokenFilterFactory" />
      <filter class="solr.TrimFilterFactory" />
      <filter class="solr.LowerCaseFilterFactory" />
      <filter class="solr.SnowballPorterFilterFactory" language="English"
protected="protwords.txt"/>
    <filter class="solr.SynonymFilterFactory"
      synonyms="synonyms.txt"
      ignoreCase="true"
      expand="true" />
    <filter class="solr.CommonGramsFilterFactory"
      words="stopwords_en.txt"
      ignoreCase="true" />
    <filter class="solr.StopFilterFactory"
      words="stopwords_en.txt"
      ignoreCase="true" />
    <filter class="solr.WordDelimiterFilterFactory"
      generateWordParts="1"
      generateNumberParts="1"
      catenateWords="1"
      catenateNumbers="1"
      catenateAll="0"
      preserveOriginal="1" />
  </analyzer>
</fieldType>
<fieldType name="text_standard" class="solr.TextField">
    <analyzer>
      <charFilter class="solr.HTMLStripCharFilterFactory" />
      <tokenizer class="solr.StandardTokenizerFactory" />
      <filter class="solr.TrimFilterFactory" />
      <filter class="solr.LowerCaseFilterFactory" />
      <filter class="solr.StopFilterFactory"
        words="stopwords_en.txt"
        ignoreCase="true" />
      <filter class="solr.WordDelimiterFilterFactory"
        generateWordParts="1"
        generateNumberParts="1"
        catenateWords="1"
        catenateNumbers="1"
        catenateAll="0"
        preserveOriginal="1" />
      <filter class="solr.SynonymFilterFactory"
        synonyms="synonyms.txt"
        ignoreCase="true"
        expand="false" />
      <filter class="solr.RemoveDuplicatesTokenFilterFactory" />
      <filter class="solr.SnowballPorterFilterFactory" language="English"
protected="protwords.txt"/>
      </analyzer>
  </fieldType>

 <field name="title"    type="text_commongrams"   indexed="true"
 stored="true"  multiValued="false" />
 <field name="author_name" type="text_standard" indexed="true"
stored="false" />


--
*Pranav Prakash*

"temet nosce"

Re: Exact match on few fields, fuzzy on others

Posted by Jack Krupansky <ja...@basetechnology.com>.

Try edismax with the PF2 option, which will automatically boost documents 
that contains occurrences of adjacent terms as you have suggested.

See:
http://wiki.apache.org/solr/ExtendedDisMax

-- Jack Krupansky

-----Original Message----- 
From: Pranav Prakash
Sent: Wednesday, August 01, 2012 1:21 PM
To: solr-user@lucene.apache.org
Subject: Exact match on few fields, fuzzy on others

Hi Folks,

I am using Solr 3.4 and my document schema has attributes - title,
transcript, author_name. Presently, I am using DisMax to search for a user
query across transcript. I would also like to do an exact search on
author_name so that for a query "Albert Einstein", I would want to get all
the documents which contain Albert or Einstein in transcript and also those
documents which have author_name exactly as 'Albert Einstein'.

Can we do this by dismax query parser? The schema for both the fields are
below:

<fieldType name="text_commongrams" class="solr.TextField">
    <analyzer>
      <charFilter class="solr.HTMLStripCharFilterFactory" />
      <tokenizer class="solr.StandardTokenizerFactory" />
      <filter class="solr.RemoveDuplicatesTokenFilterFactory" />
      <filter class="solr.TrimFilterFactory" />
      <filter class="solr.LowerCaseFilterFactory" />
      <filter class="solr.SnowballPorterFilterFactory" language="English"
protected="protwords.txt"/>
    <filter class="solr.SynonymFilterFactory"
      synonyms="synonyms.txt"
      ignoreCase="true"
      expand="true" />
    <filter class="solr.CommonGramsFilterFactory"
      words="stopwords_en.txt"
      ignoreCase="true" />
    <filter class="solr.StopFilterFactory"
      words="stopwords_en.txt"
      ignoreCase="true" />
    <filter class="solr.WordDelimiterFilterFactory"
      generateWordParts="1"
      generateNumberParts="1"
      catenateWords="1"
      catenateNumbers="1"
      catenateAll="0"
      preserveOriginal="1" />
  </analyzer>
</fieldType>
<fieldType name="text_standard" class="solr.TextField">
    <analyzer>
      <charFilter class="solr.HTMLStripCharFilterFactory" />
      <tokenizer class="solr.StandardTokenizerFactory" />
      <filter class="solr.TrimFilterFactory" />
      <filter class="solr.LowerCaseFilterFactory" />
      <filter class="solr.StopFilterFactory"
        words="stopwords_en.txt"
        ignoreCase="true" />
      <filter class="solr.WordDelimiterFilterFactory"
        generateWordParts="1"
        generateNumberParts="1"
        catenateWords="1"
        catenateNumbers="1"
        catenateAll="0"
        preserveOriginal="1" />
      <filter class="solr.SynonymFilterFactory"
        synonyms="synonyms.txt"
        ignoreCase="true"
        expand="false" />
      <filter class="solr.RemoveDuplicatesTokenFilterFactory" />
      <filter class="solr.SnowballPorterFilterFactory" language="English"
protected="protwords.txt"/>
      </analyzer>
  </fieldType>

<field name="title"    type="text_commongrams"   indexed="true"
stored="true"  multiValued="false" />
<field name="author_name" type="text_standard" indexed="true"
stored="false" />


--
*Pranav Prakash*

"temet nosce"