You are viewing a plain text version of this content. The canonical link for it is here.

Posted to solr-user@lucene.apache.org by Rohan Thakur <ro...@gmail.com> on 2013/04/05 13:14:29 UTC

solr spell suggestions help

hi all

I had some issues with solr spell suggestions.

1) first of all I wanted to know is indexbased spell suggestions better
then directspell suggestions that solr 4.1 provides in any way?

 2) then I wanted to know is their way I can get suggestions for words
providing only few prefix for the word. like when I query sam I should get
samsung as one of suggestion.

3) also I wanted to know why am I not getting suggestions for the words
that have more then 2 character difference between them like if I query for
wirlpool wich has 8 characters I get suggestion as whirlpool which is 9
characters and correct spelling but when I query for wirlpol which is 7
characters it says that this is false spelling but does not show any
suggestions. even like if I search for pansonic(8 char) it provides
panasonic(9 char) as suggestion but when I remove one more character that
is is search for panonic(7 char) it does not return any suggestions?? how
can I correct this? even when I search for ipo it does not return ipod as
suggestions?

4) one more thing I want to get clear that when I search for microwave ovan
it does not give any miss spell even when ovan is wrong it provides the
result for microwave saying the query is correct...this is the case when
one of the term in the query is correct while others are incorrect it does
not point out the wrong spelling one but reutrns the result for correct
word thats it how can I correct this? similar is the case when I query for
microvave oven is shows the result for oven saying that the query is
correct..

5) one more case is when I query plntronies (correct word is: plantronics)
it does not return any solution but when I query for plantronies it returns
the plantronics as suggestions why is that happening?

*my schema.xml is:*
<fieldType name="tSpell" class="solr.TextField" positionIncrementGap="100"
omitNorms="true">
      <analyzer type="index">
          <charFilter class="solr.PatternReplaceCharFilterFactory"
pattern="\\\[\]\(\)\-\,\/\+" replacement=" "/>
          <tokenizer class="solr.WhitespaceTokenizerFactory"/>
          <filter class="solr.LengthFilterFactory" min="2" max="20"/>
          <filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
          <filter class="solr.LowerCaseFilterFactory"/>
          <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
       </analyzer>
       <analyzer type="query">
          <tokenizer class="solr.WhitespaceTokenizerFactory"/>
          <filter class="solr.LengthFilterFactory" min="2" max="20"/>
          <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true"/>
          <filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
          <filter class="solr.LowerCaseFilterFactory"/>
          <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
       </analyzer>
     </fieldType>

<field name="spell" type="tSpell" indexed="true" stored="true" />
<copyField source="title" dest="spell" />



*my solrconfig.xml is :*

<searchComponent name="spellcheck" class="solr.SpellCheckComponent">



    <!-- Multiple "Spell Checkers" can be declared and used by this
         component
      -->

    <!-- a spellchecker built from a field of the main index -->
    <lst name="spellchecker">
    <!--
        Optional, it is required when more than one spellchecker is
configured.
        Select non-default name with spellcheck.dictionary in request
handler.
    -->
      *<str name="name">default</str>*

      <str name="classname">solr.DirectSolrSpellChecker</str>
      <!-- the spellcheck distance measure used, the default is the
internal levenshtein -->
      <!--
        Load tokens from the following field for spell checking,
        analyzer for the field's type as defined in schema.xml are used
    -->
  *    <str name="field">spell</str>
      <str name="distanceMeasure">internal</str>
      <!-- minimum accuracy needed to be considered a valid spellcheck
suggestion -->
      <float name="accuracy">0.3</float>
      <!-- the maximum #edits we consider when enumerating terms: can be 1
or 2 -->
      <int name="maxEdits">1</int>
      <!-- the minimum shared prefix when enumerating terms -->
      <int name="minPrefix">1</int>
      <!-- maximum number of inspections per result. -->
      <int name="maxInspections">5</int>
      <!-- minimum length of a query term to be considered for correction
-->
      <int name="minQueryLength">4</int>
      <!-- maximum threshold of documents a query term can appear to be
considered for correction -->
      <float name="maxQueryFrequency">0.01</float>
      <!-- uncomment this to require suggestions to occur in 1% of the
documents
          <float name="thresholdTokenFrequency">.01</float>
      -->
    </lst>*

    <!-- a spellchecker that can break or combine words.  See "/spell"
handler below for usage -->
    *<lst name="spellchecker">
      <str name="name">wordbreak</str>
      <str name="classname">solr.WordBreakSolrSpellChecker</str>
      <str name="field">spell</str>
      <str name="combineWords">true</str>
      <str name="breakWords">true</str>
      <int name="maxChanges">3</int>
       <!--  <int name="minBreakLength">5</int> -->
    </lst>*

    <!-- a spellchecker that uses a different distance measure -->

     *  <lst name="spellchecker">
         <str name="name">jarowinkler</str>
         <str name="field">spell</str>
         <str name="classname">solr.DirectSolrSpellChecker</str>
         <str
name="distanceMeasure">org.apache.lucene.search.spell.JaroWinklerDistance</str>
       </lst>*



    <!-- a spellchecker that use an alternate comparator

         comparatorClass be one of:
          1. score (default)
          2. freq (Frequency first, then score)
          3. A fully qualified class name
      -->
    <!--
       <lst name="spellchecker">
         <str name="name">freq</str>
         <str name="field">lowerfilt</str>
         <str name="classname">solr.DirectSolrSpellChecker</str>
         <str name="comparatorClass">freq</str>
      -->

    <!-- A spellchecker that reads the list of words from a file -->

     <!--  <lst name="spellchecker">
         <str name="classname">solr.FileBasedSpellChecker</str>
         <str name="name">file</str>
         <str name="sourceLocation">spellings.txt</str>
         <str name="characterEncoding">UTF-8</str>
         <str name="spellcheckIndexDir">./spellcheckerFile</str>
       </lst>
     -->
     <!-- This field type's analyzer is used by the QueryConverter to
tokenize the value for "q" parameter -->
   *    <str name="queryAnalyzerFieldType">tSpell</str>
  </searchComponent>*


 <!--
    The SpellingQueryConverter to convert raw (CommonParams.Q) queries into
tokens.  Uses a simple regular expression
    to strip off field markup, boosts, ranges, etc. but it is not
guaranteed to match an exact parse from the query parser.

    Optional, defaults to solr.SpellingQueryConverter
   -->

* <queryConverter name="queryConverter"
class="solr.SpellingQueryConverter"/>*

  <!-- A request handler for demonstrating the spellcheck component.

       NOTE: This is purely as an example.  The whole purpose of the
       SpellCheckComponent is to hook it into the request handler that
       handles your normal user queries so that a separate request is
       not needed to get suggestions.

       IN OTHER WORDS, THERE IS REALLY GOOD CHANCE THE SETUP BELOW IS
       NOT WHAT YOU WANT FOR YOUR PRODUCTION SYSTEM!

       See http://wiki.apache.org/solr/SpellCheckComponent for details
       on the request parameters.
    -->
  *<requestHandler name="/spell" class="solr.SearchHandler" startup="lazy">
    <lst name="defaults">
      <str name="df">spell</str>
      <!-- Solr will use suggestions from both the 'default' spellchecker
           and from the 'wordbreak' spellchecker and combine them.
           collations (re-written queries) can include a combination of
           corrections from both spellcheckers -->
      <str name="spellcheck.dictionary">default</str>
      <str name="spellcheck.dictionary">wordbreak</str>
         <!--<str name="spellcheck.dictionary">jarowinkler</str> -->
     <!-- <str name="spellcheck.dictionary">file</str> -->
      <!-- omp = Only More Popular -->
      <str name="spellcheck.onlyMorePopular">false</str>
      <str name="spellcheck">on</str>
      <str name="spellcheck.extendedResults">true</str>
      <str name="spellcheck.count">10</str>
      <str name="spellcheck.alternativeTermCount">5</str>
      <str name="spellcheck.maxResultsForSuggest">5</str>
      <str name="spellcheck.collate">true</str>
      <str name="spellcheck.collateExtendedResults">true</str>
      <str name="spellcheck.maxCollationTries">10</str>
      <str name="spellcheck.maxCollations">5</str>
    </lst>
    <arr name="last-components">
      <str>spellcheck</str>
    </arr>
  </requestHandler>
*



thanks in advance
regards
Rohan

Re: solr spell suggestions help

Posted by Rohan Thakur <ro...@gmail.com>.

hi all

I have resolved all issues(its was relating to the distance measures I was
using was by default lavanstine which is very basic and is not good now I
am using jarowinkler distance measures which is better and now giving exact
results that I was looking for) except the 4th one which I think is solrs
issue and they have also released patch for that
https://issues.apache.org/jira/browse/SOLR-2585 I am applying this patch
now will let you know if its is working correctly.

thanks
regards
Rohan


On Fri, Apr 5, 2013 at 4:44 PM, Rohan Thakur <ro...@gmail.com> wrote:

> hi all
>
> I had some issues with solr spell suggestions.
>
> 1) first of all I wanted to know is indexbased spell suggestions better
> then directspell suggestions that solr 4.1 provides in any way?
>
>  2) then I wanted to know is their way I can get suggestions for words
> providing only few prefix for the word. like when I query sam I should get
> samsung as one of suggestion.
>
> 3) also I wanted to know why am I not getting suggestions for the words
> that have more then 2 character difference between them like if I query for
> wirlpool wich has 8 characters I get suggestion as whirlpool which is 9
> characters and correct spelling but when I query for wirlpol which is 7
> characters it says that this is false spelling but does not show any
> suggestions. even like if I search for pansonic(8 char) it provides
> panasonic(9 char) as suggestion but when I remove one more character that
> is is search for panonic(7 char) it does not return any suggestions?? how
> can I correct this? even when I search for ipo it does not return ipod as
> suggestions?
>
> 4) one more thing I want to get clear that when I search for microwave
> ovan it does not give any miss spell even when ovan is wrong it provides
> the result for microwave saying the query is correct...this is the case
> when one of the term in the query is correct while others are incorrect it
> does not point out the wrong spelling one but reutrns the result for
> correct word thats it how can I correct this? similar is the case when I
> query for microvave oven is shows the result for oven saying that the query
> is correct..
>
> 5) one more case is when I query plntronies (correct word is: plantronics)
> it does not return any solution but when I query for plantronies it returns
> the plantronics as suggestions why is that happening?
>
> *my schema.xml is:*
> <fieldType name="tSpell" class="solr.TextField" positionIncrementGap="100"
> omitNorms="true">
>       <analyzer type="index">
>           <charFilter class="solr.PatternReplaceCharFilterFactory"
> pattern="\\\[\]\(\)\-\,\/\+" replacement=" "/>
>           <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>           <filter class="solr.LengthFilterFactory" min="2" max="20"/>
>           <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="stopwords.txt"/>
>           <filter class="solr.LowerCaseFilterFactory"/>
>           <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
>        </analyzer>
>        <analyzer type="query">
>           <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>           <filter class="solr.LengthFilterFactory" min="2" max="20"/>
>           <filter class="solr.SynonymFilterFactory"
> synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
>           <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="stopwords.txt"/>
>           <filter class="solr.LowerCaseFilterFactory"/>
>           <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
>        </analyzer>
>      </fieldType>
>
> <field name="spell" type="tSpell" indexed="true" stored="true" />
> <copyField source="title" dest="spell" />
>
>
>
> *my solrconfig.xml is :*
>
> <searchComponent name="spellcheck" class="solr.SpellCheckComponent">
>
>
>
>     <!-- Multiple "Spell Checkers" can be declared and used by this
>          component
>       -->
>
>     <!-- a spellchecker built from a field of the main index -->
>     <lst name="spellchecker">
>     <!--
>         Optional, it is required when more than one spellchecker is
> configured.
>         Select non-default name with spellcheck.dictionary in request
> handler.
>     -->
>       *<str name="name">default</str>*
>
>       <str name="classname">solr.DirectSolrSpellChecker</str>
>       <!-- the spellcheck distance measure used, the default is the
> internal levenshtein -->
>       <!--
>         Load tokens from the following field for spell checking,
>         analyzer for the field's type as defined in schema.xml are used
>     -->
>   *    <str name="field">spell</str>
>       <str name="distanceMeasure">internal</str>
>       <!-- minimum accuracy needed to be considered a valid spellcheck
> suggestion -->
>       <float name="accuracy">0.3</float>
>       <!-- the maximum #edits we consider when enumerating terms: can be 1
> or 2 -->
>       <int name="maxEdits">1</int>
>       <!-- the minimum shared prefix when enumerating terms -->
>       <int name="minPrefix">1</int>
>       <!-- maximum number of inspections per result. -->
>       <int name="maxInspections">5</int>
>       <!-- minimum length of a query term to be considered for correction
> -->
>       <int name="minQueryLength">4</int>
>       <!-- maximum threshold of documents a query term can appear to be
> considered for correction -->
>       <float name="maxQueryFrequency">0.01</float>
>       <!-- uncomment this to require suggestions to occur in 1% of the
> documents
>           <float name="thresholdTokenFrequency">.01</float>
>       -->
>     </lst>*
>
>     <!-- a spellchecker that can break or combine words.  See "/spell"
> handler below for usage -->
>     *<lst name="spellchecker">
>       <str name="name">wordbreak</str>
>       <str name="classname">solr.WordBreakSolrSpellChecker</str>
>       <str name="field">spell</str>
>       <str name="combineWords">true</str>
>       <str name="breakWords">true</str>
>       <int name="maxChanges">3</int>
>        <!--  <int name="minBreakLength">5</int> -->
>     </lst>*
>
>     <!-- a spellchecker that uses a different distance measure -->
>
>      *  <lst name="spellchecker">
>          <str name="name">jarowinkler</str>
>          <str name="field">spell</str>
>          <str name="classname">solr.DirectSolrSpellChecker</str>
>          <str
> name="distanceMeasure">org.apache.lucene.search.spell.JaroWinklerDistance</str>
>        </lst>*
>
>
>
>     <!-- a spellchecker that use an alternate comparator
>
>          comparatorClass be one of:
>           1. score (default)
>           2. freq (Frequency first, then score)
>           3. A fully qualified class name
>       -->
>     <!--
>        <lst name="spellchecker">
>          <str name="name">freq</str>
>          <str name="field">lowerfilt</str>
>          <str name="classname">solr.DirectSolrSpellChecker</str>
>          <str name="comparatorClass">freq</str>
>       -->
>
>     <!-- A spellchecker that reads the list of words from a file -->
>
>      <!--  <lst name="spellchecker">
>          <str name="classname">solr.FileBasedSpellChecker</str>
>          <str name="name">file</str>
>          <str name="sourceLocation">spellings.txt</str>
>          <str name="characterEncoding">UTF-8</str>
>          <str name="spellcheckIndexDir">./spellcheckerFile</str>
>        </lst>
>      -->
>      <!-- This field type's analyzer is used by the QueryConverter to
> tokenize the value for "q" parameter -->
>    *    <str name="queryAnalyzerFieldType">tSpell</str>
>   </searchComponent>*
>
>
>  <!--
>     The SpellingQueryConverter to convert raw (CommonParams.Q) queries
> into tokens.  Uses a simple regular expression
>     to strip off field markup, boosts, ranges, etc. but it is not
> guaranteed to match an exact parse from the query parser.
>
>     Optional, defaults to solr.SpellingQueryConverter
>    -->
>
> * <queryConverter name="queryConverter"
> class="solr.SpellingQueryConverter"/>*
>
>   <!-- A request handler for demonstrating the spellcheck component.
>
>        NOTE: This is purely as an example.  The whole purpose of the
>        SpellCheckComponent is to hook it into the request handler that
>        handles your normal user queries so that a separate request is
>        not needed to get suggestions.
>
>        IN OTHER WORDS, THERE IS REALLY GOOD CHANCE THE SETUP BELOW IS
>        NOT WHAT YOU WANT FOR YOUR PRODUCTION SYSTEM!
>
>        See http://wiki.apache.org/solr/SpellCheckComponent for details
>        on the request parameters.
>     -->
>   *<requestHandler name="/spell" class="solr.SearchHandler"
> startup="lazy">
>     <lst name="defaults">
>       <str name="df">spell</str>
>       <!-- Solr will use suggestions from both the 'default' spellchecker
>            and from the 'wordbreak' spellchecker and combine them.
>            collations (re-written queries) can include a combination of
>            corrections from both spellcheckers -->
>       <str name="spellcheck.dictionary">default</str>
>       <str name="spellcheck.dictionary">wordbreak</str>
>          <!--<str name="spellcheck.dictionary">jarowinkler</str> -->
>      <!-- <str name="spellcheck.dictionary">file</str> -->
>       <!-- omp = Only More Popular -->
>       <str name="spellcheck.onlyMorePopular">false</str>
>       <str name="spellcheck">on</str>
>       <str name="spellcheck.extendedResults">true</str>
>       <str name="spellcheck.count">10</str>
>       <str name="spellcheck.alternativeTermCount">5</str>
>       <str name="spellcheck.maxResultsForSuggest">5</str>
>       <str name="spellcheck.collate">true</str>
>       <str name="spellcheck.collateExtendedResults">true</str>
>       <str name="spellcheck.maxCollationTries">10</str>
>       <str name="spellcheck.maxCollations">5</str>
>     </lst>
>     <arr name="last-components">
>       <str>spellcheck</str>
>     </arr>
>   </requestHandler>
> *
>
>
>
> thanks in advance
> regards
> Rohan
>