You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by Soumyanayan Kar <so...@rebaca.com> on 2013/04/01 07:07:46 UTC
RE: Solr Phonetic Search Highlight issue in search results

Hi Erick,

Thanks for the reply. But help me understand this: If Solr is able to
isolate the two documents which contain the term "fact" being the phonetic
equivalent of the search term "fakt", then why will it be unable to
highlight the terms based on the same logic it uses to search the documents.

Also, it is correctly highlighting the results in other searches which are
also approximate searches and not exact ones for eg. Fuzzy or Synonym
search. In these cases also the highlights in the search results are far
from the actual search term but still they are getting correctly
highlighted.

Maybe I am getting it completely wrong but it looks like there is something
wrong with my implementation.

Thanks & Regards,

Soumya.


-----Original Message-----
From: Erick Erickson [mailto:erickerickson@gmail.com] 
Sent: 27 March 2013 06:07 AM
To: solr-user@lucene.apache.org
Subject: Re: Solr Phonetic Search Highlight issue in search results

How would you expect it to highlight successfully? The term is "fakt",
there's nothing built in (and, indeed couldn't be) to un-phoneticize it into
"fact" and apply that to the Content field. The whole point of phonetic
processing is to do a lossy translation from the word into some variant,
losing precision all the way.....

So this behavior is unsurprising...

Best
Erick




On Tue, Mar 26, 2013 at 7:28 AM, Soumyanayan Kar <soumyanayan.kar@rebaca.com
> wrote:

> When we are issuing a query with Phonetic Search, it is returning the 
> correct documents but not returning the highlights. When we use 
> Stemming or Synonym searches we are getting the proper highlights.
>
>
>
> For example, when we execute a phonetic query for the term
> fakt(ContentSearchPhonetic:fakt) in the Solr Admin interface, it 
> returns two documents containing the term "fact"(phonetic token 
> equivalent), but the list of highlights is empty as shown in the 
> response below.
>
>
>
>     <response>
>
>     <lst name="responseHeader">
>
>     <int name="status">0</int>
>
>     <int name="QTime">16</int>
>
>     <lst name="params">
>
>       <str name="q">ContentSearchPhonetic:fakt</str>
>
>       <str name="wt">xml</str>
>
>     </lst>
>
>   </lst>
>
>     <result name="response" numFound="2" start="0">
>
>         <doc>
>
>           <long name="DocId">1</long>
>
>           <str name="DocTitle">Doc 1</str>
>
>           <str name="Content">Anyway, this game was excellent and was 
> well worth the time.  The graphics are truly amazing and the sound 
> track was pretty pleasant also. The  preacher was in  fact a 
> thief.</str>
>
>           <long name="_version_">1430480998833848320</long>
>
>         </doc>
>
>         <doc>
>
>           <long name="DocId">2</long>
>
>           <str name="DocTitle">Doc 2</str>
>
>           <str name="Content">stunning. The  preacher was in  fact an 
> excellent thief who  had stolen the original manuscript of Hamlet  
> from an exhibit on the  Riviera, where  he also  acquired his 
> remarkable and tan.</str>
>
>           <long name="_version_">1430480998841188352</long>
>
>         </doc>
>
>       </result>
>
>       <lst name="highlighting">
>
>         <lst name="1"/>
>
>         <lst name="2"/>
>
>       </lst>
>
>     </response>
>
>
>
> Relevant section of Solr schema:
>
>
>
>     <field name="DocId" type="long" indexed="true" stored="true"
> required="true"/>
>
>     <field name="DocTitle" type="string" indexed="false" stored="true"
> required="true"/>
>
>     <field name="Content" type="text_general" indexed="false"
stored="true"
> required="true"/>
>
>
>
>     <field name="ContentSearch" type="text_general" indexed="true"
> stored="false" multiValued="true"/>
>
>     <field name="ContentSearchStemming" type="text_stem" indexed="true"
> stored="false" multiValued="true"/>
>
>     <field name="ContentSearchPhonetic" type="text_phonetic"
indexed="true"
> stored="false" multiValued="true"/>
>
>     <field name="ContentSearchSynonym" type="text_synonym" indexed="true"
> stored="false" multiValued="true"/>
>
>
>
>     <uniqueKey>DocId</uniqueKey>
>
>     <copyField source="Content" dest="ContentSearch"/>
>
>     <copyField source="Content" dest="ContentSearchStemming"/>
>
>     <copyField source="Content" dest="ContentSearchPhonetic"/>
>
>     <copyField source="Content" dest="ContentSearchSynonym"/>
>
>
>
>     <fieldType name="text_stem" class="solr.TextField" >
>
>       <analyzer>
>
>          <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>
>          <filter class="solr.SnowballPorterFilterFactory"/>
>
>       </analyzer>
>
>     </fieldType>
>
>
>
>     <fieldType name="text_phonetic" class="solr.TextField" >
>
>       <analyzer>
>
>          <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>
>          <filter class="solr.PhoneticFilterFactory"
> encoder="DoubleMetaphone" inject="false"/>
>
>       </analyzer>
>
>     </fieldType>
>
>
>
>     <fieldType name="text_synonym" class="solr.TextField" >
>
>     <analyzer>
>
>       <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>
>       <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
> ignoreCase="true" expand="true"/>
>
>     </analyzer>
>
>     </fieldType>
>
>
>
> Relevant section of Solr config:
>
>
>
>     <requestHandler name="/select" class="solr.SearchHandler">
>
>     <!-- default values for query parameters can be specified, these
>
>          will be overridden by parameters in the request
>
>       -->
>
>      <lst name="defaults">
>
>        <str name="echoParams">explicit</str>
>
>        <int name="rows">100</int>
>
>        <str name="df">ContentSearch</str>
>
>      <bool name="hl">true</bool>
>
>     <str name="hl.fl">Content</str>
>
>     <str name="f.Content.hl.fragsize">150</str>
>
>       <str name="f.Content.hl.snippets">40</str>
>
>      </lst>
>
>     </requestHandler>
>
>     <searchComponent class="solr.HighlightComponent" name="highlight">
>
>     <highlighting>
>
>     <!-- Configure the standard fragmenter -->
>
>     <!-- This could most likely be commented out in the "default" case 
> -->
>
>     <fragmenter name="gap"
>
>                 default="true"
>
>                 class="solr.highlight.GapFragmenter">
>
>       <lst name="defaults">
>
>         <int name="hl.fragsize">100</int>
>
>       </lst>
>
>     </fragmenter>
>
>
>
>     <!-- A regular-expression-based fragmenter
>
>          (for sentence extraction)
>
>       -->
>
>     <fragmenter name="regex"
>
>                 class="solr.highlight.RegexFragmenter">
>
>       <lst name="defaults">
>
>         <!-- slightly smaller fragsizes work better because of slop 
> -->
>
>         <int name="hl.fragsize">70</int>
>
>         <!-- allow 50% slop on fragment sizes -->
>
>         <float name="hl.regex.slop">0.5</float>
>
>         <!-- a basic sentence pattern -->
>
>         <str name="hl.regex.pattern">[-\w 
> ,/\n\&quot;&apos;]{20,200}</str>
>
>       </lst>
>
>     </fragmenter>
>
>
>
> Has anyone experienced this kind of behaviour before? Need some 
> direction for troubleshooting.
>
>
>
> Soumya.
>
>
>
>
>
>