You are viewing a plain text version of this content. The canonical link for it is here.

Posted to solr-user@lucene.apache.org by Andreas Owen <ao...@conx.ch> on 2014/03/20 10:48:36 UTC

wrong results with wdf & ngtf

Is there a way to tell ngramfilterfactory while indexing that number shall
never be tokenized? then the query should be able to find numbers.

 

Or do i have to change the ngram-min for numbers (not alpha) to 1, if that
is possible? So to speak put the hole number as token and not all possible
tokens.

 

Solr analysis shows onnly WDF has no underscore in its tokens, the rest have
it. can i tell the query to search numbers differently with NGTF, WT, LCF or
whatever?

 

I also tried <filter class="solr.WordDelimiterFilterFactory"
types="at-under-alpha.txt"/>

                @ => ALPHA

                _ => ALPHA

 

I have gotten nearly everything to work. There are to queries where i dont
get back what i want.

 

                "avaloq frage 1"               -> only returns if i set
minGramSize=1 while indexing

                "yh_cug"                            -> query parser doesn't
remove "_" but the indexer does (WDF) so there is no match

 

Is there a way to also query the hole term "avaloq frage 1" without
tokenizing it?

 

Fieldtype:

 

<fieldType name="text_de" class="solr.TextField" positionIncrementGap="100">

      <analyzer type="index"> 

                               <tokenizer
class="solr.StandardTokenizerFactory"/>

                                <filter
class="solr.LowerCaseFilterFactory"/>

                               <filter
class="solr.WordDelimiterFilterFactory" types="at-under-alpha.txt"/> 

                               <filter class="solr.StopFilterFactory"
ignoreCase="true" words="lang/stopwords_de.txt" format="snowball"
enablePositionIncrements="true"/> <!-- remove common words -->

                                <filter
class="solr.GermanNormalizationFilterFactory"/>

                               <filter
class="solr.SnowballPorterFilterFactory" language="German"/> <!-- remove
noun/adjective inflections like plural endings -->


                               <filter class="solr.NGramFilterFactory"
minGramSize="3" maxGramSize="15"/>

                               <filter
class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="1" catenateNumbers="1"
catenateAll="0" splitOnCaseChange="1"/>

                   </analyzer>

                   <analyzer type="query">

                                               <tokenizer
class="solr.WhiteSpaceTokenizerFactory"/>

                                               <filter
class="solr.LowerCaseFilterFactory"/>

                                               <filter
class="solr.WordDelimiterFilterFactory" types="at-under-alpha.txt"/> 

                                               <filter
class="solr.StopFilterFactory" ignoreCase="true"
words="lang/stopwords_de.txt" format="snowball"
enablePositionIncrements="true"/> <!-- remove common words -->

                                               <filter
class="solr.GermanNormalizationFilterFactory"/>

                                               <filter
class="solr.SnowballPorterFilterFactory" language="German"/>

      </analyzer>

</fieldType>

 

 

Solrconfig:

 

> <queryParser name="synonym_edismax"

> class="solr.SynonymExpandingExtendedDismaxQParserPlugin">

>   <lst name="synonymAnalyzers">

> <lst name="myCoolAnalyzer">

>   <lst name="tokenizer">

> <str name="class">standard</str>

>   </lst>

>   <lst name="filter">

> <str name="class">shingle</str>

> <str name="outputUnigramsIfNoShingles">true</str>

> <str name="outputUnigrams">true</str>

> <str name="minShingleSize">2</str>

> <str name="maxShingleSize">4</str>

>   </lst>

>   <lst name="filter">

> <str name="class">synonym</str>

> <str name="tokenizerFactory">solr.KeywordTokenizerFactory</str>

> <str name="synonyms">synonyms.txt</str>

> <str name="expand">true</str>

> <str name="ignoreCase">true</str>

>   </lst>

> </lst>

>   </lst>

> </queryParser>

> 

> <requestHandler name="/select2" class="solr.SearchHandler">

>      <lst name="defaults">

>        <str name="echoParams">explicit</str>

>        <int name="rows">10</int>

>        <str name="defType">synonym_edismax</str>

>    <str name="synonyms">true</str>

>    <str name="qf">plain_text^10 editorschoice^200

> title^20 h_*^14

> tags^10 thema^15 inhaltstyp^6 breadcrumb^6 doctype^10

> contentmanager^5 links^5

> last_modified^5 url^5

>    </str>

>    <str name="bq">(expiration:[NOW TO *] OR (*:* 

> -expiration:*))^6</str>

>    <str name="bf">div(clicks,max(displays,1))^8</str> <!-- tested -->

> 

>        <str name="df">text</str>

>    <str name="fl">*,path,score</str>

>    <str name="wt">json</str>

>    <str name="q.op">AND</str>

> 

>    <!-- Highlighting defaults -->

>        <str name="hl">on</str>

>        <str name="hl.fl">plain_text,title</str>

>    <str name="hl.fragSize">200</str>

>    <str name="hl.simple.pre">&lt;b&gt;</str>

>        <str name="hl.simple.post">&lt;/b&gt;</str>

> 

> <!-- <lst name="invariants"> -->

>     <str name="facet">on</str>

> <str name="facet.mincount">1</str>

>         <str name="facet.field">{!ex=inhaltstyp_s}inhaltstyp_s</str>

> <str name="f.inhaltstyp_s.facet.sort">index</str>

> <str name="facet.field">{!ex=doctype}doctype</str>

> <str name="f.doctype.facet.sort">index</str>

> <str name="facet.field">{!ex=thema_f}thema_f</str>

> <str name="f.thema_f.facet.sort">index</str>

> <str name="facet.field">{!ex=author_s}author_s</str>

> <str name="f.author_s.facet.sort">index</str>

> <str

> name="facet.field">{!ex=sachverstaendiger_s}sachverstaendiger_s</str>

> <str name="f.sachverstaendiger_s.facet.sort">index</str>

> <str name="facet.field">{!ex=veranstaltung_s}veranstaltung_s</str>

> <str name="f.veranstaltung_s.facet.sort">index</str>

> <str name="facet.date">{!ex=last_modified}last_modified</str>

> <str name="facet.date.gap">+1MONTH</str>

> <str name="facet.date.end">NOW/MONTH+1MONTH</str>

> <str name="facet.date.start">NOW/MONTH-36MONTHS</str>

> <str name="facet.date.other">after</str>

> 

>        </lst>

> </requestHandler>

Re: wrong results with wdf & ngtf

Posted by Jack Krupansky <ja...@basetechnology.com>.

What indexed text are you expecting the "avaloq frage 1" query to match 
against?

I just noticed that you have two distinct calls to WDF in your index 
analyzer.

I think you're going to need to go back and clearly state all of the term 
requirements for both indexing and query. Show all the use cases, both index 
and query. You have too many balls in the air right now for anybody to be 
confident about what you're really trying to do.

-- Jack Krupansky

-----Original Message----- 
From: Andreas Owen
Sent: Thursday, March 20, 2014 5:48 AM
To: solr-user@lucene.apache.org
Subject: wrong results with wdf & ngtf

Is there a way to tell ngramfilterfactory while indexing that number shall
never be tokenized? then the query should be able to find numbers.



Or do i have to change the ngram-min for numbers (not alpha) to 1, if that
is possible? So to speak put the hole number as token and not all possible
tokens.



Solr analysis shows onnly WDF has no underscore in its tokens, the rest have
it. can i tell the query to search numbers differently with NGTF, WT, LCF or
whatever?



I also tried <filter class="solr.WordDelimiterFilterFactory"
types="at-under-alpha.txt"/>

                @ => ALPHA

                _ => ALPHA



I have gotten nearly everything to work. There are to queries where i dont
get back what i want.



                "avaloq frage 1"               -> only returns if i set
minGramSize=1 while indexing

                "yh_cug"                            -> query parser doesn't
remove "_" but the indexer does (WDF) so there is no match



Is there a way to also query the hole term "avaloq frage 1" without
tokenizing it?



Fieldtype:



<fieldType name="text_de" class="solr.TextField" positionIncrementGap="100">

      <analyzer type="index">

                               <tokenizer
class="solr.StandardTokenizerFactory"/>

                                <filter
class="solr.LowerCaseFilterFactory"/>

                               <filter
class="solr.WordDelimiterFilterFactory" types="at-under-alpha.txt"/>

                               <filter class="solr.StopFilterFactory"
ignoreCase="true" words="lang/stopwords_de.txt" format="snowball"
enablePositionIncrements="true"/> <!-- remove common words -->

                                <filter
class="solr.GermanNormalizationFilterFactory"/>

                               <filter
class="solr.SnowballPorterFilterFactory" language="German"/> <!-- remove
noun/adjective inflections like plural endings -->


                               <filter class="solr.NGramFilterFactory"
minGramSize="3" maxGramSize="15"/>

                               <filter
class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="1" catenateNumbers="1"
catenateAll="0" splitOnCaseChange="1"/>

                   </analyzer>

                   <analyzer type="query">

                                               <tokenizer
class="solr.WhiteSpaceTokenizerFactory"/>

                                               <filter
class="solr.LowerCaseFilterFactory"/>

                                               <filter
class="solr.WordDelimiterFilterFactory" types="at-under-alpha.txt"/>

                                               <filter
class="solr.StopFilterFactory" ignoreCase="true"
words="lang/stopwords_de.txt" format="snowball"
enablePositionIncrements="true"/> <!-- remove common words -->

                                               <filter
class="solr.GermanNormalizationFilterFactory"/>

                                               <filter
class="solr.SnowballPorterFilterFactory" language="German"/>

      </analyzer>

</fieldType>





Solrconfig:



> <queryParser name="synonym_edismax"

> class="solr.SynonymExpandingExtendedDismaxQParserPlugin">

>   <lst name="synonymAnalyzers">

> <lst name="myCoolAnalyzer">

>   <lst name="tokenizer">

> <str name="class">standard</str>

>   </lst>

>   <lst name="filter">

> <str name="class">shingle</str>

> <str name="outputUnigramsIfNoShingles">true</str>

> <str name="outputUnigrams">true</str>

> <str name="minShingleSize">2</str>

> <str name="maxShingleSize">4</str>

>   </lst>

>   <lst name="filter">

> <str name="class">synonym</str>

> <str name="tokenizerFactory">solr.KeywordTokenizerFactory</str>

> <str name="synonyms">synonyms.txt</str>

> <str name="expand">true</str>

> <str name="ignoreCase">true</str>

>   </lst>

> </lst>

>   </lst>

> </queryParser>

>

> <requestHandler name="/select2" class="solr.SearchHandler">

>      <lst name="defaults">

>        <str name="echoParams">explicit</str>

>        <int name="rows">10</int>

>        <str name="defType">synonym_edismax</str>

>    <str name="synonyms">true</str>

>    <str name="qf">plain_text^10 editorschoice^200

> title^20 h_*^14

> tags^10 thema^15 inhaltstyp^6 breadcrumb^6 doctype^10

> contentmanager^5 links^5

> last_modified^5 url^5

>    </str>

>    <str name="bq">(expiration:[NOW TO *] OR (*:*

> -expiration:*))^6</str>

>    <str name="bf">div(clicks,max(displays,1))^8</str> <!-- tested -->

>

>        <str name="df">text</str>

>    <str name="fl">*,path,score</str>

>    <str name="wt">json</str>

>    <str name="q.op">AND</str>

>

>    <!-- Highlighting defaults -->

>        <str name="hl">on</str>

>        <str name="hl.fl">plain_text,title</str>

>    <str name="hl.fragSize">200</str>

>    <str name="hl.simple.pre">&lt;b&gt;</str>

>        <str name="hl.simple.post">&lt;/b&gt;</str>

>

> <!-- <lst name="invariants"> -->

>     <str name="facet">on</str>

> <str name="facet.mincount">1</str>

>         <str name="facet.field">{!ex=inhaltstyp_s}inhaltstyp_s</str>

> <str name="f.inhaltstyp_s.facet.sort">index</str>

> <str name="facet.field">{!ex=doctype}doctype</str>

> <str name="f.doctype.facet.sort">index</str>

> <str name="facet.field">{!ex=thema_f}thema_f</str>

> <str name="f.thema_f.facet.sort">index</str>

> <str name="facet.field">{!ex=author_s}author_s</str>

> <str name="f.author_s.facet.sort">index</str>

> <str

> name="facet.field">{!ex=sachverstaendiger_s}sachverstaendiger_s</str>

> <str name="f.sachverstaendiger_s.facet.sort">index</str>

> <str name="facet.field">{!ex=veranstaltung_s}veranstaltung_s</str>

> <str name="f.veranstaltung_s.facet.sort">index</str>

> <str name="facet.date">{!ex=last_modified}last_modified</str>

> <str name="facet.date.gap">+1MONTH</str>

> <str name="facet.date.end">NOW/MONTH+1MONTH</str>

> <str name="facet.date.start">NOW/MONTH-36MONTHS</str>

> <str name="facet.date.other">after</str>

>

>        </lst>

> </requestHandler>