You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by Jyotirmoy Sundi <su...@gmail.com> on 2014/03/20 19:06:02 UTC

Singles in solr for bigrams,trigrams in parsed_query

Hi Folks,
       I am using singles to index bigrams/trigrams. The same is also used
for query in the schema.xml file. But when I run the query in debug mode
for a collections, I dont see the bigrams in the parsed_query . Any idea
what I might be missing.
solr/colection/select?q=best%20price&debugQuery=on

<str name="parsedquery_toString">text:best text:price</str>
I was hoping to see
<str name="parsedquery_toString">text:best text:price text:best price</str>

My schema files looks like this:
 <types>
    <fieldType name="string" class="solr.StrField" sortMissingLast="true"
omitNorms="true"/>
    <fieldType name="int" class="solr.TrieIntField" precisionStep="0"
omitNorms="true" positionIncrementGap="0"/>

    <fieldType name="text" class="solr.TextField"
positionIncrementGap="100">
      <analyzer type="index">
        <charFilter class="solr.HTMLStripCharFilterFactory"/>
        <filter class="solr.ShingleFilterFactory" minShingleSize="2"
maxShingleSize="4" outputUnigrams="true" />
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.LengthFilterFactory" min="3" max="50" />
        <filter class="solr.WordDelimiterFilterFactory"
generateWordParts="0" generateNumberParts="0" catenateWords="1"
catenateNumbers="1" catenateAll="1" preserveOriginal="1"
splitOnCaseChange="0" splitOnNumerics="0" stemEnglishPossessive="1"/>
        <filter class="solr.StopFilterFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true"/>
        <filter class="solr.TrimFilterFactory" />
</analyzer>

      <analyzer type="query">
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.LengthFilterFactory" min="3" max="50" />
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.StopFilterFactory"/>
        <filter class="solr.TrimFilterFactory" />
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true"/>
        <filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="1" splitOnCaseChange="0"
splitOnNumerics="0" stemEnglishPossessive="1"/>
        <filter class="solr.ShingleFilterFactory" minShingleSize="2"
maxShingleSize="4" outputUnigrams="true" />
        <filter class="solr.CommonGramsFilterFactory" words="stopwords.txt"
ignoreCase="true"/>
        <!--filter class="solr.CommonGramsFilterFactory"
words="stopwords.txt" ignoreCase="true"/>
        <filter class="solr.ShingleFilterFactory" minShingleSize="2"
maxShingleSize="4" outputUnigrams="true" /-->
 </analyzer>
    </fieldType>
 </types>



-- 
Best Regards,
Jyotirmoy Sundi

Re: Shingles in solr for bigrams,trigrams in parsed_query

Posted by Jyotirmoy Sundi <su...@gmail.com>.
Hi Jack,
          Thanks for your response, but if I try q="best quality and best
price", the parsedquery comes as following which is a lot of unwanted
combinations. I am just looking for uni-bi and tri grams.

"debug":{
    "rawquerystring":"\"best quality and best price\"",
    "querystring":"\"best quality and best price\"",
    "*parsedquery*":"MultiPhraseQuery(text:\"(best best_best quality
best quality best quality _ best quality _ best) (quality quality _
quality _ best quality _ best price) (_ best _ best price _ best
price_best) (best best_best price best price) price\")",
    "*parsedquery_toString*":"text:\"(best best_best quality best
quality best quality _ best quality _ best) (quality quality _ quality
_ best quality _ best price) (_ best _ best price _ best price_best)
(best best_best price best price) price\"",
    "explain":{},
    "QParser":"LuceneQParser",

..




On Sun, Mar 23, 2014 at 11:31 AM, Jack Krupansky <ja...@basetechnology.com>wrote:

>
> The query parser only presents the query terms one at a time to the
> analyzer, so your analyzer doesn't see both terms on one analysis call.
>
> If you enclose your query terms in quotes as a single phrase, you should
> see multiple terms being processed.
>
> q="best price"
>
> -- Jack Krupansky
>
> -----Original Message----- From: Jyotirmoy Sundi
> Sent: Thursday, March 20, 2014 2:06 PM
> To: solr-user@lucene.apache.org
> Subject: Singles in solr for bigrams,trigrams in parsed_query
>
> Hi Folks,
>       I am using singles to index bigrams/trigrams. The same is also used
> for query in the schema.xml file. But when I run the query in debug mode
> for a collections, I dont see the bigrams in the parsed_query . Any idea
> what I might be missing.
> solr/colection/select?q=best%20price&debugQuery=on
>
> <str name="parsedquery_toString">text:best text:price</str>
> I was hoping to see
> <str name="parsedquery_toString">text:best text:price text:best
> price</str>
>
> My schema files looks like this:
> <types>
>    <fieldType name="string" class="solr.StrField" sortMissingLast="true"
> omitNorms="true"/>
>    <fieldType name="int" class="solr.TrieIntField" precisionStep="0"
> omitNorms="true" positionIncrementGap="0"/>
>
>    <fieldType name="text" class="solr.TextField"
> positionIncrementGap="100">
>      <analyzer type="index">
>        <charFilter class="solr.HTMLStripCharFilterFactory"/>
>        <filter class="solr.ShingleFilterFactory" minShingleSize="2"
> maxShingleSize="4" outputUnigrams="true" />
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>        <filter class="solr.LowerCaseFilterFactory"/>
>        <filter class="solr.LengthFilterFactory" min="3" max="50" />
>        <filter class="solr.WordDelimiterFilterFactory"
> generateWordParts="0" generateNumberParts="0" catenateWords="1"
> catenateNumbers="1" catenateAll="1" preserveOriginal="1"
> splitOnCaseChange="0" splitOnNumerics="0" stemEnglishPossessive="1"/>
>        <filter class="solr.StopFilterFactory"/>
>        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
> ignoreCase="true" expand="true"/>
>        <filter class="solr.TrimFilterFactory" />
> </analyzer>
>
>      <analyzer type="query">
>        <filter class="solr.LowerCaseFilterFactory"/>
>        <filter class="solr.LengthFilterFactory" min="3" max="50" />
>        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>        <filter class="solr.StopFilterFactory"/>
>        <filter class="solr.TrimFilterFactory" />
>        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
> ignoreCase="true" expand="true"/>
>        <filter class="solr.WordDelimiterFilterFactory"
> generateWordParts="1" generateNumberParts="1" catenateWords="1"
> catenateNumbers="1" catenateAll="1" splitOnCaseChange="0"
> splitOnNumerics="0" stemEnglishPossessive="1"/>
>        <filter class="solr.ShingleFilterFactory" minShingleSize="2"
> maxShingleSize="4" outputUnigrams="true" />
>        <filter class="solr.CommonGramsFilterFactory" words="stopwords.txt"
> ignoreCase="true"/>
>        <!--filter class="solr.CommonGramsFilterFactory"
> words="stopwords.txt" ignoreCase="true"/>
>        <filter class="solr.ShingleFilterFactory" minShingleSize="2"
> maxShingleSize="4" outputUnigrams="true" /-->
> </analyzer>
>    </fieldType>
> </types>
>
>
>
> --
> Best Regards,
> Jyotirmoy Sundi
>



-- 
Best Regards,
Jyotirmoy Sundi

Re: Shingles in solr for bigrams,trigrams in parsed_query

Posted by Jack Krupansky <ja...@basetechnology.com>.
The query parser only presents the query terms one at a time to the 
analyzer, so your analyzer doesn't see both terms on one analysis call.

If you enclose your query terms in quotes as a single phrase, you should see 
multiple terms being processed.

q="best price"

-- Jack Krupansky

-----Original Message----- 
From: Jyotirmoy Sundi
Sent: Thursday, March 20, 2014 2:06 PM
To: solr-user@lucene.apache.org
Subject: Singles in solr for bigrams,trigrams in parsed_query

Hi Folks,
       I am using singles to index bigrams/trigrams. The same is also used
for query in the schema.xml file. But when I run the query in debug mode
for a collections, I dont see the bigrams in the parsed_query . Any idea
what I might be missing.
solr/colection/select?q=best%20price&debugQuery=on

<str name="parsedquery_toString">text:best text:price</str>
I was hoping to see
<str name="parsedquery_toString">text:best text:price text:best price</str>

My schema files looks like this:
<types>
    <fieldType name="string" class="solr.StrField" sortMissingLast="true"
omitNorms="true"/>
    <fieldType name="int" class="solr.TrieIntField" precisionStep="0"
omitNorms="true" positionIncrementGap="0"/>

    <fieldType name="text" class="solr.TextField"
positionIncrementGap="100">
      <analyzer type="index">
        <charFilter class="solr.HTMLStripCharFilterFactory"/>
        <filter class="solr.ShingleFilterFactory" minShingleSize="2"
maxShingleSize="4" outputUnigrams="true" />
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.LengthFilterFactory" min="3" max="50" />
        <filter class="solr.WordDelimiterFilterFactory"
generateWordParts="0" generateNumberParts="0" catenateWords="1"
catenateNumbers="1" catenateAll="1" preserveOriginal="1"
splitOnCaseChange="0" splitOnNumerics="0" stemEnglishPossessive="1"/>
        <filter class="solr.StopFilterFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true"/>
        <filter class="solr.TrimFilterFactory" />
</analyzer>

      <analyzer type="query">
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.LengthFilterFactory" min="3" max="50" />
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.StopFilterFactory"/>
        <filter class="solr.TrimFilterFactory" />
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true"/>
        <filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="1" splitOnCaseChange="0"
splitOnNumerics="0" stemEnglishPossessive="1"/>
        <filter class="solr.ShingleFilterFactory" minShingleSize="2"
maxShingleSize="4" outputUnigrams="true" />
        <filter class="solr.CommonGramsFilterFactory" words="stopwords.txt"
ignoreCase="true"/>
        <!--filter class="solr.CommonGramsFilterFactory"
words="stopwords.txt" ignoreCase="true"/>
        <filter class="solr.ShingleFilterFactory" minShingleSize="2"
maxShingleSize="4" outputUnigrams="true" /-->
</analyzer>
    </fieldType>
</types>



-- 
Best Regards,
Jyotirmoy Sundi 


Re: Singles in solr for bigrams,trigrams in parsed_query

Posted by Dmitry Kan <so...@gmail.com>.
Hi,

Query rewrite happens down the chain, after query parsing. For example a
wildcard query triggers an index based query rewrite where terms matching
the wildcard are added into the original query.

In your case, looks like the query rewrite will generate the ngrams and add
them into the original query.

So just make sure, that the analysis page shows what you expect on indexing
and querying sides.

Out of curiosity: what are you trying to achieve with the query side
shingles? Isn't just index time shingles enough?


On Thu, Mar 20, 2014 at 8:06 PM, Jyotirmoy Sundi <su...@gmail.com> wrote:

> Hi Folks,
>        I am using singles to index bigrams/trigrams. The same is also used
> for query in the schema.xml file. But when I run the query in debug mode
> for a collections, I dont see the bigrams in the parsed_query . Any idea
> what I might be missing.
> solr/colection/select?q=best%20price&debugQuery=on
>
> <str name="parsedquery_toString">text:best text:price</str>
> I was hoping to see
> <str name="parsedquery_toString">text:best text:price text:best price</str>
>
> My schema files looks like this:
>  <types>
>     <fieldType name="string" class="solr.StrField" sortMissingLast="true"
> omitNorms="true"/>
>     <fieldType name="int" class="solr.TrieIntField" precisionStep="0"
> omitNorms="true" positionIncrementGap="0"/>
>
>     <fieldType name="text" class="solr.TextField"
> positionIncrementGap="100">
>       <analyzer type="index">
>         <charFilter class="solr.HTMLStripCharFilterFactory"/>
>         <filter class="solr.ShingleFilterFactory" minShingleSize="2"
> maxShingleSize="4" outputUnigrams="true" />
>         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>         <filter class="solr.LowerCaseFilterFactory"/>
>         <filter class="solr.LengthFilterFactory" min="3" max="50" />
>         <filter class="solr.WordDelimiterFilterFactory"
> generateWordParts="0" generateNumberParts="0" catenateWords="1"
> catenateNumbers="1" catenateAll="1" preserveOriginal="1"
> splitOnCaseChange="0" splitOnNumerics="0" stemEnglishPossessive="1"/>
>         <filter class="solr.StopFilterFactory"/>
>         <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
> ignoreCase="true" expand="true"/>
>         <filter class="solr.TrimFilterFactory" />
> </analyzer>
>
>       <analyzer type="query">
>         <filter class="solr.LowerCaseFilterFactory"/>
>         <filter class="solr.LengthFilterFactory" min="3" max="50" />
>         <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>         <filter class="solr.StopFilterFactory"/>
>         <filter class="solr.TrimFilterFactory" />
>         <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
> ignoreCase="true" expand="true"/>
>         <filter class="solr.WordDelimiterFilterFactory"
> generateWordParts="1" generateNumberParts="1" catenateWords="1"
> catenateNumbers="1" catenateAll="1" splitOnCaseChange="0"
> splitOnNumerics="0" stemEnglishPossessive="1"/>
>         <filter class="solr.ShingleFilterFactory" minShingleSize="2"
> maxShingleSize="4" outputUnigrams="true" />
>         <filter class="solr.CommonGramsFilterFactory" words="stopwords.txt"
> ignoreCase="true"/>
>         <!--filter class="solr.CommonGramsFilterFactory"
> words="stopwords.txt" ignoreCase="true"/>
>         <filter class="solr.ShingleFilterFactory" minShingleSize="2"
> maxShingleSize="4" outputUnigrams="true" /-->
>  </analyzer>
>     </fieldType>
>  </types>
>
>
>
> --
> Best Regards,
> Jyotirmoy Sundi
>



-- 
Dmitry
Blog: http://dmitrykan.blogspot.com
Twitter: http://twitter.com/dmitrykan