You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by "Wasim (JIRA)" <ji...@apache.org> on 2015/03/16 15:19:38 UTC
[jira] [Updated] (SOLR-7250) In spellcheck.extendedResults=true
freq value of suggestion differs from it actual origFreq
[ https://issues.apache.org/jira/browse/SOLR-7250?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Wasim updated SOLR-7250:
------------------------
Description:
Original frequency is not matching with suggestion frequency in SOLR
Output for "whs is" - (73) which is a suggestion of "who is" varies than its actual original frequency (94)
For your reference attaching two images of the output
My schema.xml
<field name="gram" type="textSpell" indexed="true" stored="true" required="true" multiValued="false"/>
<field name="gram_ci" type="textSpellCi" indexed="true" stored="false" multiValued="false"/>
<copyField source="gram" dest="gram_ci"/>
<fieldType name="textSpell" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.ShingleFilterFactory" maxShingleSize="5" minShingleSize="2" outputUnigrams="true"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.ShingleFilterFactory" maxShingleSize="5" minShingleSize="2" outputUnigrams="true"/>
</analyzer>
</fieldType>
<fieldType name="textSpellCi" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ShingleFilterFactory" maxShingleSize="5" minShingleSize="2" outputUnigrams="true"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ShingleFilterFactory" maxShingleSize="5" minShingleSize="2" outputUnigrams="true"/>
</analyzer>
</fieldType>
solrconfig.xml
<searchComponent name="spellcheck" class="solr.SpellCheckComponent">
<str name="queryAnalyzerFieldType">textSpellCi</str>
<lst name="spellchecker">
<str name="name">default</str>
<str name="field">gram_ci</str>
<str name="classname">solr.DirectSolrSpellChecker</str>
<str name="distanceMeasure">internal</str>
<float name="accuracy">0.5</float>
<int name="maxEdits">2</int>
<int name="minPrefix">0</int>
<int name="maxInspections">5</int>
<int name="minQueryLength">2</int>
<float name="maxQueryFrequency">0.99</float>
<str name="comparatorClass">freq</str>
<float name="thresholdTokenFrequency">0.0</float>
</lst>
</searchComponent>
<requestHandler name="/spell" class="solr.SearchHandler" startup="lazy">
<lst name="defaults">
<str name="df">gram_ci</str>
<str name="spellcheck.dictionary">default</str>
<str name="spellcheck">on</str>
<str name="spellcheck.extendedResults">true</str>
<str name="spellcheck.count">15</str>
<str name="spellcheck.alternativeTermCount">10</str>
<str name="spellcheck.onlyMorePopular">false</str>
</lst>
<arr name="last-components">
<str>spellcheck</str>
</arr>
</requestHandler>
was:
Original frequency is not matching with suggestion frequency in SOLR
Output for "whs is" - (73) which is a suggestion of "who is" varies than its actual original frequency (94)
For your reference attaching two images of the output
My schema.xml
<field name="gram" type="textSpell" indexed="true" stored="true" required="true" multiValued="false"/>
<field name="gram_ci" type="textSpellCi" indexed="true" stored="false" multiValued="false"/>
<copyField source="gram" dest="gram_ci"/>
<fieldType name="textSpell" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.ShingleFilterFactory" maxShingleSize="5" minShingleSize="2" outputUnigrams="true"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.ShingleFilterFactory" maxShingleSize="5" minShingleSize="2" outputUnigrams="true"/>
</analyzer>
</fieldType>
<fieldType name="textSpellCi" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ShingleFilterFactory" maxShingleSize="5" minShingleSize="2" outputUnigrams="true"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ShingleFilterFactory" maxShingleSize="5" minShingleSize="2" outputUnigrams="true"/>
</analyzer>
</fieldType>
solrconfig.xml
<searchComponent name="spellcheck" class="solr.SpellCheckComponent">
<str name="queryAnalyzerFieldType">textSpellCi</str>
<lst name="spellchecker">
<str name="name">default</str>
<str name="field">gram_ci</str>
<str name="classname">solr.DirectSolrSpellChecker</str>
<str name="distanceMeasure">internal</str>
<float name="accuracy">0.5</float>
<int name="maxEdits">2</int>
<int name="minPrefix">0</int>
<int name="maxInspections">5</int>
<int name="minQueryLength">2</int>
<float name="maxQueryFrequency">0.99</float>
<str name="comparatorClass">freq</str>
<float name="thresholdTokenFrequency">0.0</float>
</lst>
</searchComponent>
<requestHandler name="/spell" class="solr.SearchHandler" startup="lazy">
<lst name="defaults">
<str name="df">gram_ci</str>
<str name="spellcheck.dictionary">default</str>
<str name="spellcheck">on</str>
<str name="spellcheck.extendedResults">true</str>
<str name="spellcheck.count">15</str>
<str name="spellcheck.alternativeTermCount">10</str>
<str name="spellcheck.onlyMorePopular">false</str>
</lst>
<arr name="last-components">
<str>spellcheck</str>
</arr>
</requestHandler>
> In spellcheck.extendedResults=true freq value of suggestion differs from it actual origFreq
> --------------------------------------------------------------------------------------------
>
> Key: SOLR-7250
> URL: https://issues.apache.org/jira/browse/SOLR-7250
> Project: Solr
> Issue Type: New Feature
> Environment: solr 4.10.4
> Reporter: Wasim
> Original Estimate: 24h
> Remaining Estimate: 24h
>
> Original frequency is not matching with suggestion frequency in SOLR
> Output for "whs is" - (73) which is a suggestion of "who is" varies than its actual original frequency (94)
> For your reference attaching two images of the output
> My schema.xml
> <field name="gram" type="textSpell" indexed="true" stored="true" required="true" multiValued="false"/>
> <field name="gram_ci" type="textSpellCi" indexed="true" stored="false" multiValued="false"/>
> <copyField source="gram" dest="gram_ci"/>
> <fieldType name="textSpell" class="solr.TextField" positionIncrementGap="100">
> <analyzer type="index">
> <tokenizer class="solr.StandardTokenizerFactory"/>
> <filter class="solr.ShingleFilterFactory" maxShingleSize="5" minShingleSize="2" outputUnigrams="true"/>
> </analyzer>
> <analyzer type="query">
> <tokenizer class="solr.StandardTokenizerFactory"/>
> <filter class="solr.ShingleFilterFactory" maxShingleSize="5" minShingleSize="2" outputUnigrams="true"/>
> </analyzer>
> </fieldType>
> <fieldType name="textSpellCi" class="solr.TextField" positionIncrementGap="100">
> <analyzer type="index">
> <tokenizer class="solr.StandardTokenizerFactory"/>
> <filter class="solr.LowerCaseFilterFactory"/>
> <filter class="solr.ShingleFilterFactory" maxShingleSize="5" minShingleSize="2" outputUnigrams="true"/>
> </analyzer>
> <analyzer type="query">
> <tokenizer class="solr.StandardTokenizerFactory"/>
> <filter class="solr.LowerCaseFilterFactory"/>
> <filter class="solr.ShingleFilterFactory" maxShingleSize="5" minShingleSize="2" outputUnigrams="true"/>
> </analyzer>
> </fieldType>
> solrconfig.xml
> <searchComponent name="spellcheck" class="solr.SpellCheckComponent">
> <str name="queryAnalyzerFieldType">textSpellCi</str>
> <lst name="spellchecker">
> <str name="name">default</str>
> <str name="field">gram_ci</str>
> <str name="classname">solr.DirectSolrSpellChecker</str>
> <str name="distanceMeasure">internal</str>
> <float name="accuracy">0.5</float>
> <int name="maxEdits">2</int>
> <int name="minPrefix">0</int>
> <int name="maxInspections">5</int>
> <int name="minQueryLength">2</int>
> <float name="maxQueryFrequency">0.99</float>
> <str name="comparatorClass">freq</str>
> <float name="thresholdTokenFrequency">0.0</float>
> </lst>
> </searchComponent>
> <requestHandler name="/spell" class="solr.SearchHandler" startup="lazy">
> <lst name="defaults">
> <str name="df">gram_ci</str>
> <str name="spellcheck.dictionary">default</str>
> <str name="spellcheck">on</str>
> <str name="spellcheck.extendedResults">true</str>
> <str name="spellcheck.count">15</str>
> <str name="spellcheck.alternativeTermCount">10</str>
> <str name="spellcheck.onlyMorePopular">false</str>
> </lst>
> <arr name="last-components">
> <str>spellcheck</str>
> </arr>
> </requestHandler>
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: dev-help@lucene.apache.org