You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by "fabio.bozzo" <f....@3-w.it> on 2015/01/27 09:58:17 UTC

Suggesting broken words with solr.WordBreakSolrSpellChecker

I indexed an electronics e-commerce product catalog.

This is a typical document from my collection:


"docs": [
      {
        "prezzo_vendita_d": 39.9,
        "codice_produttore_s": "DK00150020",
        "codice_s": "5.BAT.27407",
        "descrizione": "BATTERIA GO PRO HERO ",
        "barcode_interno_s": "185323000958",
        "categoria": "Batterie",
        "prezzo_acquisto_d": 16.12,
        "marchio": "GO PRO",
        "data_aggiornamento_dt": "2012-06-21T00:00:00Z",
        "id": "27407",
        "_version_": 1491274123542790100
      },
  {
    "codice_produttore_s": "DK0052043",
    "codice_s": "05.SP.42760",
    "id": "42760",
    "marchio": "SP GADGETS",
    "barcode_interno_s": "4028017520430",
    "prezzo_acquisto_d": 34.4,
    "data_aggiornamento_dt": "2014-11-04T00:00:00Z",
    "descrizione": "SP POS CASE GOPRO OLIVE LARGE",
    "prezzo_vendita_d": 59.95,
    "_version_": 1491274406746390500
  }
...]
I want my spellchecker to suggest "go pro" to users searching "gopro"
(without whitespace).

I also want users searching "go pro" to find "gopro" products, too.

Here's a little bit of my configuration:

*schema.xml*
<field name="marchio" type="string" indexed="true" stored="true"/>
        <field name="categoria" type="string" indexed="true" stored="true"/>
        <field name="fornitore" type="string" indexed="true" stored="true"/>
        <field name="descrizione" type="string" indexed="true"
stored="true"/>

        <field name="catch_all_original" type="text_general" indexed="true"
stored="false" multiValued="true" />
        <field name="catch_all" type="text_it" indexed="true" stored="false"
multiValued="true" />

<copyField source="marchio" dest="catch_all" />
    <copyField source="categoria" dest="catch_all" />
    <copyField source="descrizione" dest="catch_all" />
    <copyField source="fornitore" dest="catch_all" />

    <copyField source="marchio" dest="catch_all_original" />
    <copyField source="categoria" dest="catch_all_original" />
    <copyField source="descrizione" dest="catch_all_original" />
    <copyField source="fornitore" dest="catch_all_original" />
...

        <fieldType name="text_it" class="solr.TextField"
positionIncrementGap="100">
            <analyzer type="index">
                <tokenizer class="solr.WhitespaceTokenizerFactory"/>
                <filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"
preserveOriginal="1" />

                <filter class="solr.ElisionFilterFactory" ignoreCase="true"
articles="lang/contractions_it.txt"/>
                <filter class="solr.LowerCaseFilterFactory"/>
                <filter class="solr.ASCIIFoldingFilterFactory"/>
                <filter class="solr.StopFilterFactory" ignoreCase="true"
words="lang/stopwords_it.txt" format="snowball" />
                <filter class="solr.ItalianLightStemFilterFactory"/>
            </analyzer>
            <analyzer type="query">
                <tokenizer class="solr.WhitespaceTokenizerFactory"/>
                <filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"
preserveOriginal="1" />

                <filter class="solr.ElisionFilterFactory" ignoreCase="true"
articles="lang/contractions_it.txt"/>
                <filter class="solr.LowerCaseFilterFactory"/>
                <filter class="solr.ASCIIFoldingFilterFactory"/>
                <filter class="solr.StopFilterFactory" ignoreCase="true"
words="lang/stopwords_it.txt" format="snowball" />

                <filter class="solr.ItalianLightStemFilterFactory"/>
                <filter class="solr.SynonymFilterFactory"
synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
            </analyzer>
        </fieldType>

<br />

*solr-config.xml*
<requestHandler name="/select" class="solr.SearchHandler">

        <lst name="defaults">
            <str name="echoParams">explicit</str>
            <int name="rows">10</int>
            <str name="df">catch_all</str>

            <str name="spellcheck">on</str>
            <str name="spellcheck.dictionary">default</str>
            <str name="spellcheck.dictionary">wordbreak</str>
            <str name="spellcheck.extendedResults">false</str>
            <str name="spellcheck.count">5</str>
            <str name="spellcheck.alternativeTermCount">2</str>
            <str name="spellcheck.maxResultsForSuggest">5</str>
            <str name="spellcheck.collate">true</str>
            <str name="spellcheck.collateExtendedResults">true</str>
            <str name="spellcheck.maxCollationTries">5</str>
            <str name="spellcheck.maxCollations">3</str>
        </lst>

        <arr name="last-components">
            <str>spellcheck</str>
        </arr>

    </requestHandler>
...
<searchComponent name="spellcheck" class="solr.SpellCheckComponent">

        <str name="queryAnalyzerFieldType">text_general</str>

        <lst name="spellchecker">
            <str name="name">default</str>
            <str name="field">catch_all_original</str>
            <str name="classname">solr.DirectSolrSpellChecker</str>
            <str name="distanceMeasure">internal</str>
            <float name="accuracy">0.5</float>
            <int name="maxEdits">2</int>  
            <int name="minPrefix">1</int>
            <int name="maxInspections">5</int>
            <int name="minQueryLength">4</int>
            <float name="maxQueryFrequency">0.01</float>
        </lst>

        <lst name="spellchecker">
            <str name="name">wordbreak</str>
            <str name="classname">solr.WordBreakSolrSpellChecker</str>      
            <str name="field">catch_all_original</str>
            <str name="combineWords">true</str>
            <str name="breakWords">true</str>
            <int name="maxChanges">10</int>
            <int name="minBreakLength">3</int>
        </lst>

    </searchComponent>


*Is the spellchecker the right solution or is this the case for something
else, like the "more like this" feature?*

Thank you



--
View this message in context: http://lucene.472066.n3.nabble.com/Suggesting-broken-words-with-solr-WordBreakSolrSpellChecker-tp4182172.html
Sent from the Solr - User mailing list archive at Nabble.com.

RE: Suggesting broken words with solr.WordBreakSolrSpellChecker

Posted by "Dyer, James" <Ja...@ingramcontent.com>.
1 is not too small a value, in fact, it’s the default value.  Of course the more combinations it has to try, the slower it will run, but the penalty is small enough you're not going to notice.  The only problem you might have is if you use a lot of 1-character stop-words, you might get these stop-words back as nonsense suggestions (assuming you do not filter stop words for your spelling dictionary field, but do remove them on the query field).  But I'd try it if I were you.  It's probably the best option in your case.

James Dyer
Ingram Content Group

-----Original Message-----
From: fabio.bozzo [mailto:f.bozzo@3-w.it] 
Sent: Friday, January 30, 2015 5:45 PM
To: solr-user@lucene.apache.org
Subject: Re: Suggesting broken words with solr.WordBreakSolrSpellChecker

Nice! It works indeed!
Sorry I didn't noticed that before.

But what if I want the same for the iPhone?
I mean suggesting "I phone" for users who searched "iphone". Minbreaklength
of 1 is just too small isn't it?

Il sabato 31 gennaio 2015, Dyer, James-2 [via Lucene] <
ml-node+s472066n4183176h42@n3.nabble.com> ha scritto:

> You need to decrease this to at least 2 because the length of "go" is <3.
>
> <int name="minBreakLength">3</int>
>
> James Dyer
> Ingram Content Group
>
>
> -----Original Message-----
> From: fabio.bozzo [mailto:[hidden email]
> <http:///user/SendEmail.jtp?type=node&node=4183176&i=0>]
> Sent: Wednesday, January 28, 2015 4:55 PM
> To: [hidden email] <http:///user/SendEmail.jtp?type=node&node=4183176&i=1>
> Subject: RE: Suggesting broken words with solr.WordBreakSolrSpellChecker
>
> I tried increasing my alternativeTermCount to 5 and enable extended
> results.
> I also added a filter fq parameter to clarify what I mean:
>
> *Querying for "go pro" is good:*
>
> {
>   "responseHeader": {
>     "status": 0,
>     "QTime": 2,
>     "params": {
>       "q": "go pro",
>       "indent": "true",
>       "fq": "marchio:\"GO PRO\"",
>       "rows": "1",
>       "wt": "json",
>       "spellcheck.extendedResults": "true",
>       "_": "1422485581792"
>     }
>   },
>   "response": {
>     "numFound": 27,
>     "start": 0,
>     "docs": [
>       {
>         "codice_produttore_s": "DK00150020",
>         "codice_s": "5.BAT.27407",
>         "id": "27407",
>         "marchio": "GO PRO",
>         "barcode_interno_s": "185323000958",
>         "prezzo_acquisto_d": 16.12,
>         "data_aggiornamento_dt": "2012-06-21T00:00:00Z",
>         "descrizione": "BATTERIA GO PRO HERO ",
>         "prezzo_vendita_d": 39.9,
>         "categoria": "Batterie",
>         "_version_": 1491583424191791000
>       },
>
>      ............
>
>     ]
>   },
>   "spellcheck": {
>     "suggestions": [
>       "go pro",
>       {
>         "numFound": 1,
>         "startOffset": 0,
>         "endOffset": 6,
>         "origFreq": 433,
>         "suggestion": [
>           {
>             "word": "gopro",
>             "freq": 2
>           }
>         ]
>       },
>       "correctlySpelled",
>       false,
>       "collation",
>       [
>         "collationQuery",
>         "gopro",
>         "hits",
>         3,
>         "misspellingsAndCorrections",
>         [
>           "go pro",
>           "gopro"
>         ]
>       ]
>     ]
>   }
> }
>
> While querying for "gopro" is not:
>
> {
>   "responseHeader": {
>     "status": 0,
>     "QTime": 6,
>     "params": {
>       "q": "gopro",
>       "indent": "true",
>       "fq": "marchio:\"GO PRO\"",
>       "rows": "1",
>       "wt": "json",
>       "spellcheck.extendedResults": "true",
>       "_": "1422485629480"
>     }
>   },
>   "response": {
>     "numFound": 3,
>     "start": 0,
>     "docs": [
>       {
>         "codice_produttore_s": "DK0030010",
>         "codice_s": "5.VID.39163",
>         "id": "38814",
>         "marchio": "GO PRO",
>         "barcode_interno_s": "818279012477",
>         "prezzo_acquisto_d": 150.84,
>         "data_aggiornamento_dt": "2014-12-24T00:00:00Z",
>         "descrizione": "VIDEOCAMERA GO-PRO HERO 3 WHITE NUOVO SLIM",
>         "prezzo_vendita_d": 219,
>         "categoria": "Fotografia",
>         "_version_": 1491583425479442400
>       },
> ............
>     ]
>   },
>   "spellcheck": {
>     "suggestions": [
>       "gopro",
>       {
>         "numFound": 1,
>         "startOffset": 0,
>         "endOffset": 5,
>         "origFreq": 2,
>         "suggestion": [
>           {
>             "word": "giro",
>             "freq": 6
>           }
>         ]
>       },
>       "correctlySpelled",
>       false
>     ]
>   }
> }
>
> ---
>
> I'd like "go pro" as a suggestion for "gopro" too.
>
>
>
> --
> View this message in context:
> http://lucene.472066.n3.nabble.com/Suggesting-broken-words-with-solr-WordBreakSolrSpellChecker-tp4182172p4182735.html
> Sent from the Solr - User mailing list archive at Nabble.com.
>
>
>
>
> ------------------------------
>  If you reply to this email, your message will be added to the discussion
> below:
>
> http://lucene.472066.n3.nabble.com/Suggesting-broken-words-with-solr-WordBreakSolrSpellChecker-tp4182172p4183176.html
>  To unsubscribe from Suggesting broken words with
> solr.WordBreakSolrSpellChecker, click here
> <http://lucene.472066.n3.nabble.com/template/NamlServlet.jtp?macro=unsubscribe_by_code&node=4182172&code=Zi5ib3p6b0AzLXcuaXR8NDE4MjE3MnwxODkyODA0NDQy>
> .
> NAML
> <http://lucene.472066.n3.nabble.com/template/NamlServlet.jtp?macro=macro_viewer&id=instant_html%21nabble%3Aemail.naml&base=nabble.naml.namespaces.BasicNamespace-nabble.view.web.template.NabbleNamespace-nabble.view.web.template.NodeNamespace&breadcrumbs=notify_subscribers%21nabble%3Aemail.naml-instant_emails%21nabble%3Aemail.naml-send_instant_email%21nabble%3Aemail.naml>
>


-- 
Fabio Bozzo
SW Engineer

3W s.r.l.
Via Luisetti,7
13900-Biella ( BI )
Tel. 015.84.97.804 / 015.89.76.350
Fax 015.84.70.450

Registro imprese Biella n.01965270026
R.E.A. BI 175416

 Questo messaggio di posta elettronica contiene informazioni di carattere
confidenziale rivolte esclusivamente al destinatario sopra indicato.
E' vietato l'uso, la diffusione, distribuzione o riproduzione da parte di
ogni altra persona.
Nel caso aveste ricevuto questo messaggio di posta elettronica per errore,
siete pregati di segnalarlo immediatamente al mittente e distruggere quanto
ricevuto (compresi i file allegati) senza farne copia.
Qualsivoglia utilizzo non autorizzato del contenuto di questo messaggio
costituisce violazione dell'obbligo di non prendere cognizione della
corrispondenza tra altri soggetti, salvo più grave illecito, ed espone il
responsabile alle relative conseguenze.

This e-mail transmission may contain legally privileged and/or confidential
information.
Please do not read it if you are not the intended recipient(s). Any use,
distribution, reproduction or disclosure by any other person is strictly
prohibited. If you have received this e-mail in error, please notify.




--
View this message in context: http://lucene.472066.n3.nabble.com/Suggesting-broken-words-with-solr-WordBreakSolrSpellChecker-tp4182172p4183178.html
Sent from the Solr - User mailing list archive at Nabble.com.

Re: Suggesting broken words with solr.WordBreakSolrSpellChecker

Posted by "fabio.bozzo" <f....@3-w.it>.
Nice! It works indeed!
Sorry I didn't noticed that before.

But what if I want the same for the iPhone?
I mean suggesting "I phone" for users who searched "iphone". Minbreaklength
of 1 is just too small isn't it?

Il sabato 31 gennaio 2015, Dyer, James-2 [via Lucene] <
ml-node+s472066n4183176h42@n3.nabble.com> ha scritto:

> You need to decrease this to at least 2 because the length of "go" is <3.
>
> <int name="minBreakLength">3</int>
>
> James Dyer
> Ingram Content Group
>
>
> -----Original Message-----
> From: fabio.bozzo [mailto:[hidden email]
> <http:///user/SendEmail.jtp?type=node&node=4183176&i=0>]
> Sent: Wednesday, January 28, 2015 4:55 PM
> To: [hidden email] <http:///user/SendEmail.jtp?type=node&node=4183176&i=1>
> Subject: RE: Suggesting broken words with solr.WordBreakSolrSpellChecker
>
> I tried increasing my alternativeTermCount to 5 and enable extended
> results.
> I also added a filter fq parameter to clarify what I mean:
>
> *Querying for "go pro" is good:*
>
> {
>   "responseHeader": {
>     "status": 0,
>     "QTime": 2,
>     "params": {
>       "q": "go pro",
>       "indent": "true",
>       "fq": "marchio:\"GO PRO\"",
>       "rows": "1",
>       "wt": "json",
>       "spellcheck.extendedResults": "true",
>       "_": "1422485581792"
>     }
>   },
>   "response": {
>     "numFound": 27,
>     "start": 0,
>     "docs": [
>       {
>         "codice_produttore_s": "DK00150020",
>         "codice_s": "5.BAT.27407",
>         "id": "27407",
>         "marchio": "GO PRO",
>         "barcode_interno_s": "185323000958",
>         "prezzo_acquisto_d": 16.12,
>         "data_aggiornamento_dt": "2012-06-21T00:00:00Z",
>         "descrizione": "BATTERIA GO PRO HERO ",
>         "prezzo_vendita_d": 39.9,
>         "categoria": "Batterie",
>         "_version_": 1491583424191791000
>       },
>
>      ............
>
>     ]
>   },
>   "spellcheck": {
>     "suggestions": [
>       "go pro",
>       {
>         "numFound": 1,
>         "startOffset": 0,
>         "endOffset": 6,
>         "origFreq": 433,
>         "suggestion": [
>           {
>             "word": "gopro",
>             "freq": 2
>           }
>         ]
>       },
>       "correctlySpelled",
>       false,
>       "collation",
>       [
>         "collationQuery",
>         "gopro",
>         "hits",
>         3,
>         "misspellingsAndCorrections",
>         [
>           "go pro",
>           "gopro"
>         ]
>       ]
>     ]
>   }
> }
>
> While querying for "gopro" is not:
>
> {
>   "responseHeader": {
>     "status": 0,
>     "QTime": 6,
>     "params": {
>       "q": "gopro",
>       "indent": "true",
>       "fq": "marchio:\"GO PRO\"",
>       "rows": "1",
>       "wt": "json",
>       "spellcheck.extendedResults": "true",
>       "_": "1422485629480"
>     }
>   },
>   "response": {
>     "numFound": 3,
>     "start": 0,
>     "docs": [
>       {
>         "codice_produttore_s": "DK0030010",
>         "codice_s": "5.VID.39163",
>         "id": "38814",
>         "marchio": "GO PRO",
>         "barcode_interno_s": "818279012477",
>         "prezzo_acquisto_d": 150.84,
>         "data_aggiornamento_dt": "2014-12-24T00:00:00Z",
>         "descrizione": "VIDEOCAMERA GO-PRO HERO 3 WHITE NUOVO SLIM",
>         "prezzo_vendita_d": 219,
>         "categoria": "Fotografia",
>         "_version_": 1491583425479442400
>       },
> ............
>     ]
>   },
>   "spellcheck": {
>     "suggestions": [
>       "gopro",
>       {
>         "numFound": 1,
>         "startOffset": 0,
>         "endOffset": 5,
>         "origFreq": 2,
>         "suggestion": [
>           {
>             "word": "giro",
>             "freq": 6
>           }
>         ]
>       },
>       "correctlySpelled",
>       false
>     ]
>   }
> }
>
> ---
>
> I'd like "go pro" as a suggestion for "gopro" too.
>
>
>
> --
> View this message in context:
> http://lucene.472066.n3.nabble.com/Suggesting-broken-words-with-solr-WordBreakSolrSpellChecker-tp4182172p4182735.html
> Sent from the Solr - User mailing list archive at Nabble.com.
>
>
>
>
> ------------------------------
>  If you reply to this email, your message will be added to the discussion
> below:
>
> http://lucene.472066.n3.nabble.com/Suggesting-broken-words-with-solr-WordBreakSolrSpellChecker-tp4182172p4183176.html
>  To unsubscribe from Suggesting broken words with
> solr.WordBreakSolrSpellChecker, click here
> <http://lucene.472066.n3.nabble.com/template/NamlServlet.jtp?macro=unsubscribe_by_code&node=4182172&code=Zi5ib3p6b0AzLXcuaXR8NDE4MjE3MnwxODkyODA0NDQy>
> .
> NAML
> <http://lucene.472066.n3.nabble.com/template/NamlServlet.jtp?macro=macro_viewer&id=instant_html%21nabble%3Aemail.naml&base=nabble.naml.namespaces.BasicNamespace-nabble.view.web.template.NabbleNamespace-nabble.view.web.template.NodeNamespace&breadcrumbs=notify_subscribers%21nabble%3Aemail.naml-instant_emails%21nabble%3Aemail.naml-send_instant_email%21nabble%3Aemail.naml>
>


-- 
Fabio Bozzo
SW Engineer

3W s.r.l.
Via Luisetti,7
13900-Biella ( BI )
Tel. 015.84.97.804 / 015.89.76.350
Fax 015.84.70.450

Registro imprese Biella n.01965270026
R.E.A. BI 175416

 Questo messaggio di posta elettronica contiene informazioni di carattere
confidenziale rivolte esclusivamente al destinatario sopra indicato.
E' vietato l'uso, la diffusione, distribuzione o riproduzione da parte di
ogni altra persona.
Nel caso aveste ricevuto questo messaggio di posta elettronica per errore,
siete pregati di segnalarlo immediatamente al mittente e distruggere quanto
ricevuto (compresi i file allegati) senza farne copia.
Qualsivoglia utilizzo non autorizzato del contenuto di questo messaggio
costituisce violazione dell'obbligo di non prendere cognizione della
corrispondenza tra altri soggetti, salvo più grave illecito, ed espone il
responsabile alle relative conseguenze.

This e-mail transmission may contain legally privileged and/or confidential
information.
Please do not read it if you are not the intended recipient(s). Any use,
distribution, reproduction or disclosure by any other person is strictly
prohibited. If you have received this e-mail in error, please notify.




--
View this message in context: http://lucene.472066.n3.nabble.com/Suggesting-broken-words-with-solr-WordBreakSolrSpellChecker-tp4182172p4183178.html
Sent from the Solr - User mailing list archive at Nabble.com.

RE: Suggesting broken words with solr.WordBreakSolrSpellChecker

Posted by "Dyer, James" <Ja...@ingramcontent.com>.
You need to decrease this to at least 2 because the length of "go" is <3.

<int name="minBreakLength">3</int>

James Dyer
Ingram Content Group


-----Original Message-----
From: fabio.bozzo [mailto:f.bozzo@3-w.it] 
Sent: Wednesday, January 28, 2015 4:55 PM
To: solr-user@lucene.apache.org
Subject: RE: Suggesting broken words with solr.WordBreakSolrSpellChecker

I tried increasing my alternativeTermCount to 5 and enable extended results.
I also added a filter fq parameter to clarify what I mean:

*Querying for "go pro" is good:*

{
  "responseHeader": {
    "status": 0,
    "QTime": 2,
    "params": {
      "q": "go pro",
      "indent": "true",
      "fq": "marchio:\"GO PRO\"",
      "rows": "1",
      "wt": "json",
      "spellcheck.extendedResults": "true",
      "_": "1422485581792"
    }
  },
  "response": {
    "numFound": 27,
    "start": 0,
    "docs": [
      {
        "codice_produttore_s": "DK00150020",
        "codice_s": "5.BAT.27407",
        "id": "27407",
        "marchio": "GO PRO",
        "barcode_interno_s": "185323000958",
        "prezzo_acquisto_d": 16.12,
        "data_aggiornamento_dt": "2012-06-21T00:00:00Z",
        "descrizione": "BATTERIA GO PRO HERO ",
        "prezzo_vendita_d": 39.9,
        "categoria": "Batterie",
        "_version_": 1491583424191791000
      },

     ............

    ]
  },
  "spellcheck": {
    "suggestions": [
      "go pro",
      {
        "numFound": 1,
        "startOffset": 0,
        "endOffset": 6,
        "origFreq": 433,
        "suggestion": [
          {
            "word": "gopro",
            "freq": 2
          }
        ]
      },
      "correctlySpelled",
      false,
      "collation",
      [
        "collationQuery",
        "gopro",
        "hits",
        3,
        "misspellingsAndCorrections",
        [
          "go pro",
          "gopro"
        ]
      ]
    ]
  }
}

While querying for "gopro" is not:

{
  "responseHeader": {
    "status": 0,
    "QTime": 6,
    "params": {
      "q": "gopro",
      "indent": "true",
      "fq": "marchio:\"GO PRO\"",
      "rows": "1",
      "wt": "json",
      "spellcheck.extendedResults": "true",
      "_": "1422485629480"
    }
  },
  "response": {
    "numFound": 3,
    "start": 0,
    "docs": [
      {
        "codice_produttore_s": "DK0030010",
        "codice_s": "5.VID.39163",
        "id": "38814",
        "marchio": "GO PRO",
        "barcode_interno_s": "818279012477",
        "prezzo_acquisto_d": 150.84,
        "data_aggiornamento_dt": "2014-12-24T00:00:00Z",
        "descrizione": "VIDEOCAMERA GO-PRO HERO 3 WHITE NUOVO SLIM",
        "prezzo_vendita_d": 219,
        "categoria": "Fotografia",
        "_version_": 1491583425479442400
      },
............
    ]
  },
  "spellcheck": {
    "suggestions": [
      "gopro",
      {
        "numFound": 1,
        "startOffset": 0,
        "endOffset": 5,
        "origFreq": 2,
        "suggestion": [
          {
            "word": "giro",
            "freq": 6
          }
        ]
      },
      "correctlySpelled",
      false
    ]
  }
}

---

I'd like "go pro" as a suggestion for "gopro" too.



--
View this message in context: http://lucene.472066.n3.nabble.com/Suggesting-broken-words-with-solr-WordBreakSolrSpellChecker-tp4182172p4182735.html
Sent from the Solr - User mailing list archive at Nabble.com.



RE: Suggesting broken words with solr.WordBreakSolrSpellChecker

Posted by "fabio.bozzo" <f....@3-w.it>.
I tried increasing my alternativeTermCount to 5 and enable extended results.
I also added a filter fq parameter to clarify what I mean:

*Querying for "go pro" is good:*

{
  "responseHeader": {
    "status": 0,
    "QTime": 2,
    "params": {
      "q": "go pro",
      "indent": "true",
      "fq": "marchio:\"GO PRO\"",
      "rows": "1",
      "wt": "json",
      "spellcheck.extendedResults": "true",
      "_": "1422485581792"
    }
  },
  "response": {
    "numFound": 27,
    "start": 0,
    "docs": [
      {
        "codice_produttore_s": "DK00150020",
        "codice_s": "5.BAT.27407",
        "id": "27407",
        "marchio": "GO PRO",
        "barcode_interno_s": "185323000958",
        "prezzo_acquisto_d": 16.12,
        "data_aggiornamento_dt": "2012-06-21T00:00:00Z",
        "descrizione": "BATTERIA GO PRO HERO ",
        "prezzo_vendita_d": 39.9,
        "categoria": "Batterie",
        "_version_": 1491583424191791000
      },

     ............

    ]
  },
  "spellcheck": {
    "suggestions": [
      "go pro",
      {
        "numFound": 1,
        "startOffset": 0,
        "endOffset": 6,
        "origFreq": 433,
        "suggestion": [
          {
            "word": "gopro",
            "freq": 2
          }
        ]
      },
      "correctlySpelled",
      false,
      "collation",
      [
        "collationQuery",
        "gopro",
        "hits",
        3,
        "misspellingsAndCorrections",
        [
          "go pro",
          "gopro"
        ]
      ]
    ]
  }
}

While querying for "gopro" is not:

{
  "responseHeader": {
    "status": 0,
    "QTime": 6,
    "params": {
      "q": "gopro",
      "indent": "true",
      "fq": "marchio:\"GO PRO\"",
      "rows": "1",
      "wt": "json",
      "spellcheck.extendedResults": "true",
      "_": "1422485629480"
    }
  },
  "response": {
    "numFound": 3,
    "start": 0,
    "docs": [
      {
        "codice_produttore_s": "DK0030010",
        "codice_s": "5.VID.39163",
        "id": "38814",
        "marchio": "GO PRO",
        "barcode_interno_s": "818279012477",
        "prezzo_acquisto_d": 150.84,
        "data_aggiornamento_dt": "2014-12-24T00:00:00Z",
        "descrizione": "VIDEOCAMERA GO-PRO HERO 3 WHITE NUOVO SLIM",
        "prezzo_vendita_d": 219,
        "categoria": "Fotografia",
        "_version_": 1491583425479442400
      },
............
    ]
  },
  "spellcheck": {
    "suggestions": [
      "gopro",
      {
        "numFound": 1,
        "startOffset": 0,
        "endOffset": 5,
        "origFreq": 2,
        "suggestion": [
          {
            "word": "giro",
            "freq": 6
          }
        ]
      },
      "correctlySpelled",
      false
    ]
  }
}

---

I'd like "go pro" as a suggestion for "gopro" too.



--
View this message in context: http://lucene.472066.n3.nabble.com/Suggesting-broken-words-with-solr-WordBreakSolrSpellChecker-tp4182172p4182735.html
Sent from the Solr - User mailing list archive at Nabble.com.

RE: Suggesting broken words with solr.WordBreakSolrSpellChecker

Posted by "Dyer, James" <Ja...@ingramcontent.com>.
Try using something larger than 2 for alternativeTermCount.  5 is probably ok here.  If that doesn't work, then post the exact query you are using and the full extended spellcheck results.

James Dyer
Ingram Content Group


-----Original Message-----
From: fabio.bozzo [mailto:f.bozzo@3-w.it] 
Sent: Tuesday, January 27, 2015 3:59 PM
To: solr-user@lucene.apache.org
Subject: RE: Suggesting broken words with solr.WordBreakSolrSpellChecker

I have this in my solrconfig:

<requestHandler name="/select" class="solr.SearchHandler">

		<lst name="defaults">
			<str name="echoParams">explicit</str>
			<int name="rows">10</int>
			<str name="df">catch_all</str>

			<str name="spellcheck">on</str>
			<str name="spellcheck.dictionary">default</str>
			<str name="spellcheck.dictionary">wordbreak</str>
			<str name="spellcheck.extendedResults">false</str>
			<str name="spellcheck.count">5</str>
			<str name="spellcheck.alternativeTermCount">2</str>
			<str name="spellcheck.maxResultsForSuggest">100</str>
			<str name="spellcheck.collate">true</str>
			<str name="spellcheck.collateExtendedResults">true</str>
			<str name="spellcheck.maxCollationTries">5</str>
			<str name="spellcheck.maxCollations">3</str>
		</lst>

		<arr name="last-components">
			<str>spellcheck</str>
		</arr>

	</requestHandler>

Although my spellchecker does work, suggesting for misspelled terms, it
doesn't work for the example above:
I mean terms which are both valid, ("gopro"=100 docs; "go pro"=150 'others'
docs).
I want to suggest "gopro" for "go pro" search term and vice-versa, even if
they're both perfectly valid terms in the index. Thank you



--
View this message in context: http://lucene.472066.n3.nabble.com/Suggesting-broken-words-with-solr-WordBreakSolrSpellChecker-tp4182172p4182398.html
Sent from the Solr - User mailing list archive at Nabble.com.



RE: Suggesting broken words with solr.WordBreakSolrSpellChecker

Posted by "fabio.bozzo" <f....@3-w.it>.
I have this in my solrconfig:

<requestHandler name="/select" class="solr.SearchHandler">

		<lst name="defaults">
			<str name="echoParams">explicit</str>
			<int name="rows">10</int>
			<str name="df">catch_all</str>

			<str name="spellcheck">on</str>
			<str name="spellcheck.dictionary">default</str>
			<str name="spellcheck.dictionary">wordbreak</str>
			<str name="spellcheck.extendedResults">false</str>
			<str name="spellcheck.count">5</str>
			<str name="spellcheck.alternativeTermCount">2</str>
			<str name="spellcheck.maxResultsForSuggest">100</str>
			<str name="spellcheck.collate">true</str>
			<str name="spellcheck.collateExtendedResults">true</str>
			<str name="spellcheck.maxCollationTries">5</str>
			<str name="spellcheck.maxCollations">3</str>
		</lst>

		<arr name="last-components">
			<str>spellcheck</str>
		</arr>

	</requestHandler>

Although my spellchecker does work, suggesting for misspelled terms, it
doesn't work for the example above:
I mean terms which are both valid, ("gopro"=100 docs; "go pro"=150 'others'
docs).
I want to suggest "gopro" for "go pro" search term and vice-versa, even if
they're both perfectly valid terms in the index. Thank you



--
View this message in context: http://lucene.472066.n3.nabble.com/Suggesting-broken-words-with-solr-WordBreakSolrSpellChecker-tp4182172p4182398.html
Sent from the Solr - User mailing list archive at Nabble.com.

RE: Suggesting broken words with solr.WordBreakSolrSpellChecker

Posted by "Dyer, James" <Ja...@ingramcontent.com>.
You need to set "spellcheck.alternativeTermCount" to a value greater than zero.  Without it, spellcheck will never suggest for something in the index.

See https://cwiki.apache.org/confluence/display/solr/Spell+Checking#SpellChecking-Thespellcheck.alternativeTermCountParameter

James Dyer
Ingram Content Group


-----Original Message-----
From: fabio.bozzo [mailto:f.bozzo@3-w.it] 
Sent: Tuesday, January 27, 2015 9:57 AM
To: solr-user@lucene.apache.org
Subject: Re: Suggesting broken words with solr.WordBreakSolrSpellChecker

Good, I'll try.
But imagine I have 100 documents containing "go pro" and 150 documents
containing "gopro".
Suggestions of the "other" term do not come up in any case.

2015-01-27 16:21 GMT+01:00 Dyer, James-2 [via Lucene] <
ml-node+s472066n4182254h60@n3.nabble.com>:

> I think the word break spellchecker will do what you want.  But, if I were
> you, I'd dial back "maxChanges" to 1 or 2.  You don't want it slicing a
> word into 10 parts or trying to combine 10 adjacent words.  You also need
> the "minBreakLength" to be no more than 2, if you want it to break "go"
> (length=2) off of "gopro".
>
> James Dyer
> Ingram Content Group
>
>
> -----Original Message-----
> From: fabio.bozzo [mailto:[hidden email]
> <http:///user/SendEmail.jtp?type=node&node=4182254&i=0>]
> Sent: Tuesday, January 27, 2015 2:58 AM
> To: [hidden email] <http:///user/SendEmail.jtp?type=node&node=4182254&i=1>
> Subject: Suggesting broken words with solr.WordBreakSolrSpellChecker
>
> I indexed an electronics e-commerce product catalog.
>
> This is a typical document from my collection:
>
>
> "docs": [
>       {
>         "prezzo_vendita_d": 39.9,
>         "codice_produttore_s": "DK00150020",
>         "codice_s": "5.BAT.27407",
>         "descrizione": "BATTERIA GO PRO HERO ",
>         "barcode_interno_s": "185323000958",
>         "categoria": "Batterie",
>         "prezzo_acquisto_d": 16.12,
>         "marchio": "GO PRO",
>         "data_aggiornamento_dt": "2012-06-21T00:00:00Z",
>         "id": "27407",
>         "_version_": 1491274123542790100
>       },
>   {
>     "codice_produttore_s": "DK0052043",
>     "codice_s": "05.SP.42760",
>     "id": "42760",
>     "marchio": "SP GADGETS",
>     "barcode_interno_s": "4028017520430",
>     "prezzo_acquisto_d": 34.4,
>     "data_aggiornamento_dt": "2014-11-04T00:00:00Z",
>     "descrizione": "SP POS CASE GOPRO OLIVE LARGE",
>     "prezzo_vendita_d": 59.95,
>     "_version_": 1491274406746390500
>   }
> ...]
> I want my spellchecker to suggest "go pro" to users searching "gopro"
> (without whitespace).
>
> I also want users searching "go pro" to find "gopro" products, too.
>
> Here's a little bit of my configuration:
>
> *schema.xml*
> <field name="marchio" type="string" indexed="true" stored="true"/>
>         <field name="categoria" type="string" indexed="true"
> stored="true"/>
>         <field name="fornitore" type="string" indexed="true"
> stored="true"/>
>         <field name="descrizione" type="string" indexed="true"
> stored="true"/>
>
>         <field name="catch_all_original" type="text_general"
> indexed="true"
> stored="false" multiValued="true" />
>         <field name="catch_all" type="text_it" indexed="true"
> stored="false"
> multiValued="true" />
>
> <copyField source="marchio" dest="catch_all" />
>     <copyField source="categoria" dest="catch_all" />
>     <copyField source="descrizione" dest="catch_all" />
>     <copyField source="fornitore" dest="catch_all" />
>
>     <copyField source="marchio" dest="catch_all_original" />
>     <copyField source="categoria" dest="catch_all_original" />
>     <copyField source="descrizione" dest="catch_all_original" />
>     <copyField source="fornitore" dest="catch_all_original" />
> ...
>
>         <fieldType name="text_it" class="solr.TextField"
> positionIncrementGap="100">
>             <analyzer type="index">
>                 <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>                 <filter class="solr.WordDelimiterFilterFactory"
> generateWordParts="1" generateNumberParts="1" catenateWords="1"
> catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"
> preserveOriginal="1" />
>
>                 <filter class="solr.ElisionFilterFactory"
> ignoreCase="true"
> articles="lang/contractions_it.txt"/>
>                 <filter class="solr.LowerCaseFilterFactory"/>
>                 <filter class="solr.ASCIIFoldingFilterFactory"/>
>                 <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="lang/stopwords_it.txt" format="snowball" />
>                 <filter class="solr.ItalianLightStemFilterFactory"/>
>             </analyzer>
>             <analyzer type="query">
>                 <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>                 <filter class="solr.WordDelimiterFilterFactory"
> generateWordParts="1" generateNumberParts="1" catenateWords="1"
> catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"
> preserveOriginal="1" />
>
>                 <filter class="solr.ElisionFilterFactory"
> ignoreCase="true"
> articles="lang/contractions_it.txt"/>
>                 <filter class="solr.LowerCaseFilterFactory"/>
>                 <filter class="solr.ASCIIFoldingFilterFactory"/>
>                 <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="lang/stopwords_it.txt" format="snowball" />
>
>                 <filter class="solr.ItalianLightStemFilterFactory"/>
>                 <filter class="solr.SynonymFilterFactory"
> synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
>             </analyzer>
>         </fieldType>
>
> <br />
>
> *solr-config.xml*
> <requestHandler name="/select" class="solr.SearchHandler">
>
>         <lst name="defaults">
>             <str name="echoParams">explicit</str>
>             <int name="rows">10</int>
>             <str name="df">catch_all</str>
>
>             <str name="spellcheck">on</str>
>             <str name="spellcheck.dictionary">default</str>
>             <str name="spellcheck.dictionary">wordbreak</str>
>             <str name="spellcheck.extendedResults">false</str>
>             <str name="spellcheck.count">5</str>
>             <str name="spellcheck.alternativeTermCount">2</str>
>             <str name="spellcheck.maxResultsForSuggest">5</str>
>             <str name="spellcheck.collate">true</str>
>             <str name="spellcheck.collateExtendedResults">true</str>
>             <str name="spellcheck.maxCollationTries">5</str>
>             <str name="spellcheck.maxCollations">3</str>
>         </lst>
>
>         <arr name="last-components">
>             <str>spellcheck</str>
>         </arr>
>
>     </requestHandler>
> ...
> <searchComponent name="spellcheck" class="solr.SpellCheckComponent">
>
>         <str name="queryAnalyzerFieldType">text_general</str>
>
>         <lst name="spellchecker">
>             <str name="name">default</str>
>             <str name="field">catch_all_original</str>
>             <str name="classname">solr.DirectSolrSpellChecker</str>
>             <str name="distanceMeasure">internal</str>
>             <float name="accuracy">0.5</float>
>             <int name="maxEdits">2</int>
>             <int name="minPrefix">1</int>
>             <int name="maxInspections">5</int>
>             <int name="minQueryLength">4</int>
>             <float name="maxQueryFrequency">0.01</float>
>         </lst>
>
>         <lst name="spellchecker">
>             <str name="name">wordbreak</str>
>             <str name="classname">solr.WordBreakSolrSpellChecker</str>
>
>             <str name="field">catch_all_original</str>
>             <str name="combineWords">true</str>
>             <str name="breakWords">true</str>
>             <int name="maxChanges">10</int>
>             <int name="minBreakLength">3</int>
>         </lst>
>
>     </searchComponent>
>
>
> *Is the spellchecker the right solution or is this the case for something
> else, like the "more like this" feature?*
>
> Thank you
>
>
>
> --
> View this message in context:
> http://lucene.472066.n3.nabble.com/Suggesting-broken-words-with-solr-WordBreakSolrSpellChecker-tp4182172.html
> Sent from the Solr - User mailing list archive at Nabble.com.
>
>
>
>
> ------------------------------
>  If you reply to this email, your message will be added to the discussion
> below:
>
> http://lucene.472066.n3.nabble.com/Suggesting-broken-words-with-solr-WordBreakSolrSpellChecker-tp4182172p4182254.html
>  To unsubscribe from Suggesting broken words with
> solr.WordBreakSolrSpellChecker, click here
> <http://lucene.472066.n3.nabble.com/template/NamlServlet.jtp?macro=unsubscribe_by_code&node=4182172&code=Zi5ib3p6b0AzLXcuaXR8NDE4MjE3MnwxODkyODA0NDQy>
> .
> NAML
> <http://lucene.472066.n3.nabble.com/template/NamlServlet.jtp?macro=macro_viewer&id=instant_html%21nabble%3Aemail.naml&base=nabble.naml.namespaces.BasicNamespace-nabble.view.web.template.NabbleNamespace-nabble.view.web.template.NodeNamespace&breadcrumbs=notify_subscribers%21nabble%3Aemail.naml-instant_emails%21nabble%3Aemail.naml-send_instant_email%21nabble%3Aemail.naml>
>



-- 
Fabio Bozzo
SW Engineer

3W s.r.l.
Via Luisetti,7
13900-Biella ( BI )
Tel. 015.84.97.804 / 015.89.76.350
Fax 015.84.70.450

Registro imprese Biella n.01965270026
R.E.A. BI 175416

 Questo messaggio di posta elettronica contiene informazioni di carattere
confidenziale rivolte esclusivamente al destinatario sopra indicato.
E' vietato l'uso, la diffusione, distribuzione o riproduzione da parte di
ogni altra persona.
Nel caso aveste ricevuto questo messaggio di posta elettronica per errore,
siete pregati di segnalarlo immediatamente al mittente e distruggere quanto
ricevuto (compresi i file allegati) senza farne copia.
Qualsivoglia utilizzo non autorizzato del contenuto di questo messaggio
costituisce violazione dell'obbligo di non prendere cognizione della
corrispondenza tra altri soggetti, salvo più grave illecito, ed espone il
responsabile alle relative conseguenze.

This e-mail transmission may contain legally privileged and/or confidential
information.
Please do not read it if you are not the intended recipient(s). Any use,
distribution, reproduction or disclosure by any other person is strictly
prohibited. If you have received this e-mail in error, please notify.




--
View this message in context: http://lucene.472066.n3.nabble.com/Suggesting-broken-words-with-solr-WordBreakSolrSpellChecker-tp4182172p4182263.html
Sent from the Solr - User mailing list archive at Nabble.com.

Re: Suggesting broken words with solr.WordBreakSolrSpellChecker

Posted by "fabio.bozzo" <f....@3-w.it>.
Good, I'll try.
But imagine I have 100 documents containing "go pro" and 150 documents
containing "gopro".
Suggestions of the "other" term do not come up in any case.

2015-01-27 16:21 GMT+01:00 Dyer, James-2 [via Lucene] <
ml-node+s472066n4182254h60@n3.nabble.com>:

> I think the word break spellchecker will do what you want.  But, if I were
> you, I'd dial back "maxChanges" to 1 or 2.  You don't want it slicing a
> word into 10 parts or trying to combine 10 adjacent words.  You also need
> the "minBreakLength" to be no more than 2, if you want it to break "go"
> (length=2) off of "gopro".
>
> James Dyer
> Ingram Content Group
>
>
> -----Original Message-----
> From: fabio.bozzo [mailto:[hidden email]
> <http:///user/SendEmail.jtp?type=node&node=4182254&i=0>]
> Sent: Tuesday, January 27, 2015 2:58 AM
> To: [hidden email] <http:///user/SendEmail.jtp?type=node&node=4182254&i=1>
> Subject: Suggesting broken words with solr.WordBreakSolrSpellChecker
>
> I indexed an electronics e-commerce product catalog.
>
> This is a typical document from my collection:
>
>
> "docs": [
>       {
>         "prezzo_vendita_d": 39.9,
>         "codice_produttore_s": "DK00150020",
>         "codice_s": "5.BAT.27407",
>         "descrizione": "BATTERIA GO PRO HERO ",
>         "barcode_interno_s": "185323000958",
>         "categoria": "Batterie",
>         "prezzo_acquisto_d": 16.12,
>         "marchio": "GO PRO",
>         "data_aggiornamento_dt": "2012-06-21T00:00:00Z",
>         "id": "27407",
>         "_version_": 1491274123542790100
>       },
>   {
>     "codice_produttore_s": "DK0052043",
>     "codice_s": "05.SP.42760",
>     "id": "42760",
>     "marchio": "SP GADGETS",
>     "barcode_interno_s": "4028017520430",
>     "prezzo_acquisto_d": 34.4,
>     "data_aggiornamento_dt": "2014-11-04T00:00:00Z",
>     "descrizione": "SP POS CASE GOPRO OLIVE LARGE",
>     "prezzo_vendita_d": 59.95,
>     "_version_": 1491274406746390500
>   }
> ...]
> I want my spellchecker to suggest "go pro" to users searching "gopro"
> (without whitespace).
>
> I also want users searching "go pro" to find "gopro" products, too.
>
> Here's a little bit of my configuration:
>
> *schema.xml*
> <field name="marchio" type="string" indexed="true" stored="true"/>
>         <field name="categoria" type="string" indexed="true"
> stored="true"/>
>         <field name="fornitore" type="string" indexed="true"
> stored="true"/>
>         <field name="descrizione" type="string" indexed="true"
> stored="true"/>
>
>         <field name="catch_all_original" type="text_general"
> indexed="true"
> stored="false" multiValued="true" />
>         <field name="catch_all" type="text_it" indexed="true"
> stored="false"
> multiValued="true" />
>
> <copyField source="marchio" dest="catch_all" />
>     <copyField source="categoria" dest="catch_all" />
>     <copyField source="descrizione" dest="catch_all" />
>     <copyField source="fornitore" dest="catch_all" />
>
>     <copyField source="marchio" dest="catch_all_original" />
>     <copyField source="categoria" dest="catch_all_original" />
>     <copyField source="descrizione" dest="catch_all_original" />
>     <copyField source="fornitore" dest="catch_all_original" />
> ...
>
>         <fieldType name="text_it" class="solr.TextField"
> positionIncrementGap="100">
>             <analyzer type="index">
>                 <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>                 <filter class="solr.WordDelimiterFilterFactory"
> generateWordParts="1" generateNumberParts="1" catenateWords="1"
> catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"
> preserveOriginal="1" />
>
>                 <filter class="solr.ElisionFilterFactory"
> ignoreCase="true"
> articles="lang/contractions_it.txt"/>
>                 <filter class="solr.LowerCaseFilterFactory"/>
>                 <filter class="solr.ASCIIFoldingFilterFactory"/>
>                 <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="lang/stopwords_it.txt" format="snowball" />
>                 <filter class="solr.ItalianLightStemFilterFactory"/>
>             </analyzer>
>             <analyzer type="query">
>                 <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>                 <filter class="solr.WordDelimiterFilterFactory"
> generateWordParts="1" generateNumberParts="1" catenateWords="1"
> catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"
> preserveOriginal="1" />
>
>                 <filter class="solr.ElisionFilterFactory"
> ignoreCase="true"
> articles="lang/contractions_it.txt"/>
>                 <filter class="solr.LowerCaseFilterFactory"/>
>                 <filter class="solr.ASCIIFoldingFilterFactory"/>
>                 <filter class="solr.StopFilterFactory" ignoreCase="true"
> words="lang/stopwords_it.txt" format="snowball" />
>
>                 <filter class="solr.ItalianLightStemFilterFactory"/>
>                 <filter class="solr.SynonymFilterFactory"
> synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
>             </analyzer>
>         </fieldType>
>
> <br />
>
> *solr-config.xml*
> <requestHandler name="/select" class="solr.SearchHandler">
>
>         <lst name="defaults">
>             <str name="echoParams">explicit</str>
>             <int name="rows">10</int>
>             <str name="df">catch_all</str>
>
>             <str name="spellcheck">on</str>
>             <str name="spellcheck.dictionary">default</str>
>             <str name="spellcheck.dictionary">wordbreak</str>
>             <str name="spellcheck.extendedResults">false</str>
>             <str name="spellcheck.count">5</str>
>             <str name="spellcheck.alternativeTermCount">2</str>
>             <str name="spellcheck.maxResultsForSuggest">5</str>
>             <str name="spellcheck.collate">true</str>
>             <str name="spellcheck.collateExtendedResults">true</str>
>             <str name="spellcheck.maxCollationTries">5</str>
>             <str name="spellcheck.maxCollations">3</str>
>         </lst>
>
>         <arr name="last-components">
>             <str>spellcheck</str>
>         </arr>
>
>     </requestHandler>
> ...
> <searchComponent name="spellcheck" class="solr.SpellCheckComponent">
>
>         <str name="queryAnalyzerFieldType">text_general</str>
>
>         <lst name="spellchecker">
>             <str name="name">default</str>
>             <str name="field">catch_all_original</str>
>             <str name="classname">solr.DirectSolrSpellChecker</str>
>             <str name="distanceMeasure">internal</str>
>             <float name="accuracy">0.5</float>
>             <int name="maxEdits">2</int>
>             <int name="minPrefix">1</int>
>             <int name="maxInspections">5</int>
>             <int name="minQueryLength">4</int>
>             <float name="maxQueryFrequency">0.01</float>
>         </lst>
>
>         <lst name="spellchecker">
>             <str name="name">wordbreak</str>
>             <str name="classname">solr.WordBreakSolrSpellChecker</str>
>
>             <str name="field">catch_all_original</str>
>             <str name="combineWords">true</str>
>             <str name="breakWords">true</str>
>             <int name="maxChanges">10</int>
>             <int name="minBreakLength">3</int>
>         </lst>
>
>     </searchComponent>
>
>
> *Is the spellchecker the right solution or is this the case for something
> else, like the "more like this" feature?*
>
> Thank you
>
>
>
> --
> View this message in context:
> http://lucene.472066.n3.nabble.com/Suggesting-broken-words-with-solr-WordBreakSolrSpellChecker-tp4182172.html
> Sent from the Solr - User mailing list archive at Nabble.com.
>
>
>
>
> ------------------------------
>  If you reply to this email, your message will be added to the discussion
> below:
>
> http://lucene.472066.n3.nabble.com/Suggesting-broken-words-with-solr-WordBreakSolrSpellChecker-tp4182172p4182254.html
>  To unsubscribe from Suggesting broken words with
> solr.WordBreakSolrSpellChecker, click here
> <http://lucene.472066.n3.nabble.com/template/NamlServlet.jtp?macro=unsubscribe_by_code&node=4182172&code=Zi5ib3p6b0AzLXcuaXR8NDE4MjE3MnwxODkyODA0NDQy>
> .
> NAML
> <http://lucene.472066.n3.nabble.com/template/NamlServlet.jtp?macro=macro_viewer&id=instant_html%21nabble%3Aemail.naml&base=nabble.naml.namespaces.BasicNamespace-nabble.view.web.template.NabbleNamespace-nabble.view.web.template.NodeNamespace&breadcrumbs=notify_subscribers%21nabble%3Aemail.naml-instant_emails%21nabble%3Aemail.naml-send_instant_email%21nabble%3Aemail.naml>
>



-- 
Fabio Bozzo
SW Engineer

3W s.r.l.
Via Luisetti,7
13900-Biella ( BI )
Tel. 015.84.97.804 / 015.89.76.350
Fax 015.84.70.450

Registro imprese Biella n.01965270026
R.E.A. BI 175416

 Questo messaggio di posta elettronica contiene informazioni di carattere
confidenziale rivolte esclusivamente al destinatario sopra indicato.
E' vietato l'uso, la diffusione, distribuzione o riproduzione da parte di
ogni altra persona.
Nel caso aveste ricevuto questo messaggio di posta elettronica per errore,
siete pregati di segnalarlo immediatamente al mittente e distruggere quanto
ricevuto (compresi i file allegati) senza farne copia.
Qualsivoglia utilizzo non autorizzato del contenuto di questo messaggio
costituisce violazione dell'obbligo di non prendere cognizione della
corrispondenza tra altri soggetti, salvo più grave illecito, ed espone il
responsabile alle relative conseguenze.

This e-mail transmission may contain legally privileged and/or confidential
information.
Please do not read it if you are not the intended recipient(s). Any use,
distribution, reproduction or disclosure by any other person is strictly
prohibited. If you have received this e-mail in error, please notify.




--
View this message in context: http://lucene.472066.n3.nabble.com/Suggesting-broken-words-with-solr-WordBreakSolrSpellChecker-tp4182172p4182263.html
Sent from the Solr - User mailing list archive at Nabble.com.

RE: Suggesting broken words with solr.WordBreakSolrSpellChecker

Posted by "Dyer, James" <Ja...@ingramcontent.com>.
I think the word break spellchecker will do what you want.  But, if I were you, I'd dial back "maxChanges" to 1 or 2.  You don't want it slicing a word into 10 parts or trying to combine 10 adjacent words.  You also need the "minBreakLength" to be no more than 2, if you want it to break "go" (length=2) off of "gopro".  

James Dyer
Ingram Content Group


-----Original Message-----
From: fabio.bozzo [mailto:f.bozzo@3-w.it] 
Sent: Tuesday, January 27, 2015 2:58 AM
To: solr-user@lucene.apache.org
Subject: Suggesting broken words with solr.WordBreakSolrSpellChecker

I indexed an electronics e-commerce product catalog.

This is a typical document from my collection:


"docs": [
      {
        "prezzo_vendita_d": 39.9,
        "codice_produttore_s": "DK00150020",
        "codice_s": "5.BAT.27407",
        "descrizione": "BATTERIA GO PRO HERO ",
        "barcode_interno_s": "185323000958",
        "categoria": "Batterie",
        "prezzo_acquisto_d": 16.12,
        "marchio": "GO PRO",
        "data_aggiornamento_dt": "2012-06-21T00:00:00Z",
        "id": "27407",
        "_version_": 1491274123542790100
      },
  {
    "codice_produttore_s": "DK0052043",
    "codice_s": "05.SP.42760",
    "id": "42760",
    "marchio": "SP GADGETS",
    "barcode_interno_s": "4028017520430",
    "prezzo_acquisto_d": 34.4,
    "data_aggiornamento_dt": "2014-11-04T00:00:00Z",
    "descrizione": "SP POS CASE GOPRO OLIVE LARGE",
    "prezzo_vendita_d": 59.95,
    "_version_": 1491274406746390500
  }
...]
I want my spellchecker to suggest "go pro" to users searching "gopro"
(without whitespace).

I also want users searching "go pro" to find "gopro" products, too.

Here's a little bit of my configuration:

*schema.xml*
<field name="marchio" type="string" indexed="true" stored="true"/>
        <field name="categoria" type="string" indexed="true" stored="true"/>
        <field name="fornitore" type="string" indexed="true" stored="true"/>
        <field name="descrizione" type="string" indexed="true"
stored="true"/>

        <field name="catch_all_original" type="text_general" indexed="true"
stored="false" multiValued="true" />
        <field name="catch_all" type="text_it" indexed="true" stored="false"
multiValued="true" />

<copyField source="marchio" dest="catch_all" />
    <copyField source="categoria" dest="catch_all" />
    <copyField source="descrizione" dest="catch_all" />
    <copyField source="fornitore" dest="catch_all" />

    <copyField source="marchio" dest="catch_all_original" />
    <copyField source="categoria" dest="catch_all_original" />
    <copyField source="descrizione" dest="catch_all_original" />
    <copyField source="fornitore" dest="catch_all_original" />
...

        <fieldType name="text_it" class="solr.TextField"
positionIncrementGap="100">
            <analyzer type="index">
                <tokenizer class="solr.WhitespaceTokenizerFactory"/>
                <filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"
preserveOriginal="1" />

                <filter class="solr.ElisionFilterFactory" ignoreCase="true"
articles="lang/contractions_it.txt"/>
                <filter class="solr.LowerCaseFilterFactory"/>
                <filter class="solr.ASCIIFoldingFilterFactory"/>
                <filter class="solr.StopFilterFactory" ignoreCase="true"
words="lang/stopwords_it.txt" format="snowball" />
                <filter class="solr.ItalianLightStemFilterFactory"/>
            </analyzer>
            <analyzer type="query">
                <tokenizer class="solr.WhitespaceTokenizerFactory"/>
                <filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"
preserveOriginal="1" />

                <filter class="solr.ElisionFilterFactory" ignoreCase="true"
articles="lang/contractions_it.txt"/>
                <filter class="solr.LowerCaseFilterFactory"/>
                <filter class="solr.ASCIIFoldingFilterFactory"/>
                <filter class="solr.StopFilterFactory" ignoreCase="true"
words="lang/stopwords_it.txt" format="snowball" />

                <filter class="solr.ItalianLightStemFilterFactory"/>
                <filter class="solr.SynonymFilterFactory"
synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
            </analyzer>
        </fieldType>

<br />

*solr-config.xml*
<requestHandler name="/select" class="solr.SearchHandler">

        <lst name="defaults">
            <str name="echoParams">explicit</str>
            <int name="rows">10</int>
            <str name="df">catch_all</str>

            <str name="spellcheck">on</str>
            <str name="spellcheck.dictionary">default</str>
            <str name="spellcheck.dictionary">wordbreak</str>
            <str name="spellcheck.extendedResults">false</str>
            <str name="spellcheck.count">5</str>
            <str name="spellcheck.alternativeTermCount">2</str>
            <str name="spellcheck.maxResultsForSuggest">5</str>
            <str name="spellcheck.collate">true</str>
            <str name="spellcheck.collateExtendedResults">true</str>
            <str name="spellcheck.maxCollationTries">5</str>
            <str name="spellcheck.maxCollations">3</str>
        </lst>

        <arr name="last-components">
            <str>spellcheck</str>
        </arr>

    </requestHandler>
...
<searchComponent name="spellcheck" class="solr.SpellCheckComponent">

        <str name="queryAnalyzerFieldType">text_general</str>

        <lst name="spellchecker">
            <str name="name">default</str>
            <str name="field">catch_all_original</str>
            <str name="classname">solr.DirectSolrSpellChecker</str>
            <str name="distanceMeasure">internal</str>
            <float name="accuracy">0.5</float>
            <int name="maxEdits">2</int>  
            <int name="minPrefix">1</int>
            <int name="maxInspections">5</int>
            <int name="minQueryLength">4</int>
            <float name="maxQueryFrequency">0.01</float>
        </lst>

        <lst name="spellchecker">
            <str name="name">wordbreak</str>
            <str name="classname">solr.WordBreakSolrSpellChecker</str>      
            <str name="field">catch_all_original</str>
            <str name="combineWords">true</str>
            <str name="breakWords">true</str>
            <int name="maxChanges">10</int>
            <int name="minBreakLength">3</int>
        </lst>

    </searchComponent>


*Is the spellchecker the right solution or is this the case for something
else, like the "more like this" feature?*

Thank you



--
View this message in context: http://lucene.472066.n3.nabble.com/Suggesting-broken-words-with-solr-WordBreakSolrSpellChecker-tp4182172.html
Sent from the Solr - User mailing list archive at Nabble.com.