You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by kl...@apache.org on 2007/11/05 20:39:15 UTC
svn commit: r592129 - in /lucene/solr/trunk: ./
src/java/org/apache/solr/handler/ src/java/org/apache/solr/util/
src/test/org/apache/solr/handler/ src/test/test-files/solr/conf/
Author: klaas
Date: Mon Nov 5 11:39:14 2007
New Revision: 592129
URL: http://svn.apache.org/viewvc?rev=592129&view=rev
Log:
SOLR-395: spell checker upgrade
Added:
lucene/solr/trunk/src/java/org/apache/solr/util/HiFrequencyDictionary.java (with props)
lucene/solr/trunk/src/test/org/apache/solr/handler/SpellCheckerRequestHandlerTest.java (with props)
lucene/solr/trunk/src/test/test-files/solr/conf/schema-spellchecker.xml (with props)
lucene/solr/trunk/src/test/test-files/solr/conf/solrconfig-spellchecker.xml (with props)
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/src/java/org/apache/solr/handler/SpellCheckerRequestHandler.java
Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=592129&r1=592128&r2=592129&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Mon Nov 5 11:39:14 2007
@@ -136,12 +136,18 @@
to the detailed field information from the solrj client API.
(Grant Ingersoll via ehatcher)
-26. SOLR-334L Pluggable query parsers. Allows specification of query
+26. SOLR-334: Pluggable query parsers. Allows specification of query
type and arguments as a prefix on a query string. (yonik)
-27. SOLR-351L External Value Source. An external file may be used
+27. SOLR-351: External Value Source. An external file may be used
to specify the values of a field, currently usable as
a ValueSource in a FunctionQuery. (yonik)
+
+28. SOLR-395: Many new features for the spell checker implementation, including
+ an extended response mode with much richer output, multi-word spell checking,
+ and a bevy of new and renamed options (see the wiki).
+ (Mike Krimerman, Scott Taber via klaas).
+
Changes in runtime behavior
Modified: lucene/solr/trunk/src/java/org/apache/solr/handler/SpellCheckerRequestHandler.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/handler/SpellCheckerRequestHandler.java?rev=592129&r1=592128&r2=592129&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/handler/SpellCheckerRequestHandler.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/handler/SpellCheckerRequestHandler.java Mon Nov 5 11:39:14 2007
@@ -18,6 +18,7 @@
package org.apache.solr.handler;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.spell.LuceneDictionary;
@@ -30,7 +31,9 @@
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrCore;
+import org.apache.solr.util.HiFrequencyDictionary;
import java.io.File;
import java.io.IOException;
@@ -42,6 +45,141 @@
* Takes a string (e.g. a query string) as the value of the "q" parameter
* and looks up alternative spelling suggestions in the spellchecker.
* The spellchecker used by this handler is the Lucene contrib SpellChecker.
+ *
+<style>
+pre.code
+{
+ border: 1pt solid #AEBDCC;
+ background-color: #F3F5F7;
+ padding: 5pt;
+ font-family: courier, monospace;
+ white-space: pre;
+ // begin css 3 or browser specific rules - do not remove!
+ //see: http://forums.techguy.org/archive/index.php/t-249849.html
+ white-space: pre-wrap;
+ word-wrap: break-word;
+ white-space: -moz-pre-wrap;
+ white-space: -pre-wrap;
+ white-space: -o-pre-wrap;
+ // end css 3 or browser specific rules
+}
+
+</style>
+ *
+ * <p>The results identifies the original words echoing it as an entry with the
+ * name of "words" and original word value. It
+ * also identifies if the requested "words" is contained in the index through
+ * the use of the exist true/false name value. Examples of these output
+ * parameters in the standard output format is as follows:</p>
+ * <pre class="code">
+<str name="words">facial</str>
+<str name="exist">true</str> </pre>
+ *
+ * <p>If a query string parameter of "multiWords" is used, then each word within the
+ * "q" parameter (seperated by a space or +) will
+ * be iterated through the spell checker and will be wrapped in an
+ * NamedList. Each word will then get its own set of results: words, exists, and
+ * suggestions.</p>
+ *
+ * <p>Examples of the use of the standard ouput (XML) without and with the
+ * use of the "multiWords" parameter are as follows.</p>
+ *
+ * <p> The following URL
+ * examples were configured with the solr.SpellCheckerRequestHandler
+ * named as "/spellchecker".</p>
+ *
+ * <p>Without the use of "extendedResults" and one word
+ * spelled correctly: facial </p>
+ * <pre class="code">http://.../spellchecker?indent=on&onlyMorePopular=true&accuracy=.6&suggestionCount=20&q=facial</pre>
+ * <pre class="code">
+<?xml version="1.0" encoding="UTF-8"?>
+<response>
+
+<lst name="responseHeader">
+ <int name="status">0</int>
+ <int name="QTime">6</int>
+</lst>
+<str name="words">facial</str>
+<str name="exist">true</str>
+<arr name="suggestions">
+ <str>faciale</str>
+ <str>faucial</str>
+ <str>fascial</str>
+ <str>facing</str>
+ <str>faciei</str>
+ <str>facialis</str>
+ <str>social</str>
+ <str>facile</str>
+ <str>spacial</str>
+ <str>glacial</str>
+ <str>marcial</str>
+ <str>facies</str>
+ <str>facio</str>
+</arr>
+</response> </pre>
+ *
+ * <p>Without the use of "extendedResults" and two words,
+ * one spelled correctly and one misspelled: facial salophosphoprotein </p>
+ * <pre class="code">http://.../spellchecker?indent=on&onlyMorePopular=true&accuracy=.6&suggestionCount=20&q=facial+salophosphoprotein</pre>
+ * <pre class="code">
+<?xml version="1.0" encoding="UTF-8"?>
+<response>
+
+<lst name="responseHeader">
+ <int name="status">0</int>
+ <int name="QTime">18</int>
+</lst>
+<str name="words">facial salophosphoprotein</str>
+<str name="exist">false</str>
+<arr name="suggestions">
+ <str>sialophosphoprotein</str>
+</arr>
+</response> </pre>
+ *
+ *
+ * <p>With the use of "extendedResults" and two words,
+ * one spelled correctly and one misspelled: facial salophosphoprotein </p>
+ * <pre class="code">http://.../spellchecker?indent=on&onlyMorePopular=true&accuracy=.6&suggestionCount=20&extendedResults=true&q=facial+salophosphoprotein</pre>
+ * <pre class="code">
+<?xml version="1.0" encoding="UTF-8"?>
+<response>
+
+<lst name="responseHeader">
+ <int name="status">0</int>
+ <int name="QTime">23</int>
+</lst>
+<lst name="result">
+ <lst name="facial">
+ <int name="frequency">1</int>
+ <lst name="suggestions">
+ <lst name="faciale"><int name="frequency">1</int></lst>
+ <lst name="faucial"><int name="frequency">1</int></lst>
+ <lst name="fascial"><int name="frequency">1</int></lst>
+ <lst name="facing"><int name="frequency">1</int></lst>
+ <lst name="faciei"><int name="frequency">1</int></lst>
+ <lst name="facialis"><int name="frequency">1</int></lst>
+ <lst name="social"><int name="frequency">1</int></lst>
+ <lst name="facile"><int name="frequency">1</int></lst>
+ <lst name="spacial"><int name="frequency">1</int></lst>
+ <lst name="glacial"><int name="frequency">1</int></lst>
+ <lst name="marcial"><int name="frequency">1</int></lst>
+ <lst name="facies"><int name="frequency">1</int></lst>
+ <lst name="facio"><int name="frequency">1</int></lst>
+ </lst>
+ </lst>
+ <lst name="salophosphoprotein">
+ <int name="frequency">0</int>
+ <lst name="suggestions">
+ <lst name="sialophosphoprotein"><int name="frequency">1</int></lst>
+ <lst name="phosphoprotein"><int name="frequency">1</int></lst>
+ <lst name="phosphoproteins"><int name="frequency">1</int></lst>
+ <lst name="alphalipoprotein"><int name="frequency">1</int></lst>
+ </lst>
+ </lst>
+</lst>
+</response> </pre>
+
+ *
* @see <a href="http://wiki.apache.org/jakarta-lucene/SpellChecker">The Lucene Spellchecker documentation</a>
*
*/
@@ -64,22 +202,37 @@
* return only the words more frequent than this.
*
*/
- private boolean onlyMorePopular = false;
private Directory spellcheckerIndexDir = new RAMDirectory();
private String dirDescription = "(ramdir)";
private String termSourceField;
+
+ private static final String PREFIX = "sp.";
+ private static final String QUERY_PREFIX = PREFIX + "query.";
+ private static final String DICTIONARY_PREFIX = PREFIX + "dictionary.";
+
+ private static final String SOURCE_FIELD = DICTIONARY_PREFIX + "termSourceField";
+ private static final String INDEX_DIR = DICTIONARY_PREFIX + "indexDir";
+ private static final String THRESHOLD = DICTIONARY_PREFIX + "threshold";
+
+ private static final String ACCURACY = QUERY_PREFIX + "accuracy";
+ private static final String SUGGESTIONS = QUERY_PREFIX + "suggestionCount";
+ private static final String POPULAR = QUERY_PREFIX + "onlyMorePopular";
+ private static final String EXTENDED = QUERY_PREFIX + "extendedResults";
+
private static final float DEFAULT_ACCURACY = 0.5f;
- private static final int DEFAULT_NUM_SUGGESTIONS = 1;
+ private static final int DEFAULT_SUGGESTION_COUNT = 1;
private static final boolean DEFAULT_MORE_POPULAR = false;
-
+ private static final boolean DEFAULT_EXTENDED_RESULTS = false;
+ private static final float DEFAULT_DICTIONARY_THRESHOLD = 0.0f;
+
public void init(NamedList args) {
super.init(args);
SolrParams p = SolrParams.toSolrParams(args);
- termSourceField = p.get("termSourceField");
+ termSourceField = p.get(SOURCE_FIELD, p.get("termSourceField"));
try {
- String dir = p.get("spellcheckerIndexDir");
+ String dir = p.get(INDEX_DIR, p.get("spellcheckerIndexDir"));
if (null != dir) {
File f = new File(dir);
if ( ! f.isAbsolute() ) {
@@ -97,6 +250,10 @@
}
}
+ /**
+ * Processes the following query string parameters: q, multiWords, cmd rebuild,
+ * cmd reopen, accuracy, suggestionCount, restrictToField, and onlyMorePopular.
+ */
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp)
throws Exception {
SolrParams p = req.getParams();
@@ -115,47 +272,90 @@
}
}
+ // empty query string
+ if (null == words || "".equals(words.trim())) {
+ return;
+ }
+
IndexReader indexReader = null;
String suggestionField = null;
Float accuracy;
int numSug;
+ boolean onlyMorePopular;
+ boolean extendedResults;
try {
- accuracy = p.getFloat("accuracy", DEFAULT_ACCURACY);
+ accuracy = p.getFloat(ACCURACY, p.getFloat("accuracy", DEFAULT_ACCURACY));
spellChecker.setAccuracy(accuracy);
} catch (NumberFormatException e) {
throw new RuntimeException("Accuracy must be a valid positive float", e);
}
try {
- numSug = p.getInt("suggestionCount", DEFAULT_NUM_SUGGESTIONS);
+ numSug = p.getInt(SUGGESTIONS, p.getInt("suggestionCount", DEFAULT_SUGGESTION_COUNT));
} catch (NumberFormatException e) {
throw new RuntimeException("Spelling suggestion count must be a valid positive integer", e);
}
try {
- onlyMorePopular = p.getBool("onlyMorePopular", DEFAULT_MORE_POPULAR);
- } catch (NumberFormatException e) {
+ onlyMorePopular = p.getBool(POPULAR, DEFAULT_MORE_POPULAR);
+ } catch (SolrException e) {
throw new RuntimeException("'Only more popular' must be a valid boolean", e);
}
+ try {
+ extendedResults = p.getBool(EXTENDED, DEFAULT_EXTENDED_RESULTS);
+ } catch (SolrException e) {
+ throw new RuntimeException("'Extended results' must be a valid boolean", e);
+ }
- // when searching for more popular, a non null index-reader and
+ // when searching for more popular, a non null index-reader and
// restricted-field are required
- if (onlyMorePopular) {
+ if (onlyMorePopular || extendedResults) {
indexReader = req.getSearcher().getReader();
suggestionField = termSourceField;
}
+ if (extendedResults) {
- if (null != words && !"".equals(words.trim())) {
+ SimpleOrderedMap<Object> results = new SimpleOrderedMap<Object>();
+ String[] wordz = words.split(" ");
+ for (String word : wordz)
+ {
+ SimpleOrderedMap<Object> nl = new SimpleOrderedMap<Object>();
+ nl.add("frequency", indexReader.docFreq(new Term(suggestionField, word)));
+ String[] suggestions =
+ spellChecker.suggestSimilar(word, numSug,
+ indexReader, suggestionField, onlyMorePopular);
+
+ // suggestion array
+ NamedList<Object> sa = new NamedList<Object>();
+ for (int i=0; i<suggestions.length; i++) {
+ // suggestion item
+ SimpleOrderedMap<Object> si = new SimpleOrderedMap<Object>();
+ si.add("frequency", indexReader.docFreq(new Term(termSourceField, suggestions[i])));
+ sa.add(suggestions[i], si);
+ }
+ nl.add("suggestions", sa);
+ results.add(word, nl);
+ }
+ rsp.add( "result", results );
+
+ } else {
+ rsp.add("words", words);
+ if (spellChecker.exist(words)) {
+ rsp.add("exist","true");
+ } else {
+ rsp.add("exist","false");
+ }
String[] suggestions =
spellChecker.suggestSimilar(words, numSug,
indexReader, suggestionField,
onlyMorePopular);
-
+
rsp.add("suggestions", Arrays.asList(suggestions));
}
}
/** Rebuilds the SpellChecker index using values from the <code>termSourceField</code> from the
* index pointed to by the current {@link IndexSearcher}.
+ * Any word appearing in less that thresh documents will not be added to the spellcheck index.
*/
private void rebuild(SolrQueryRequest req) throws IOException, SolrException {
if (null == termSourceField) {
@@ -163,8 +363,15 @@
(SolrException.ErrorCode.SERVER_ERROR, "can't rebuild spellchecker index without termSourceField configured");
}
+ Float threshold;
+ try {
+ threshold = req.getParams().getFloat("sp.dictionary.threshold", DEFAULT_DICTIONARY_THRESHOLD);
+ } catch (NumberFormatException e) {
+ throw new RuntimeException("Threshold must be a valid positive float", e);
+ }
+
IndexReader indexReader = req.getSearcher().getReader();
- Dictionary dictionary = new LuceneDictionary(indexReader, termSourceField);
+ Dictionary dictionary = new HiFrequencyDictionary(indexReader, termSourceField, threshold);
spellChecker.clearIndex();
spellChecker.indexDictionary(dictionary);
reopen();
Added: lucene/solr/trunk/src/java/org/apache/solr/util/HiFrequencyDictionary.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/util/HiFrequencyDictionary.java?rev=592129&view=auto
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/util/HiFrequencyDictionary.java (added)
+++ lucene/solr/trunk/src/java/org/apache/solr/util/HiFrequencyDictionary.java Mon Nov 5 11:39:14 2007
@@ -0,0 +1,140 @@
+package org.apache.solr.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.search.spell.Dictionary;
+
+/**
+ * Hi Frequency Dictionary: terms taken from the given field
+ * of a Lucene index, which appear in a number of documents
+ * above a given threshold.
+ *
+ * When using IndexReader.terms(Term) the code must not call next() on TermEnum
+ * as the first call to TermEnum, see: http://issues.apache.org/jira/browse/LUCENE-6
+ *
+ * Threshold is a value in [0..1] representing the minimum
+ * number of documents (of the total) where a term should appear.
+ *
+ * @author Mike Krimerman
+ *
+ * Based on LuceneDictionary, by
+ * @author Nicolas Maisonneuve
+ * @author Christian Mallwitz
+ */
+public class HiFrequencyDictionary implements Dictionary {
+ private IndexReader reader;
+ private String field;
+ private float thresh;
+
+ public HiFrequencyDictionary(IndexReader reader, String field, float thresh) {
+ this.reader = reader;
+ this.field = field.intern();
+ this.thresh = thresh;
+ }
+
+ public final Iterator getWordsIterator() {
+ return new HiFrequencyIterator();
+ }
+
+
+ final class HiFrequencyIterator implements Iterator {
+ private TermEnum termEnum;
+ private Term actualTerm;
+ private boolean hasNextCalled;
+ private int minNumDocs;
+
+ HiFrequencyIterator() {
+ try {
+ termEnum = reader.terms(new Term(field, ""));
+ minNumDocs = (int)(thresh * (float)reader.numDocs());
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private boolean isFrequent(Term term) {
+ try {
+ return reader.docFreq(term) >= minNumDocs;
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public Object next() {
+ if (!hasNextCalled) {
+ hasNext();
+ }
+ hasNextCalled = false;
+
+ try {
+ termEnum.next();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ return (actualTerm != null) ? actualTerm.text() : null;
+ }
+
+ public boolean hasNext() {
+ if (hasNextCalled) {
+ return actualTerm != null;
+ }
+ hasNextCalled = true;
+
+ do {
+ actualTerm = termEnum.term();
+
+ // if there are no words return false
+ if (actualTerm == null) {
+ return false;
+ }
+
+ String currentField = actualTerm.field();
+
+ // if the next word doesn't have the same field return false
+ if (currentField != field) {
+ actualTerm = null;
+ return false;
+ }
+
+ // got a valid term, does it pass the threshold?
+ if (isFrequent(actualTerm)) {
+ return true;
+ }
+
+ // term not up to threshold
+ try {
+ termEnum.next();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ } while (true);
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+}
Propchange: lucene/solr/trunk/src/java/org/apache/solr/util/HiFrequencyDictionary.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/solr/trunk/src/test/org/apache/solr/handler/SpellCheckerRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/handler/SpellCheckerRequestHandlerTest.java?rev=592129&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/handler/SpellCheckerRequestHandlerTest.java (added)
+++ lucene/solr/trunk/src/test/org/apache/solr/handler/SpellCheckerRequestHandlerTest.java Mon Nov 5 11:39:14 2007
@@ -0,0 +1,473 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler;
+
+import org.apache.solr.util.AbstractSolrTestCase;
+
+/**
+ * This is a test case to test the SpellCheckerRequestHandler class.
+ * It tests:
+ * <ul>
+ * <li>The generation of the spell checkers list with a 10 words</li>
+ * <li>The identification of the word that was being spell checked</li>
+ * <li>The confirmation if the word exists or not in the index</li>
+ * <li>The suggested list of a correctly and incorrectly spelled words</li>
+ * <li>The suggestions for both correct and incorrect words</li>
+ * <li>The limitation on the number of suggestions with the
+ * suggestionCount parameter</li>
+ * <li>The usage of the parameter multiWords</li>
+ * </ul>
+ *
+ * Notes/Concerns about this Test Case:
+ * <ul>
+ * <li>This is my first test case for a Solr Handler. As such I am not
+ * familiar with the AbstractSolrTestCase and as such I am not
+ * 100% these test cases will work under the same for each person
+ * who runs the test cases (see next note).</li>
+ * <li>The order of the arrays (arr) may not be consistant on other
+ * systems or different runs, as such these test cases may fail?</li>
+ * <li>Note: I changed //arr/str[1][.='cart'] to //arr/str[.='cart'] and it
+ * appears to work.</li>
+ * <li>The two notations appear to successfully test for the same thing:
+ * "//lst[@name='result']/lst[1][@name='word']/str[@name='words'][.='cat']"
+ * and "//str[@name='words'][.='cat']" which I would think // would indicate
+ * a root node.</li>
+ * </ul>
+ */
+public class SpellCheckerRequestHandlerTest
+ extends AbstractSolrTestCase
+{
+
+ @Override
+ public String getSchemaFile() { return "solr/conf/schema-spellchecker.xml"; }
+
+ @Override
+ public String getSolrConfigFile() { return "solr/conf/solrconfig-spellchecker.xml"; }
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+
+
+ }
+
+ private void buildSpellCheckIndex()
+ {
+ lrf = h.getRequestFactory("spellchecker", 0, 20 );
+ lrf.args.put("version","2.0");
+ lrf.args.put("sp.query.accuracy",".9");
+
+ assertU("Add some words to the Spell Check Index:",
+ adoc("id", "100",
+ "spell", "solr"));
+ assertU(adoc("id", "101",
+ "spell", "cat"));
+ assertU(adoc("id", "102",
+ "spell", "cart"));
+ assertU(adoc("id", "103",
+ "spell", "carp"));
+ assertU(adoc("id", "104",
+ "spell", "cant"));
+ assertU(adoc("id", "105",
+ "spell", "catnip"));
+ assertU(adoc("id", "106",
+ "spell", "cattails"));
+ assertU(adoc("id", "107",
+ "spell", "cod"));
+ assertU(adoc("id", "108",
+ "spell", "corn"));
+ assertU(adoc("id", "109",
+ "spell", "cot"));
+
+ assertU(commit());
+ assertU(optimize());
+
+ lrf.args.put("cmd","rebuild");
+ assertQ("Need to first build the index:",
+ req("cat")
+ ,"//str[@name='cmdExecuted'][.='rebuild']"
+ ,"//str[@name='words'][.='cat']"
+ ,"//str[@name='exist'][.='true']"
+ // ,"//arr[@name='suggestions'][.='']"
+ );
+ lrf.args.clear();
+
+ }
+
+ /**
+ * Test for correct spelling of a single word at various accuracy levels
+ * to see how the suggestions vary.
+ */
+ public void testSpellCheck_01_correctWords() {
+
+ buildSpellCheckIndex();
+
+ lrf = h.getRequestFactory("spellchecker", 0, 20 );
+ lrf.args.put("version","2.0");
+
+ lrf.args.put("sp.query.accuracy",".9");
+ assertQ("Failed to spell check",
+ req("cat")
+ ,"//str[@name='words'][.='cat']"
+ ,"//str[@name='exist'][.='true']"
+ );
+
+ lrf.args.put("sp.query.accuracy",".4");
+ assertQ("Failed to spell check",
+ req("cat")
+ ,"//str[@name='words'][.='cat']"
+ ,"//str[@name='exist'][.='true']"
+ ,"//arr/str[.='cot']"
+ ,"//arr/str[.='cart']"
+// ,"//arr/str[1][.='cot']"
+// ,"//arr/str[2][.='cart']"
+ );
+
+ lrf.args.put("sp.query.accuracy",".0");
+ assertQ("Failed to spell check",
+ req("cat")
+ ,"//str[@name='words'][.='cat']"
+ ,"//str[@name='exist'][.='true']"
+ ,"//arr/str[.='cart']"
+ ,"//arr/str[.='cot']"
+ ,"//arr/str[.='carp']"
+ ,"//arr/str[.='cod']"
+ ,"//arr/str[.='corn']"
+ );
+ }
+
+ /**
+ * Test for correct spelling of a single word at various accuracy levels
+ * to see how the suggestions vary.
+ */
+ public void testSpellCheck_02_incorrectWords() {
+
+ buildSpellCheckIndex();
+
+ lrf = h.getRequestFactory("spellchecker", 0, 20 );
+ lrf.args.put("version","2.0");
+ lrf.args.put("sp.query.accuracy",".9");
+
+ assertQ("Confirm the index is still valid",
+ req("cat")
+ ,"//str[@name='words'][.='cat']"
+ ,"//str[@name='exist'][.='true']"
+ );
+
+
+ assertQ("Failed to spell check",
+ req("coat")
+ ,"//str[@name='words'][.='coat']"
+ ,"//str[@name='exist'][.='false']"
+ ,"//arr[@name='suggestions'][.='']"
+ );
+
+
+ lrf.args.put("sp.query.accuracy",".2");
+ assertQ("Failed to spell check",
+ req("coat")
+ ,"//str[@name='words'][.='coat']"
+ ,"//str[@name='exist'][.='false']"
+ ,"//arr/str[.='cot']"
+ ,"//arr/str[.='cat']"
+ ,"//arr/str[.='corn']"
+ ,"//arr/str[.='cart']"
+ ,"//arr/str[.='cod']"
+ ,"//arr/str[.='solr']"
+ ,"//arr/str[.='carp']"
+ );
+
+ lrf.args.put("sp.query.suggestionCount", "2");
+ lrf.args.put("sp.query.accuracy",".2");
+ assertQ("Failed to spell check",
+ req("coat")
+ ,"//str[@name='words'][.='coat']"
+ ,"//str[@name='exist'][.='false']"
+ ,"//arr/str[.='cot']"
+ ,"//arr/str[.='cat']"
+ );
+ }
+
+ /**
+ * Test for correct spelling of a single word at various accuracy levels
+ * to see how the suggestions vary.
+ */
+ public void testSpellCheck_03_multiWords_correctWords() {
+
+ buildSpellCheckIndex();
+
+ lrf = h.getRequestFactory("spellchecker", 0, 20 );
+ lrf.args.put("version","2.0");
+ lrf.args.put("sp.query.accuracy",".9");
+
+ assertQ("Confirm the index is still valid",
+ req("cat")
+ ,"//str[@name='words'][.='cat']"
+ ,"//str[@name='exist'][.='true']"
+ );
+
+
+ // Enable multiWords formatting:
+ lrf.args.put("sp.query.extendedResults", "true");
+
+
+ assertQ("Failed to spell check",
+ req("cat")
+ ,"//lst[@name='cat']"
+ ,"//lst[@name='cat']/int[@name='frequency'][.>0]"
+ ,"//lst[@name='cat']/lst[@name='suggestions' and count(lst)=0]"
+ );
+
+
+ // Please note that the following produces the following XML structure.
+ // <response>
+ // <responseHeader>
+ // <status>0</status><QTime>0</QTime>
+ // </responseHeader>
+ // <lst name="result">
+ // <lst name="cat">
+ // <int name="frequency">1</int>
+ // <lst name="suggestions">
+ // <lst name="cart"><int name="frequency">1</int></lst>
+ // <lst name="cot"><int name="frequency">1</int></lst>
+ // <lst name="cod"><int name="frequency">1</int></lst>
+ // <lst name="carp"><int name="frequency">1</int></lst>
+ // </lst>
+ // </lst>
+ // </lst>
+ // </response>
+
+
+ lrf.args.put("sp.query.accuracy",".2");
+ assertQ("Failed to spell check",
+ req("cat")
+ ,"//lst[@name='cat']"
+ ,"//lst[@name='cat']/int[@name='frequency'][.>0]"
+ ,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cart']/int[@name='frequency'][.>0]"
+ ,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cot']/int[@name='frequency'][.>0]"
+ ,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cod']/int[@name='frequency'][.>0]"
+ ,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='carp']/int[@name='frequency'][.>0]"
+ );
+
+ lrf.args.put("sp.query.suggestionCount", "2");
+ lrf.args.put("sp.query.accuracy",".2");
+ assertQ("Failed to spell check",
+ req("cat")
+ ,"//lst[@name='cat']"
+ ,"//lst[@name='cat']/int[@name='frequency'][.>0]"
+ ,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cart']"
+ ,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cot']"
+ );
+
+ /* The following is the generated XML response for the next query with three words:
+ <response>
+ <responseHeader><status>0</status><QTime>0</QTime></responseHeader>
+ <lst name="result">
+ <lst name="cat">
+ <int name="frequency">1</int>
+ <lst name="suggestions">
+ <lst name="cart"><int name="frequency">1</int></lst>
+ <lst name="cot"><int name="frequency">1</int></lst>
+ </lst>
+ </lst>
+ <lst name="card">
+ <int name="frequency">1</int>
+ <lst name="suggestions">
+ <lst name="carp"><int name="frequency">1</int></lst>
+ <lst name="cat"><int name="frequency">1</int></lst>
+ </lst>
+ </lst>
+ <lst name="carp">
+ <int name="frequency">1</int>
+ <lst name="suggestions">
+ <lst name="cart"><int name="frequency">1</int></lst>
+ <lst name="corn"><int name="frequency">1</int></lst>
+ </lst>
+ </lst>
+ </lst>
+ </response>
+ */
+
+ lrf.args.put("sp.query.suggestionCount", "2");
+ lrf.args.put("sp.query.accuracy",".2");
+ assertQ("Failed to spell check",
+ req("cat cart carp")
+ ,"//lst[@name='cat']"
+ ,"//lst[@name='cat']/int[@name='frequency'][.>0]"
+ ,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cart']"
+ ,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cot']"
+
+ ,"//lst[@name='cart']"
+ ,"//lst[@name='cart']/int[@name='frequency'][.>0]"
+ ,"//lst[@name='cart']/lst/lst[1]"
+ ,"//lst[@name='cart']/lst/lst[2]"
+
+ ,"//lst[@name='carp']"
+ ,"//lst[@name='carp']/int[@name='frequency'][.>0]"
+ ,"//lst[@name='carp']/lst[@name='suggestions']/lst[@name='cart']"
+ ,"//lst[@name='carp']/lst[@name='suggestions']/lst[@name='corn']"
+
+ );
+
+ }
+
+ /**
+ * Test for correct spelling of a single word at various accuracy levels
+ * to see how the suggestions vary.
+ */
+ public void testSpellCheck_04_multiWords_incorrectWords() {
+
+ buildSpellCheckIndex();
+
+ lrf = h.getRequestFactory("spellchecker", 0, 20 );
+ lrf.args.put("version","2.0");
+ lrf.args.put("sp.query.accuracy",".9");
+
+ assertQ("Confirm the index is still valid",
+ req("cat")
+ ,"//str[@name='words'][.='cat']"
+ ,"//str[@name='exist'][.='true']"
+ );
+
+
+ // Enable multiWords formatting:
+ lrf.args.put("sp.query.extendedResults", "true");
+
+
+ assertQ("Failed to spell check",
+ req("coat")
+ ,"//lst[@name='coat']"
+ ,"//lst[@name='coat']/int[@name='frequency'][.=0]"
+ ,"//lst[@name='coat']/lst[@name='suggestions' and count(lst)=0]"
+ );
+
+ lrf.args.put("sp.query.accuracy",".2");
+ assertQ("Failed to spell check",
+ req("coat")
+ ,"//lst[@name='coat']"
+ ,"//lst[@name='coat']/int[@name='frequency'][.=0]"
+ ,"//lst[@name='coat']/lst[@name='suggestions']/lst[@name='cot']"
+ ,"//lst[@name='coat']/lst[@name='suggestions']/lst[@name='cat']"
+ ,"//lst[@name='coat']/lst[@name='suggestions']/lst[@name='corn']"
+ ,"//lst[@name='coat']/lst[@name='suggestions']/lst[@name='cart']"
+ );
+
+ lrf.args.put("sp.query.suggestionCount", "2");
+ lrf.args.put("sp.query.accuracy",".2");
+ assertQ("Failed to spell check",
+ req("coat")
+ ,"//lst[@name='coat']"
+ ,"//lst[@name='coat']/int[@name='frequency'][.=0]"
+ ,"//lst[@name='coat']/lst[@name='suggestions']/lst[@name='cot']"
+ ,"//lst[@name='coat']/lst[@name='suggestions']/lst[@name='cat']"
+ );
+
+
+
+ lrf.args.put("sp.query.suggestionCount", "2");
+ lrf.args.put("sp.query.accuracy",".2");
+ assertQ("Failed to spell check",
+ req("cet cert corp")
+ ,"//lst[@name='cet']"
+ ,"//lst[@name='cet']/int[@name='frequency'][.=0]"
+ ,"//lst[@name='cet']/lst[@name='suggestions']/lst[1]"
+ ,"//lst[@name='cet']/lst[@name='suggestions']/lst[2]"
+
+ ,"//lst[@name='cert']"
+ ,"//lst[@name='cert']/int[@name='frequency'][.=0]"
+ ,"//lst[@name='cert']/lst[@name='suggestions']/lst[1]"
+ ,"//lst[@name='cert']/lst[@name='suggestions']/lst[2]"
+
+ ,"//lst[@name='corp']"
+ ,"//lst[@name='corp']/int[@name='frequency'][.=0]"
+ ,"//lst[@name='corp']/lst[@name='suggestions']/lst[1]"
+ ,"//lst[@name='corp']/lst[@name='suggestions']/lst[2]"
+
+ );
+
+ }
+
+ public void testSpellCheck_05_buildDictionary() {
+ lrf = h.getRequestFactory("spellchecker", 0, 20 );
+ lrf.args.put("version","2.0");
+ lrf.args.put("sp.query.accuracy",".9");
+
+ assertU("Add some words to the Spell Check Index:",
+ adoc("id", "100",
+ "spell", "solr cat cart"));
+ assertU(adoc("id", "101",
+ "spell", "cat cart"));
+ assertU(adoc("id", "102",
+ "spell", "cat cart"));
+ assertU(adoc("id", "103",
+ "spell", "cat cart carp"));
+ assertU(adoc("id", "104",
+ "spell", "cat car cant"));
+ assertU(adoc("id", "105",
+ "spell", "cat catnip"));
+ assertU(adoc("id", "106",
+ "spell", "cat cattails"));
+ assertU(adoc("id", "107",
+ "spell", "cat cod"));
+ assertU(adoc("id", "108",
+ "spell", "cat corn"));
+ assertU(adoc("id", "109",
+ "spell", "cat cot"));
+ assertU(commit());
+ assertU(optimize());
+
+ lrf.args.put("sp.dictionary.threshold", "0.20");
+ lrf.args.put("cmd","rebuild");
+ assertQ("Need to first build the index:",
+ req("cat")
+ ,"//str[@name='cmdExecuted'][.='rebuild']"
+ ,"//str[@name='words'][.='cat']"
+ ,"//str[@name='exist'][.='true']"
+ );
+
+ lrf.args.clear();
+ lrf.args.put("version","2.0");
+ lrf.args.put("sp.query.accuracy",".9");
+
+ assertQ("Confirm index contains only words above threshold",
+ req("cat")
+ ,"//str[@name='words'][.='cat']"
+ ,"//str[@name='exist'][.='true']"
+ );
+
+ assertQ("Confirm index contains only words above threshold",
+ req("cart")
+ ,"//str[@name='words'][.='cart']"
+ ,"//str[@name='exist'][.='true']"
+ );
+
+ assertQ("Confirm index contains only words above threshold",
+ req("cod")
+ ,"//str[@name='words'][.='cod']"
+ ,"//str[@name='exist'][.='false']"
+ );
+
+ assertQ("Confirm index contains only words above threshold",
+ req("corn")
+ ,"//str[@name='words'][.='corn']"
+ ,"//str[@name='exist'][.='false']"
+ );
+
+ lrf.args.clear();
+ }
+}
Propchange: lucene/solr/trunk/src/test/org/apache/solr/handler/SpellCheckerRequestHandlerTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/solr/trunk/src/test/test-files/solr/conf/schema-spellchecker.xml
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/schema-spellchecker.xml?rev=592129&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/schema-spellchecker.xml (added)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/schema-spellchecker.xml Mon Nov 5 11:39:14 2007
@@ -0,0 +1,83 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- This is the Solr schema file. This file should be named "schema.xml" and
+ should be in the conf directory under the solr home
+ (i.e. ./solr/conf/schema.xml by default)
+ or located where the classloader for the Solr webapp can find it.
+
+ For more information, on how to customize this file, please see
+ http://wiki.apache.org/solr/SchemaXml
+-->
+
+<schema name="Solr SpellCheck Test" version="1.1">
+ <!-- attribute "name" is the name of this schema and is only used for display purposes.
+ Applications should change this to reflect the nature of the search collection.
+ version="1.1" is Solr's version number for the schema syntax and semantics. It should
+ not normally be changed by applications.
+ 1.0: multiValued attribute did not exist, all fields are multiValued by nature
+ 1.1: multiValued attribute introduced, false by default -->
+
+ <types>
+ <fieldtype name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
+
+ <fieldtype name="text" class="solr.TextField">
+ <analyzer>
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StandardFilterFactory"/>
+ <filter class="solr.LowerCaseFilterFactory"/>
+ <filter class="solr.StopFilterFactory"/>
+ <filter class="solr.EnglishPorterFilterFactory"/>
+ </analyzer>
+ </fieldtype>
+
+ <fieldType name="spellText" class="solr.TextField" positionIncrementGap="100">
+ <analyzer type="index">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+ <filter class="solr.StandardFilterFactory"/>
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ </analyzer>
+ <analyzer type="query">
+ <tokenizer class="solr.StandardTokenizerFactory"/>
+ <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+ <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+ <filter class="solr.StandardFilterFactory"/>
+ <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ </analyzer>
+ </fieldType>
+
+ </types>
+
+
+ <fields>
+ <field name="id" type="string" indexed="true" stored="true"/>
+ <field name="spell" type="spellText" indexed="true" stored="true" />
+ <field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
+ </fields>
+
+ <!-- field to use to determine and enforce document uniqueness. -->
+ <uniqueKey>id</uniqueKey>
+
+ <!-- field for the QueryParser to use when an explicit fieldname is absent -->
+ <defaultSearchField>text</defaultSearchField>
+
+ <!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
+ <solrQueryParser defaultOperator="OR"/>
+
+</schema>
Propchange: lucene/solr/trunk/src/test/test-files/solr/conf/schema-spellchecker.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/solr/trunk/src/test/test-files/solr/conf/solrconfig-spellchecker.xml
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/solrconfig-spellchecker.xml?rev=592129&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/solrconfig-spellchecker.xml (added)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/solrconfig-spellchecker.xml Mon Nov 5 11:39:14 2007
@@ -0,0 +1,103 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<config>
+ <indexDefaults>
+ <useCompoundFile>false</useCompoundFile>
+ <mergeFactor>10</mergeFactor>
+ <maxBufferedDocs>1000</maxBufferedDocs>
+ <maxMergeDocs>2147483647</maxMergeDocs>
+ <maxFieldLength>10000</maxFieldLength>
+ <writeLockTimeout>1000</writeLockTimeout>
+ <commitLockTimeout>10000</commitLockTimeout>
+ </indexDefaults>
+
+ <mainIndex>
+ <useCompoundFile>false</useCompoundFile>
+ <mergeFactor>10</mergeFactor>
+ <maxBufferedDocs>1000</maxBufferedDocs>
+ <maxMergeDocs>2147483647</maxMergeDocs>
+ <maxFieldLength>10000</maxFieldLength>
+ <unlockOnStartup>true</unlockOnStartup>
+ </mainIndex>
+
+
+ <updateHandler class="solr.DirectUpdateHandler2">
+ <commitIntervalLowerBound>0</commitIntervalLowerBound>
+ </updateHandler>
+
+
+ <query>
+ <maxBooleanClauses>1024</maxBooleanClauses>
+ <useFilterForSortedQuery>true</useFilterForSortedQuery>
+ <queryResultWindowSize>10</queryResultWindowSize>
+ <HashDocSet maxSize="3000" loadFactor="0.75"/>
+ <boolTofilterOptimizer enabled="true" cacheSize="32" threshold=".05"/>
+ </query>
+
+
+
+ <requestHandler name="standard" class="solr.StandardRequestHandler" />
+ <requestHandler name="/update" class="solr.XmlUpdateRequestHandler" />
+
+
+ <!-- SpellCheckerRequestHandler takes in a word (or several words) as the
+ value of the "q" parameter and returns a list of alternative spelling
+ suggestions. If invoked with a ...&cmd=rebuild, it will rebuild the
+ spellchecker index.
+ -->
+ <requestHandler name="spellchecker" class="solr.SpellCheckerRequestHandler" startup="lazy">
+ <!-- default values for query parameters -->
+ <lst name="defaults">
+ <int name="sp.query.suggestionCount">20</int>
+ <float name="sp.query.accuracy">0.60</float>
+ </lst>
+
+ <!-- Main init params for handler -->
+
+ <!-- The directory where your SpellChecker Index should live. -->
+ <!-- May be absolute, or relative to the Solr "dataDir" directory. -->
+ <!-- If this option is not specified, a RAM directory will be used -->
+ <str name="sp.dictionary.spellcheckerIndexDir">spell</str>
+
+ <!-- the field in your schema that you want to be able to build -->
+ <!-- your spell index on. This should be a field that uses a very -->
+ <!-- simple FieldType without a lot of Analysis (ie: string) -->
+ <str name="sp.dictionary.termSourceField">spell</str>
+
+ <!-- threshold for word to make it into the dictionary -->
+ <!-- a word should appear at minimum in the specified precent of documents -->
+ <str name="sp.dictionary.threshold">0.0</str>
+
+ </requestHandler>
+
+
+
+ <queryResponseWriter name="standard" class="org.apache.solr.request.XMLResponseWriter"/>
+ <queryResponseWriter name="useless" class="org.apache.solr.OutputWriterTest$UselessOutputWriter"/>
+ <queryResponseWriter name="xslt" class="org.apache.solr.request.XSLTResponseWriter"/>
+ <queryResponseWriter name="json" class="org.apache.solr.request.JSONResponseWriter"/>
+
+
+ <!-- config for the admin interface -->
+ <admin>
+ <defaultQuery>solr</defaultQuery>
+ <gettableFiles>solrconfig.xml schema.xml admin-extra.html</gettableFiles>
+ </admin>
+
+</config>
Propchange: lucene/solr/trunk/src/test/test-files/solr/conf/solrconfig-spellchecker.xml
------------------------------------------------------------------------------
svn:eol-style = native