You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by ma...@apache.org on 2009/12/10 00:02:35 UTC
svn commit: r889012 - in /lucene/solr/branches/cloud: ./ lib/
src/common/org/apache/solr/common/ src/common/org/apache/solr/common/util/
src/java/org/apache/solr/handler/component/
src/java/org/apache/solr/spelling/ src/maven/ src/solrj/org/ src/solrj/...
Author: markrmiller
Date: Wed Dec 9 23:02:34 2009
New Revision: 889012
URL: http://svn.apache.org/viewvc?rev=889012&view=rev
Log:
merge up to r888806
Added:
lucene/solr/branches/cloud/lib/lucene-analyzers-2.9.1-dev.jar
- copied unchanged from r888806, lucene/solr/trunk/lib/lucene-analyzers-2.9.1-dev.jar
lucene/solr/branches/cloud/lib/lucene-collation-2.9.1-dev.jar
- copied unchanged from r888806, lucene/solr/trunk/lib/lucene-collation-2.9.1-dev.jar
lucene/solr/branches/cloud/lib/lucene-core-2.9.1-dev.jar
- copied unchanged from r888806, lucene/solr/trunk/lib/lucene-core-2.9.1-dev.jar
lucene/solr/branches/cloud/lib/lucene-highlighter-2.9.1-dev.jar
- copied unchanged from r888806, lucene/solr/trunk/lib/lucene-highlighter-2.9.1-dev.jar
lucene/solr/branches/cloud/lib/lucene-memory-2.9.1-dev.jar
- copied unchanged from r888806, lucene/solr/trunk/lib/lucene-memory-2.9.1-dev.jar
lucene/solr/branches/cloud/lib/lucene-misc-2.9.1-dev.jar
- copied unchanged from r888806, lucene/solr/trunk/lib/lucene-misc-2.9.1-dev.jar
lucene/solr/branches/cloud/lib/lucene-queries-2.9.1-dev.jar
- copied unchanged from r888806, lucene/solr/trunk/lib/lucene-queries-2.9.1-dev.jar
lucene/solr/branches/cloud/lib/lucene-snowball-2.9.1-dev.jar
- copied unchanged from r888806, lucene/solr/trunk/lib/lucene-snowball-2.9.1-dev.jar
lucene/solr/branches/cloud/lib/lucene-spatial-2.9.1-dev.jar
- copied unchanged from r888806, lucene/solr/trunk/lib/lucene-spatial-2.9.1-dev.jar
lucene/solr/branches/cloud/lib/lucene-spellchecker-2.9.1-dev.jar
- copied unchanged from r888806, lucene/solr/trunk/lib/lucene-spellchecker-2.9.1-dev.jar
lucene/solr/branches/cloud/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java
- copied unchanged from r888806, lucene/solr/trunk/src/test/org/apache/solr/handler/component/DistributedSpellCheckComponentTest.java
Removed:
lucene/solr/branches/cloud/lib/lucene-analyzers-2.9.1.jar
lucene/solr/branches/cloud/lib/lucene-collation-2.9.1.jar
lucene/solr/branches/cloud/lib/lucene-core-2.9.1.jar
lucene/solr/branches/cloud/lib/lucene-highlighter-2.9.1.jar
lucene/solr/branches/cloud/lib/lucene-memory-2.9.1.jar
lucene/solr/branches/cloud/lib/lucene-misc-2.9.1.jar
lucene/solr/branches/cloud/lib/lucene-queries-2.9.1.jar
lucene/solr/branches/cloud/lib/lucene-snowball-2.9.1.jar
lucene/solr/branches/cloud/lib/lucene-spatial-2.9.1.jar
lucene/solr/branches/cloud/lib/lucene-spellchecker-2.9.1.jar
Modified:
lucene/solr/branches/cloud/ (props changed)
lucene/solr/branches/cloud/CHANGES.txt
lucene/solr/branches/cloud/common-build.xml
lucene/solr/branches/cloud/lib/commons-httpclient-3.1.jar (props changed)
lucene/solr/branches/cloud/lib/jcl-over-slf4j-1.5.5.jar (props changed)
lucene/solr/branches/cloud/src/common/org/apache/solr/common/ (props changed)
lucene/solr/branches/cloud/src/common/org/apache/solr/common/util/DOMUtil.java
lucene/solr/branches/cloud/src/common/org/apache/solr/common/util/JavaBinCodec.java (props changed)
lucene/solr/branches/cloud/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
lucene/solr/branches/cloud/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
lucene/solr/branches/cloud/src/maven/solr-core-pom.xml.template (contents, props changed)
lucene/solr/branches/cloud/src/maven/solr-solrj-pom.xml.template (props changed)
lucene/solr/branches/cloud/src/solrj/org/ (props changed)
lucene/solr/branches/cloud/src/solrj/org/apache/solr/client/solrj/impl/StreamingUpdateSolrServer.java (props changed)
lucene/solr/branches/cloud/src/test/org/apache/solr/client/ (props changed)
lucene/solr/branches/cloud/src/webapp/src/org/apache/solr/client/solrj/embedded/ (props changed)
Propchange: lucene/solr/branches/cloud/
------------------------------------------------------------------------------
svn:mergeinfo = /lucene/solr/trunk:888480-888806
Modified: lucene/solr/branches/cloud/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/branches/cloud/CHANGES.txt?rev=889012&r1=889011&r2=889012&view=diff
==============================================================================
--- lucene/solr/branches/cloud/CHANGES.txt (original)
+++ lucene/solr/branches/cloud/CHANGES.txt Wed Dec 9 23:02:34 2009
@@ -53,6 +53,9 @@
* SOLR-1571: Added unicode collation support though Lucene's CollationKeyFilter
(Robert Muir via shalin)
+* SOLR-785: Distributed Search support for SpellCheckComponent
+ (Matthew Woytowitz, shalin)
+
Optimizations
----------------------
@@ -117,6 +120,9 @@
in a multivalued field when term positions (term vectors) are stored.
(Chris Harris via yonik)
+* SOLR-1635: Fixed error message when numeric values can't be parsed by
+ DOMUtils - notably for plugin init params in solrconfig.xml.
+ (hossman)
Other Changes
----------------------
@@ -136,6 +142,8 @@
* SOLR-1608: Extract base class from TestDistributedSearch to make
it easy to write test cases for other distributed components. (shalin)
+* Upgraded to Lucene 2.9-dev r888785 (shalin)
+
Build
----------------------
Modified: lucene/solr/branches/cloud/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/solr/branches/cloud/common-build.xml?rev=889012&r1=889011&r2=889012&view=diff
==============================================================================
--- lucene/solr/branches/cloud/common-build.xml (original)
+++ lucene/solr/branches/cloud/common-build.xml Wed Dec 9 23:02:34 2009
@@ -114,7 +114,7 @@
The version suffix of the Lucene artifacts checked into "lib"
IF YOU CHANGE THIS, SANITY CHECK "javadoc.link.lucene"
-->
- <property name="lucene_version" value="2.9.1"/>
+ <property name="lucene_version" value="2.9.1-dev"/>
<!-- The version number to assign to the Maven artifacts. -->
<property name="maven_version" value="1.5-SNAPSHOT"/>
Propchange: lucene/solr/branches/cloud/lib/commons-httpclient-3.1.jar
('svn:mergeinfo' removed)
Propchange: lucene/solr/branches/cloud/lib/jcl-over-slf4j-1.5.5.jar
('svn:mergeinfo' removed)
Propchange: lucene/solr/branches/cloud/src/common/org/apache/solr/common/
('svn:mergeinfo' removed)
Modified: lucene/solr/branches/cloud/src/common/org/apache/solr/common/util/DOMUtil.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/cloud/src/common/org/apache/solr/common/util/DOMUtil.java?rev=889012&r1=889011&r2=889012&view=diff
==============================================================================
--- lucene/solr/branches/cloud/src/common/org/apache/solr/common/util/DOMUtil.java (original)
+++ lucene/solr/branches/cloud/src/common/org/apache/solr/common/util/DOMUtil.java Wed Dec 9 23:02:34 2009
@@ -108,40 +108,60 @@
return lst;
}
-
+ /**
+ * Examines a Node from the DOM representation of a NamedList and adds the
+ * contents of that node to both the specified NamedList and List passed
+ * as arguments.
+ *
+ * @param nd The Node whose type will be used to determine how to parse the
+ * text content. If there is a 'name' attribute it will be used
+ * when adding to the NamedList
+ * @param nlst A NamedList to add the item to with name if application.
+ * If this param is null it will be ignored.
+ * @param arr A List to add the item to.
+ * If this param is null it will be ignored.
+ */
@SuppressWarnings("unchecked")
public static void addToNamedList(Node nd, NamedList nlst, List arr) {
// Nodes often include whitespace, etc... so just return if this
// is not an Element.
if (nd.getNodeType() != Node.ELEMENT_NODE) return;
- String type = nd.getNodeName();
+ final String type = nd.getNodeName();
- String name = null;
- if (nd.hasAttributes()) {
- NamedNodeMap attrs = nd.getAttributes();
- Node nameNd = attrs.getNamedItem("name");
- if (nameNd != null) name=nameNd.getNodeValue();
- }
+ final String name = getAttr(nd, "name");
Object val=null;
- if ("str".equals(type)) {
- val = getText(nd);
- } else if ("int".equals(type)) {
- val = Integer.valueOf(getText(nd));
- } else if ("long".equals(type)) {
- val = Long.valueOf(getText(nd));
- } else if ("float".equals(type)) {
- val = Float.valueOf(getText(nd));
- } else if ("double".equals(type)) {
- val = Double.valueOf(getText(nd));
- } else if ("bool".equals(type)) {
- val = StrUtils.parseBool(getText(nd));
- } else if ("lst".equals(type)) {
+ if ("lst".equals(type)) {
val = childNodesToNamedList(nd);
} else if ("arr".equals(type)) {
val = childNodesToList(nd);
+ } else {
+ final String textValue = getText(nd);
+ try {
+ if ("str".equals(type)) {
+ val = textValue;
+ } else if ("int".equals(type)) {
+ val = Integer.valueOf(textValue);
+ } else if ("long".equals(type)) {
+ val = Long.valueOf(textValue);
+ } else if ("float".equals(type)) {
+ val = Float.valueOf(textValue);
+ } else if ("double".equals(type)) {
+ val = Double.valueOf(textValue);
+ } else if ("bool".equals(type)) {
+ val = StrUtils.parseBool(textValue);
+ }
+ // :NOTE: Unexpected Node names are ignored
+ // :TODO: should we generate an error here?
+ } catch (NumberFormatException nfe) {
+ throw new SolrException
+ (SolrException.ErrorCode.SERVER_ERROR,
+ "Value " + (null != name ? ("of '" +name+ "' ") : "") +
+ "can not be parsed as '" +type+ "': \"" + textValue + "\"",
+ nfe);
+ }
}
if (nlst != null) nlst.add(name,val);
Propchange: lucene/solr/branches/cloud/src/common/org/apache/solr/common/util/JavaBinCodec.java
('svn:mergeinfo' removed)
Modified: lucene/solr/branches/cloud/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/cloud/src/java/org/apache/solr/handler/component/SpellCheckComponent.java?rev=889012&r1=889011&r2=889012&view=diff
==============================================================================
--- lucene/solr/branches/cloud/src/java/org/apache/solr/handler/component/SpellCheckComponent.java (original)
+++ lucene/solr/branches/cloud/src/java/org/apache/solr/handler/component/SpellCheckComponent.java Wed Dec 9 23:02:34 2009
@@ -19,14 +19,13 @@
import java.io.IOException;
import java.io.StringReader;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.Map;
-import java.util.Collections;
+import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
+
+import org.apache.lucene.search.spell.LevensteinDistance;
+import org.apache.lucene.search.spell.StringDistance;
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.solr.client.solrj.response.SpellCheckResponse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -152,6 +151,217 @@
}
}
+ static class SuggestWordQueue extends PriorityQueue {
+ SuggestWordQueue(int size) {
+ initialize(size);
+ }
+
+ @Override
+ protected boolean lessThan(Object a, Object b) {
+ SuggestWord wa = (SuggestWord) a;
+ SuggestWord wb = (SuggestWord) b;
+ int val = wa.compareTo(wb);
+ return val < 0;
+ }
+ }
+
+ /**
+ * Borrowed from Lucene SpellChecker
+ */
+ static class SuggestWord {
+ /**
+ * the score of the word
+ */
+ public float score;
+
+ /**
+ * The freq of the word
+ */
+ public int freq;
+
+ /**
+ * the suggested word
+ */
+ public String string;
+
+ public final int compareTo(SuggestWord a) {
+ // first criteria: the edit distance
+ if (score > a.score) {
+ return 1;
+ }
+ if (score < a.score) {
+ return -1;
+ }
+
+ // second criteria (if first criteria is equal): the popularity
+ if (freq > a.freq) {
+ return 1;
+ }
+
+ if (freq < a.freq) {
+ return -1;
+ }
+ return 0;
+ }
+ }
+
+ @Override
+ public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) {
+ SolrParams params = rb.req.getParams();
+ // Turn on spellcheck only only when retrieving fields
+ if (!params.getBool(COMPONENT_NAME, false)) return;
+ if ((sreq.purpose & ShardRequest.PURPOSE_GET_TOP_IDS) != 0) {
+ // fetch at least 5 suggestions from each shard
+ int count = sreq.params.getInt(SPELLCHECK_COUNT, 1);
+ if (count < 5) count = 5;
+ sreq.params.set(SPELLCHECK_COUNT, count);
+ sreq.params.set("spellcheck", "true");
+ } else {
+ sreq.params.set("spellcheck", "false");
+ }
+ }
+
+ @Override
+ @SuppressWarnings({"unchecked", "deprecation"})
+ public void finishStage(ResponseBuilder rb) {
+ SolrParams params = rb.req.getParams();
+ if (!params.getBool(COMPONENT_NAME, false) || rb.stage != ResponseBuilder.STAGE_GET_FIELDS)
+ return;
+
+ boolean extendedResults = params.getBool(SPELLCHECK_EXTENDED_RESULTS, false);
+ boolean collate = params.getBool(SPELLCHECK_COLLATE, false);
+
+ String origQuery = params.get(SPELLCHECK_Q);
+ if (origQuery == null) {
+ origQuery = rb.getQueryString();
+ if (origQuery == null) {
+ origQuery = params.get(CommonParams.Q);
+ }
+ }
+
+ int count = rb.req.getParams().getInt(SPELLCHECK_COUNT, 1);
+ float min = 0.5f;
+ StringDistance sd = null;
+ int numSug = Math.max(count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT);
+ SolrSpellChecker checker = getSpellChecker(rb.req.getParams());
+ if (checker instanceof AbstractLuceneSpellChecker) {
+ AbstractLuceneSpellChecker spellChecker = (AbstractLuceneSpellChecker) checker;
+ min = spellChecker.getAccuracy();
+ sd = spellChecker.getStringDistance();
+ }
+ if (sd == null)
+ sd = new LevensteinDistance();
+
+ Collection<Token> tokens = null;
+ try {
+ tokens = getTokens(origQuery, checker.getQueryAnalyzer());
+ } catch (IOException e) {
+ LOG.error("Could not get tokens (this should never happen)", e);
+ }
+
+ // original token -> corresponding Suggestion object (keep track of start,end)
+ Map<String, SpellCheckResponse.Suggestion> origVsSuggestion = new HashMap<String, SpellCheckResponse.Suggestion>();
+ // original token string -> summed up frequency
+ Map<String, Integer> origVsFreq = new HashMap<String, Integer>();
+ // original token string -> set of alternatives
+ // must preserve order because collation algorithm can only work in-order
+ Map<String, HashSet<String>> origVsSuggested = new LinkedHashMap<String, HashSet<String>>();
+ // alternative string -> corresponding SuggestWord object
+ Map<String, SuggestWord> suggestedVsWord = new HashMap<String, SuggestWord>();
+
+ for (ShardRequest sreq : rb.finished) {
+ for (ShardResponse srsp : sreq.responses) {
+ NamedList nl = (NamedList) srsp.getSolrResponse().getResponse().get("spellcheck");
+ LOG.info(srsp.getShard() + " " + nl);
+ if (nl != null) {
+ SpellCheckResponse spellCheckResp = new SpellCheckResponse(nl);
+ for (SpellCheckResponse.Suggestion suggestion : spellCheckResp.getSuggestions()) {
+ origVsSuggestion.put(suggestion.getToken(), suggestion);
+ HashSet<String> suggested = origVsSuggested.get(suggestion.getToken());
+ if (suggested == null) {
+ suggested = new HashSet<String>();
+ origVsSuggested.put(suggestion.getToken(), suggested);
+ }
+
+ // sum up original frequency
+ int origFreq = 0;
+ Integer o = origVsFreq.get(suggestion.getToken());
+ if (o != null) origFreq += o;
+ origFreq += suggestion.getOriginalFrequency();
+ origVsFreq.put(suggestion.getToken(), origFreq);
+
+ // find best suggestions
+ for (int i = 0; i < suggestion.getNumFound(); i++) {
+ String alternative = suggestion.getAlternatives().get(i);
+ suggested.add(alternative);
+ SuggestWord sug = suggestedVsWord.get(alternative);
+ if (sug == null) {
+ sug = new SuggestWord();
+ suggestedVsWord.put(alternative, sug);
+ }
+ sug.string = alternative;
+ // alternative frequency is present only for extendedResults=true
+ if (suggestion.getAlternativeFrequencies() != null && suggestion.getAlternativeFrequencies().size() > 0) {
+ Integer freq = suggestion.getAlternativeFrequencies().get(i);
+ if (freq != null) sug.freq += freq;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // all shard responses have been collected
+ // create token and get top suggestions
+ SpellingResult result = new SpellingResult(tokens); //todo: investigate, why does it need tokens beforehand?
+ for (Map.Entry<String, HashSet<String>> entry : origVsSuggested.entrySet()) {
+ String original = entry.getKey();
+ HashSet<String> suggested = entry.getValue();
+ SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
+ for (String suggestion : suggested) {
+ SuggestWord sug = suggestedVsWord.get(suggestion);
+ sug.score = sd.getDistance(original, sug.string);
+ if (sug.score < min) continue;
+ sugQueue.insertWithOverflow(sug);
+ if (sugQueue.size() == numSug) {
+ // if queue full, maintain the minScore score
+ min = ((SuggestWord) sugQueue.top()).score;
+ }
+ }
+
+ // create token
+ SpellCheckResponse.Suggestion suggestion = origVsSuggestion.get(original);
+ Token token = new Token();
+ token.setTermText(original);
+ token.setStartOffset(suggestion.getStartOffset());
+ token.setEndOffset(suggestion.getEndOffset());
+
+ // get top 'count' suggestions out of 'sugQueue.size()' candidates
+ SuggestWord[] suggestions = new SuggestWord[Math.min(count, sugQueue.size())];
+ // skip the first sugQueue.size() - count elements
+ for (int k=0; k < sugQueue.size() - count; k++) sugQueue.pop();
+ // now collect the top 'count' responses
+ for (int k = Math.min(count, sugQueue.size()) - 1; k >= 0; k--) {
+ suggestions[k] = ((SuggestWord) sugQueue.pop());
+ }
+
+ if (extendedResults) {
+ Integer o = origVsFreq.get(original);
+ if (o != null) result.add(token, o);
+ for (SuggestWord word : suggestions)
+ result.add(token, word.string, word.freq);
+ } else {
+ List<String> words = new ArrayList<String>(sugQueue.size());
+ for (SuggestWord word : suggestions) words.add(word.string);
+ result.add(token, words);
+ }
+ }
+
+ NamedList response = new SimpleOrderedMap();
+ response.add("suggestions", toNamedList(result, origQuery, extendedResults, collate));
+ rb.rsp.add("spellcheck", response);
+ }
+
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
Collection<Token> result = new ArrayList<Token>();
Token token = null;
Modified: lucene/solr/branches/cloud/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
URL: http://svn.apache.org/viewvc/lucene/solr/branches/cloud/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java?rev=889012&r1=889011&r2=889012&view=diff
==============================================================================
--- lucene/solr/branches/cloud/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java (original)
+++ lucene/solr/branches/cloud/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java Wed Dec 9 23:02:34 2009
@@ -77,6 +77,8 @@
protected float accuracy = 0.5f;
public static final String FIELD = "field";
+ protected StringDistance sd;
+
public String init(NamedList config, SolrCore core) {
super.init(config, core);
indexDir = (String) config.get(INDEX_DIR);
@@ -90,7 +92,6 @@
sourceLocation = (String) config.get(LOCATION);
field = (String) config.get(FIELD);
String strDistanceName = (String)config.get(STRING_DISTANCE);
- StringDistance sd = null;
if (strDistanceName != null) {
sd = (StringDistance) core.getResourceLoader().newInstance(strDistanceName);
//TODO: Figure out how to configure options. Where's Spring when you need it? Or at least BeanUtils...
@@ -226,4 +227,8 @@
public String getSourceLocation() {
return sourceLocation;
}
+
+ public StringDistance getStringDistance() {
+ return sd;
+ }
}
Modified: lucene/solr/branches/cloud/src/maven/solr-core-pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/solr/branches/cloud/src/maven/solr-core-pom.xml.template?rev=889012&r1=889011&r2=889012&view=diff
==============================================================================
--- lucene/solr/branches/cloud/src/maven/solr-core-pom.xml.template (original)
+++ lucene/solr/branches/cloud/src/maven/solr-core-pom.xml.template Wed Dec 9 23:02:34 2009
@@ -81,6 +81,11 @@
<artifactId>lucene-spellchecker</artifactId>
<version>2.9.1</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-spatial</artifactId>
+ <version>2.9.1</version>
+ </dependency>
<!-- Apache Commons -->
<dependency>
Propchange: lucene/solr/branches/cloud/src/maven/solr-core-pom.xml.template
('svn:mergeinfo' removed)
Propchange: lucene/solr/branches/cloud/src/maven/solr-solrj-pom.xml.template
('svn:mergeinfo' removed)
Propchange: lucene/solr/branches/cloud/src/solrj/org/
('svn:mergeinfo' removed)
Propchange: lucene/solr/branches/cloud/src/solrj/org/apache/solr/client/solrj/impl/StreamingUpdateSolrServer.java
('svn:mergeinfo' removed)
Propchange: lucene/solr/branches/cloud/src/test/org/apache/solr/client/
('svn:mergeinfo' removed)
Propchange: lucene/solr/branches/cloud/src/webapp/src/org/apache/solr/client/solrj/embedded/
('svn:mergeinfo' removed)