You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by sh...@apache.org on 2009/09/16 16:28:40 UTC
svn commit: r815801 - in /lucene/solr/trunk: CHANGES.txt
src/java/org/apache/solr/spelling/SpellingQueryConverter.java
src/test/org/apache/solr/spelling/SpellingQueryConverterTest.java
Author: shalin
Date: Wed Sep 16 14:28:40 2009
New Revision: 815801
URL: http://svn.apache.org/viewvc?rev=815801&view=rev
Log:
SOLR-1407 -- SpellingQueryConverter disallows underscores and digits in field names
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/src/java/org/apache/solr/spelling/SpellingQueryConverter.java
lucene/solr/trunk/src/test/org/apache/solr/spelling/SpellingQueryConverterTest.java
Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=815801&r1=815800&r2=815801&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Wed Sep 16 14:28:40 2009
@@ -254,7 +254,9 @@
57. SOLR-1152: Snapshoot on ReplicationHandler should accept location as a request parameter (shalin)
-58. SOLR-1204: Enhance SpellingQueryConverter to handle UTF-8 instead of ASCII only (Michael Ludwig via shalin)
+58. SOLR-1204: Enhance SpellingQueryConverter to handle UTF-8 instead of ASCII only.
+ Use the NMTOKEN syntax for matching field names.
+ (Michael Ludwig, shalin)
59. SOLR-1189: Support providing username and password for basic HTTP authentication in Java replication
(Matthew Gregg, shalin)
Modified: lucene/solr/trunk/src/java/org/apache/solr/spelling/SpellingQueryConverter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/spelling/SpellingQueryConverter.java?rev=815801&r1=815800&r2=815801&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/spelling/SpellingQueryConverter.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/spelling/SpellingQueryConverter.java Wed Sep 16 14:28:40 2009
@@ -37,7 +37,49 @@
**/
public class SpellingQueryConverter extends QueryConverter {
- protected Pattern QUERY_REGEX = Pattern.compile("(?:(?!(\\p{L}+:|\\d+)))\\p{L}+");
+ /*
+ * The following builds up a regular expression that matches productions
+ * of the syntax for NMTOKEN as per the W3C XML Recommendation - with one
+ * important exception (see below).
+ *
+ * http://www.w3.org/TR/2008/REC-xml-20081126/ - version used as reference
+ *
+ * http://www.w3.org/TR/REC-xml/#NT-Nmtoken
+ *
+ * An NMTOKEN is a series of one or more NAMECHAR characters, which is an
+ * extension of the NAMESTARTCHAR character class.
+ *
+ * The EXCEPTION referred to above concerns the colon, which is legal in an
+ * NMTOKEN, but cannot currently be used as a valid field name within Solr,
+ * as it is used to delimit the field name from the query string.
+ */
+
+ final static String[] NAMESTARTCHAR_PARTS = {
+ "A-Z_a-z", "\\xc0-\\xd6", "\\xd8-\\xf6", "\\xf8-\\u02ff",
+ "\\u0370-\\u037d", "\\u037f-\\u1fff",
+ "\\u200c-\\u200d", "\\u2070-\\u218f",
+ "\\u2c00-\\u2fef", "\\u2001-\\ud7ff",
+ "\\uf900-\\ufdcf", "\\ufdf0-\\ufffd"
+ };
+ final static String[] ADDITIONAL_NAMECHAR_PARTS = {
+ "\\-.0-9\\xb7", "\\u0300-\\u036f", "\\u203f-\\u2040"
+ };
+ final static String SURROGATE_PAIR = "\\p{Cs}{2}";
+ final static String NMTOKEN;
+
+ static {
+ StringBuilder sb = new StringBuilder();
+ for (String part : NAMESTARTCHAR_PARTS)
+ sb.append(part);
+ for (String part : ADDITIONAL_NAMECHAR_PARTS)
+ sb.append(part);
+ NMTOKEN = "([" + sb.toString() + "]|" + SURROGATE_PAIR + ")+";
+ }
+
+ final static String PATTERN = "(?:(?!(" + NMTOKEN + ":|\\d+)))[^\\s]+";
+ // previous version: Pattern.compile("(?:(?!(\\w+:|\\d+)))\\w+");
+ protected Pattern QUERY_REGEX = Pattern.compile(PATTERN);
+
/**
* Converts the original query string to a collection of Lucene Tokens.
Modified: lucene/solr/trunk/src/test/org/apache/solr/spelling/SpellingQueryConverterTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/spelling/SpellingQueryConverterTest.java?rev=815801&r1=815800&r2=815801&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/spelling/SpellingQueryConverterTest.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/spelling/SpellingQueryConverterTest.java Wed Sep 16 14:28:40 2009
@@ -22,6 +22,7 @@
import org.apache.solr.common.util.NamedList;
import static org.junit.Assert.assertTrue;
import org.junit.Test;
+import org.junit.Assert;
import java.util.Collection;
@@ -43,4 +44,64 @@
assertTrue("tokens is null and it shouldn't be", tokens != null);
assertTrue("tokens Size: " + tokens.size() + " is not: " + 1, tokens.size() == 1);
}
+
+ @Test
+ public void testSpecialChars() {
+ SpellingQueryConverter converter = new SpellingQueryConverter();
+ converter.init(new NamedList());
+ converter.setAnalyzer(new WhitespaceAnalyzer());
+ Collection<Token> tokens = converter.convert("field_with_underscore:value_with_underscore");
+ assertTrue("tokens is null and it shouldn't be", tokens != null);
+ Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
+
+ tokens = converter.convert("field_with_digits123:value_with_digits123");
+ assertTrue("tokens is null and it shouldn't be", tokens != null);
+ Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
+
+ tokens = converter.convert("field-with-hyphens:value-with-hyphens");
+ assertTrue("tokens is null and it shouldn't be", tokens != null);
+ Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
+
+ // mix 'em up and add some to the value
+ tokens = converter.convert("field_with-123s:value_,.|with-hyphens");
+ assertTrue("tokens is null and it shouldn't be", tokens != null);
+ Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
+ }
+
+ @Test
+ public void testUnicode() {
+ SpellingQueryConverter converter = new SpellingQueryConverter();
+ converter.init(new NamedList());
+ converter.setAnalyzer(new WhitespaceAnalyzer());
+
+ // chinese text value
+ Collection<Token> tokens = converter.convert("text_field:æè´ä¹°äºéå
·åæè£
ã");
+ assertTrue("tokens is null and it shouldn't be", tokens != null);
+ Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
+
+ tokens = converter.convert("text_è´field:æè´ä¹°äºéå
·åæè£
ã");
+ assertTrue("tokens is null and it shouldn't be", tokens != null);
+ Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
+
+ tokens = converter.convert("text_field:æè´xyzä¹°äºéå
·åæè£
ã");
+ assertTrue("tokens is null and it shouldn't be", tokens != null);
+ Assert.assertEquals("tokens Size: " + tokens.size() + " is not 1", 1, tokens.size());
+ }
+
+ @Test
+ public void testMultipleClauses() {
+ SpellingQueryConverter converter = new SpellingQueryConverter();
+ converter.init(new NamedList());
+ converter.setAnalyzer(new WhitespaceAnalyzer());
+
+ // two field:value pairs should give two tokens
+ Collection<Token> tokens = converter.convert("ä¹°text_field:æè´ä¹°äºéå
·åæè£
ã field2:bar");
+ assertTrue("tokens is null and it shouldn't be", tokens != null);
+ Assert.assertEquals("tokens Size: " + tokens.size() + " is not 2", 2, tokens.size());
+
+ // a field:value pair and a search term should give two tokens
+ tokens = converter.convert("text_field:æè´ä¹°äºéå
·åæè£
ã bar");
+ assertTrue("tokens is null and it shouldn't be", tokens != null);
+ Assert.assertEquals("tokens Size: " + tokens.size() + " is not 2", 2, tokens.size());
+ }
}