You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by yo...@apache.org on 2006/07/05 21:36:09 UTC
svn commit: r419321 - in /incubator/solr/trunk: CHANGES.txt
src/java/org/apache/solr/analysis/WordDelimiterFilter.java
src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
src/test/test-files/solr/conf/schema.xml
Author: yonik
Date: Wed Jul 5 12:36:08 2006
New Revision: 419321
URL: http://svn.apache.org/viewvc?rev=419321&view=rev
Log:
WordDelimiterFilter could loose position info
Added:
incubator/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
Modified:
incubator/solr/trunk/CHANGES.txt
incubator/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
incubator/solr/trunk/src/test/test-files/solr/conf/schema.xml
Modified: incubator/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/solr/trunk/CHANGES.txt?rev=419321&r1=419320&r2=419321&view=diff
==============================================================================
--- incubator/solr/trunk/CHANGES.txt (original)
+++ incubator/solr/trunk/CHANGES.txt Wed Jul 5 12:36:08 2006
@@ -43,6 +43,7 @@
2. Added escaping of attribute values in the XML response (Erik Hatcher)
3. Added empty extractTerms() to FunctionQuery to enable use in
a MultiSearcher (Yonik)
+ 4. WordDelimiterFilter sometimes lost token positionIncrement information
Other Changes
1. Upgrade to Lucene 2.0 nightly build 2006-06-22, lucene SVN revision 416224,
Modified: incubator/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/incubator/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java?rev=419321&r1=419320&r2=419321&view=diff
==============================================================================
--- incubator/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java (original)
+++ incubator/solr/trunk/src/java/org/apache/solr/analysis/WordDelimiterFilter.java Wed Jul 5 12:36:08 2006
@@ -123,10 +123,6 @@
}
}
- private int charType(String s, int pos) {
- return charType(s.charAt(pos));
- }
-
// use the type of the first char as the type
// of the token.
private int tokType(Token t) {
@@ -170,16 +166,18 @@
// Would it actually be faster to check for the common form
// of isLetter() isLower()*, and then backtrack if it doesn't match?
+ int origPosIncrement;
while(true) {
Token t = input.next();
if (t == null) return null;
String s = t.termText();
- int off=t.startOffset();
int start=0;
int end=s.length();
if (end==0) continue;
+ origPosIncrement = t.getPositionIncrement();
+
// Avoid calling charType more than once for each char (basically
// avoid any backtracking).
// makes code slightly more difficult, but faster.
@@ -273,6 +271,7 @@
// optimization... if this is the only token,
// return it immediately.
if (queue.size()==0) {
+ newtok.setPositionIncrement(origPosIncrement);
return newtok;
}
@@ -376,7 +375,9 @@
// System.out.println("##########AFTER COMBINATIONS:"+ str(queue));
queuePos=1;
- return queue.get(0);
+ Token tok = queue.get(0);
+ tok.setPositionIncrement(origPosIncrement);
+ return tok;
}
@@ -416,29 +417,10 @@
}
}
- private String str(List<Token> lst) {
- StringBuilder sb = new StringBuilder();
- sb.append('{');
- for (Token t : lst) {
- sb.append('(');
- sb.append('"');
- sb.append(t.termText());
- sb.append("\",increment=");
- sb.append(Integer.toString(t.getPositionIncrement()));
- sb.append(')');
-
- sb.append(',');
- }
- sb.append('}');
- return sb.toString();
- }
-
-
// questions:
// negative numbers? -42 indexed as just 42?
// dollar sign? $42
// percent sign? 33%
// downsides: if source text is "powershot" then a query of "PowerShot" won't match!
-
}
Added: incubator/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/incubator/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java?rev=419321&view=auto
==============================================================================
--- incubator/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java (added)
+++ incubator/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java Wed Jul 5 12:36:08 2006
@@ -0,0 +1,49 @@
+package org.apache.solr.analysis;
+
+import org.apache.solr.util.AbstractSolrTestCase;
+import org.apache.solr.util.TestHarness;
+import org.apache.solr.request.SolrQueryRequest;
+
+/**
+ * New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
+ */
+public class TestWordDelimiterFilter extends AbstractSolrTestCase {
+ public String getSchemaFile() { return "solr/conf/schema.xml"; }
+ public String getSolrConfigFile() { return "solr/conf/solrconfig.xml"; }
+
+
+ public void posTst(String v1, String v2, String s1, String s2) {
+ assertU(adoc("id", "42",
+ "subword", v1,
+ "subword", v2));
+ assertU(commit());
+
+ // there is a positionIncrementGap of 100 between field values, so
+ // we test if that was maintained.
+ assertQ("position increment lost",
+ req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~90")
+ ,"//result[@numFound=0]"
+ );
+ assertQ("position increment lost",
+ req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~110")
+ ,"//result[@numFound=1]"
+ );
+ }
+
+
+ public void testRetainPositionIncrement() {
+ posTst("foo","bar","foo","bar");
+ posTst("-foo-","-bar-","foo","bar");
+ posTst("foo","bar","-foo-","-bar-");
+
+ posTst("123","456","123","456");
+ posTst("/123/","/456/","123","456");
+
+ posTst("/123/abc","qwe/456/","abc","qwe");
+
+ posTst("zoo-foo","bar-baz","foo","bar");
+ posTst("zoo-foo-123","456-bar-baz","foo","bar");
+ }
+
+
+}
Modified: incubator/solr/trunk/src/test/test-files/solr/conf/schema.xml
URL: http://svn.apache.org/viewvc/incubator/solr/trunk/src/test/test-files/solr/conf/schema.xml?rev=419321&r1=419320&r2=419321&view=diff
==============================================================================
--- incubator/solr/trunk/src/test/test-files/solr/conf/schema.xml (original)
+++ incubator/solr/trunk/src/test/test-files/solr/conf/schema.xml Wed Jul 5 12:36:08 2006
@@ -174,7 +174,7 @@
</analyzer>
</fieldtype>
- <fieldtype name="subword" class="solr.TextField">
+ <fieldtype name="subword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>