You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by ma...@apache.org on 2009/12/21 14:53:52 UTC
svn commit: r892821 [3/3] - in /lucene/solr/trunk: ./
src/test/org/apache/solr/analysis/ src/test/test-files/solr/conf/
Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestTrimFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestTrimFilter.java?rev=892821&r1=892820&r2=892821&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestTrimFilter.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestTrimFilter.java Mon Dec 21 13:53:50 2009
@@ -17,12 +17,19 @@
package org.apache.solr.analysis;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-
-import java.util.List;
-
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/**
* @version $Id:$
@@ -35,46 +42,75 @@
char[] ccc = "cCc".toCharArray();
char[] whitespace = " ".toCharArray();
char[] empty = "".toCharArray();
- TokenStream ts = new TrimFilter
- (new IterTokenStream(new Token(a, 0, a.length, 1, 5),
+ TrimFilterFactory factory = new TrimFilterFactory();
+ Map<String,String> args = new HashMap<String,String>();
+ args.put("updateOffsets", "false");
+ factory.init(args);
+ TokenStream ts = factory.create(new IterTokenStream(new Token(a, 0, a.length, 1, 5),
new Token(b, 0, b.length, 6, 10),
new Token(ccc, 0, ccc.length, 11, 15),
new Token(whitespace, 0, whitespace.length, 16, 20),
- new Token(empty, 0, empty.length, 21, 21)), false);
+ new Token(empty, 0, empty.length, 21, 21)));
- TermAttribute token;
- assertTrue(ts.incrementToken());
- token = (TermAttribute) ts.getAttribute(TermAttribute.class);
- assertEquals("a", new String(token.termBuffer(), 0, token.termLength()));
- assertTrue(ts.incrementToken());
- assertEquals("b", new String(token.termBuffer(), 0, token.termLength()));
- assertTrue(ts.incrementToken());
- assertEquals("cCc", new String(token.termBuffer(), 0, token.termLength()));
- assertTrue(ts.incrementToken());
- assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
- assertTrue(ts.incrementToken());
- assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
- assertFalse(ts.incrementToken());
+ assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""});
a = " a".toCharArray();
b = "b ".toCharArray();
ccc = " c ".toCharArray();
whitespace = " ".toCharArray();
- ts = new TrimFilter(new IterTokenStream(
+ factory = new TrimFilterFactory();
+ args = new HashMap<String,String>();
+ args.put("updateOffsets", "true");
+ factory.init(args);
+ ts = factory.create(new IterTokenStream(
new Token(a, 0, a.length, 0, 2),
new Token(b, 0, b.length, 0, 2),
new Token(ccc, 0, ccc.length, 0, 3),
- new Token(whitespace, 0, whitespace.length, 0, 3)), true);
+ new Token(whitespace, 0, whitespace.length, 0, 3)));
+
+ assertTokenStreamContents(ts,
+ new String[] { "a", "b", "c", "" },
+ new int[] { 1, 0, 1, 3 },
+ new int[] { 2, 1, 2, 3 },
+ new int[] { 1, 1, 1, 1 });
+ }
+
+ /**
+ * @deprecated does not support custom attributes
+ */
+ private static class IterTokenStream extends TokenStream {
+ final Token tokens[];
+ int index = 0;
+ TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
+ TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+ PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
- List<Token> expect = tokens("a,1,1,2 b,1,0,1 c,1,1,2 ,1,3,3");
- List<Token> real = getTokens(ts);
- for (Token t : expect) {
- System.out.println("TEST:" + t);
+ public IterTokenStream(Token... tokens) {
+ super();
+ this.tokens = tokens;
}
- for (Token t : real) {
- System.out.println("REAL:" + t);
+
+ public IterTokenStream(Collection<Token> tokens) {
+ this(tokens.toArray(new Token[tokens.size()]));
+ }
+
+ public boolean incrementToken() throws IOException {
+ if (index >= tokens.length)
+ return false;
+ else {
+ clearAttributes();
+ Token token = tokens[index++];
+ termAtt.setTermBuffer(token.term());
+ offsetAtt.setOffset(token.startOffset(), token.endOffset());
+ posIncAtt.setPositionIncrement(token.getPositionIncrement());
+ flagsAtt.setFlags(token.getFlags());
+ typeAtt.setType(token.type());
+ payloadAtt.setPayload(token.getPayload());
+ return true;
+ }
}
- assertTokEqualOff(expect, real);
}
-
}
Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java?rev=892821&r1=892820&r2=892821&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java Mon Dec 21 13:53:50 2009
@@ -17,14 +17,14 @@
package org.apache.solr.analysis;
-import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceTokenizer;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@@ -37,7 +37,7 @@
/**
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
*/
-public class TestWordDelimiterFilter extends AbstractSolrTestCase {
+public class TestWordDelimiterFilter extends BaseTokenTestCase {
public String getSchemaFile() { return "solr/conf/schema.xml"; }
public String getSolrConfigFile() { return "solr/conf/solrconfig.xml"; }
@@ -144,148 +144,74 @@
// test that subwords and catenated subwords have
// the correct offsets.
WordDelimiterFilter wdf = new WordDelimiterFilter(
- new TokenStream() {
- Token t;
- public Token next() throws IOException {
- if (t!=null) return null;
- t = new Token("foo-bar", 5, 12); // actual
- return t;
- }
- },
+ new SingleTokenTokenStream(new Token("foo-bar", 5, 12)),
1,1,0,0,1,1,0);
- int i=0;
- for(Token t; (t=wdf.next())!=null;) {
- String termText = new String(t.termBuffer(), 0, t.termLength());
- if (termText.equals("foo")) {
- assertEquals(5, t.startOffset());
- assertEquals(8, t.endOffset());
- i++;
- }
- if (termText.equals("bar")) {
- assertEquals(9, t.startOffset());
- assertEquals(12, t.endOffset());
- i++;
- }
- if (termText.equals("foobar")) {
- assertEquals(5, t.startOffset());
- assertEquals(12, t.endOffset());
- i++;
- }
- }
- assertEquals(3,i); // make sure all 3 tokens were generated
+ assertTokenStreamContents(wdf,
+ new String[] { "foo", "bar", "foobar" },
+ new int[] { 5, 9, 5 },
+ new int[] { 8, 12, 12 });
- // test that if splitting or catenating a synonym, that the offsets
- // are not altered (they would be incorrect).
wdf = new WordDelimiterFilter(
- new TokenStream() {
- Token t;
- public Token next() throws IOException {
- if (t!=null) return null;
- t = new Token("foo-bar", 5, 6); // a synonym
- return t;
- }
- },
+ new SingleTokenTokenStream(new Token("foo-bar", 5, 6)),
1,1,0,0,1,1,0);
- for(Token t; (t=wdf.next())!=null;) {
- assertEquals(5, t.startOffset());
- assertEquals(6, t.endOffset());
- }
+
+ assertTokenStreamContents(wdf,
+ new String[] { "foo", "bar", "foobar" },
+ new int[] { 5, 5, 5 },
+ new int[] { 6, 6, 6 });
}
public void testOffsetChange() throws Exception
{
WordDelimiterFilter wdf = new WordDelimiterFilter(
- new TokenStream() {
- Token t;
- public Token next() {
- if (t != null) return null;
- t = new Token("übelkeit)", 7, 16);
- return t;
- }
- },
+ new SingleTokenTokenStream(new Token("übelkeit)", 7, 16)),
1,1,0,0,1,1,0
);
- Token t = wdf.next();
-
- assertNotNull(t);
- assertEquals("übelkeit", t.term());
- assertEquals(7, t.startOffset());
- assertEquals(15, t.endOffset());
+ assertTokenStreamContents(wdf,
+ new String[] { "übelkeit" },
+ new int[] { 7 },
+ new int[] { 15 });
}
public void testOffsetChange2() throws Exception
{
WordDelimiterFilter wdf = new WordDelimiterFilter(
- new TokenStream() {
- Token t;
- public Token next() {
- if (t != null) return null;
- t = new Token("(übelkeit", 7, 17);
- return t;
- }
- },
+ new SingleTokenTokenStream(new Token("(übelkeit", 7, 17)),
1,1,0,0,1,1,0
);
- Token t = wdf.next();
-
- assertNotNull(t);
- assertEquals("übelkeit", t.term());
- assertEquals(8, t.startOffset());
- assertEquals(17, t.endOffset());
+ assertTokenStreamContents(wdf,
+ new String[] { "übelkeit" },
+ new int[] { 8 },
+ new int[] { 17 });
}
public void testOffsetChange3() throws Exception
{
WordDelimiterFilter wdf = new WordDelimiterFilter(
- new TokenStream() {
- Token t;
- public Token next() {
- if (t != null) return null;
- t = new Token("(übelkeit", 7, 16);
- return t;
- }
- },
+ new SingleTokenTokenStream(new Token("(übelkeit", 7, 16)),
1,1,0,0,1,1,0
);
- Token t = wdf.next();
-
- assertNotNull(t);
- assertEquals("übelkeit", t.term());
- assertEquals(8, t.startOffset());
- assertEquals(16, t.endOffset());
+ assertTokenStreamContents(wdf,
+ new String[] { "übelkeit" },
+ new int[] { 8 },
+ new int[] { 16 });
}
public void testOffsetChange4() throws Exception
{
WordDelimiterFilter wdf = new WordDelimiterFilter(
- new TokenStream() {
- private Token t;
- public Token next() {
- if (t != null) return null;
- t = new Token("(foo,bar)", 7, 16);
- return t;
- }
- },
+ new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)),
1,1,0,0,1,1,0
);
- Token t = wdf.next();
-
- assertNotNull(t);
- assertEquals("foo", t.term());
- assertEquals(8, t.startOffset());
- assertEquals(11, t.endOffset());
-
- t = wdf.next();
-
- assertNotNull(t);
- assertEquals("bar", t.term());
- assertEquals(12, t.startOffset());
- assertEquals(15, t.endOffset());
+ assertTokenStreamContents(wdf,
+ new String[] { "foo", "bar", "foobar"},
+ new int[] { 8, 12, 8 },
+ new int[] { 11, 15, 15 });
}
public void testAlphaNumericWords(){
@@ -338,24 +264,10 @@
public void doSplit(final String input, String... output) throws Exception {
- WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() {
- boolean done=false;
- @Override
- public Token next() throws IOException {
- if (done) return null;
- done = true;
- return new Token(input,0,input.length());
- }
- }
- ,1,1,0,0,0
- );
-
- for(String expected : output) {
- Token t = wdf.next();
- assertEquals(expected, t.term());
- }
-
- assertEquals(null, wdf.next());
+ WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
+ new StringReader(input)), 1, 1, 0, 0, 0);
+
+ assertTokenStreamContents(wdf, output);
}
public void testSplits() throws Exception {
@@ -365,29 +277,38 @@
// non-space marking symbol shouldn't cause split
// this is an example in Thai
doSplit("\u0e1a\u0e49\u0e32\u0e19","\u0e1a\u0e49\u0e32\u0e19");
+ // possessive followed by delimiter
+ doSplit("test's'", "test");
+ // some russian upper and lowercase
+ doSplit("РобеÑÑ", "РобеÑÑ");
+ // now cause a split (russian camelCase)
+ doSplit("РобÐÑÑ", "Роб", "ÐÑÑ");
+ // a composed titlecase character, don't split
+ doSplit("aÇ
ungla", "aÇ
ungla");
+
+ // a modifier letter, don't split
+ doSplit("سÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙاÙ
", "سÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙاÙ
");
+
+ // enclosing mark, don't split
+ doSplit("Ûtest", "Ûtest");
+
+ // combining spacing mark (the virama), don't split
+ doSplit("हिनà¥à¤¦à¥", "हिनà¥à¤¦à¥");
+
+ // don't split non-ascii digits
+ doSplit("١٢٣٤", "١٢٣٤");
+
+ // don't split supplementaries into unpaired surrogates
+ doSplit("ð ð ", "ð ð ");
}
public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
- WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() {
- boolean done=false;
- @Override
- public Token next() throws IOException {
- if (done) return null;
- done = true;
- return new Token(input,0,input.length());
- }
- }
- ,1,1,0,0,0,1,0,1,stemPossessive,null
- );
-
- for(String expected : output) {
- Token t = wdf.next();
- assertEquals(expected, t.term());
- }
+ WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
+ new StringReader(input)), 1,1,0,0,0,1,0,1,stemPossessive, null);
- assertEquals(null, wdf.next());
+ assertTokenStreamContents(wdf, output);
}
/*
@@ -485,25 +406,4 @@
new int[] { 6, 14, 19 },
new int[] { 1, 11, 1 });
}
-
- private void assertAnalyzesTo(Analyzer a, String input, String[] output,
- int startOffsets[], int endOffsets[], int posIncs[]) throws Exception {
-
- TokenStream ts = a.tokenStream("dummy", new StringReader(input));
- TermAttribute termAtt = (TermAttribute) ts
- .getAttribute(TermAttribute.class);
- OffsetAttribute offsetAtt = (OffsetAttribute) ts
- .getAttribute(OffsetAttribute.class);
- PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts
- .getAttribute(PositionIncrementAttribute.class);
- for (int i = 0; i < output.length; i++) {
- assertTrue(ts.incrementToken());
- assertEquals(output[i], termAtt.term());
- assertEquals(startOffsets[i], offsetAtt.startOffset());
- assertEquals(endOffsets[i], offsetAtt.endOffset());
- assertEquals(posIncs[i], posIncAtt.getPositionIncrement());
- }
- assertFalse(ts.incrementToken());
- ts.close();
- }
}
Added: lucene/solr/trunk/src/test/test-files/solr/conf/compoundDictionary.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/compoundDictionary.txt?rev=892821&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/compoundDictionary.txt (added)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/compoundDictionary.txt Mon Dec 21 13:53:50 2009
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# A set of words for testing the DictionaryCompound factory
+soft
+ball
+team
Added: lucene/solr/trunk/src/test/test-files/solr/conf/frenchArticles.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/frenchArticles.txt?rev=892821&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/frenchArticles.txt (added)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/frenchArticles.txt Mon Dec 21 13:53:50 2009
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# A set of articles for testing the French Elision filter.
+# Requiring a text file is a bit weird here...
+l
+m
+t
+qu
+n
+s
+j