You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by ma...@apache.org on 2009/12/21 14:53:52 UTC
svn commit: r892821 [3/3] - in /lucene/solr/trunk: ./ src/test/org/apache/solr/analysis/ src/test/test-files/solr/conf/

Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestTrimFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestTrimFilter.java?rev=892821&r1=892820&r2=892821&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestTrimFilter.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestTrimFilter.java Mon Dec 21 13:53:50 2009
@@ -17,12 +17,19 @@
 
 package org.apache.solr.analysis;
 
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-
-import java.util.List;
-
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 
 /**
  * @version $Id:$
@@ -35,46 +42,75 @@
     char[] ccc = "cCc".toCharArray();
     char[] whitespace = "   ".toCharArray();
     char[] empty = "".toCharArray();
-    TokenStream ts = new TrimFilter
-            (new IterTokenStream(new Token(a, 0, a.length, 1, 5),
+    TrimFilterFactory factory = new TrimFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("updateOffsets", "false");
+    factory.init(args);
+    TokenStream ts = factory.create(new IterTokenStream(new Token(a, 0, a.length, 1, 5),
                     new Token(b, 0, b.length, 6, 10),
                     new Token(ccc, 0, ccc.length, 11, 15),
                     new Token(whitespace, 0, whitespace.length, 16, 20),
-                    new Token(empty, 0, empty.length, 21, 21)), false);
+                    new Token(empty, 0, empty.length, 21, 21)));
 
-    TermAttribute token;
-    assertTrue(ts.incrementToken());
-    token = (TermAttribute) ts.getAttribute(TermAttribute.class);
-    assertEquals("a", new String(token.termBuffer(), 0, token.termLength()));
-    assertTrue(ts.incrementToken());
-    assertEquals("b", new String(token.termBuffer(), 0, token.termLength()));
-    assertTrue(ts.incrementToken());
-    assertEquals("cCc", new String(token.termBuffer(), 0, token.termLength()));
-    assertTrue(ts.incrementToken());
-    assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
-    assertTrue(ts.incrementToken());
-    assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
-    assertFalse(ts.incrementToken());
+    assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""});
 
     a = " a".toCharArray();
     b = "b ".toCharArray();
     ccc = " c ".toCharArray();
     whitespace = "   ".toCharArray();
-    ts = new TrimFilter(new IterTokenStream(
+    factory = new TrimFilterFactory();
+    args = new HashMap<String,String>();
+    args.put("updateOffsets", "true");
+    factory.init(args);
+    ts = factory.create(new IterTokenStream(
             new Token(a, 0, a.length, 0, 2),
             new Token(b, 0, b.length, 0, 2),
             new Token(ccc, 0, ccc.length, 0, 3),
-            new Token(whitespace, 0, whitespace.length, 0, 3)), true);
+            new Token(whitespace, 0, whitespace.length, 0, 3)));
+    
+    assertTokenStreamContents(ts, 
+        new String[] { "a", "b", "c", "" },
+        new int[] { 1, 0, 1, 3 },
+        new int[] { 2, 1, 2, 3 },
+        new int[] { 1, 1, 1, 1 });
+  }
+  
+  /**
+   * @deprecated does not support custom attributes
+   */
+  private static class IterTokenStream extends TokenStream {
+    final Token tokens[];
+    int index = 0;
+    TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+    OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+    PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+    FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
+    TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
+    PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
     
-    List<Token> expect = tokens("a,1,1,2 b,1,0,1 c,1,1,2 ,1,3,3");
-    List<Token> real = getTokens(ts);
-    for (Token t : expect) {
-      System.out.println("TEST:" + t);
+    public IterTokenStream(Token... tokens) {
+      super();
+      this.tokens = tokens;
     }
-    for (Token t : real) {
-      System.out.println("REAL:" + t);
+    
+    public IterTokenStream(Collection<Token> tokens) {
+      this(tokens.toArray(new Token[tokens.size()]));
+    }
+    
+    public boolean incrementToken() throws IOException {
+      if (index >= tokens.length)
+        return false;
+      else {
+        clearAttributes();
+        Token token = tokens[index++];
+        termAtt.setTermBuffer(token.term());
+        offsetAtt.setOffset(token.startOffset(), token.endOffset());
+        posIncAtt.setPositionIncrement(token.getPositionIncrement());
+        flagsAtt.setFlags(token.getFlags());
+        typeAtt.setType(token.type());
+        payloadAtt.setPayload(token.getPayload());
+        return true;
+      }
     }
-    assertTokEqualOff(expect, real);
   }
-
 }

Modified: lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java?rev=892821&r1=892820&r2=892821&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java Mon Dec 21 13:53:50 2009
@@ -17,14 +17,14 @@
 
 package org.apache.solr.analysis;
 
-import org.apache.solr.util.AbstractSolrTestCase;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
@@ -37,7 +37,7 @@
 /**
  * New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
  */
-public class TestWordDelimiterFilter extends AbstractSolrTestCase {
+public class TestWordDelimiterFilter extends BaseTokenTestCase {
   public String getSchemaFile() { return "solr/conf/schema.xml"; }
   public String getSolrConfigFile() { return "solr/conf/solrconfig.xml"; }
 
@@ -144,148 +144,74 @@
     // test that subwords and catenated subwords have
     // the correct offsets.
     WordDelimiterFilter wdf = new WordDelimiterFilter(
-            new TokenStream() {
-              Token t;
-              public Token next() throws IOException {
-                if (t!=null) return null;
-                t = new Token("foo-bar", 5, 12);  // actual
-                return t;
-              }
-            },
+            new SingleTokenTokenStream(new Token("foo-bar", 5, 12)),
     1,1,0,0,1,1,0);
 
-    int i=0;
-    for(Token t; (t=wdf.next())!=null;) {
-      String termText = new String(t.termBuffer(), 0, t.termLength());
-      if (termText.equals("foo")) {
-        assertEquals(5, t.startOffset());
-        assertEquals(8, t.endOffset());
-        i++;
-      }
-      if (termText.equals("bar")) {
-        assertEquals(9, t.startOffset());
-        assertEquals(12, t.endOffset());
-        i++;
-      }
-      if (termText.equals("foobar")) {
-        assertEquals(5, t.startOffset());
-        assertEquals(12, t.endOffset());
-        i++;
-      }
-    }
-    assertEquals(3,i); // make sure all 3 tokens were generated
+    assertTokenStreamContents(wdf, 
+        new String[] { "foo", "bar", "foobar" },
+        new int[] { 5, 9, 5 }, 
+        new int[] { 8, 12, 12 });
 
-    // test that if splitting or catenating a synonym, that the offsets
-    // are not altered (they would be incorrect).
     wdf = new WordDelimiterFilter(
-            new TokenStream() {
-              Token t;
-              public Token next() throws IOException {
-                if (t!=null) return null;
-                t = new Token("foo-bar", 5, 6);  // a synonym
-                return t;
-              }
-            },
+            new SingleTokenTokenStream(new Token("foo-bar", 5, 6)),
     1,1,0,0,1,1,0);
-    for(Token t; (t=wdf.next())!=null;) {
-      assertEquals(5, t.startOffset());
-      assertEquals(6, t.endOffset());
-    }
+    
+    assertTokenStreamContents(wdf,
+        new String[] { "foo", "bar", "foobar" },
+        new int[] { 5, 5, 5 },
+        new int[] { 6, 6, 6 });
   }
   
   public void testOffsetChange() throws Exception
   {
     WordDelimiterFilter wdf = new WordDelimiterFilter(
-      new TokenStream() {
-        Token t;
-        public Token next() {
-         if (t != null) return null;
-         t = new Token("Ã¼belkeit)", 7, 16);
-         return t;
-        }
-      },
+      new SingleTokenTokenStream(new Token("Ã¼belkeit)", 7, 16)),
       1,1,0,0,1,1,0
     );
     
-    Token t = wdf.next();
-    
-    assertNotNull(t);
-    assertEquals("Ã¼belkeit", t.term());
-    assertEquals(7, t.startOffset());
-    assertEquals(15, t.endOffset());
+    assertTokenStreamContents(wdf,
+        new String[] { "Ã¼belkeit" },
+        new int[] { 7 },
+        new int[] { 15 });
   }
   
   public void testOffsetChange2() throws Exception
   {
     WordDelimiterFilter wdf = new WordDelimiterFilter(
-      new TokenStream() {
-        Token t;
-        public Token next() {
-         if (t != null) return null;
-         t = new Token("(Ã¼belkeit", 7, 17);
-         return t;
-        }
-      },
+      new SingleTokenTokenStream(new Token("(Ã¼belkeit", 7, 17)),
       1,1,0,0,1,1,0
     );
     
-    Token t = wdf.next();
-    
-    assertNotNull(t);
-    assertEquals("Ã¼belkeit", t.term());
-    assertEquals(8, t.startOffset());
-    assertEquals(17, t.endOffset());
+    assertTokenStreamContents(wdf,
+        new String[] { "Ã¼belkeit" },
+        new int[] { 8 },
+        new int[] { 17 });
   }
   
   public void testOffsetChange3() throws Exception
   {
     WordDelimiterFilter wdf = new WordDelimiterFilter(
-      new TokenStream() {
-        Token t;
-        public Token next() {
-         if (t != null) return null;
-         t = new Token("(Ã¼belkeit", 7, 16);
-         return t;
-        }
-      },
+      new SingleTokenTokenStream(new Token("(Ã¼belkeit", 7, 16)),
       1,1,0,0,1,1,0
     );
     
-    Token t = wdf.next();
-    
-    assertNotNull(t);
-    assertEquals("Ã¼belkeit", t.term());
-    assertEquals(8, t.startOffset());
-    assertEquals(16, t.endOffset());
+    assertTokenStreamContents(wdf,
+        new String[] { "Ã¼belkeit" },
+        new int[] { 8 },
+        new int[] { 16 });
   }
   
   public void testOffsetChange4() throws Exception
   {
     WordDelimiterFilter wdf = new WordDelimiterFilter(
-      new TokenStream() {
-        private Token t;
-        public Token next() {
-         if (t != null) return null;
-         t = new Token("(foo,bar)", 7, 16);
-         return t;
-        }
-      },
+      new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)),
       1,1,0,0,1,1,0
     );
     
-    Token t = wdf.next();
-    
-    assertNotNull(t);
-    assertEquals("foo", t.term());
-    assertEquals(8, t.startOffset());
-    assertEquals(11, t.endOffset());
-    
-    t = wdf.next();
-    
-    assertNotNull(t);
-    assertEquals("bar", t.term());
-    assertEquals(12, t.startOffset());
-    assertEquals(15, t.endOffset());
+    assertTokenStreamContents(wdf,
+        new String[] { "foo", "bar", "foobar"},
+        new int[] { 8, 12, 8 },
+        new int[] { 11, 15, 15 });
   }
 
   public void testAlphaNumericWords(){
@@ -338,24 +264,10 @@
 
 
   public void doSplit(final String input, String... output) throws Exception {
-    WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() {
-      boolean done=false;
-      @Override
-      public Token next() throws IOException {
-        if (done) return null;
-        done = true;
-        return new Token(input,0,input.length());
-      }
-    }
-            ,1,1,0,0,0
-    );
-
-    for(String expected : output) {
-      Token t = wdf.next();
-      assertEquals(expected, t.term());
-    }
-
-    assertEquals(null, wdf.next());
+    WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
+        new StringReader(input)), 1, 1, 0, 0, 0);
+    
+    assertTokenStreamContents(wdf, output);
   }
 
   public void testSplits() throws Exception {
@@ -365,29 +277,38 @@
     // non-space marking symbol shouldn't cause split
     // this is an example in Thai    
     doSplit("\u0e1a\u0e49\u0e32\u0e19","\u0e1a\u0e49\u0e32\u0e19");
+    // possessive followed by delimiter
+    doSplit("test's'", "test");
 
+    // some russian upper and lowercase
+    doSplit("Ð Ð¾Ð±ÐµÑÑ", "Ð Ð¾Ð±ÐµÑÑ");
+    // now cause a split (russian camelCase)
+    doSplit("Ð Ð¾Ð±ÐÑÑ", "Ð Ð¾Ð±", "ÐÑÑ");
 
+    // a composed titlecase character, don't split
+    doSplit("aÇungla", "aÇungla");
+    
+    // a modifier letter, don't split
+    doSplit("Ø³ÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙØ§Ù", "Ø³ÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙÙØ§Ù");
+    
+    // enclosing mark, don't split
+    doSplit("Ûtest", "Ûtest");
+    
+    // combining spacing mark (the virama), don't split
+    doSplit("à¤¹à¤¿à¤¨à¥à¤¦à¥", "à¤¹à¤¿à¤¨à¥à¤¦à¥");
+    
+    // don't split non-ascii digits
+    doSplit("Ù¡Ù¢Ù£Ù¤", "Ù¡Ù¢Ù£Ù¤");
+    
+    // don't split supplementaries into unpaired surrogates
+    doSplit("ð ð ", "ð ð ");
   }
   
   public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
-    WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() {
-      boolean done=false;
-      @Override
-      public Token next() throws IOException {
-        if (done) return null;
-        done = true;
-        return new Token(input,0,input.length());
-      }
-    }
-            ,1,1,0,0,0,1,0,1,stemPossessive,null
-    );
-
-    for(String expected : output) {
-      Token t = wdf.next();
-      assertEquals(expected, t.term());
-    }
+    WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
+        new StringReader(input)), 1,1,0,0,0,1,0,1,stemPossessive, null);
 
-    assertEquals(null, wdf.next());
+    assertTokenStreamContents(wdf, output);
   }
   
   /*
@@ -485,25 +406,4 @@
         new int[] { 6, 14, 19 },
         new int[] { 1, 11, 1 });
   }
-
-  private void assertAnalyzesTo(Analyzer a, String input, String[] output,
-      int startOffsets[], int endOffsets[], int posIncs[]) throws Exception {
-
-    TokenStream ts = a.tokenStream("dummy", new StringReader(input));
-    TermAttribute termAtt = (TermAttribute) ts
-        .getAttribute(TermAttribute.class);
-    OffsetAttribute offsetAtt = (OffsetAttribute) ts
-        .getAttribute(OffsetAttribute.class);
-    PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts
-        .getAttribute(PositionIncrementAttribute.class);
-    for (int i = 0; i < output.length; i++) {
-      assertTrue(ts.incrementToken());
-      assertEquals(output[i], termAtt.term());
-      assertEquals(startOffsets[i], offsetAtt.startOffset());
-      assertEquals(endOffsets[i], offsetAtt.endOffset());
-      assertEquals(posIncs[i], posIncAtt.getPositionIncrement());
-    }
-    assertFalse(ts.incrementToken());
-    ts.close();
-  }
 }

Added: lucene/solr/trunk/src/test/test-files/solr/conf/compoundDictionary.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/compoundDictionary.txt?rev=892821&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/compoundDictionary.txt (added)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/compoundDictionary.txt Mon Dec 21 13:53:50 2009
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# A set of words for testing the DictionaryCompound factory
+soft
+ball
+team

Added: lucene/solr/trunk/src/test/test-files/solr/conf/frenchArticles.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/test-files/solr/conf/frenchArticles.txt?rev=892821&view=auto
==============================================================================
--- lucene/solr/trunk/src/test/test-files/solr/conf/frenchArticles.txt (added)
+++ lucene/solr/trunk/src/test/test-files/solr/conf/frenchArticles.txt Mon Dec 21 13:53:50 2009
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# A set of articles for testing the French Elision filter.
+# Requiring a text file is a bit weird here...
+l
+m
+t
+qu
+n
+s
+j