You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/10/11 08:02:39 UTC

svn commit: r1531202 - in /lucene/dev/branches/lucene_solr_4_5: ./ lucene/ lucene/analysis/ lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/ lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/ lucene/analysis/comm...

Author: rmuir
Date: Fri Oct 11 06:02:38 2013
New Revision: 1531202

URL: http://svn.apache.org/r1531202
Log:
LUCENE-5269: Fix NGramTokenFilter length filtering

Added:
    lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilter.java
      - copied unchanged from r1531195, lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilter.java
    lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilterFactory.java
      - copied unchanged from r1531195, lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CodepointCountFilterFactory.java
    lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCodepointCountFilter.java
      - copied unchanged from r1531195, lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCodepointCountFilter.java
    lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCodepointCountFilterFactory.java
      - copied unchanged from r1531195, lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCodepointCountFilterFactory.java
Modified:
    lucene/dev/branches/lucene_solr_4_5/   (props changed)
    lucene/dev/branches/lucene_solr_4_5/lucene/   (props changed)
    lucene/dev/branches/lucene_solr_4_5/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/lucene_solr_4_5/lucene/analysis/   (props changed)
    lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
    lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
    lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java
    lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
    lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
    lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
    lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java

Modified: lucene/dev/branches/lucene_solr_4_5/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_5/lucene/CHANGES.txt?rev=1531202&r1=1531201&r2=1531202&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_5/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/lucene_solr_4_5/lucene/CHANGES.txt Fri Oct 11 06:02:38 2013
@@ -30,6 +30,50 @@ Bug Fixes
   terms were present in the query and the high-frequent operator was set
   to SHOULD. (Simon Willnauer)
 
+* LUCENE-5269: Fix bug in NGramTokenFilter where it would sometimes count
+  unicode characters incorrectly. Adds CodepointCountFilter.
+  (Mike McCandless, Robert Muir)
+
+API Changes:
+
+* LUCENE-5222: Add SortField.needsScores(). Previously it was not possible
+  for a custom Sort that makes use of the relevance score to work correctly
+  with IndexSearcher when an ExecutorService is specified.
+  (Ryan Ernst, Mike McCandless, Robert Muir)
+
+Optimizations
+
+* LUCENE-5225: The ToParentBlockJoinQuery only keeps tracks of the the child
+  doc ids and child scores if the ToParentBlockJoinCollector is used.
+  (Martijn van Groningen)
+
+Documentation
+
+* LUCENE-5211: Better javadocs and error checking of 'format' option in 
+  StopFilterFactory, as well as comments in all snowball formated files
+  about specifying format option.  (hossman)
+
+Changes in backwards compatibility policy
+
+* LUCENE-5235: Sub classes of Tokenizer have to call super.reset()
+  when implementing reset(). Otherwise the consumer will get an
+  IllegalStateException because the Reader is not correctly assigned.
+  It is important to never change the "input" field on Tokenizer
+  without using setReader(). The "input" field must not be used
+  outside reset(), incrementToken(), or end() - especially not in
+  the constructor.  (Uwe Schindler, Robert Muir)
+
+* LUCENE-5204: Directory doesn't have default implementations for
+  LockFactory-related methods, which have been moved to BaseDirectory. If you
+  had a custom Directory implementation that extended Directory, you need to
+  extend BaseDirectory instead. (Adrien Grand)
+
+Build
+
+* LUCENE-5249, LUCENE-5257: All Lucene/Solr modules should use the same
+  dependency versions. (Steve Rowe)
+
+>>>>>>> .merge-right.r1531195
 ======================= Lucene 4.5.0 =======================
 
 New features

Modified: lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java?rev=1531202&r1=1531201&r2=1531202&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java (original)
+++ lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java Fri Oct 11 06:02:38 2013
@@ -21,7 +21,7 @@ import java.io.IOException;
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.miscellaneous.LengthFilter;
+import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
@@ -81,7 +81,7 @@ public final class NGramTokenFilter exte
    * @param maxGram the largest n-gram to generate
    */
   public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
-    super(new LengthFilter(version, input, minGram, Integer.MAX_VALUE));
+    super(new CodepointCountFilter(version, input, minGram, Integer.MAX_VALUE));
     this.version = version;
     this.charUtils = version.onOrAfter(Version.LUCENE_44)
         ? CharacterUtils.getInstance(version)

Modified: lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory?rev=1531202&r1=1531201&r2=1531202&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory (original)
+++ lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory Fri Oct 11 06:02:38 2013
@@ -56,6 +56,7 @@ org.apache.lucene.analysis.it.ItalianLig
 org.apache.lucene.analysis.lv.LatvianStemFilterFactory
 org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory
 org.apache.lucene.analysis.miscellaneous.CapitalizationFilterFactory
+org.apache.lucene.analysis.miscellaneous.CodepointCountFilterFactory
 org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilterFactory
 org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory
 org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory

Modified: lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java?rev=1531202&r1=1531201&r2=1531202&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java (original)
+++ lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestBugInSomething.java Fri Oct 11 06:02:38 2013
@@ -1,5 +1,6 @@
 package org.apache.lucene.analysis.core;
 
+import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.nio.CharBuffer;
@@ -11,10 +12,14 @@ import org.apache.lucene.analysis.MockCh
 import org.apache.lucene.analysis.MockTokenFilter;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.charfilter.MappingCharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
 import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
+import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
+import org.apache.lucene.analysis.ngram.NGramTokenFilter;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
 import org.apache.lucene.analysis.util.CharArraySet;
 
 /*
@@ -195,4 +200,58 @@ public class TestBugInSomething extends 
       assertEquals("read(char[], int, int)", e.getMessage());
     }
   }
+  
+  // todo: test framework?
+  
+  static final class SopTokenFilter extends TokenFilter {
+
+    SopTokenFilter(TokenStream input) {
+      super(input);
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+      if (input.incrementToken()) {
+        System.out.println(input.getClass().getSimpleName() + "->" + this.reflectAsString(false));
+        return true;
+      } else {
+        return false;
+      }
+    }
+
+    @Override
+    public void end() throws IOException {
+      super.end();
+      System.out.println(input.getClass().getSimpleName() + ".end()");
+    }
+
+    @Override
+    public void close() throws IOException {
+      super.close();
+      System.out.println(input.getClass().getSimpleName() + ".close()");
+    }
+
+    @Override
+    public void reset() throws IOException {
+      super.reset();
+      System.out.println(input.getClass().getSimpleName() + ".reset()");
+    }
+  }
+  
+  // LUCENE-5269
+  public void testUnicodeShinglesAndNgrams() throws Exception {
+    Analyzer analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 94);
+        //TokenStream stream = new SopTokenFilter(tokenizer);
+        TokenStream stream = new ShingleFilter(tokenizer, 5);
+        //stream = new SopTokenFilter(stream);
+        stream = new NGramTokenFilter(TEST_VERSION_CURRENT, stream, 55, 83);
+        //stream = new SopTokenFilter(stream);
+        return new TokenStreamComponents(tokenizer, stream);
+      }  
+    };
+    checkRandomData(random(), analyzer, 2000);
+  }
 }

Modified: lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java?rev=1531202&r1=1531201&r2=1531202&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java (original)
+++ lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilterTest.java Fri Oct 11 06:02:38 2013
@@ -209,15 +209,20 @@ public class EdgeNGramTokenFilterTest ex
   
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
-    Analyzer a = new Analyzer() {
-      @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
-        return new TokenStreamComponents(tokenizer, 
-            new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 2, 4));
-      }    
-    };
-    checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
+    for (int i = 0; i < 10; i++) {
+      final int min = _TestUtil.nextInt(random(), 2, 10);
+      final int max = _TestUtil.nextInt(random(), min, 20);
+    
+      Analyzer a = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+          return new TokenStreamComponents(tokenizer, 
+            new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, min, max));
+        }    
+      };
+      checkRandomData(random(), a, 100*RANDOM_MULTIPLIER);
+    }
     
     Analyzer b = new Analyzer() {
       @Override

Modified: lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java?rev=1531202&r1=1531201&r2=1531202&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java (original)
+++ lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/EdgeNGramTokenizerTest.java Fri Oct 11 06:02:38 2013
@@ -107,15 +107,20 @@ public class EdgeNGramTokenizerTest exte
   
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
-    Analyzer a = new Analyzer() {
-      @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 4);
-        return new TokenStreamComponents(tokenizer, tokenizer);
-      }    
-    };
-    checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
-    checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 8192, false, false);
+    for (int i = 0; i < 10; i++) {
+      final int min = _TestUtil.nextInt(random(), 2, 10);
+      final int max = _TestUtil.nextInt(random(), min, 20);
+      
+      Analyzer a = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, reader, min, max);
+          return new TokenStreamComponents(tokenizer, tokenizer);
+        }    
+      };
+      checkRandomData(random(), a, 100*RANDOM_MULTIPLIER, 20);
+      checkRandomData(random(), a, 10*RANDOM_MULTIPLIER, 8192);
+    }
     
     Analyzer b = new Analyzer() {
       @Override

Modified: lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java?rev=1531202&r1=1531201&r2=1531202&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java (original)
+++ lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenFilterTest.java Fri Oct 11 06:02:38 2013
@@ -144,15 +144,19 @@ public class NGramTokenFilterTest extend
   
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
-    Analyzer a = new Analyzer() {
-      @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
-        return new TokenStreamComponents(tokenizer, 
-            new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, 2, 4));
-      }    
-    };
-    checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
+    for (int i = 0; i < 10; i++) {
+      final int min = _TestUtil.nextInt(random(), 2, 10);
+      final int max = _TestUtil.nextInt(random(), min, 20);
+      Analyzer a = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+          return new TokenStreamComponents(tokenizer, 
+              new NGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, min, max));
+        }    
+      };
+      checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20);
+    }
   }
   
   public void testEmptyTerm() throws Exception {

Modified: lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java?rev=1531202&r1=1531201&r2=1531202&view=diff
==============================================================================
--- lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java (original)
+++ lucene/dev/branches/lucene_solr_4_5/lucene/analysis/common/src/test/org/apache/lucene/analysis/ngram/NGramTokenizerTest.java Fri Oct 11 06:02:38 2013
@@ -107,15 +107,19 @@ public class NGramTokenizerTest extends 
   
   /** blast some random strings through the analyzer */
   public void testRandomStrings() throws Exception {
-    Analyzer a = new Analyzer() {
-      @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, reader, 2, 4);
-        return new TokenStreamComponents(tokenizer, tokenizer);
-      }    
-    };
-    checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER, 20, false, false);
-    checkRandomData(random(), a, 50*RANDOM_MULTIPLIER, 1027, false, false);
+    for (int i = 0; i < 10; i++) {
+      final int min = _TestUtil.nextInt(random(), 2, 10);
+      final int max = _TestUtil.nextInt(random(), min, 20);
+      Analyzer a = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = new NGramTokenizer(TEST_VERSION_CURRENT, reader, min, max);
+          return new TokenStreamComponents(tokenizer, tokenizer);
+        }    
+      };
+      checkRandomData(random(), a, 200*RANDOM_MULTIPLIER, 20);
+      checkRandomData(random(), a, 10*RANDOM_MULTIPLIER, 1027);
+    }
   }
 
   private static void testNGrams(int minGram, int maxGram, int length, final String nonTokenChars) throws IOException {