You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/08/22 20:06:31 UTC

svn commit: r1376162 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/analysis/ lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/ lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/ lucene/analysis/common/src/java/...

Author: rmuir
Date: Wed Aug 22 18:06:30 2012
New Revision: 1376162

URL: http://svn.apache.org/viewvc?rev=1376162&view=rev
Log:
LUCENE-4321: Don't extend the trappy/broken FilterReader and simplify CharFilter so subclasses are correct readers

Added:
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianCharFilter.java
      - copied unchanged from r1376158, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianCharFilter.java
Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
    lucene/dev/branches/branch_4x/lucene/core/   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/CharFilter.java
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestCharFilter.java
    lucene/dev/branches/branch_4x/lucene/test-framework/   (props changed)
    lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java
    lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
    lucene/dev/branches/branch_4x/solr/   (props changed)
    lucene/dev/branches/branch_4x/solr/core/   (props changed)
    lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Wed Aug 22 18:06:30 2012
@@ -61,6 +61,14 @@ API Changes
   only exists so that this statistic can be accessed for Lucene 3.x 
   segments, which don't support Terms.size().  (Uwe Schindler, Robert Muir)
 
+* LUCENE-4321: Change CharFilter to extend Reader directly, as FilterReader
+  overdelegates (read(), read(char[], int, int), skip, etc). This made it
+  hard to implement CharFilters that were correct. Instead only close() is
+  delegated by default: read(char[], int, int) and correct(int) are abstract
+  so that its obvious which methods you should implement.  The protected 
+  inner Reader is 'input' like CharFilter in the 3.x series, instead of 'in'.  
+  (Dawid Weiss, Uwe Schindler, Robert Muir)
+
 Bug Fixes
 
 * LUCENE-4297: BooleanScorer2 would multiply the coord() factor

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java Wed Aug 22 18:06:30 2012
@@ -67,8 +67,8 @@ public class MappingCharFilter extends B
 
   @Override
   public void reset() throws IOException {
-    super.reset();
-    buffer.reset(in);
+    input.reset();
+    buffer.reset(input);
     replacement = null;
     inputOff = 0;
   }

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java Wed Aug 22 18:06:30 2012
@@ -34,7 +34,7 @@ public class PersianCharFilter extends C
   
   @Override
   public int read(char[] cbuf, int off, int len) throws IOException {
-    final int charsRead = super.read(cbuf, off, len);
+    final int charsRead = input.read(cbuf, off, len);
     if (charsRead > 0) {
       final int end = off + charsRead;
       while (off < end) {
@@ -45,6 +45,17 @@ public class PersianCharFilter extends C
     }
     return charsRead;
   }
+  
+  // optimized impl: some other charfilters consume with read()
+  @Override
+  public int read() throws IOException {
+    int ch = input.read();
+    if (ch == '\u200C') {
+      return ' ';
+    } else {
+      return ch;
+    }
+  }
 
   @Override
   protected int correct(int currentOff) {

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java Wed Aug 22 18:06:30 2012
@@ -80,7 +80,7 @@ public class PatternReplaceCharFilter ex
   private void fill() throws IOException {
     StringBuilder buffered = new StringBuilder();
     char [] temp = new char [1024];
-    for (int cnt = in.read(temp); cnt > 0; cnt = in.read(temp)) {
+    for (int cnt = input.read(temp); cnt > 0; cnt = input.read(temp)) {
       buffered.append(temp, 0, cnt);
     }
     transformedInput = new StringReader(processPattern(buffered).toString());

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java Wed Aug 22 18:06:30 2012
@@ -782,31 +782,51 @@ public class TestRandomChains extends Ba
     @Override
     public int read(char[] cbuf, int off, int len) throws IOException {
       readSomething = true;
-      return in.read(cbuf, off, len);
+      return input.read(cbuf, off, len);
     }
 
     @Override
     public int read() throws IOException {
       readSomething = true;
-      return in.read();
+      return input.read();
     }
 
     @Override
     public int read(CharBuffer target) throws IOException {
       readSomething = true;
-      return in.read(target);
+      return input.read(target);
     }
 
     @Override
     public int read(char[] cbuf) throws IOException {
       readSomething = true;
-      return in.read(cbuf);
+      return input.read(cbuf);
     }
 
     @Override
     public long skip(long n) throws IOException {
       readSomething = true;
-      return in.skip(n);
+      return input.skip(n);
+    }
+
+    @Override
+    public void mark(int readAheadLimit) throws IOException {
+      input.mark(readAheadLimit);
+    }
+
+    @Override
+    public boolean markSupported() {
+      return input.markSupported();
+    }
+
+    @Override
+    public boolean ready() throws IOException {
+      return input.ready();
+    }
+
+    @Override
+    public void reset() throws IOException {
+      input.reset();
     }
   }
   

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/CharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/CharFilter.java?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/CharFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/CharFilter.java Wed Aug 22 18:06:30 2012
@@ -17,7 +17,7 @@ package org.apache.lucene.analysis;
  * limitations under the License.
  */
 
-import java.io.FilterReader;
+import java.io.IOException;
 import java.io.Reader;
 
 /**
@@ -25,17 +25,39 @@ import java.io.Reader;
  * They can be used as {@link java.io.Reader} with additional offset
  * correction. {@link Tokenizer}s will automatically use {@link #correctOffset}
  * if a CharFilter subclass is used.
+ * <p>
+ * This class is abstract: at a minimum you must implement {@link #read(char[], int, int)},
+ * transforming the input in some way from {@link #input}, and {@link #correct(int)}
+ * to adjust the offsets to match the originals.
+ * <p>
+ * You can optionally provide more efficient implementations of additional methods 
+ * like {@link #read()}, {@link #read(char[])}, {@link #read(java.nio.CharBuffer)},
+ * but this is not required.
  */
-public abstract class CharFilter extends FilterReader {
+// the way java.io.FilterReader should work!
+public abstract class CharFilter extends Reader {
+  /** 
+   * The underlying character-input stream. 
+   */
+  protected final Reader input;
 
   /**
    * Create a new CharFilter wrapping the provided reader.
-   * @param in a Reader, can also be a CharFilter for chaining.
+   * @param input a Reader, can also be a CharFilter for chaining.
    */
-  public CharFilter(Reader in) {
-    super(in);
+  public CharFilter(Reader input) {
+    super(input);
+    this.input = input;
   }
   
+  /** 
+   * Closes the underlying input stream.
+   */
+  @Override
+  public void close() throws IOException {
+    input.close();
+  }
+
   /**
    * Subclasses override to correct the current offset.
    *
@@ -50,6 +72,6 @@ public abstract class CharFilter extends
    */
   public final int correctOffset(int currentOff) {
     final int corrected = correct(currentOff);
-    return (in instanceof CharFilter) ? ((CharFilter) in).correctOffset(corrected) : corrected;
+    return (input instanceof CharFilter) ? ((CharFilter) input).correctOffset(corrected) : corrected;
   }
 }

Modified: lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestCharFilter.java?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestCharFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestCharFilter.java Wed Aug 22 18:06:30 2012
@@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
  * limitations under the License.
  */
 
+import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 
@@ -51,6 +52,11 @@ public class TestCharFilter extends Luce
     }
 
     @Override
+    public int read(char[] cbuf, int off, int len) throws IOException {
+      return input.read(cbuf, off, len);
+    }
+
+    @Override
     protected int correct(int currentOff) {
       return currentOff + 1;
     }
@@ -61,6 +67,11 @@ public class TestCharFilter extends Luce
     protected CharFilter2(Reader in) {
       super(in);
     }
+    
+    @Override
+    public int read(char[] cbuf, int off, int len) throws IOException {
+      return input.read(cbuf, off, len);
+    }
 
     @Override
     protected int correct(int currentOff) {

Modified: lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java Wed Aug 22 18:06:30 2012
@@ -42,11 +42,6 @@ public class MockCharFilter extends Char
   public MockCharFilter(Reader in) {
     this(in, 0);
   }
-
-  @Override
-  public void close() throws IOException {
-    in.close();
-  }
   
   int currentOffset = -1;
   int delta = 0;
@@ -66,7 +61,7 @@ public class MockCharFilter extends Char
     }
     
     // otherwise actually read one    
-    int ch = in.read();
+    int ch = input.read();
     if (ch < 0)
       return ch;
     

Modified: lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java Wed Aug 22 18:06:30 2012
@@ -19,12 +19,16 @@ package org.apache.lucene.analysis;
 
 import java.io.IOException;
 import java.io.Reader;
+import java.nio.CharBuffer;
+import java.util.Random;
 
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.apache.lucene.util.automaton.RegExp;
 
+import com.carrotsearch.randomizedtesting.RandomizedContext;
+
 /**
  * Tokenizer for testing.
  * <p>
@@ -77,6 +81,9 @@ public class MockTokenizer extends Token
   private int lastOffset = 0; // only for asserting
   private boolean enableChecks = true;
   
+  // evil: but we don't change the behavior with this random, we only switch up how we read
+  private final Random random = new Random(RandomizedContext.current().getRandom().nextLong());
+  
   public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
     super(factory, input);
     this.runAutomaton = runAutomaton;
@@ -139,14 +146,14 @@ public class MockTokenizer extends Token
   }
 
   protected int readCodePoint() throws IOException {
-    int ch = input.read();
+    int ch = readChar();
     if (ch < 0) {
       return ch;
     } else {
       assert !Character.isLowSurrogate((char) ch) : "unpaired low surrogate: " + Integer.toHexString(ch);
       off++;
       if (Character.isHighSurrogate((char) ch)) {
-        int ch2 = input.read();
+        int ch2 = readChar();
         if (ch2 >= 0) {
           off++;
           assert Character.isLowSurrogate((char) ch2) : "unpaired high surrogate: " + Integer.toHexString(ch) + ", followed by: " + Integer.toHexString(ch2);
@@ -158,6 +165,33 @@ public class MockTokenizer extends Token
       return ch;
     }
   }
+  
+  protected int readChar() throws IOException {
+    switch(random.nextInt(10)) {
+      case 0: {
+        // read(char[])
+        char c[] = new char[1];
+        int ret = input.read(c);
+        return ret < 0 ? ret : c[0];
+      }
+      case 1: {
+        // read(char[], int, int)
+        char c[] = new char[2];
+        int ret = input.read(c, 1, 1);
+        return ret < 0 ? ret : c[1];
+      }
+      case 2: {
+        // read(CharBuffer)
+        char c[] = new char[1];
+        CharBuffer cb = CharBuffer.wrap(c);
+        int ret = input.read(cb);
+        return ret < 0 ? ret : c[0];
+      }
+      default: 
+        // read()
+        return input.read();
+    }
+  }
 
   protected boolean isTokenChar(int c) {
     state = runAutomaton.step(state, c);

Modified: lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java Wed Aug 22 18:06:30 2012
@@ -103,7 +103,7 @@ public class LegacyHTMLStripCharFilter e
       return ch;
     }
     numRead++;
-    return in.read();
+    return input.read();
   }
 
   private int nextSkipWS() throws IOException {
@@ -118,7 +118,7 @@ public class LegacyHTMLStripCharFilter e
       return pushed.charAt(len-1);
     }
     numRead++;
-    int ch = in.read();
+    int ch = input.read();
     push(ch);
     return ch;
   }
@@ -180,11 +180,11 @@ public class LegacyHTMLStripCharFilter e
 
   private void saveState() throws IOException {
     lastMark = numRead;
-    in.mark(readAheadLimit);
+    input.mark(readAheadLimit);
   }
 
   private void restoreState() throws IOException {
-    in.reset();
+    input.reset();
     pushed.setLength(0);
   }