You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/08/22 20:06:31 UTC
svn commit: r1376162 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/analysis/
lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/
lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/
lucene/analysis/common/src/java/...
Author: rmuir
Date: Wed Aug 22 18:06:30 2012
New Revision: 1376162
URL: http://svn.apache.org/viewvc?rev=1376162&view=rev
Log:
LUCENE-4321: Don't extend the trappy/broken FilterReader and simplify CharFilter so subclasses are correct readers
Added:
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianCharFilter.java
- copied unchanged from r1376158, lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/fa/TestPersianCharFilter.java
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/lucene/analysis/ (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java
lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
lucene/dev/branches/branch_4x/lucene/core/ (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/CharFilter.java
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestCharFilter.java
lucene/dev/branches/branch_4x/lucene/test-framework/ (props changed)
lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java
lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
lucene/dev/branches/branch_4x/solr/ (props changed)
lucene/dev/branches/branch_4x/solr/core/ (props changed)
lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Wed Aug 22 18:06:30 2012
@@ -61,6 +61,14 @@ API Changes
only exists so that this statistic can be accessed for Lucene 3.x
segments, which don't support Terms.size(). (Uwe Schindler, Robert Muir)
+* LUCENE-4321: Change CharFilter to extend Reader directly, as FilterReader
+ overdelegates (read(), read(char[], int, int), skip, etc). This made it
+ hard to implement CharFilters that were correct. Instead only close() is
+ delegated by default: read(char[], int, int) and correct(int) are abstract
+ so that its obvious which methods you should implement. The protected
+ inner Reader is 'input' like CharFilter in the 3.x series, instead of 'in'.
+ (Dawid Weiss, Uwe Schindler, Robert Muir)
+
Bug Fixes
* LUCENE-4297: BooleanScorer2 would multiply the coord() factor
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java Wed Aug 22 18:06:30 2012
@@ -67,8 +67,8 @@ public class MappingCharFilter extends B
@Override
public void reset() throws IOException {
- super.reset();
- buffer.reset(in);
+ input.reset();
+ buffer.reset(input);
replacement = null;
inputOff = 0;
}
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java Wed Aug 22 18:06:30 2012
@@ -34,7 +34,7 @@ public class PersianCharFilter extends C
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
- final int charsRead = super.read(cbuf, off, len);
+ final int charsRead = input.read(cbuf, off, len);
if (charsRead > 0) {
final int end = off + charsRead;
while (off < end) {
@@ -45,6 +45,17 @@ public class PersianCharFilter extends C
}
return charsRead;
}
+
+ // optimized impl: some other charfilters consume with read()
+ @Override
+ public int read() throws IOException {
+ int ch = input.read();
+ if (ch == '\u200C') {
+ return ' ';
+ } else {
+ return ch;
+ }
+ }
@Override
protected int correct(int currentOff) {
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java Wed Aug 22 18:06:30 2012
@@ -80,7 +80,7 @@ public class PatternReplaceCharFilter ex
private void fill() throws IOException {
StringBuilder buffered = new StringBuilder();
char [] temp = new char [1024];
- for (int cnt = in.read(temp); cnt > 0; cnt = in.read(temp)) {
+ for (int cnt = input.read(temp); cnt > 0; cnt = input.read(temp)) {
buffered.append(temp, 0, cnt);
}
transformedInput = new StringReader(processPattern(buffered).toString());
Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java Wed Aug 22 18:06:30 2012
@@ -782,31 +782,51 @@ public class TestRandomChains extends Ba
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
readSomething = true;
- return in.read(cbuf, off, len);
+ return input.read(cbuf, off, len);
}
@Override
public int read() throws IOException {
readSomething = true;
- return in.read();
+ return input.read();
}
@Override
public int read(CharBuffer target) throws IOException {
readSomething = true;
- return in.read(target);
+ return input.read(target);
}
@Override
public int read(char[] cbuf) throws IOException {
readSomething = true;
- return in.read(cbuf);
+ return input.read(cbuf);
}
@Override
public long skip(long n) throws IOException {
readSomething = true;
- return in.skip(n);
+ return input.skip(n);
+ }
+
+ @Override
+ public void mark(int readAheadLimit) throws IOException {
+ input.mark(readAheadLimit);
+ }
+
+ @Override
+ public boolean markSupported() {
+ return input.markSupported();
+ }
+
+ @Override
+ public boolean ready() throws IOException {
+ return input.ready();
+ }
+
+ @Override
+ public void reset() throws IOException {
+ input.reset();
}
}
Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/CharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/CharFilter.java?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/CharFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/analysis/CharFilter.java Wed Aug 22 18:06:30 2012
@@ -17,7 +17,7 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
-import java.io.FilterReader;
+import java.io.IOException;
import java.io.Reader;
/**
@@ -25,17 +25,39 @@ import java.io.Reader;
* They can be used as {@link java.io.Reader} with additional offset
* correction. {@link Tokenizer}s will automatically use {@link #correctOffset}
* if a CharFilter subclass is used.
+ * <p>
+ * This class is abstract: at a minimum you must implement {@link #read(char[], int, int)},
+ * transforming the input in some way from {@link #input}, and {@link #correct(int)}
+ * to adjust the offsets to match the originals.
+ * <p>
+ * You can optionally provide more efficient implementations of additional methods
+ * like {@link #read()}, {@link #read(char[])}, {@link #read(java.nio.CharBuffer)},
+ * but this is not required.
*/
-public abstract class CharFilter extends FilterReader {
+// the way java.io.FilterReader should work!
+public abstract class CharFilter extends Reader {
+ /**
+ * The underlying character-input stream.
+ */
+ protected final Reader input;
/**
* Create a new CharFilter wrapping the provided reader.
- * @param in a Reader, can also be a CharFilter for chaining.
+ * @param input a Reader, can also be a CharFilter for chaining.
*/
- public CharFilter(Reader in) {
- super(in);
+ public CharFilter(Reader input) {
+ super(input);
+ this.input = input;
}
+ /**
+ * Closes the underlying input stream.
+ */
+ @Override
+ public void close() throws IOException {
+ input.close();
+ }
+
/**
* Subclasses override to correct the current offset.
*
@@ -50,6 +72,6 @@ public abstract class CharFilter extends
*/
public final int correctOffset(int currentOff) {
final int corrected = correct(currentOff);
- return (in instanceof CharFilter) ? ((CharFilter) in).correctOffset(corrected) : corrected;
+ return (input instanceof CharFilter) ? ((CharFilter) input).correctOffset(corrected) : corrected;
}
}
Modified: lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestCharFilter.java?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestCharFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/analysis/TestCharFilter.java Wed Aug 22 18:06:30 2012
@@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
+import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
@@ -51,6 +52,11 @@ public class TestCharFilter extends Luce
}
@Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ return input.read(cbuf, off, len);
+ }
+
+ @Override
protected int correct(int currentOff) {
return currentOff + 1;
}
@@ -61,6 +67,11 @@ public class TestCharFilter extends Luce
protected CharFilter2(Reader in) {
super(in);
}
+
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ return input.read(cbuf, off, len);
+ }
@Override
protected int correct(int currentOff) {
Modified: lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java Wed Aug 22 18:06:30 2012
@@ -42,11 +42,6 @@ public class MockCharFilter extends Char
public MockCharFilter(Reader in) {
this(in, 0);
}
-
- @Override
- public void close() throws IOException {
- in.close();
- }
int currentOffset = -1;
int delta = 0;
@@ -66,7 +61,7 @@ public class MockCharFilter extends Char
}
// otherwise actually read one
- int ch = in.read();
+ int ch = input.read();
if (ch < 0)
return ch;
Modified: lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java Wed Aug 22 18:06:30 2012
@@ -19,12 +19,16 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
+import java.nio.CharBuffer;
+import java.util.Random;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.automaton.RegExp;
+import com.carrotsearch.randomizedtesting.RandomizedContext;
+
/**
* Tokenizer for testing.
* <p>
@@ -77,6 +81,9 @@ public class MockTokenizer extends Token
private int lastOffset = 0; // only for asserting
private boolean enableChecks = true;
+ // evil: but we don't change the behavior with this random, we only switch up how we read
+ private final Random random = new Random(RandomizedContext.current().getRandom().nextLong());
+
public MockTokenizer(AttributeFactory factory, Reader input, CharacterRunAutomaton runAutomaton, boolean lowerCase, int maxTokenLength) {
super(factory, input);
this.runAutomaton = runAutomaton;
@@ -139,14 +146,14 @@ public class MockTokenizer extends Token
}
protected int readCodePoint() throws IOException {
- int ch = input.read();
+ int ch = readChar();
if (ch < 0) {
return ch;
} else {
assert !Character.isLowSurrogate((char) ch) : "unpaired low surrogate: " + Integer.toHexString(ch);
off++;
if (Character.isHighSurrogate((char) ch)) {
- int ch2 = input.read();
+ int ch2 = readChar();
if (ch2 >= 0) {
off++;
assert Character.isLowSurrogate((char) ch2) : "unpaired high surrogate: " + Integer.toHexString(ch) + ", followed by: " + Integer.toHexString(ch2);
@@ -158,6 +165,33 @@ public class MockTokenizer extends Token
return ch;
}
}
+
+ protected int readChar() throws IOException {
+ switch(random.nextInt(10)) {
+ case 0: {
+ // read(char[])
+ char c[] = new char[1];
+ int ret = input.read(c);
+ return ret < 0 ? ret : c[0];
+ }
+ case 1: {
+ // read(char[], int, int)
+ char c[] = new char[2];
+ int ret = input.read(c, 1, 1);
+ return ret < 0 ? ret : c[1];
+ }
+ case 2: {
+ // read(CharBuffer)
+ char c[] = new char[1];
+ CharBuffer cb = CharBuffer.wrap(c);
+ int ret = input.read(cb);
+ return ret < 0 ? ret : c[0];
+ }
+ default:
+ // read()
+ return input.read();
+ }
+ }
protected boolean isTokenChar(int c) {
state = runAutomaton.step(state, c);
Modified: lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java?rev=1376162&r1=1376161&r2=1376162&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java Wed Aug 22 18:06:30 2012
@@ -103,7 +103,7 @@ public class LegacyHTMLStripCharFilter e
return ch;
}
numRead++;
- return in.read();
+ return input.read();
}
private int nextSkipWS() throws IOException {
@@ -118,7 +118,7 @@ public class LegacyHTMLStripCharFilter e
return pushed.charAt(len-1);
}
numRead++;
- int ch = in.read();
+ int ch = input.read();
push(ch);
return ch;
}
@@ -180,11 +180,11 @@ public class LegacyHTMLStripCharFilter e
private void saveState() throws IOException {
lastMark = numRead;
- in.mark(readAheadLimit);
+ input.mark(readAheadLimit);
}
private void restoreState() throws IOException {
- in.reset();
+ input.reset();
pushed.setLength(0);
}