You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2018/03/04 16:26:10 UTC
lucene-solr:branch_7x: LUCENE-8191: if a tokenstream has broken
offsets, its broken. IndexWriter always checks,
so a separate whitelist can't work
Repository: lucene-solr
Updated Branches:
refs/heads/branch_7x 9fdbc26ab -> 96cd9c5d6
LUCENE-8191: if a tokenstream has broken offsets, its broken. IndexWriter always checks, so a separate whitelist can't work
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/96cd9c5d
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/96cd9c5d
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/96cd9c5d
Branch: refs/heads/branch_7x
Commit: 96cd9c5d6279d23da8e86c241310d8aaf69bdf12
Parents: 9fdbc26
Author: Robert Muir <rm...@apache.org>
Authored: Sun Mar 4 11:23:45 2018 -0500
Committer: Robert Muir <rm...@apache.org>
Committed: Sun Mar 4 11:24:25 2018 -0500
----------------------------------------------------------------------
.../lucene/analysis/core/TestRandomChains.java | 84 +++++---------------
.../lucene/analysis/ValidatingTokenFilter.java | 8 +-
2 files changed, 22 insertions(+), 70 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96cd9c5d/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 406addf..7839203 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -56,8 +56,6 @@ import org.apache.lucene.analysis.CharArrayMap;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.CrankyTokenFilter;
-import org.apache.lucene.analysis.MockGraphTokenFilter;
-import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
@@ -147,15 +145,27 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
return !((Boolean) args[2]); // args are broken if consumeAllTokens is false
});
for (Class<?> c : Arrays.<Class<?>>asList(
- // TODO: can we promote some of these to be only
- // offsets offenders?
// doesn't actual reset itself!
CachingTokenFilter.class,
+ // LUCENE-8092: doesn't handle graph inputs
+ CJKBigramFilter.class,
+ // TODO: LUCENE-4983
+ CommonGramsFilter.class,
+ // TODO: doesn't handle graph inputs
+ CommonGramsQueryFilter.class,
// Not broken, simulates brokenness:
CrankyTokenFilter.class,
+ // TODO: doesn't handle graph inputs (or even look at positionIncrement)
+ HyphenatedWordsFilter.class,
+ // broken offsets
+ PathHierarchyTokenizer.class,
+ // broken offsets
+ ReversePathHierarchyTokenizer.class,
// Not broken: we forcefully add this, so we shouldn't
// also randomly pick it:
ValidatingTokenFilter.class,
+ // TODO: it seems to mess up offsets!?
+ WikipediaTokenizer.class,
// TODO: needs to be a tokenizer, doesnt handle graph inputs properly (a shingle or similar following will then cause pain)
WordDelimiterFilter.class,
// Cannot correct offsets when a char filter had changed them:
@@ -174,33 +184,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
}
}
- // TODO: also fix these and remove (maybe):
- // Classes/options that don't produce consistent graph offsets:
- private static final Map<Constructor<?>,Predicate<Object[]>> brokenOffsetsConstructors = new HashMap<>();
- static {
- try {
- for (Class<?> c : Arrays.<Class<?>>asList(
- ReversePathHierarchyTokenizer.class,
- PathHierarchyTokenizer.class,
- // TODO: it seems to mess up offsets!?
- WikipediaTokenizer.class,
- // TODO: doesn't handle graph inputs
- CJKBigramFilter.class,
- // TODO: doesn't handle graph inputs (or even look at positionIncrement)
- HyphenatedWordsFilter.class,
- // TODO: LUCENE-4983
- CommonGramsFilter.class,
- // TODO: doesn't handle graph inputs
- CommonGramsQueryFilter.class)) {
- for (Constructor<?> ctor : c.getConstructors()) {
- brokenOffsetsConstructors.put(ctor, ALWAYS);
- }
- }
- } catch (Exception e) {
- throw new Error(e);
- }
- }
-
@BeforeClass
public static void beforeClass() throws Exception {
List<Class<?>> analysisClasses = getClassesForPackage("org.apache.lucene.analysis");
@@ -584,20 +567,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
this.seed = seed;
}
- public boolean offsetsAreCorrect() {
- // TODO: can we not do the full chain here!?
- Random random = new Random(seed);
- TokenizerSpec tokenizerSpec = newTokenizer(random);
- TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
- return filterSpec.offsetsAreCorrect;
- }
-
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Random random = new Random(seed);
TokenizerSpec tokenizerSpec = newTokenizer(random);
//System.out.println("seed=" + seed + ",create tokenizer=" + tokenizerSpec.toString);
- TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
+ TokenFilterSpec filterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
//System.out.println("seed=" + seed + ",create filter=" + filterSpec.toString);
return new TokenStreamComponents(tokenizerSpec.tokenizer, filterSpec.stream);
}
@@ -622,13 +597,10 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
sb.append("\n");
sb.append("tokenizer=");
sb.append(tokenizerSpec.toString);
- TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer, tokenizerSpec.offsetsAreCorrect);
+ TokenFilterSpec tokenFilterSpec = newFilterChain(random, tokenizerSpec.tokenizer);
sb.append("\n");
sb.append("filters=");
sb.append(tokenFilterSpec.toString);
- sb.append("\n");
- sb.append("offsetsAreCorrect=");
- sb.append(tokenFilterSpec.offsetsAreCorrect);
return sb.toString();
}
@@ -669,11 +641,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
return pred != null && pred.test(args);
}
- private boolean brokenOffsets(Constructor<?> ctor, Object[] args) {
- final Predicate<Object[]> pred = brokenOffsetsConstructors.get(ctor);
- return pred != null && pred.test(args);
- }
-
// create a new random tokenizer from classpath
private TokenizerSpec newTokenizer(Random random) {
TokenizerSpec spec = new TokenizerSpec();
@@ -686,7 +653,6 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
}
spec.tokenizer = createComponent(ctor, args, descr);
if (spec.tokenizer != null) {
- spec.offsetsAreCorrect &= !brokenOffsets(ctor, args);
spec.toString = descr.toString();
}
}
@@ -716,9 +682,8 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
return spec;
}
- private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer, boolean offsetsAreCorrect) {
+ private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer) {
TokenFilterSpec spec = new TokenFilterSpec();
- spec.offsetsAreCorrect = offsetsAreCorrect;
spec.stream = tokenizer;
StringBuilder descr = new StringBuilder();
int numFilters = random.nextInt(5);
@@ -727,26 +692,17 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
// Insert ValidatingTF after each stage so we can
// catch problems right after the TF that "caused"
// them:
- spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i, spec.offsetsAreCorrect);
+ spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);
while (true) {
final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
- // hack: MockGraph/MockLookahead has assertions that will trip if they follow
- // an offsets violator. so we cant use them after e.g. wikipediatokenizer
- if (!spec.offsetsAreCorrect &&
- (ctor.getDeclaringClass().equals(MockGraphTokenFilter.class)
- || ctor.getDeclaringClass().equals(MockRandomLookaheadTokenFilter.class))) {
- continue;
- }
-
final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
if (broken(ctor, args)) {
continue;
}
final TokenFilter flt = createComponent(ctor, args, descr);
if (flt != null) {
- spec.offsetsAreCorrect &= !brokenOffsets(ctor, args);
spec.stream = flt;
break;
}
@@ -756,7 +712,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
// Insert ValidatingTF after each stage so we can
// catch problems right after the TF that "caused"
// them:
- spec.stream = new ValidatingTokenFilter(spec.stream, "last stage", spec.offsetsAreCorrect);
+ spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");
spec.toString = descr.toString();
return spec;
@@ -829,13 +785,11 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
static class TokenizerSpec {
Tokenizer tokenizer;
String toString;
- boolean offsetsAreCorrect = true;
}
static class TokenFilterSpec {
TokenStream stream;
String toString;
- boolean offsetsAreCorrect = true;
}
static class CharFilterSpec {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96cd9c5d/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
index b31b604..7901eea 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
@@ -49,17 +49,15 @@ public final class ValidatingTokenFilter extends TokenFilter {
private final PositionLengthAttribute posLenAtt = getAttribute(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = getAttribute(OffsetAttribute.class);
private final CharTermAttribute termAtt = getAttribute(CharTermAttribute.class);
- private final boolean offsetsAreCorrect;
private final String name;
/** The name arg is used to identify this stage when
* throwing exceptions (useful if you have more than one
* instance in your chain). */
- public ValidatingTokenFilter(TokenStream in, String name, boolean offsetsAreCorrect) {
+ public ValidatingTokenFilter(TokenStream in, String name) {
super(in);
this.name = name;
- this.offsetsAreCorrect = offsetsAreCorrect;
}
@Override
@@ -85,7 +83,7 @@ public final class ValidatingTokenFilter extends TokenFilter {
startOffset = offsetAtt.startOffset();
endOffset = offsetAtt.endOffset();
- if (offsetsAreCorrect && offsetAtt.startOffset() < lastStartOffset) {
+ if (offsetAtt.startOffset() < lastStartOffset) {
throw new IllegalStateException(name + ": offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset);
}
lastStartOffset = offsetAtt.startOffset();
@@ -93,7 +91,7 @@ public final class ValidatingTokenFilter extends TokenFilter {
posLen = posLenAtt == null ? 1 : posLenAtt.getPositionLength();
- if (offsetAtt != null && posIncAtt != null && offsetsAreCorrect) {
+ if (offsetAtt != null && posIncAtt != null) {
if (!posToStartOffset.containsKey(pos)) {
// First time we've seen a token leaving from this position: