You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/04/09 21:05:47 UTC
svn commit: r1311373 - in /lucene/dev/branches/lucene3969:
lucene/test-framework/src/java/org/apache/lucene/analysis/
modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/
modules/analysis/common/src/test/org/apache/lucene/analysis/core/
Author: mikemccand
Date: Mon Apr 9 19:05:47 2012
New Revision: 1311373
URL: http://svn.apache.org/viewvc?rev=1311373&view=rev
Log:
LUCENE-3969: validate after each analysis stage; tenatively add posLen to ShingleFilter
Added:
lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java (with props)
Modified:
lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java
lucene/dev/branches/lucene3969/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
Modified: lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1311373&r1=1311372&r2=1311373&view=diff
==============================================================================
--- lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Mon Apr 9 19:05:47 2012
@@ -222,7 +222,7 @@ public abstract class BaseTokenStreamTes
assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
}
}
- assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
+ assertFalse("TokenStream has more tokens than expected (expected count=" + output.length + ")", ts.incrementToken());
ts.end();
if (finalOffset != null) {
assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());
Modified: lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java?rev=1311373&r1=1311372&r2=1311373&view=diff
==============================================================================
--- lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java (original)
+++ lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java Mon Apr 9 19:05:47 2012
@@ -151,7 +151,7 @@ public abstract class LookaheadTokenFilt
startPosData.startOffset = startOffset;
} else {
// Make sure our input isn't messing up offsets:
- assert startPosData.startOffset == startOffset;
+ assert startPosData.startOffset == startOffset: "prev startOffset=" + startPosData.startOffset + " vs new startOffset=" + startOffset + " inputPos=" + inputPos;
}
final int endOffset = offsetAtt.endOffset();
@@ -159,7 +159,7 @@ public abstract class LookaheadTokenFilt
endPosData.endOffset = endOffset;
} else {
// Make sure our input isn't messing up offsets:
- assert endPosData.endOffset == endOffset;
+ assert endPosData.endOffset == endOffset: "prev endOffset=" + endPosData.endOffset + " vs new endOffset=" + endOffset + " inputPos=" + inputPos;
}
tokenPending = true;
Added: lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java?rev=1311373&view=auto
==============================================================================
--- lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java (added)
+++ lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java Mon Apr 9 19:05:47 2012
@@ -0,0 +1,117 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+
+// nocommit better name...?
+
+// nocommit BTSTC should just append this to the chain
+// instead of checking itself:
+
+/** A TokenFilter that checks consistency of the tokens (eg
+ * offsets are consistent with one another). */
+public final class ValidatingTokenFilter extends TokenFilter {
+
+ private int pos;
+
+ // Maps position to the start/end offset:
+ private final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
+ private final Map<Integer,Integer> posToEndOffset = new HashMap<Integer,Integer>();
+
+ // nocommit must be more careful here? check hasAttribute first...?
+ private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+ private final String name;
+
+ /** The name arg is used to identify this stage when
+ * throwing exceptions (useful if you have more than one
+ * instance in your chain). */
+ public ValidatingTokenFilter(TokenStream in, String name) {
+ super(in);
+ this.name = name;
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (!input.incrementToken()) {
+ return false;
+ }
+
+ pos += posIncAtt.getPositionIncrement();
+ if (pos == -1) {
+ throw new IllegalStateException("first posInc must be > 0");
+ }
+
+ final int startOffset = offsetAtt.startOffset();
+ final int endOffset = offsetAtt.endOffset();
+
+ final int posLen = posLenAtt.getPositionLength();
+ if (!posToStartOffset.containsKey(pos)) {
+ // First time we've seen a token leaving from this position:
+ posToStartOffset.put(pos, startOffset);
+ System.out.println(" + s " + pos + " -> " + startOffset);
+ } else {
+ // We've seen a token leaving from this position
+ // before; verify the startOffset is the same:
+ System.out.println(" + vs " + pos + " -> " + startOffset);
+ final int oldStartOffset = posToStartOffset.get(pos);
+ if (oldStartOffset != startOffset) {
+ throw new IllegalStateException(name + ": inconsistent startOffset as pos=" + pos + ": " + oldStartOffset + " vs " + startOffset + "; token=" + termAtt);
+ }
+ }
+
+ final int endPos = pos + posLen;
+
+ if (!posToEndOffset.containsKey(endPos)) {
+ // First time we've seen a token arriving to this position:
+ posToEndOffset.put(endPos, endOffset);
+ System.out.println(" + e " + endPos + " -> " + endOffset);
+ } else {
+ // We've seen a token arriving to this position
+ // before; verify the endOffset is the same:
+ System.out.println(" + ve " + endPos + " -> " + endOffset);
+ final int oldEndOffset = posToEndOffset.get(endPos);
+ if (oldEndOffset != endOffset) {
+ throw new IllegalStateException(name + ": inconsistent endOffset as pos=" + endPos + ": " + oldEndOffset + " vs " + endOffset + "; token=" + termAtt);
+ }
+ }
+
+ return true;
+ }
+
+ // TODO: end? (what to validate?)
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ pos = -1;
+ posToStartOffset.clear();
+ posToEndOffset.clear();
+ }
+}
Modified: lucene/dev/branches/lucene3969/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?rev=1311373&r1=1311372&r2=1311373&view=diff
==============================================================================
--- lucene/dev/branches/lucene3969/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (original)
+++ lucene/dev/branches/lucene3969/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java Mon Apr 9 19:05:47 2012
@@ -23,9 +23,10 @@ import java.util.LinkedList;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
@@ -150,6 +151,7 @@ public final class ShingleFilter extends
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
@@ -319,6 +321,8 @@ public final class ShingleFilter extends
noShingleOutput = false;
}
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
+ // nocommit is this right!? i'm just guessing...
+ posLenAtt.setPositionLength(builtGramSize);
isOutputHere = true;
gramSize.advance();
tokenAvailable = true;
Modified: lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java?rev=1311373&r1=1311372&r2=1311373&view=diff
==============================================================================
--- lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (original)
+++ lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java Mon Apr 9 19:05:47 2012
@@ -34,11 +34,11 @@ import java.util.Collections;
import java.util.Comparator;
import java.util.Enumeration;
import java.util.HashSet;
+import java.util.IdentityHashMap;
import java.util.List;
+import java.util.Map;
import java.util.Random;
import java.util.Set;
-import java.util.Map;
-import java.util.IdentityHashMap;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
@@ -52,6 +52,7 @@ import org.apache.lucene.analysis.MockTo
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ValidatingTokenFilter;
import org.apache.lucene.analysis.charfilter.CharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
@@ -73,8 +74,8 @@ import org.apache.lucene.analysis.standa
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.AttributeSource.AttributeFactory;
+import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.Rethrow;
import org.apache.lucene.util.Version;
@@ -133,6 +134,12 @@ public class TestRandomChains extends Ba
) {
continue;
}
+
+ if (c == ValidatingTokenFilter.class) {
+ // We insert this one ourselves after each stage...
+ continue;
+ }
+
for (final Constructor<?> ctor : c.getConstructors()) {
// don't test deprecated ctors, they likely have known bugs:
if (ctor.isAnnotationPresent(Deprecated.class) || ctor.isSynthetic()) {
@@ -635,6 +642,12 @@ public class TestRandomChains extends Ba
StringBuilder descr = new StringBuilder();
int numFilters = random.nextInt(5);
for (int i = 0; i < numFilters; i++) {
+
+ // Insert ValidatingTF after each stage so we can
+ // catch problems right after the TF that "caused"
+ // them:
+ spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);
+
while (true) {
final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
@@ -645,6 +658,12 @@ public class TestRandomChains extends Ba
}
}
}
+
+ // Insert ValidatingTF after each stage so we can
+ // catch problems right after the TF that "caused"
+ // them:
+ spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");
+
spec.toString = descr.toString();
return spec;
}