You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/04/09 21:05:47 UTC

svn commit: r1311373 - in /lucene/dev/branches/lucene3969: lucene/test-framework/src/java/org/apache/lucene/analysis/ modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ modules/analysis/common/src/test/org/apache/lucene/analysis/core/

Author: mikemccand
Date: Mon Apr  9 19:05:47 2012
New Revision: 1311373

URL: http://svn.apache.org/viewvc?rev=1311373&view=rev
Log:
LUCENE-3969: validate after each analysis stage; tenatively add posLen to ShingleFilter

Added:
    lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java   (with props)
Modified:
    lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
    lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java
    lucene/dev/branches/lucene3969/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
    lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java

Modified: lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java?rev=1311373&r1=1311372&r2=1311373&view=diff
==============================================================================
--- lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java (original)
+++ lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java Mon Apr  9 19:05:47 2012
@@ -222,7 +222,7 @@ public abstract class BaseTokenStreamTes
         assertTrue("posLength must be >= 1", posLengthAtt.getPositionLength() >= 1);
       }
     }
-    assertFalse("TokenStream has more tokens than expected", ts.incrementToken());
+    assertFalse("TokenStream has more tokens than expected (expected count=" + output.length + ")", ts.incrementToken());
     ts.end();
     if (finalOffset != null) {
       assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset());

Modified: lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java?rev=1311373&r1=1311372&r2=1311373&view=diff
==============================================================================
--- lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java (original)
+++ lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java Mon Apr  9 19:05:47 2012
@@ -151,7 +151,7 @@ public abstract class LookaheadTokenFilt
         startPosData.startOffset = startOffset;
       } else {
         // Make sure our input isn't messing up offsets:
-        assert startPosData.startOffset == startOffset;
+        assert startPosData.startOffset == startOffset: "prev startOffset=" + startPosData.startOffset + " vs new startOffset=" + startOffset + " inputPos=" + inputPos;
       }
 
       final int endOffset = offsetAtt.endOffset();
@@ -159,7 +159,7 @@ public abstract class LookaheadTokenFilt
         endPosData.endOffset = endOffset;
       } else {
         // Make sure our input isn't messing up offsets:
-        assert endPosData.endOffset == endOffset;
+        assert endPosData.endOffset == endOffset: "prev endOffset=" + endPosData.endOffset + " vs new endOffset=" + endOffset + " inputPos=" + inputPos;
       }
 
       tokenPending = true;

Added: lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java?rev=1311373&view=auto
==============================================================================
--- lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java (added)
+++ lucene/dev/branches/lucene3969/lucene/test-framework/src/java/org/apache/lucene/analysis/ValidatingTokenFilter.java Mon Apr  9 19:05:47 2012
@@ -0,0 +1,117 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+
+// nocommit better name...?
+
+// nocommit BTSTC should just append this to the chain
+// instead of checking itself:
+
+/** A TokenFilter that checks consistency of the tokens (eg
+ *  offsets are consistent with one another). */
+public final class ValidatingTokenFilter extends TokenFilter {
+
+  private int pos;
+
+  // Maps position to the start/end offset:
+  private final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
+  private final Map<Integer,Integer> posToEndOffset = new HashMap<Integer,Integer>();
+
+  // nocommit must be more careful here?  check hasAttribute first...?
+  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+  private final String name;
+
+  /** The name arg is used to identify this stage when
+   *  throwing exceptions (useful if you have more than one
+   *  instance in your chain). */
+  public ValidatingTokenFilter(TokenStream in, String name) {
+    super(in);
+    this.name = name;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (!input.incrementToken()) {
+      return false;
+    }
+
+    pos += posIncAtt.getPositionIncrement();
+    if (pos == -1) {
+      throw new IllegalStateException("first posInc must be > 0");
+    }
+
+    final int startOffset = offsetAtt.startOffset();
+    final int endOffset = offsetAtt.endOffset();
+
+    final int posLen = posLenAtt.getPositionLength();
+    if (!posToStartOffset.containsKey(pos)) {
+      // First time we've seen a token leaving from this position:
+      posToStartOffset.put(pos, startOffset);
+      System.out.println("  + s " + pos + " -> " + startOffset);
+    } else {
+      // We've seen a token leaving from this position
+      // before; verify the startOffset is the same:
+      System.out.println("  + vs " + pos + " -> " + startOffset);
+      final int oldStartOffset = posToStartOffset.get(pos);
+      if (oldStartOffset != startOffset) {
+        throw new IllegalStateException(name + ": inconsistent startOffset as pos=" + pos + ": " + oldStartOffset + " vs " + startOffset + "; token=" + termAtt);
+      }
+    }
+
+    final int endPos = pos + posLen;
+
+    if (!posToEndOffset.containsKey(endPos)) {
+      // First time we've seen a token arriving to this position:
+      posToEndOffset.put(endPos, endOffset);
+      System.out.println("  + e " + endPos + " -> " + endOffset);
+    } else {
+      // We've seen a token arriving to this position
+      // before; verify the endOffset is the same:
+      System.out.println("  + ve " + endPos + " -> " + endOffset);
+      final int oldEndOffset = posToEndOffset.get(endPos);
+      if (oldEndOffset != endOffset) {
+        throw new IllegalStateException(name + ": inconsistent endOffset as pos=" + endPos + ": " + oldEndOffset + " vs " + endOffset + "; token=" + termAtt);
+      }
+    }
+
+    return true;
+  }
+
+  // TODO: end?  (what to validate?)
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    pos = -1;
+    posToStartOffset.clear();
+    posToEndOffset.clear();
+  }
+}

Modified: lucene/dev/branches/lucene3969/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?rev=1311373&r1=1311372&r2=1311373&view=diff
==============================================================================
--- lucene/dev/branches/lucene3969/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (original)
+++ lucene/dev/branches/lucene3969/modules/analysis/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java Mon Apr  9 19:05:47 2012
@@ -23,9 +23,10 @@ import java.util.LinkedList;
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.AttributeSource;
 
@@ -150,6 +151,7 @@ public final class ShingleFilter extends
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
 
 
@@ -319,6 +321,8 @@ public final class ShingleFilter extends
           noShingleOutput = false;
         }
         offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
+        // nocommit is this right!?  i'm just guessing...
+        posLenAtt.setPositionLength(builtGramSize);
         isOutputHere = true;
         gramSize.advance();
         tokenAvailable = true;

Modified: lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java?rev=1311373&r1=1311372&r2=1311373&view=diff
==============================================================================
--- lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java (original)
+++ lucene/dev/branches/lucene3969/modules/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java Mon Apr  9 19:05:47 2012
@@ -34,11 +34,11 @@ import java.util.Collections;
 import java.util.Comparator;
 import java.util.Enumeration;
 import java.util.HashSet;
+import java.util.IdentityHashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.Random;
 import java.util.Set;
-import java.util.Map;
-import java.util.IdentityHashMap;
 import java.util.regex.Pattern;
 
 import org.apache.lucene.analysis.Analyzer;
@@ -52,6 +52,7 @@ import org.apache.lucene.analysis.MockTo
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ValidatingTokenFilter;
 import org.apache.lucene.analysis.charfilter.CharFilter;
 import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
 import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
@@ -73,8 +74,8 @@ import org.apache.lucene.analysis.standa
 import org.apache.lucene.analysis.synonym.SynonymMap;
 import org.apache.lucene.analysis.util.CharArrayMap;
 import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.AttributeSource.AttributeFactory;
+import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.Rethrow;
 import org.apache.lucene.util.Version;
@@ -133,6 +134,12 @@ public class TestRandomChains extends Ba
       ) {
         continue;
       }
+
+      if (c == ValidatingTokenFilter.class) {
+        // We insert this one ourselves after each stage...
+        continue;
+      }
+
       for (final Constructor<?> ctor : c.getConstructors()) {
         // don't test deprecated ctors, they likely have known bugs:
         if (ctor.isAnnotationPresent(Deprecated.class) || ctor.isSynthetic()) {
@@ -635,6 +642,12 @@ public class TestRandomChains extends Ba
       StringBuilder descr = new StringBuilder();
       int numFilters = random.nextInt(5);
       for (int i = 0; i < numFilters; i++) {
+
+        // Insert ValidatingTF after each stage so we can
+        // catch problems right after the TF that "caused"
+        // them:
+        spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);
+
         while (true) {
           final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
           final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
@@ -645,6 +658,12 @@ public class TestRandomChains extends Ba
           }
         }
       }
+
+      // Insert ValidatingTF after each stage so we can
+      // catch problems right after the TF that "caused"
+      // them:
+      spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");
+
       spec.toString = descr.toString();
       return spec;
     }