You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/02/12 15:22:03 UTC

svn commit: r1243257 - in /lucene/dev/branches/lucene3767: lucene/core/src/java/org/apache/lucene/analysis/ lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/ lucene/core/src/test/org/apache/lucene/analysis/ lucene/core/src/test/org/apach...

Author: mikemccand
Date: Sun Feb 12 14:22:02 2012
New Revision: 1243257

URL: http://svn.apache.org/viewvc?rev=1243257&view=rev
Log:
LUCENE-3767: also backtrace when paths converge on single backpointer; output position length for the compound tokens

Modified:
    lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/Token.java
    lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java
    lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
    lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/TestToken.java
    lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpl.java
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/Perf.java
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java
    lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java

Modified: lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/Token.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/Token.java (original)
+++ lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/Token.java Sun Feb 12 14:22:02 2012
@@ -128,6 +128,7 @@ public class Token extends CharTermAttri
   private int flags;
   private Payload payload;
   private int positionIncrement = 1;
+  private int positionLength = 1;
 
   /** Constructs a Token will null text. */
   public Token() {
@@ -270,6 +271,20 @@ public class Token extends CharTermAttri
     return positionIncrement;
   }
 
+  /** @param positionLength how many positions this token
+   *  spans.  NOTE: this is optional, and most analyzers
+   *  don't change the default value (1). */
+  public void setPositionLength(int positionLength) {
+    this.positionLength = positionLength;
+  }
+
+  /** Returns the position length of this Token.
+   * @see #setPositionLength    
+   */
+  public int getPositionLength() {
+    return positionLength;
+  }
+
   /** Returns this Token's starting offset, the position of the first character
     corresponding to this token in the source text.
 
@@ -360,6 +375,7 @@ public class Token extends CharTermAttri
     super.clear();
     payload = null;
     positionIncrement = 1;
+    positionLength = 1;
     flags = 0;
     startOffset = endOffset = 0;
     type = DEFAULT_TYPE;
@@ -383,6 +399,7 @@ public class Token extends CharTermAttri
   public Token clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
     final Token t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset);
     t.positionIncrement = positionIncrement;
+    t.positionLength = positionLength;
     t.flags = flags;
     t.type = type;
     if (payload != null)
@@ -401,6 +418,7 @@ public class Token extends CharTermAttri
           endOffset == other.endOffset && 
           flags == other.flags &&
           positionIncrement == other.positionIncrement &&
+          positionLength == other.positionLength &&
           (type == null ? other.type == null : type.equals(other.type)) &&
           (payload == null ? other.payload == null : payload.equals(other.payload)) &&
           super.equals(obj)
@@ -416,6 +434,7 @@ public class Token extends CharTermAttri
     code = code * 31 + endOffset;
     code = code * 31 + flags;
     code = code * 31 + positionIncrement;
+    code = code * 31 + positionLength;
     if (type != null)
       code = code * 31 + type.hashCode();
     if (payload != null)
@@ -427,6 +446,7 @@ public class Token extends CharTermAttri
   private void clearNoTermBuffer() {
     payload = null;
     positionIncrement = 1;
+    positionLength = 1;
     flags = 0;
     startOffset = endOffset = 0;
     type = DEFAULT_TYPE;
@@ -443,6 +463,7 @@ public class Token extends CharTermAttri
     copyBuffer(newTermBuffer, newTermOffset, newTermLength);
     payload = null;
     positionIncrement = 1;
+    positionLength = 1;
     startOffset = newStartOffset;
     endOffset = newEndOffset;
     type = newType;
@@ -531,6 +552,7 @@ public class Token extends CharTermAttri
   public void reinit(Token prototype) {
     copyBuffer(prototype.buffer(), 0, prototype.length());
     positionIncrement = prototype.positionIncrement;
+    positionLength = prototype.positionLength;
     flags = prototype.flags;
     startOffset = prototype.startOffset;
     endOffset = prototype.endOffset;
@@ -546,6 +568,7 @@ public class Token extends CharTermAttri
   public void reinit(Token prototype, String newTerm) {
     setEmpty().append(newTerm);
     positionIncrement = prototype.positionIncrement;
+    positionLength = prototype.positionLength;
     flags = prototype.flags;
     startOffset = prototype.startOffset;
     endOffset = prototype.endOffset;
@@ -563,6 +586,7 @@ public class Token extends CharTermAttri
   public void reinit(Token prototype, char[] newTermBuffer, int offset, int length) {
     copyBuffer(newTermBuffer, offset, length);
     positionIncrement = prototype.positionIncrement;
+    positionLength = prototype.positionLength;
     flags = prototype.flags;
     startOffset = prototype.startOffset;
     endOffset = prototype.endOffset;
@@ -583,6 +607,7 @@ public class Token extends CharTermAttri
       super.copyTo(target);
       ((OffsetAttribute) target).setOffset(startOffset, endOffset);
       ((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement);
+      ((PositionIncrementAttribute) target).setPositionLength(positionLength);
       ((PayloadAttribute) target).setPayload((payload == null) ? null : (Payload) payload.clone());
       ((FlagsAttribute) target).setFlags(flags);
       ((TypeAttribute) target).setType(type);
@@ -595,6 +620,7 @@ public class Token extends CharTermAttri
     reflector.reflect(OffsetAttribute.class, "startOffset", startOffset);
     reflector.reflect(OffsetAttribute.class, "endOffset", endOffset);
     reflector.reflect(PositionIncrementAttribute.class, "positionIncrement", positionIncrement);
+    reflector.reflect(PositionIncrementAttribute.class, "positionLength", positionLength);
     reflector.reflect(PayloadAttribute.class, "payload", payload);
     reflector.reflect(FlagsAttribute.class, "flags", flags);
     reflector.reflect(TypeAttribute.class, "type", type);

Modified: lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java (original)
+++ lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java Sun Feb 12 14:22:02 2012
@@ -56,4 +56,15 @@ public interface PositionIncrementAttrib
    * @see #setPositionIncrement
    */
   public int getPositionIncrement();
+
+  // nocommit better names...?  getLength?  getSpan?
+  /** @param positionLength how many positions this token
+   *  spans.  NOTE: this is optional, and most analyzers
+   *  don't change the default value (1). */
+  public void setPositionLength(int positionLength);
+
+  /** Returns the position length of this Token.
+   * @see #setPositionLength
+   */
+  public int getPositionLength();
 }

Modified: lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java (original)
+++ lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java Sun Feb 12 14:22:02 2012
@@ -46,6 +46,7 @@ import org.apache.lucene.util.AttributeI
  */
 public class PositionIncrementAttributeImpl extends AttributeImpl implements PositionIncrementAttribute, Cloneable {
   private int positionIncrement = 1;
+  private int positionLength = 1;
   
   /** Set the position increment. The default value is one.
    *
@@ -54,7 +55,7 @@ public class PositionIncrementAttributeI
   public void setPositionIncrement(int positionIncrement) {
     if (positionIncrement < 0)
       throw new IllegalArgumentException
-        ("Increment must be zero or greater: " + positionIncrement);
+        ("Increment must be zero or greater: got " + positionIncrement);
     this.positionIncrement = positionIncrement;
   }
 
@@ -65,9 +66,27 @@ public class PositionIncrementAttributeI
     return positionIncrement;
   }
 
+  /** @param positionLength how many positions this token
+   *  spans.  NOTE: this is optional, and most analyzers
+   *  don't change the default value (1). */
+  public void setPositionLength(int positionLength) {
+    if (positionLength < 1)
+      throw new IllegalArgumentException
+        ("Position length must be 1 or greater: got " + positionLength);
+    this.positionLength = positionLength;
+  }
+
+  /** Returns the position length of this Token.
+   * @see #setPositionLength    
+   */
+  public int getPositionLength() {
+    return positionLength;
+  }
+
   @Override
   public void clear() {
     this.positionIncrement = 1;
+    this.positionLength = 1;
   }
   
   @Override
@@ -77,7 +96,9 @@ public class PositionIncrementAttributeI
     }
     
     if (other instanceof PositionIncrementAttributeImpl) {
-      return positionIncrement == ((PositionIncrementAttributeImpl) other).positionIncrement;
+      PositionIncrementAttributeImpl _other = (PositionIncrementAttributeImpl) other;
+      return positionIncrement ==  _other.positionIncrement &&
+        positionLength ==  _other.positionLength;
     }
  
     return false;
@@ -85,13 +106,14 @@ public class PositionIncrementAttributeI
 
   @Override
   public int hashCode() {
-    return positionIncrement;
+    return positionIncrement + positionLength*3947;
   }
   
   @Override
   public void copyTo(AttributeImpl target) {
     PositionIncrementAttribute t = (PositionIncrementAttribute) target;
     t.setPositionIncrement(positionIncrement);
+    t.setPositionLength(positionLength);
   }  
 
 }

Modified: lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/TestToken.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/TestToken.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/TestToken.java (original)
+++ lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/TestToken.java Sun Feb 12 14:22:02 2012
@@ -253,6 +253,7 @@ public class TestToken extends LuceneTes
         put(OffsetAttribute.class.getName() + "#startOffset", 6);
         put(OffsetAttribute.class.getName() + "#endOffset", 22);
         put(PositionIncrementAttribute.class.getName() + "#positionIncrement", 1);
+        put(PositionIncrementAttribute.class.getName() + "#positionLength", 1);
         put(PayloadAttribute.class.getName() + "#payload", null);
         put(TypeAttribute.class.getName() + "#type", TypeAttribute.DEFAULT_TYPE);
         put(FlagsAttribute.class.getName() + "#flags", 8);

Modified: lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpl.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpl.java (original)
+++ lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpl.java Sun Feb 12 14:22:02 2012
@@ -27,8 +27,10 @@ public class TestSimpleAttributeImpl ext
 
   // this checks using reflection API if the defaults are correct
   public void testAttributes() {
-    _TestUtil.assertAttributeReflection(new PositionIncrementAttributeImpl(),
-      Collections.singletonMap(PositionIncrementAttribute.class.getName()+"#positionIncrement", 1));
+    _TestUtil.assertAttributeReflection(new PositionIncrementAttributeImpl(), new HashMap<String,Object>() {{
+      put(PositionIncrementAttribute.class.getName()+"#positionIncrement", 1);
+      put(PositionIncrementAttribute.class.getName()+"#positionLength", 1);
+    }});
     _TestUtil.assertAttributeReflection(new FlagsAttributeImpl(),
       Collections.singletonMap(FlagsAttribute.class.getName()+"#flags", 0));
     _TestUtil.assertAttributeReflection(new TypeAttributeImpl(),

Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/Perf.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/Perf.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/Perf.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/Perf.java Sun Feb 12 14:22:02 2012
@@ -9,9 +9,9 @@ import org.apache.lucene.analysis.kuromo
 import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
 import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
 
-// javac -cp ../build/kuromoji/classes/java:../../../lucene/build/classes/java:../../analysis/build/common/lucene-analyzers-common-4.0-SNAPSHOT.jar Perf.java
+// javac -cp ../build/kuromoji/classes/java:../../../lucene/build/core/classes/java:../../analysis/build/common/lucene-analyzers-common-4.0-SNAPSHOT.jar Perf.java
 
-// java -cp .:../build/kuromoji/classes/java:../../../lucene/build/classes/java:../../analysis/build/common/lucene-analyzers-common-4.0-SNAPSHOT.jar Perf
+// java -cp .:../build/kuromoji/classes/java:../../../lucene/build/core/classes/java:../../analysis/build/common/lucene-analyzers-common-4.0-SNAPSHOT.jar Perf
 public class Perf {
 
   private final static Analyzer analyzer = new Analyzer() {

Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java Sun Feb 12 14:22:02 2012
@@ -56,6 +56,7 @@ public final class KuromojiTokenizer ext
   @Override
   protected void setNextSentence(int sentenceStart, int sentenceEnd) {
     this.sentenceStart = sentenceStart;
+    //System.out.println("\nNEXT SENTENCE: " + sentenceStart + " to " + sentenceEnd + ": " + new String(buffer, sentenceStart, sentenceEnd-sentenceStart));
     // TODO: maybe don't pass 0 here, so kuromoji tracks offsets for us?
     tokens = segmenter.doTokenize(0, buffer, sentenceStart, sentenceEnd-sentenceStart, true);
     tokenIndex = 0;

Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java Sun Feb 12 14:22:02 2012
@@ -53,6 +53,9 @@ import org.apache.lucene.util.fst.FST;
 
 // nocommit add toDot and look at 1st pass intersection
 
+// nocommit need better testing of compound output... need
+// assertAnalyzesTo to accept posLength too
+
 // nocommit add comment explaining this isn't quite a real viterbi
 
 // nocomit explain how the 2nd best tokenization is
@@ -64,6 +67,10 @@ import org.apache.lucene.util.fst.FST;
 // depending on whether ipadic was "trained" with sentence
 // breaks?
 
+// nocommit -- should we use the sentence breakiterator
+// too..?  we can simply use it to slip an EOS/BOS token
+// in...
+
 /* Uses a rolling Viterbi search to find the least cost
  * segmentation (path) of the incoming characters.
  *
@@ -116,6 +123,7 @@ public final class KuromojiTokenizer2 ex
   private final boolean discardPunctuation;
   private final boolean searchMode;
   private final boolean extendedMode;
+  private int sameLeastIndex;
 
   private int lastBackTracePos;
   private int lastTokenPos;
@@ -325,7 +333,7 @@ public final class KuromojiTokenizer2 ex
     }
 
     if (VERBOSE) {
-      System.out.println("      + cost=" + leastCost + " wordID=" + wordID + " wordCat=" + leftID + " tok=" + new String(buffer.get(posData.pos, endPos-posData.pos)));
+      System.out.println("      + cost=" + leastCost + " wordID=" + wordID + " leftID=" + leftID + " tok=" + new String(buffer.get(posData.pos, endPos-posData.pos)) + " leastIDX=" + leastIDX + " toPos.idx=" + positions.get(endPos).count);
     }
 
     // nocommit ideally we don't have to do this ... just
@@ -342,6 +350,13 @@ public final class KuromojiTokenizer2 ex
     }
 
     positions.get(endPos).add(leastCost, dict.getRightId(wordID), posData.pos, leastIDX, wordID, type);
+    /*
+    if (sameLeastIndex == -1) {
+      sameLeastIndex = leastIDX;
+    } else if (sameLeastIndex != leastIDX) {
+      sameLeastIndex = Integer.MAX_VALUE;
+    }
+    */
   }
 
   @Override
@@ -373,9 +388,11 @@ public final class KuromojiTokenizer2 ex
     inflectionAtt.setToken(token);
     if (token.getPosition() == lastTokenPos) {
       posIncAtt.setPositionIncrement(0);
+      posIncAtt.setPositionLength(token.getPositionLength());
     } else {
       assert token.getPosition() > lastTokenPos;
       posIncAtt.setPositionIncrement(1);
+      posIncAtt.setPositionLength(1);
     }
     if (VERBOSE) {
       System.out.println("    incToken: return token=" + token);
@@ -549,6 +566,7 @@ public final class KuromojiTokenizer2 ex
         //System.out.println("count=" + count + " vs len=" + positions.length);
         if (count == positions.length) {
           Position[] newPositions = new Position[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+          //System.out.println("grow positions " + newPositions.length);
           System.arraycopy(positions, nextWrite, newPositions, 0, positions.length-nextWrite);
           System.arraycopy(positions, 0, newPositions, positions.length-nextWrite, nextWrite);
           for(int i=positions.length;i<newPositions.length;i++) {
@@ -629,8 +647,11 @@ public final class KuromojiTokenizer2 ex
       }
 
       final Position posData = positions.get(pos);
+      final boolean isFrontier = positions.getNextPos() == pos+1;
 
-      if (pos != lastBackTracePos && posData.count == 1 && positions.getNextPos() == pos+1) {
+      // nocommit if I change 40 below it changes number of
+      // tokens in TestQuality!
+      if ((pos - lastBackTracePos >= 100) && posData.count == 1 && isFrontier) {
         // We are at a "frontier", and only one node is
         // alive, so whatever the eventual best path is must
         // come through this node.  So we can safely commit
@@ -650,9 +671,11 @@ public final class KuromojiTokenizer2 ex
       }
 
       if (VERBOSE) {
-        System.out.println("\n  extend @ pos=" + pos);
+        System.out.println("\n  extend @ pos=" + pos + " char=" + (char) buffer.get(pos));
       }
 
+      sameLeastIndex = -1;
+
       if (posData.count == 0) {
         // No arcs arrive here; move to next position:
         pos++;
@@ -783,6 +806,28 @@ public final class KuromojiTokenizer2 ex
         //unknownWordEndIndex = posData.pos + unknownWordLength;
       }
 
+      // nocommit explainme:
+      if (false && (pos - lastBackTracePos >= 100) && sameLeastIndex != -1 && sameLeastIndex != Integer.MAX_VALUE &&
+          pos != lastBackTracePos && isFrontier) {
+
+        final int sav = sameLeastIndex;
+
+        //System.out.println("**SAME: " + sameLeastIndex);
+        backtrace(posData, sameLeastIndex);
+
+        // Re-base cost so we don't risk int overflow:
+        // nocommit: this does nothing: arcs were already extended
+        posData.costs[sav] = 0;
+
+        if (pending.size() != 0) {
+          return;
+        } else {
+          // nocommit: make punctuation-only testcase
+          // This means the backtrace only produced
+          // punctuation tokens, so we must keep parsing.
+        }
+      }
+
       pos++;
     }
 
@@ -793,9 +838,13 @@ public final class KuromojiTokenizer2 ex
       final Position endPosData = positions.get(pos);
       int leastCost = Integer.MAX_VALUE;
       int leastIDX = 0;
+      if (VERBOSE) {
+        System.out.println("  end: " + endPosData.count + " nodes");
+      }
       for(int idx=0;idx<endPosData.count;idx++) {
         // Add EOS cost:
         final int cost = endPosData.costs[idx] + costs.get(endPosData.nodeID[idx], 0);
+        //System.out.println("    idx=" + idx + " cost=" + cost);
         if (cost < leastCost) {
           leastCost = cost;
           leastIDX = idx;
@@ -939,7 +988,9 @@ public final class KuromojiTokenizer2 ex
     // We trace backwards, so this will be the leftWordID of
     // the token after the one we are now on:
     int lastLeftWordID = -1;
-    
+
+    int backCount = 0;
+
     // nocommit: don't use intermediate Token instance
     // here... change this to just hold raw back trace info,
     // then in incrementToken we pull the necessary char[],
@@ -966,12 +1017,16 @@ public final class KuromojiTokenizer2 ex
         // that it means something very different ("output
         // smaller segmentation"):
         final int penalty = computePenalty(backPos, pos-backPos);
-
+        
         if (penalty > 0) {
           if (VERBOSE) {
             System.out.println("  compound=" + new String(buffer.get(backPos, pos-backPos)) + " backPos=" + backPos + " pos=" + pos + " penalty=" + penalty);
           }
 
+          // Use the penalty to set maxCost on the 2nd best
+          // segmentation:
+          final int maxCost = posData.costs[bestIDX] + penalty;
+
           final int backID = posData.backID[bestIDX];
 
           // Now, prune all too-long tokens from the graph:
@@ -1001,7 +1056,7 @@ public final class KuromojiTokenizer2 ex
 
           // nocommit -- must also check whether 2nd best score falls w/in threshold?
 
-          if (leastIDX != -1) {
+          if (leastIDX != -1 && posData.backPos[leastIDX] <= maxCost) {
             // We should have pruned the altToken from the graph:
             assert posData.backPos[leastIDX] != backPos;
 
@@ -1021,6 +1076,7 @@ public final class KuromojiTokenizer2 ex
             backPos = posData.backPos[bestIDX];
             length = pos - backPos;
             backType = posData.backType[bestIDX];
+            backCount = 0;
             
           } else {
             // I think in theory it's possible there is no
@@ -1031,9 +1087,8 @@ public final class KuromojiTokenizer2 ex
       }
 
       final int offset = backPos - lastBackTracePos;
+      assert offset >= 0;
 
-      // nocommit -- how come TestQuality doesn't change if
-      // i output the altToken!
       // nocommit
       if (altToken != null && altToken.getPosition() >= backPos) {
 
@@ -1049,7 +1104,9 @@ public final class KuromojiTokenizer2 ex
         if (VERBOSE) {
           System.out.println("    add altToken=" + altToken);
         }
-        // nocommit
+        assert backCount >= 1;
+        backCount++;
+        altToken.setPositionLength(backCount);
         pending.add(altToken);
         altToken = null;
       }
@@ -1082,6 +1139,7 @@ public final class KuromojiTokenizer2 ex
         Collections.reverse(pending.subList(pending.size() - (wordIDAndLength.length - 1),
                                             pending.size()));
 
+        backCount += wordIDAndLength.length-1;
       } else {
 
         if (extendedMode && posData.backType[bestIDX] == Type.UNKNOWN) {
@@ -1105,6 +1163,7 @@ public final class KuromojiTokenizer2 ex
                                   unkDictionary));
             unigramTokenCount++;
           }
+          backCount += unigramTokenCount;
           
         } else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) {
           //System.out.println("backPos=" + backPos);
@@ -1118,6 +1177,7 @@ public final class KuromojiTokenizer2 ex
           if (VERBOSE) {
             System.out.println("    add token=" + pending.get(pending.size()-1));
           }
+          backCount++;
         } else {
           if (VERBOSE) {
             System.out.println("    skip punctuation token=" + new String(fragment, offset, length));
@@ -1125,7 +1185,6 @@ public final class KuromojiTokenizer2 ex
         }
       }
 
-      // nocommit -- accuracy drops a bit if we do this.... weird
       lastLeftWordID = dict.getLeftId(posData.backID[bestIDX]);
       pos = backPos;
       bestIDX = posData.backIndex[bestIDX];

Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java Sun Feb 12 14:22:02 2012
@@ -30,6 +30,7 @@ public class Token {
   private final int length;
   
   private final int position;
+  private int positionLength;
   
   private final Type type;
   
@@ -40,6 +41,7 @@ public class Token {
     this.length = length;
     this.type = type;
     this.position = position;
+    this.positionLength = positionLength;
     this.dictionary = dictionary;
   }
 
@@ -149,4 +151,21 @@ public class Token {
   public int getPosition() {
     return position;
   }
+
+  /**
+   * Set the position length (in tokens) of this token.  For normal
+   * tokens this is 1; for compound tokens it's > 1.
+   */
+  public void setPositionLength(int positionLength) {
+    this.positionLength = positionLength;
+  }
+  
+  /**
+   * Get the length (in tokens) of this token.  For normal
+   * tokens this is 1; for compound tokens it's > 1.
+   * @return position length of token
+   */
+  public int getPositionLength() {
+    return positionLength;
+  }
 }

Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java Sun Feb 12 14:22:02 2012
@@ -118,12 +118,15 @@ public class Viterbi {
       if (startIndexArr[i] == null || endIndexArr[i] == null){	// continue since no array which contains ViterbiNodes exists. Or no previous node exists.
         continue;
       }
+      //System.out.println("\npos=" + (i-1));
 
       // For each arc leaving...
       for (ViterbiNode node : startIndexArr[i]) {
         if (node == null){	// If array doesn't contain ViterbiNode any more, continue to next index
           break;
         }
+
+        //System.out.println("  leaving node.wordID=" + node.getWordId() + " leftID=" + node.getLeftId());
         
         int backwardConnectionId = node.getLeftId();
         int wordCost = node.getWordCost();
@@ -134,8 +137,10 @@ public class Viterbi {
             break;
           }
           
+          //System.out.println("    arriving node.wordID=" + leftNode.getWordId() + " rightID=" + leftNode.getRightId());
           int pathCost = leftNode.getPathCost() + costs.get(leftNode.getRightId(), backwardConnectionId) + wordCost;	// cost = [total cost from BOS to previous node] + [connection cost between previous node and current node] + [word cost]
-          
+
+          //System.out.println("      pathCost=" + pathCost);
           // "Search mode". Add extra costs if it is long node.
           if (searchMode) {
             //						System.out.print(""); // If this line exists, kuromoji runs faster for some reason when searchMode == false.
@@ -161,8 +166,10 @@ public class Viterbi {
               }
             }
           }
+          //System.out.println("      after penalty pathCost=" + pathCost);
           
           if (pathCost < leastPathCost){	// If total cost is lower than before, set current previous node as best left node (previous means left).
+            //System.out.println("        **");
             leastPathCost = pathCost;
             node.setPathCost(leastPathCost);
             node.setLeftNode(leftNode);

Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java Sun Feb 12 14:22:02 2012
@@ -52,10 +52,14 @@ public class TestKuromojiAnalyzer extend
   public void testDecomposition() throws IOException {
 
     // nocommit
+    /*
     TokenStream ts = new KuromojiAnalyzer(TEST_VERSION_CURRENT)
         .tokenStream("foo",
-                     new StringReader("マイケルジャクソン	マイケル ジャクソン"));
+                     //new StringReader("楽器・機材の質は完全にプロ仕様!ですが値段は何処よりも安い!"));
+                     //new StringReader("ですが値段は何処よりも安い!"));
+                     new StringReader("木工芸では、木を、硬木、軟木と、唐木に分類します。"));
     while(ts.incrementToken());
+    */
 
     // Senior software engineer:
     if (KuromojiTokenizer2.DO_OUTPUT_COMPOUND) {
@@ -95,7 +99,6 @@ public class TestKuromojiAnalyzer extend
 
       // Kyoto University Baseball Club
       // nocommit --segments differently but perhaps OK
-      /*
       assertAnalyzesTo(new KuromojiAnalyzer(TEST_VERSION_CURRENT), "京都大学硬式野球部",
                        new String[] { "京都",
                                       "京都大学硬式野球部",
@@ -104,7 +107,6 @@ public class TestKuromojiAnalyzer extend
                                       "野球",
                                       "部" },
                        new int[] {1, 0, 1, 1, 1, 1});
-      */
     } else {
       assertAnalyzesTo(new KuromojiAnalyzer(TEST_VERSION_CURRENT), "シニアソフトウェアエンジニア",
                        new String[] { "シニア",

Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java Sun Feb 12 14:22:02 2012
@@ -32,6 +32,7 @@ import java.util.zip.ZipFile;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
 
@@ -39,7 +40,7 @@ import org.apache.lucene.util.LuceneTest
 // just compares segmentation to some sentences pre-tokenized by mecab
 public class TestQuality extends LuceneTestCase {
 
-  public void test() throws Exception {
+  public void testSingleText() throws Exception {
     File datafile = getDataFile("tanakaseg.zip");
     ZipFile zip = new ZipFile(datafile);
     InputStream is = zip.getInputStream(zip.getEntry("sentences.txt"));
@@ -47,6 +48,9 @@ public class TestQuality extends LuceneT
     InputStream is2 = zip.getInputStream(zip.getEntry("segmented.txt"));
     BufferedReader seg = new BufferedReader(new InputStreamReader(is2, IOUtils.CHARSET_UTF_8));
     Stats stats = new Stats();
+
+    final boolean ONE_TIME = true;
+
     /**
      #words: 1578506
      #chars: 4519246
@@ -56,19 +60,84 @@ public class TestQuality extends LuceneT
      word agreement?: 0.999587584716181
      */
     //final Tokenizer tokenizer = new KuromojiTokenizer(null, null, true, Mode.SEARCH);
+
+    StringBuilder sb = new StringBuilder();
+
+    String line1 = null;
+    String line2 = null;
+    int maxLen = 0;
+    int count = 0;
+    while ((line1 = unseg.readLine()) != null) {
+      seg.readLine();
+      // nocommit also try removing the "easy" period at the
+      // end of each sentence...
+      maxLen = Math.max(line1.length(), maxLen);
+      sb.append(line1);
+      if (ONE_TIME && count++ == 100) {
+        // nocommit;
+        break;
+      }
+    }
+    System.out.println("maxLen=" + maxLen);
+
+    //final Tokenizer tokenizer = new KuromojiTokenizer2(new StringReader(""), null, true, Mode.SEARCH);
+    final Tokenizer tokenizer = new KuromojiTokenizer(new StringReader(""));
+    final String all = sb.toString();
+    final int ITERS = 20;
+    CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class); 
+    for(int iter=0;iter<ITERS;iter++) {
+      tokenizer.reset(new StringReader(all));
+      count = 0;
+      long t0 = System.currentTimeMillis();
+      while(tokenizer.incrementToken()) {
+        if (false && ONE_TIME) {
+          System.out.println(count + ": " + termAtt.toString());
+        }
+        count++;
+      }
+      long t1 = System.currentTimeMillis();
+      System.out.println(all.length()/(t1-t0) + " bytes/msec; count=" + count);
+
+      if (ONE_TIME) {
+        break;
+      }
+    }
+  }
+
+  public void test() throws Exception {
+    File datafile = getDataFile("tanakaseg.zip");
+    ZipFile zip = new ZipFile(datafile);
+    InputStream is = zip.getInputStream(zip.getEntry("sentences.txt"));
+    BufferedReader unseg = new BufferedReader(new InputStreamReader(is, IOUtils.CHARSET_UTF_8));
+    InputStream is2 = zip.getInputStream(zip.getEntry("segmented.txt"));
+    BufferedReader seg = new BufferedReader(new InputStreamReader(is2, IOUtils.CHARSET_UTF_8));
+    Stats stats = new Stats();
+    /**
+     #words: 1578506
+     #chars: 4519246
+     #edits: 651
+     #sentences: 150122
+     sentence agreement?: 0.998161495317142
+     word agreement?: 0.999587584716181
+     */
+    
+    //final Tokenizer tokenizer = new KuromojiTokenizer(new StringReader(""));
+    //final Tokenizer tokenizer = new KuromojiTokenizer(new Segmenter(Mode.NORMAL), new StringReader(""));
     final Tokenizer tokenizer = new KuromojiTokenizer2(new StringReader(""), null, true, Mode.SEARCH);
     
     String line1 = null;
     String line2 = null;
+    int count = 0;
     while ((line1 = unseg.readLine()) != null) {
       line2 = seg.readLine();
-      evaluateLine(line1, line2, tokenizer, stats);
+      evaluateLine(count++, line1, line2, tokenizer, stats);
     }
     
     System.out.println("#words: " + stats.numWords);
     System.out.println("#chars: " + stats.numChars);
     System.out.println("#edits: " + stats.numEdits);
     System.out.println("#sentences: " + stats.numSentences);
+    System.out.println("#tokens: " + stats.numTokens);
     System.out.println("sentence agreement?: " + (stats.numSentencesCorrect/(double)stats.numSentences));
     System.out.println("word agreement?: " + (1D - (stats.numEdits / (double)stats.numWords)));
     unseg.close();
@@ -82,38 +151,126 @@ public class TestQuality extends LuceneT
     long numChars = 0;
     long numSentences = 0;
     long numSentencesCorrect = 0;
+    long numTokens = 0;
+  }
+
+  static class Path {
+    public final List<String> tokens;
+    public int pos;
+
+    public Path() {
+      tokens = new ArrayList<String>();
+    }
+
+    public void add(String token, int posLen) {
+      tokens.add(token);
+      pos += posLen;
+    }
   }
   
-  public static void evaluateLine(String unseg, String seg, Tokenizer tokenizer, Stats stats) throws Exception {
-    //System.out.println("\nTEST: " + unseg);
-    List<String> tokens = new ArrayList<String>();
+  public static void evaluateLine(int lineCount, String unseg, String seg, Tokenizer tokenizer, Stats stats) throws Exception {
+    if (VERBOSE) {
+      System.out.println("\nTEST " + lineCount + ": input " + unseg);
+    }
     tokenizer.reset(new StringReader(unseg));
     CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
+    PositionIncrementAttribute posIncAtt = tokenizer.addAttribute(PositionIncrementAttribute.class);
+    List<Path> paths = new ArrayList<Path>();
+    paths.add(new Path());
+    
+    int pos = -1;
+    int numTokens = 0;
     while(tokenizer.incrementToken()) {
-      tokens.add(termAtt.toString());
+      final int posInc = posIncAtt.getPositionIncrement();
+      final int posLength = posIncAtt.getPositionLength();
+      final String token = termAtt.toString();
+
+      //System.out.println("  tok=" + token + " numPaths=" + paths.size() + " posLen=" + posLength);
+
+      pos += posInc;
+      numTokens++;
+
+      if (VERBOSE) {
+        if (posIncAtt.getPositionIncrement() == 0) {
+          System.out.println("  fork @ token=" + token + " posLength=" + posLength);
+        }
+      }
+
+      assert pos >= 0;
+      final int numPaths = paths.size();
+      for(int pathID=0;pathID<numPaths;pathID++) {
+        final Path path = paths.get(pathID);
+        if (pos == path.pos) {
+          path.add(token, posLength);
+        } else if (pos == path.pos-1 && posInc == 0) {
+
+          // NOTE: this is horribly, horribly inefficient in
+          // general!!  Much better to do graph-to-graph
+          // alignment to get min edits:
+
+          // nocommit this isn't fully general, ie, it
+          // assumes the tokenizer ALWAYS outputs the
+          // posLen=1 token first, at a given position
+          // Fork!
+          assert path.tokens.size() > 0;
+          Path newPath = new Path();
+          newPath.tokens.addAll(path.tokens);
+          newPath.tokens.remove(newPath.tokens.size()-1);
+          newPath.pos = pos;
+          newPath.add(token, posLength);
+          paths.add(newPath);
+
+        } else {
+          assert pos < path.pos: "pos=" + pos + " path.pos=" + path.pos + " pathID=" + pathID;
+        }
+      }
     }
-    
+
+    if (VERBOSE) {
+      System.out.println("  " + paths.size() + " paths");
+    }
+
     List<String> expectedTokens = Arrays.asList(seg.split("\\s+"));
-    tokens = normalize(tokens);
     expectedTokens = normalize(expectedTokens);
+
+    int minEdits = Integer.MAX_VALUE;
+    List<String> minPath = null;
+    for(Path path : paths) {
+      List<String> tokens = normalize(path.tokens);
+      if (VERBOSE) {
+        System.out.println("    path: " + path.tokens);
+      }
     
-    HashMap<String,Character> transformation = new HashMap<String,Character>();
-    CharRef charRef = new CharRef();
-    
-    String s1 = transform(tokens, transformation, charRef);
-    String s2 = transform(expectedTokens, transformation, charRef);
+      HashMap<String,Character> transformation = new HashMap<String,Character>();
+      CharRef charRef = new CharRef();
+      String s1 = transform(tokens, transformation, charRef);
+      String s2 = transform(expectedTokens, transformation, charRef);
     
-    int edits = getDistance(s2, s1);
-    if (edits > 0) {
-      System.out.println(edits + " edits; unseg: " + unseg);
-      System.out.println(tokens + " vs " + expectedTokens);
+      int edits = getDistance(s2, s1);
+      if (edits < minEdits) {
+        minEdits = edits;
+        minPath = tokens;
+        if (VERBOSE) {
+          System.out.println("      ** edits=" + edits);
+        }
+      }
+    }
+    assert minPath != null;
+    if (minEdits > 0) {
+      if (!VERBOSE) {
+        System.out.println("\nTEST " + lineCount + ": input " + unseg + "; " + minEdits + " edits");
+      }
+      System.out.println("    expected: " + expectedTokens);
+      System.out.println("      actual: " + minPath);
     }
     stats.numChars += seg.length();
-    stats.numEdits += edits;
+    stats.numEdits += minEdits;
     stats.numWords += expectedTokens.size();
+    stats.numTokens += numTokens;
     stats.numSentences++;
-    if (edits == 0)
+    if (minEdits == 0) {
       stats.numSentencesCorrect++;
+    }
   }
   
   static class CharRef {