You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/02/12 15:22:03 UTC
svn commit: r1243257 - in /lucene/dev/branches/lucene3767:
lucene/core/src/java/org/apache/lucene/analysis/
lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/
lucene/core/src/test/org/apache/lucene/analysis/
lucene/core/src/test/org/apach...
Author: mikemccand
Date: Sun Feb 12 14:22:02 2012
New Revision: 1243257
URL: http://svn.apache.org/viewvc?rev=1243257&view=rev
Log:
LUCENE-3767: also backtrace when paths converge on single backpointer; output position length for the compound tokens
Modified:
lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/Token.java
lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java
lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/TestToken.java
lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpl.java
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/Perf.java
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java
lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java
Modified: lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/Token.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/Token.java (original)
+++ lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/Token.java Sun Feb 12 14:22:02 2012
@@ -128,6 +128,7 @@ public class Token extends CharTermAttri
private int flags;
private Payload payload;
private int positionIncrement = 1;
+ private int positionLength = 1;
/** Constructs a Token will null text. */
public Token() {
@@ -270,6 +271,20 @@ public class Token extends CharTermAttri
return positionIncrement;
}
+ /** @param positionLength how many positions this token
+ * spans. NOTE: this is optional, and most analyzers
+ * don't change the default value (1). */
+ public void setPositionLength(int positionLength) {
+ this.positionLength = positionLength;
+ }
+
+ /** Returns the position length of this Token.
+ * @see #setPositionLength
+ */
+ public int getPositionLength() {
+ return positionLength;
+ }
+
/** Returns this Token's starting offset, the position of the first character
corresponding to this token in the source text.
@@ -360,6 +375,7 @@ public class Token extends CharTermAttri
super.clear();
payload = null;
positionIncrement = 1;
+ positionLength = 1;
flags = 0;
startOffset = endOffset = 0;
type = DEFAULT_TYPE;
@@ -383,6 +399,7 @@ public class Token extends CharTermAttri
public Token clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) {
final Token t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset);
t.positionIncrement = positionIncrement;
+ t.positionLength = positionLength;
t.flags = flags;
t.type = type;
if (payload != null)
@@ -401,6 +418,7 @@ public class Token extends CharTermAttri
endOffset == other.endOffset &&
flags == other.flags &&
positionIncrement == other.positionIncrement &&
+ positionLength == other.positionLength &&
(type == null ? other.type == null : type.equals(other.type)) &&
(payload == null ? other.payload == null : payload.equals(other.payload)) &&
super.equals(obj)
@@ -416,6 +434,7 @@ public class Token extends CharTermAttri
code = code * 31 + endOffset;
code = code * 31 + flags;
code = code * 31 + positionIncrement;
+ code = code * 31 + positionLength;
if (type != null)
code = code * 31 + type.hashCode();
if (payload != null)
@@ -427,6 +446,7 @@ public class Token extends CharTermAttri
private void clearNoTermBuffer() {
payload = null;
positionIncrement = 1;
+ positionLength = 1;
flags = 0;
startOffset = endOffset = 0;
type = DEFAULT_TYPE;
@@ -443,6 +463,7 @@ public class Token extends CharTermAttri
copyBuffer(newTermBuffer, newTermOffset, newTermLength);
payload = null;
positionIncrement = 1;
+ positionLength = 1;
startOffset = newStartOffset;
endOffset = newEndOffset;
type = newType;
@@ -531,6 +552,7 @@ public class Token extends CharTermAttri
public void reinit(Token prototype) {
copyBuffer(prototype.buffer(), 0, prototype.length());
positionIncrement = prototype.positionIncrement;
+ positionLength = prototype.positionLength;
flags = prototype.flags;
startOffset = prototype.startOffset;
endOffset = prototype.endOffset;
@@ -546,6 +568,7 @@ public class Token extends CharTermAttri
public void reinit(Token prototype, String newTerm) {
setEmpty().append(newTerm);
positionIncrement = prototype.positionIncrement;
+ positionLength = prototype.positionLength;
flags = prototype.flags;
startOffset = prototype.startOffset;
endOffset = prototype.endOffset;
@@ -563,6 +586,7 @@ public class Token extends CharTermAttri
public void reinit(Token prototype, char[] newTermBuffer, int offset, int length) {
copyBuffer(newTermBuffer, offset, length);
positionIncrement = prototype.positionIncrement;
+ positionLength = prototype.positionLength;
flags = prototype.flags;
startOffset = prototype.startOffset;
endOffset = prototype.endOffset;
@@ -583,6 +607,7 @@ public class Token extends CharTermAttri
super.copyTo(target);
((OffsetAttribute) target).setOffset(startOffset, endOffset);
((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement);
+ ((PositionIncrementAttribute) target).setPositionLength(positionLength);
((PayloadAttribute) target).setPayload((payload == null) ? null : (Payload) payload.clone());
((FlagsAttribute) target).setFlags(flags);
((TypeAttribute) target).setType(type);
@@ -595,6 +620,7 @@ public class Token extends CharTermAttri
reflector.reflect(OffsetAttribute.class, "startOffset", startOffset);
reflector.reflect(OffsetAttribute.class, "endOffset", endOffset);
reflector.reflect(PositionIncrementAttribute.class, "positionIncrement", positionIncrement);
+ reflector.reflect(PositionIncrementAttribute.class, "positionLength", positionLength);
reflector.reflect(PayloadAttribute.class, "payload", payload);
reflector.reflect(FlagsAttribute.class, "flags", flags);
reflector.reflect(TypeAttribute.class, "type", type);
Modified: lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java (original)
+++ lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java Sun Feb 12 14:22:02 2012
@@ -56,4 +56,15 @@ public interface PositionIncrementAttrib
* @see #setPositionIncrement
*/
public int getPositionIncrement();
+
+ // nocommit better names...? getLength? getSpan?
+ /** @param positionLength how many positions this token
+ * spans. NOTE: this is optional, and most analyzers
+ * don't change the default value (1). */
+ public void setPositionLength(int positionLength);
+
+ /** Returns the position length of this Token.
+ * @see #setPositionLength
+ */
+ public int getPositionLength();
}
Modified: lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java (original)
+++ lucene/dev/branches/lucene3767/lucene/core/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java Sun Feb 12 14:22:02 2012
@@ -46,6 +46,7 @@ import org.apache.lucene.util.AttributeI
*/
public class PositionIncrementAttributeImpl extends AttributeImpl implements PositionIncrementAttribute, Cloneable {
private int positionIncrement = 1;
+ private int positionLength = 1;
/** Set the position increment. The default value is one.
*
@@ -54,7 +55,7 @@ public class PositionIncrementAttributeI
public void setPositionIncrement(int positionIncrement) {
if (positionIncrement < 0)
throw new IllegalArgumentException
- ("Increment must be zero or greater: " + positionIncrement);
+ ("Increment must be zero or greater: got " + positionIncrement);
this.positionIncrement = positionIncrement;
}
@@ -65,9 +66,27 @@ public class PositionIncrementAttributeI
return positionIncrement;
}
+ /** @param positionLength how many positions this token
+ * spans. NOTE: this is optional, and most analyzers
+ * don't change the default value (1). */
+ public void setPositionLength(int positionLength) {
+ if (positionLength < 1)
+ throw new IllegalArgumentException
+ ("Position length must be 1 or greater: got " + positionLength);
+ this.positionLength = positionLength;
+ }
+
+ /** Returns the position length of this Token.
+ * @see #setPositionLength
+ */
+ public int getPositionLength() {
+ return positionLength;
+ }
+
@Override
public void clear() {
this.positionIncrement = 1;
+ this.positionLength = 1;
}
@Override
@@ -77,7 +96,9 @@ public class PositionIncrementAttributeI
}
if (other instanceof PositionIncrementAttributeImpl) {
- return positionIncrement == ((PositionIncrementAttributeImpl) other).positionIncrement;
+ PositionIncrementAttributeImpl _other = (PositionIncrementAttributeImpl) other;
+ return positionIncrement == _other.positionIncrement &&
+ positionLength == _other.positionLength;
}
return false;
@@ -85,13 +106,14 @@ public class PositionIncrementAttributeI
@Override
public int hashCode() {
- return positionIncrement;
+ return positionIncrement + positionLength*3947;
}
@Override
public void copyTo(AttributeImpl target) {
PositionIncrementAttribute t = (PositionIncrementAttribute) target;
t.setPositionIncrement(positionIncrement);
+ t.setPositionLength(positionLength);
}
}
Modified: lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/TestToken.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/TestToken.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/TestToken.java (original)
+++ lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/TestToken.java Sun Feb 12 14:22:02 2012
@@ -253,6 +253,7 @@ public class TestToken extends LuceneTes
put(OffsetAttribute.class.getName() + "#startOffset", 6);
put(OffsetAttribute.class.getName() + "#endOffset", 22);
put(PositionIncrementAttribute.class.getName() + "#positionIncrement", 1);
+ put(PositionIncrementAttribute.class.getName() + "#positionLength", 1);
put(PayloadAttribute.class.getName() + "#payload", null);
put(TypeAttribute.class.getName() + "#type", TypeAttribute.DEFAULT_TYPE);
put(FlagsAttribute.class.getName() + "#flags", 8);
Modified: lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpl.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpl.java (original)
+++ lucene/dev/branches/lucene3767/lucene/core/src/test/org/apache/lucene/analysis/tokenattributes/TestSimpleAttributeImpl.java Sun Feb 12 14:22:02 2012
@@ -27,8 +27,10 @@ public class TestSimpleAttributeImpl ext
// this checks using reflection API if the defaults are correct
public void testAttributes() {
- _TestUtil.assertAttributeReflection(new PositionIncrementAttributeImpl(),
- Collections.singletonMap(PositionIncrementAttribute.class.getName()+"#positionIncrement", 1));
+ _TestUtil.assertAttributeReflection(new PositionIncrementAttributeImpl(), new HashMap<String,Object>() {{
+ put(PositionIncrementAttribute.class.getName()+"#positionIncrement", 1);
+ put(PositionIncrementAttribute.class.getName()+"#positionLength", 1);
+ }});
_TestUtil.assertAttributeReflection(new FlagsAttributeImpl(),
Collections.singletonMap(FlagsAttribute.class.getName()+"#flags", 0));
_TestUtil.assertAttributeReflection(new TypeAttributeImpl(),
Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/Perf.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/Perf.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/Perf.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/Perf.java Sun Feb 12 14:22:02 2012
@@ -9,9 +9,9 @@ import org.apache.lucene.analysis.kuromo
import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
-// javac -cp ../build/kuromoji/classes/java:../../../lucene/build/classes/java:../../analysis/build/common/lucene-analyzers-common-4.0-SNAPSHOT.jar Perf.java
+// javac -cp ../build/kuromoji/classes/java:../../../lucene/build/core/classes/java:../../analysis/build/common/lucene-analyzers-common-4.0-SNAPSHOT.jar Perf.java
-// java -cp .:../build/kuromoji/classes/java:../../../lucene/build/classes/java:../../analysis/build/common/lucene-analyzers-common-4.0-SNAPSHOT.jar Perf
+// java -cp .:../build/kuromoji/classes/java:../../../lucene/build/core/classes/java:../../analysis/build/common/lucene-analyzers-common-4.0-SNAPSHOT.jar Perf
public class Perf {
private final static Analyzer analyzer = new Analyzer() {
Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java Sun Feb 12 14:22:02 2012
@@ -56,6 +56,7 @@ public final class KuromojiTokenizer ext
@Override
protected void setNextSentence(int sentenceStart, int sentenceEnd) {
this.sentenceStart = sentenceStart;
+ //System.out.println("\nNEXT SENTENCE: " + sentenceStart + " to " + sentenceEnd + ": " + new String(buffer, sentenceStart, sentenceEnd-sentenceStart));
// TODO: maybe don't pass 0 here, so kuromoji tracks offsets for us?
tokens = segmenter.doTokenize(0, buffer, sentenceStart, sentenceEnd-sentenceStart, true);
tokenIndex = 0;
Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer2.java Sun Feb 12 14:22:02 2012
@@ -53,6 +53,9 @@ import org.apache.lucene.util.fst.FST;
// nocommit add toDot and look at 1st pass intersection
+// nocommit need better testing of compound output... need
+// assertAnalyzesTo to accept posLength too
+
// nocommit add comment explaining this isn't quite a real viterbi
// nocomit explain how the 2nd best tokenization is
@@ -64,6 +67,10 @@ import org.apache.lucene.util.fst.FST;
// depending on whether ipadic was "trained" with sentence
// breaks?
+// nocommit -- should we use the sentence breakiterator
+// too..? we can simply use it to slip an EOS/BOS token
+// in...
+
/* Uses a rolling Viterbi search to find the least cost
* segmentation (path) of the incoming characters.
*
@@ -116,6 +123,7 @@ public final class KuromojiTokenizer2 ex
private final boolean discardPunctuation;
private final boolean searchMode;
private final boolean extendedMode;
+ private int sameLeastIndex;
private int lastBackTracePos;
private int lastTokenPos;
@@ -325,7 +333,7 @@ public final class KuromojiTokenizer2 ex
}
if (VERBOSE) {
- System.out.println(" + cost=" + leastCost + " wordID=" + wordID + " wordCat=" + leftID + " tok=" + new String(buffer.get(posData.pos, endPos-posData.pos)));
+ System.out.println(" + cost=" + leastCost + " wordID=" + wordID + " leftID=" + leftID + " tok=" + new String(buffer.get(posData.pos, endPos-posData.pos)) + " leastIDX=" + leastIDX + " toPos.idx=" + positions.get(endPos).count);
}
// nocommit ideally we don't have to do this ... just
@@ -342,6 +350,13 @@ public final class KuromojiTokenizer2 ex
}
positions.get(endPos).add(leastCost, dict.getRightId(wordID), posData.pos, leastIDX, wordID, type);
+ /*
+ if (sameLeastIndex == -1) {
+ sameLeastIndex = leastIDX;
+ } else if (sameLeastIndex != leastIDX) {
+ sameLeastIndex = Integer.MAX_VALUE;
+ }
+ */
}
@Override
@@ -373,9 +388,11 @@ public final class KuromojiTokenizer2 ex
inflectionAtt.setToken(token);
if (token.getPosition() == lastTokenPos) {
posIncAtt.setPositionIncrement(0);
+ posIncAtt.setPositionLength(token.getPositionLength());
} else {
assert token.getPosition() > lastTokenPos;
posIncAtt.setPositionIncrement(1);
+ posIncAtt.setPositionLength(1);
}
if (VERBOSE) {
System.out.println(" incToken: return token=" + token);
@@ -549,6 +566,7 @@ public final class KuromojiTokenizer2 ex
//System.out.println("count=" + count + " vs len=" + positions.length);
if (count == positions.length) {
Position[] newPositions = new Position[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+ //System.out.println("grow positions " + newPositions.length);
System.arraycopy(positions, nextWrite, newPositions, 0, positions.length-nextWrite);
System.arraycopy(positions, 0, newPositions, positions.length-nextWrite, nextWrite);
for(int i=positions.length;i<newPositions.length;i++) {
@@ -629,8 +647,11 @@ public final class KuromojiTokenizer2 ex
}
final Position posData = positions.get(pos);
+ final boolean isFrontier = positions.getNextPos() == pos+1;
- if (pos != lastBackTracePos && posData.count == 1 && positions.getNextPos() == pos+1) {
+ // nocommit if I change 40 below it changes number of
+ // tokens in TestQuality!
+ if ((pos - lastBackTracePos >= 100) && posData.count == 1 && isFrontier) {
// We are at a "frontier", and only one node is
// alive, so whatever the eventual best path is must
// come through this node. So we can safely commit
@@ -650,9 +671,11 @@ public final class KuromojiTokenizer2 ex
}
if (VERBOSE) {
- System.out.println("\n extend @ pos=" + pos);
+ System.out.println("\n extend @ pos=" + pos + " char=" + (char) buffer.get(pos));
}
+ sameLeastIndex = -1;
+
if (posData.count == 0) {
// No arcs arrive here; move to next position:
pos++;
@@ -783,6 +806,28 @@ public final class KuromojiTokenizer2 ex
//unknownWordEndIndex = posData.pos + unknownWordLength;
}
+ // nocommit explainme:
+ if (false && (pos - lastBackTracePos >= 100) && sameLeastIndex != -1 && sameLeastIndex != Integer.MAX_VALUE &&
+ pos != lastBackTracePos && isFrontier) {
+
+ final int sav = sameLeastIndex;
+
+ //System.out.println("**SAME: " + sameLeastIndex);
+ backtrace(posData, sameLeastIndex);
+
+ // Re-base cost so we don't risk int overflow:
+ // nocommit: this does nothing: arcs were already extended
+ posData.costs[sav] = 0;
+
+ if (pending.size() != 0) {
+ return;
+ } else {
+ // nocommit: make punctuation-only testcase
+ // This means the backtrace only produced
+ // punctuation tokens, so we must keep parsing.
+ }
+ }
+
pos++;
}
@@ -793,9 +838,13 @@ public final class KuromojiTokenizer2 ex
final Position endPosData = positions.get(pos);
int leastCost = Integer.MAX_VALUE;
int leastIDX = 0;
+ if (VERBOSE) {
+ System.out.println(" end: " + endPosData.count + " nodes");
+ }
for(int idx=0;idx<endPosData.count;idx++) {
// Add EOS cost:
final int cost = endPosData.costs[idx] + costs.get(endPosData.nodeID[idx], 0);
+ //System.out.println(" idx=" + idx + " cost=" + cost);
if (cost < leastCost) {
leastCost = cost;
leastIDX = idx;
@@ -939,7 +988,9 @@ public final class KuromojiTokenizer2 ex
// We trace backwards, so this will be the leftWordID of
// the token after the one we are now on:
int lastLeftWordID = -1;
-
+
+ int backCount = 0;
+
// nocommit: don't use intermediate Token instance
// here... change this to just hold raw back trace info,
// then in incrementToken we pull the necessary char[],
@@ -966,12 +1017,16 @@ public final class KuromojiTokenizer2 ex
// that it means something very different ("output
// smaller segmentation"):
final int penalty = computePenalty(backPos, pos-backPos);
-
+
if (penalty > 0) {
if (VERBOSE) {
System.out.println(" compound=" + new String(buffer.get(backPos, pos-backPos)) + " backPos=" + backPos + " pos=" + pos + " penalty=" + penalty);
}
+ // Use the penalty to set maxCost on the 2nd best
+ // segmentation:
+ final int maxCost = posData.costs[bestIDX] + penalty;
+
final int backID = posData.backID[bestIDX];
// Now, prune all too-long tokens from the graph:
@@ -1001,7 +1056,7 @@ public final class KuromojiTokenizer2 ex
// nocommit -- must also check whether 2nd best score falls w/in threshold?
- if (leastIDX != -1) {
+ if (leastIDX != -1 && posData.backPos[leastIDX] <= maxCost) {
// We should have pruned the altToken from the graph:
assert posData.backPos[leastIDX] != backPos;
@@ -1021,6 +1076,7 @@ public final class KuromojiTokenizer2 ex
backPos = posData.backPos[bestIDX];
length = pos - backPos;
backType = posData.backType[bestIDX];
+ backCount = 0;
} else {
// I think in theory it's possible there is no
@@ -1031,9 +1087,8 @@ public final class KuromojiTokenizer2 ex
}
final int offset = backPos - lastBackTracePos;
+ assert offset >= 0;
- // nocommit -- how come TestQuality doesn't change if
- // i output the altToken!
// nocommit
if (altToken != null && altToken.getPosition() >= backPos) {
@@ -1049,7 +1104,9 @@ public final class KuromojiTokenizer2 ex
if (VERBOSE) {
System.out.println(" add altToken=" + altToken);
}
- // nocommit
+ assert backCount >= 1;
+ backCount++;
+ altToken.setPositionLength(backCount);
pending.add(altToken);
altToken = null;
}
@@ -1082,6 +1139,7 @@ public final class KuromojiTokenizer2 ex
Collections.reverse(pending.subList(pending.size() - (wordIDAndLength.length - 1),
pending.size()));
+ backCount += wordIDAndLength.length-1;
} else {
if (extendedMode && posData.backType[bestIDX] == Type.UNKNOWN) {
@@ -1105,6 +1163,7 @@ public final class KuromojiTokenizer2 ex
unkDictionary));
unigramTokenCount++;
}
+ backCount += unigramTokenCount;
} else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) {
//System.out.println("backPos=" + backPos);
@@ -1118,6 +1177,7 @@ public final class KuromojiTokenizer2 ex
if (VERBOSE) {
System.out.println(" add token=" + pending.get(pending.size()-1));
}
+ backCount++;
} else {
if (VERBOSE) {
System.out.println(" skip punctuation token=" + new String(fragment, offset, length));
@@ -1125,7 +1185,6 @@ public final class KuromojiTokenizer2 ex
}
}
- // nocommit -- accuracy drops a bit if we do this.... weird
lastLeftWordID = dict.getLeftId(posData.backID[bestIDX]);
pos = backPos;
bestIDX = posData.backIndex[bestIDX];
Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java Sun Feb 12 14:22:02 2012
@@ -30,6 +30,7 @@ public class Token {
private final int length;
private final int position;
+ private int positionLength;
private final Type type;
@@ -40,6 +41,7 @@ public class Token {
this.length = length;
this.type = type;
this.position = position;
+ this.positionLength = positionLength;
this.dictionary = dictionary;
}
@@ -149,4 +151,21 @@ public class Token {
public int getPosition() {
return position;
}
+
+ /**
+ * Set the position length (in tokens) of this token. For normal
+ * tokens this is 1; for compound tokens it's > 1.
+ */
+ public void setPositionLength(int positionLength) {
+ this.positionLength = positionLength;
+ }
+
+ /**
+ * Get the length (in tokens) of this token. For normal
+ * tokens this is 1; for compound tokens it's > 1.
+ * @return position length of token
+ */
+ public int getPositionLength() {
+ return positionLength;
+ }
}
Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java Sun Feb 12 14:22:02 2012
@@ -118,12 +118,15 @@ public class Viterbi {
if (startIndexArr[i] == null || endIndexArr[i] == null){ // continue since no array which contains ViterbiNodes exists. Or no previous node exists.
continue;
}
+ //System.out.println("\npos=" + (i-1));
// For each arc leaving...
for (ViterbiNode node : startIndexArr[i]) {
if (node == null){ // If array doesn't contain ViterbiNode any more, continue to next index
break;
}
+
+ //System.out.println(" leaving node.wordID=" + node.getWordId() + " leftID=" + node.getLeftId());
int backwardConnectionId = node.getLeftId();
int wordCost = node.getWordCost();
@@ -134,8 +137,10 @@ public class Viterbi {
break;
}
+ //System.out.println(" arriving node.wordID=" + leftNode.getWordId() + " rightID=" + leftNode.getRightId());
int pathCost = leftNode.getPathCost() + costs.get(leftNode.getRightId(), backwardConnectionId) + wordCost; // cost = [total cost from BOS to previous node] + [connection cost between previous node and current node] + [word cost]
-
+
+ //System.out.println(" pathCost=" + pathCost);
// "Search mode". Add extra costs if it is long node.
if (searchMode) {
// System.out.print(""); // If this line exists, kuromoji runs faster for some reason when searchMode == false.
@@ -161,8 +166,10 @@ public class Viterbi {
}
}
}
+ //System.out.println(" after penalty pathCost=" + pathCost);
if (pathCost < leastPathCost){ // If total cost is lower than before, set current previous node as best left node (previous means left).
+ //System.out.println(" **");
leastPathCost = pathCost;
node.setPathCost(leastPathCost);
node.setLeftNode(leftNode);
Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiAnalyzer.java Sun Feb 12 14:22:02 2012
@@ -52,10 +52,14 @@ public class TestKuromojiAnalyzer extend
public void testDecomposition() throws IOException {
// nocommit
+ /*
TokenStream ts = new KuromojiAnalyzer(TEST_VERSION_CURRENT)
.tokenStream("foo",
- new StringReader("ãã¤ã±ã«ã¸ã£ã¯ã½ã³ ãã¤ã±ã« ã¸ã£ã¯ã½ã³"));
+ //new StringReader("楽å¨ã»æ©æã®è³ªã¯å®å
¨ã«ããä»æ§ï¼ã§ããå¤æ®µã¯ä½å¦ãããå®ãï¼"));
+ //new StringReader("ã§ããå¤æ®µã¯ä½å¦ãããå®ãï¼"));
+ new StringReader("æ¨å·¥è¸ã§ã¯ãæ¨ãã硬æ¨ãè»æ¨ã¨ãåæ¨ã«åé¡ãã¾ãã"));
while(ts.incrementToken());
+ */
// Senior software engineer:
if (KuromojiTokenizer2.DO_OUTPUT_COMPOUND) {
@@ -95,7 +99,6 @@ public class TestKuromojiAnalyzer extend
// Kyoto University Baseball Club
// nocommit --segments differently but perhaps OK
- /*
assertAnalyzesTo(new KuromojiAnalyzer(TEST_VERSION_CURRENT), "京é½å¤§å¦ç¡¬å¼éçé¨",
new String[] { "京é½",
"京é½å¤§å¦ç¡¬å¼éçé¨",
@@ -104,7 +107,6 @@ public class TestKuromojiAnalyzer extend
"éç",
"é¨" },
new int[] {1, 0, 1, 1, 1, 1});
- */
} else {
assertAnalyzesTo(new KuromojiAnalyzer(TEST_VERSION_CURRENT), "ã·ãã¢ã½ããã¦ã§ã¢ã¨ã³ã¸ãã¢",
new String[] { "ã·ãã¢",
Modified: lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java?rev=1243257&r1=1243256&r2=1243257&view=diff
==============================================================================
--- lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java (original)
+++ lucene/dev/branches/lucene3767/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestQuality.java Sun Feb 12 14:22:02 2012
@@ -32,6 +32,7 @@ import java.util.zip.ZipFile;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
@@ -39,7 +40,7 @@ import org.apache.lucene.util.LuceneTest
// just compares segmentation to some sentences pre-tokenized by mecab
public class TestQuality extends LuceneTestCase {
- public void test() throws Exception {
+ public void testSingleText() throws Exception {
File datafile = getDataFile("tanakaseg.zip");
ZipFile zip = new ZipFile(datafile);
InputStream is = zip.getInputStream(zip.getEntry("sentences.txt"));
@@ -47,6 +48,9 @@ public class TestQuality extends LuceneT
InputStream is2 = zip.getInputStream(zip.getEntry("segmented.txt"));
BufferedReader seg = new BufferedReader(new InputStreamReader(is2, IOUtils.CHARSET_UTF_8));
Stats stats = new Stats();
+
+ final boolean ONE_TIME = true;
+
/**
#words: 1578506
#chars: 4519246
@@ -56,19 +60,84 @@ public class TestQuality extends LuceneT
word agreement?: 0.999587584716181
*/
//final Tokenizer tokenizer = new KuromojiTokenizer(null, null, true, Mode.SEARCH);
+
+ StringBuilder sb = new StringBuilder();
+
+ String line1 = null;
+ String line2 = null;
+ int maxLen = 0;
+ int count = 0;
+ while ((line1 = unseg.readLine()) != null) {
+ seg.readLine();
+ // nocommit also try removing the "easy" period at the
+ // end of each sentence...
+ maxLen = Math.max(line1.length(), maxLen);
+ sb.append(line1);
+ if (ONE_TIME && count++ == 100) {
+ // nocommit;
+ break;
+ }
+ }
+ System.out.println("maxLen=" + maxLen);
+
+ //final Tokenizer tokenizer = new KuromojiTokenizer2(new StringReader(""), null, true, Mode.SEARCH);
+ final Tokenizer tokenizer = new KuromojiTokenizer(new StringReader(""));
+ final String all = sb.toString();
+ final int ITERS = 20;
+ CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
+ for(int iter=0;iter<ITERS;iter++) {
+ tokenizer.reset(new StringReader(all));
+ count = 0;
+ long t0 = System.currentTimeMillis();
+ while(tokenizer.incrementToken()) {
+ if (false && ONE_TIME) {
+ System.out.println(count + ": " + termAtt.toString());
+ }
+ count++;
+ }
+ long t1 = System.currentTimeMillis();
+ System.out.println(all.length()/(t1-t0) + " bytes/msec; count=" + count);
+
+ if (ONE_TIME) {
+ break;
+ }
+ }
+ }
+
+ public void test() throws Exception {
+ File datafile = getDataFile("tanakaseg.zip");
+ ZipFile zip = new ZipFile(datafile);
+ InputStream is = zip.getInputStream(zip.getEntry("sentences.txt"));
+ BufferedReader unseg = new BufferedReader(new InputStreamReader(is, IOUtils.CHARSET_UTF_8));
+ InputStream is2 = zip.getInputStream(zip.getEntry("segmented.txt"));
+ BufferedReader seg = new BufferedReader(new InputStreamReader(is2, IOUtils.CHARSET_UTF_8));
+ Stats stats = new Stats();
+ /**
+ #words: 1578506
+ #chars: 4519246
+ #edits: 651
+ #sentences: 150122
+ sentence agreement?: 0.998161495317142
+ word agreement?: 0.999587584716181
+ */
+
+ //final Tokenizer tokenizer = new KuromojiTokenizer(new StringReader(""));
+ //final Tokenizer tokenizer = new KuromojiTokenizer(new Segmenter(Mode.NORMAL), new StringReader(""));
final Tokenizer tokenizer = new KuromojiTokenizer2(new StringReader(""), null, true, Mode.SEARCH);
String line1 = null;
String line2 = null;
+ int count = 0;
while ((line1 = unseg.readLine()) != null) {
line2 = seg.readLine();
- evaluateLine(line1, line2, tokenizer, stats);
+ evaluateLine(count++, line1, line2, tokenizer, stats);
}
System.out.println("#words: " + stats.numWords);
System.out.println("#chars: " + stats.numChars);
System.out.println("#edits: " + stats.numEdits);
System.out.println("#sentences: " + stats.numSentences);
+ System.out.println("#tokens: " + stats.numTokens);
System.out.println("sentence agreement?: " + (stats.numSentencesCorrect/(double)stats.numSentences));
System.out.println("word agreement?: " + (1D - (stats.numEdits / (double)stats.numWords)));
unseg.close();
@@ -82,38 +151,126 @@ public class TestQuality extends LuceneT
long numChars = 0;
long numSentences = 0;
long numSentencesCorrect = 0;
+ long numTokens = 0;
+ }
+
+ static class Path {
+ public final List<String> tokens;
+ public int pos;
+
+ public Path() {
+ tokens = new ArrayList<String>();
+ }
+
+ public void add(String token, int posLen) {
+ tokens.add(token);
+ pos += posLen;
+ }
}
- public static void evaluateLine(String unseg, String seg, Tokenizer tokenizer, Stats stats) throws Exception {
- //System.out.println("\nTEST: " + unseg);
- List<String> tokens = new ArrayList<String>();
+ public static void evaluateLine(int lineCount, String unseg, String seg, Tokenizer tokenizer, Stats stats) throws Exception {
+ if (VERBOSE) {
+ System.out.println("\nTEST " + lineCount + ": input " + unseg);
+ }
tokenizer.reset(new StringReader(unseg));
CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class);
+ PositionIncrementAttribute posIncAtt = tokenizer.addAttribute(PositionIncrementAttribute.class);
+ List<Path> paths = new ArrayList<Path>();
+ paths.add(new Path());
+
+ int pos = -1;
+ int numTokens = 0;
while(tokenizer.incrementToken()) {
- tokens.add(termAtt.toString());
+ final int posInc = posIncAtt.getPositionIncrement();
+ final int posLength = posIncAtt.getPositionLength();
+ final String token = termAtt.toString();
+
+ //System.out.println(" tok=" + token + " numPaths=" + paths.size() + " posLen=" + posLength);
+
+ pos += posInc;
+ numTokens++;
+
+ if (VERBOSE) {
+ if (posIncAtt.getPositionIncrement() == 0) {
+ System.out.println(" fork @ token=" + token + " posLength=" + posLength);
+ }
+ }
+
+ assert pos >= 0;
+ final int numPaths = paths.size();
+ for(int pathID=0;pathID<numPaths;pathID++) {
+ final Path path = paths.get(pathID);
+ if (pos == path.pos) {
+ path.add(token, posLength);
+ } else if (pos == path.pos-1 && posInc == 0) {
+
+ // NOTE: this is horribly, horribly inefficient in
+ // general!! Much better to do graph-to-graph
+ // alignment to get min edits:
+
+ // nocommit this isn't fully general, ie, it
+ // assumes the tokenizer ALWAYS outputs the
+ // posLen=1 token first, at a given position
+ // Fork!
+ assert path.tokens.size() > 0;
+ Path newPath = new Path();
+ newPath.tokens.addAll(path.tokens);
+ newPath.tokens.remove(newPath.tokens.size()-1);
+ newPath.pos = pos;
+ newPath.add(token, posLength);
+ paths.add(newPath);
+
+ } else {
+ assert pos < path.pos: "pos=" + pos + " path.pos=" + path.pos + " pathID=" + pathID;
+ }
+ }
}
-
+
+ if (VERBOSE) {
+ System.out.println(" " + paths.size() + " paths");
+ }
+
List<String> expectedTokens = Arrays.asList(seg.split("\\s+"));
- tokens = normalize(tokens);
expectedTokens = normalize(expectedTokens);
+
+ int minEdits = Integer.MAX_VALUE;
+ List<String> minPath = null;
+ for(Path path : paths) {
+ List<String> tokens = normalize(path.tokens);
+ if (VERBOSE) {
+ System.out.println(" path: " + path.tokens);
+ }
- HashMap<String,Character> transformation = new HashMap<String,Character>();
- CharRef charRef = new CharRef();
-
- String s1 = transform(tokens, transformation, charRef);
- String s2 = transform(expectedTokens, transformation, charRef);
+ HashMap<String,Character> transformation = new HashMap<String,Character>();
+ CharRef charRef = new CharRef();
+ String s1 = transform(tokens, transformation, charRef);
+ String s2 = transform(expectedTokens, transformation, charRef);
- int edits = getDistance(s2, s1);
- if (edits > 0) {
- System.out.println(edits + " edits; unseg: " + unseg);
- System.out.println(tokens + " vs " + expectedTokens);
+ int edits = getDistance(s2, s1);
+ if (edits < minEdits) {
+ minEdits = edits;
+ minPath = tokens;
+ if (VERBOSE) {
+ System.out.println(" ** edits=" + edits);
+ }
+ }
+ }
+ assert minPath != null;
+ if (minEdits > 0) {
+ if (!VERBOSE) {
+ System.out.println("\nTEST " + lineCount + ": input " + unseg + "; " + minEdits + " edits");
+ }
+ System.out.println(" expected: " + expectedTokens);
+ System.out.println(" actual: " + minPath);
}
stats.numChars += seg.length();
- stats.numEdits += edits;
+ stats.numEdits += minEdits;
stats.numWords += expectedTokens.size();
+ stats.numTokens += numTokens;
stats.numSentences++;
- if (edits == 0)
+ if (minEdits == 0) {
stats.numSentencesCorrect++;
+ }
}
static class CharRef {