You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cm...@apache.org on 2012/03/22 12:41:55 UTC
svn commit: r1303739 - in /lucene/dev/trunk/modules/analysis/kuromoji/src:
java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java
Author: cm
Date: Thu Mar 22 11:41:54 2012
New Revision: 1303739
URL: http://svn.apache.org/viewvc?rev=1303739&view=rev
Log:
Fix for LUCENE-3897 (KuromojiTokenizer fails with large docs)
Modified:
lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java?rev=1303739&r1=1303738&r2=1303739&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java Thu Mar 22 11:41:54 2012
@@ -588,27 +588,71 @@ public final class KuromojiTokenizer ext
if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP) {
// Safety: if we've buffered too much, force a
- // backtrace now:
+ // backtrace now. We find the least-cost partial
+ // path, across all paths, backtrace from it, and
+ // then prune all others. Note that this, in
+ // general, can produce the wrong result, if the
+ // total bast path did not in fact back trace
+ // through this partial best path. But it's the
+ // best we can do... (short of not having a
+ // safety!).
+
+ // First pass: find least cost parital path so far,
+ // including ending at future positions:
int leastIDX = -1;
int leastCost = Integer.MAX_VALUE;
- for(int idx=0;idx<posData.count;idx++) {
- //System.out.println(" idx=" + idx + " cost=" + cost);
- final int cost = posData.costs[idx];
- if (cost < leastCost) {
- leastCost = cost;
- leastIDX = idx;
+ Position leastPosData = null;
+ for(int pos2=pos;pos2<positions.getNextPos();pos2++) {
+ final Position posData2 = positions.get(pos2);
+ for(int idx=0;idx<posData2.count;idx++) {
+ //System.out.println(" idx=" + idx + " cost=" + cost);
+ final int cost = posData.costs[idx];
+ if (cost < leastCost) {
+ leastCost = cost;
+ leastIDX = idx;
+ leastPosData = posData2;
+ }
}
}
- backtrace(posData, leastIDX);
+
+ // We will always have at least one live path:
+ assert leastIDX != -1;
+
+ // Second pass: prune all but the best path:
+ for(int pos2=pos;pos2<positions.getNextPos();pos2++) {
+ final Position posData2 = positions.get(pos2);
+ if (posData2 != leastPosData) {
+ posData2.reset();
+ } else {
+ if (leastIDX != 0) {
+ posData2.costs[0] = posData2.costs[leastIDX];
+ posData2.lastRightID[0] = posData2.lastRightID[leastIDX];
+ posData2.backPos[0] = posData2.backPos[leastIDX];
+ posData2.backIndex[0] = posData2.backIndex[leastIDX];
+ posData2.backID[0] = posData2.backID[leastIDX];
+ posData2.backType[0] = posData2.backType[leastIDX];
+ }
+ posData2.count = 1;
+ }
+ }
+
+ backtrace(leastPosData, 0);
// Re-base cost so we don't risk int overflow:
- Arrays.fill(posData.costs, 0, posData.count, 0);
+ Arrays.fill(leastPosData.costs, 0, leastPosData.count, 0);
if (pending.size() != 0) {
return;
} else {
// This means the backtrace only produced
// punctuation tokens, so we must keep parsing.
+ if (pos != leastPosData.pos) {
+ // We jumped into a future position; continue to
+ // the top of the loop to skip until we get
+ // there:
+ assert pos < leastPosData.pos;
+ continue;
+ }
}
}
Modified: lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java?rev=1303739&r1=1303738&r2=1303739&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java (original)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TestKuromojiTokenizer.java Thu Mar 22 11:41:54 2012
@@ -192,7 +192,6 @@ public class TestKuromojiTokenizer exten
}
/** blast some random large strings through the analyzer */
- @Ignore("FIXME: see LUCENE-3897")
public void testRandomHugeStrings() throws Exception {
checkRandomData(random, analyzer, 200*RANDOM_MULTIPLIER, 8192);
checkRandomData(random, analyzerNoPunct, 200*RANDOM_MULTIPLIER, 8192);