You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2013/01/13 16:49:36 UTC

svn commit: r1432646 - in /lucene/dev/trunk/lucene: analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/ core/src/java/org/apache/lucene/util/fst/ core/src/test/org/apache/lucene/util/fst/

Author: mikemccand
Date: Sun Jan 13 15:49:35 2013
New Revision: 1432646

URL: http://svn.apache.org/viewvc?rev=1432646&view=rev
Log:
LUCENE-4678: add BytesStore.truncate and use it to not double-buffer during pack

Modified:
    lucene/dev/trunk/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java

Modified: lucene/dev/trunk/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary$fst.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/kuromoji/src/resources/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary%24fst.dat?rev=1432646&r1=1432645&r2=1432646&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java?rev=1432646&r1=1432645&r2=1432646&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java Sun Jan 13 15:49:35 2013
@@ -69,6 +69,14 @@ class BytesStore extends DataOutput {
     nextWrite = blocks.get(blocks.size()-1).length;
   }
 
+  /** Absolute write byte; you must ensure dest is < max
+   *  position written so far. */
+  public void writeByte(int dest, byte b) {
+    int blockIndex = dest >> blockBits;
+    byte[] block = blocks.get(blockIndex);
+    block[dest & blockMask] = b;
+  }
+
   @Override
   public void writeByte(byte b) {
     if (nextWrite == blockSize) {
@@ -237,9 +245,10 @@ class BytesStore extends DataOutput {
     }
   }
 
-  /** Reverse the last numBytes. */
+  /** Reverse from srcPos, inclusive, to destPos, inclusive. */
   public void reverse(int srcPos, int destPos) {
     assert srcPos < destPos;
+    assert destPos < getPosition();
     //System.out.println("reverse src=" + srcPos + " dest=" + destPos);
 
     int srcBlockIndex = srcPos >> blockBits;
@@ -275,7 +284,7 @@ class BytesStore extends DataOutput {
     }
   }
 
-  public void skip(int len) {
+  public void skipBytes(int len) {
     while (len > 0) {
       int chunk = blockSize - nextWrite;
       if (len <= chunk) {
@@ -294,6 +303,26 @@ class BytesStore extends DataOutput {
     return (blocks.size()-1) * blockSize + nextWrite;
   }
 
+  /** Pos must be less than the max position written so far!
+   *  Ie, you cannot "grow" the file with this! */
+  public void truncate(int newLen) {
+    assert newLen <= getPosition();
+    assert newLen >= 0;
+    int blockIndex = newLen >> blockBits;
+    nextWrite = newLen & blockMask;
+    if (nextWrite == 0) {
+      blockIndex--;
+      nextWrite = blockSize;
+    }
+    blocks.subList(blockIndex+1, blocks.size()).clear();
+    if (newLen == 0) {
+      current = null;
+    } else {
+      current = blocks.get(blockIndex);
+    }
+    assert newLen == getPosition();
+  }
+
   public void finish() {
     if (current != null) {
       byte[] lastBuffer = new byte[nextWrite];

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java?rev=1432646&r1=1432645&r2=1432646&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java Sun Jan 13 15:49:35 2013
@@ -730,7 +730,7 @@ public final class FST<T> {
       int destPos = fixedArrayStart + nodeIn.numArcs*maxBytesPerArc;
       assert destPos >= srcPos;
       if (destPos > srcPos) {
-        bytes.skip(destPos - srcPos);
+        bytes.skipBytes(destPos - srcPos);
         for(int arcIdx=nodeIn.numArcs-1;arcIdx>=0;arcIdx--) {
           destPos -= maxBytesPerArc;
           srcPos -= bytesPerArc[arcIdx];
@@ -1444,9 +1444,6 @@ public final class FST<T> {
       throw new IllegalArgumentException("this FST was not built with willPackFST=true");
     }
 
-    final RAMOutputStream buffer = new RAMOutputStream();
-    byte[] bufferBytes = new byte[64];
-
     Arc<T> arc = new Arc<T>();
 
     final BytesReader r = getBytesReader(0);
@@ -1553,7 +1550,7 @@ public final class FST<T> {
         // this is an array'd node and bytesPerArc changes:
         writeNode:
         while(true) { // retry writing this node
-          assert buffer.getFilePointer() == 0;
+
           //System.out.println("  cycle: retry");
           readFirstRealTargetArc(node, arc, r);
 
@@ -1563,9 +1560,9 @@ public final class FST<T> {
             if (bytesPerArc == 0) {
               bytesPerArc = arc.bytesPerArc;
             }
-            buffer.writeByte(ARCS_AS_FIXED_ARRAY);
-            buffer.writeVInt(arc.numArcs);
-            buffer.writeVInt(bytesPerArc);
+            writer.writeByte(ARCS_AS_FIXED_ARRAY);
+            writer.writeVInt(arc.numArcs);
+            writer.writeVInt(bytesPerArc);
             //System.out.println("node " + node + ": " + arc.numArcs + " arcs");
           }
 
@@ -1574,7 +1571,7 @@ public final class FST<T> {
           while(true) {  // iterate over all arcs for this node
             //System.out.println("    cycle next arc");
 
-            final int arcStartPos = (int) buffer.getFilePointer();
+            final int arcStartPos = writer.getPosition();
             nodeArcCount++;
 
             byte flags = 0;
@@ -1621,7 +1618,7 @@ public final class FST<T> {
                 absPtr = topNodeMap.size() + (int) newNodeAddress.get(arc.target) + addressError;
               }
 
-              int delta = (int) (newNodeAddress.get(arc.target) + addressError - buffer.getFilePointer() - address - 2);
+              int delta = (int) (newNodeAddress.get(arc.target) + addressError - writer.getPosition() - 2);
               if (delta < 0) {
                 //System.out.println("neg: " + delta);
                 anyNegDelta = true;
@@ -1637,23 +1634,23 @@ public final class FST<T> {
             }
 
             assert flags != ARCS_AS_FIXED_ARRAY;
-            buffer.writeByte(flags);
+            writer.writeByte(flags);
 
-            fst.writeLabel(buffer, arc.label);
+            fst.writeLabel(writer, arc.label);
 
             if (arc.output != NO_OUTPUT) {
-              outputs.write(arc.output, buffer);
+              outputs.write(arc.output, writer);
               if (!retry) {
                 fst.arcWithOutputCount++;
               }
             }
             if (arc.nextFinalOutput != NO_OUTPUT) {
-              outputs.writeFinalOutput(arc.nextFinalOutput, buffer);
+              outputs.writeFinalOutput(arc.nextFinalOutput, writer);
             }
 
             if (doWriteTarget) {
 
-              int delta = (int) (newNodeAddress.get(arc.target) + addressError - buffer.getFilePointer() - address);
+              int delta = (int) (newNodeAddress.get(arc.target) + addressError - writer.getPosition());
               if (delta < 0) {
                 anyNegDelta = true;
                 //System.out.println("neg: " + delta);
@@ -1662,7 +1659,7 @@ public final class FST<T> {
 
               if (flag(flags, BIT_TARGET_DELTA)) {
                 //System.out.println("        delta");
-                buffer.writeVInt(delta);
+                writer.writeVInt(delta);
                 if (!retry) {
                   deltaCount++;
                 }
@@ -1674,7 +1671,7 @@ public final class FST<T> {
                   System.out.println("        abs");
                 }
                 */
-                buffer.writeVInt(absPtr);
+                writer.writeVInt(absPtr);
                 if (!retry) {
                   if (absPtr >= topNodeMap.size()) {
                     absCount++;
@@ -1686,7 +1683,7 @@ public final class FST<T> {
             }
 
             if (useArcArray) {
-              final int arcBytes = (int) (buffer.getFilePointer() - arcStartPos);
+              final int arcBytes = writer.getPosition() - arcStartPos;
               //System.out.println("  " + arcBytes + " bytes");
               maxBytesPerArc = Math.max(maxBytesPerArc, arcBytes);
               // NOTE: this may in fact go "backwards", if
@@ -1696,11 +1693,7 @@ public final class FST<T> {
               // will retry (below) so it's OK to ovewrite
               // bytes:
               //wasted += bytesPerArc - arcBytes;
-              int skip = (int) (arcStartPos + bytesPerArc - buffer.getFilePointer());
-              while(skip > 0) {
-                buffer.writeByte((byte) 0);
-                skip--;
-              }
+              writer.skipBytes(arcStartPos + bytesPerArc - writer.getPosition());
             }
 
             if (arc.isLast()) {
@@ -1725,19 +1718,12 @@ public final class FST<T> {
 
           // Retry:
           bytesPerArc = maxBytesPerArc;
-          buffer.reset();
+          writer.truncate(address);
           nodeArcCount = 0;
           retry = true;
           anyNegDelta = false;
         }
 
-        if (bufferBytes.length < (int) buffer.getFilePointer()) {
-          bufferBytes = ArrayUtil.grow(bufferBytes, (int) buffer.getFilePointer());
-        }
-        buffer.writeTo(bufferBytes, 0);
-        writer.writeBytes(bufferBytes, 0, (int) buffer.getFilePointer());
-        buffer.reset();
-
         negDelta |= anyNegDelta;
 
         fst.arcCount += nodeArcCount;

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java?rev=1432646&r1=1432645&r2=1432646&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestBytesStore.java Sun Jan 13 15:49:35 2013
@@ -42,7 +42,7 @@ public class TestBytesStore extends Luce
 
       int pos = 0;
       while(pos < numBytes) {
-        int op = random().nextInt(7);
+        int op = random().nextInt(8);
         if (VERBOSE) {
           System.out.println("  cycle pos=" + pos);
         }
@@ -97,21 +97,21 @@ public class TestBytesStore extends Luce
         case 3:
           {
             // reverse bytes
-            if (pos > 0) {
-              int len = _TestUtil.nextInt(random(), 1, Math.min(100, pos));
+            if (pos > 1) {
+              int len = _TestUtil.nextInt(random(), 2, Math.min(100, pos));
               int start;
               if (len == pos) {
                 start = 0;
               } else {
                 start = random().nextInt(pos - len);
               }
-              int end = start + len;
+              int end = start + len - 1;
               if (VERBOSE) {
-                System.out.println("    reverse start=" + start + " end=" + end + " len=" + len);
+                System.out.println("    reverse start=" + start + " end=" + end + " len=" + len + " pos=" + pos);
               }
               bytes.reverse(start, end);
 
-              while(start < end) {
+              while(start <= end) {
                 byte b = expected[end];
                 expected[end] = expected[start];
                 expected[start] = b;
@@ -159,17 +159,49 @@ public class TestBytesStore extends Luce
           {
             // skip
             int len = random().nextInt(Math.min(100, numBytes - pos));
-            pos += len;
-            bytes.skip(len);
+
             if (VERBOSE) {
               System.out.println("    skip len=" + len);
             }
+
+            pos += len;
+            bytes.skipBytes(len);
+
+            // NOTE: must fill in zeros in case truncate was
+            // used, else we get false fails:
+            if (len > 0) {
+              byte[] zeros = new byte[len];
+              bytes.writeBytes(pos-len, zeros, 0, len);
+            }
           }
           break;
+
+        case 7:
+          {
+            // absWriteByte
+            if (pos > 0) {
+              int dest = random().nextInt(pos);
+              byte b = (byte) random().nextInt(256);
+              expected[dest] = b;
+              bytes.writeByte(dest, b);
+            }
+            break;
+          }
         }
 
         assertEquals(pos, bytes.getPosition());
 
+        if (pos > 0 && random().nextInt(50) == 17) {
+          // truncate
+          int len = _TestUtil.nextInt(random(), 1, Math.min(pos, 100));
+          bytes.truncate(pos - len);
+          pos -= len;
+          Arrays.fill(expected, pos, pos+len, (byte) 0);
+          if (VERBOSE) {
+            System.out.println("    truncate len=" + len + " newPos=" + pos);
+          }
+        }
+
         if ((pos > 0 && random().nextInt(200) == 17)) {
           verify(bytes, expected, pos);
         }

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java?rev=1432646&r1=1432645&r2=1432646&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java Sun Jan 13 15:49:35 2013
@@ -483,11 +483,14 @@ public class TestFSTs extends LuceneTest
             break;
           }
         }
-        long t = System.currentTimeMillis() - tStart;
-        System.out.println((t / 1000.0) + " sec to build");
+
+        long tMid = System.currentTimeMillis();
+        System.out.println(((tMid-tStart) / 1000.0) + " sec to add all terms");
 
         assert builder.getTermCount() == ord;
         FST<T> fst = builder.finish();
+        long tEnd = System.currentTimeMillis();
+        System.out.println(((tEnd-tMid) / 1000.0) + " sec to finish/pack");
         if (fst == null) {
           System.out.println("FST was fully pruned!");
           System.exit(0);