You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2015/09/15 17:00:32 UTC
svn commit: r1703219 - in /lucene/dev/trunk/lucene: ./ core/src/java/org/apache/lucene/codecs/compressing/ core/src/java/org/apache/lucene/util/ core/src/test/org/apache/lucene/codecs/compressing/ core/src/test/org/apache/lucene/util/

Author: shalin
Date: Tue Sep 15 15:00:31 2015
New Revision: 1703219

URL: http://svn.apache.org/r1703219
Log:
LUCENE-6779: Reduce memory allocated by CompressingStoredFieldsWriter to write strings larger than 64kb by an amount equal to string's utf8 size

Added:
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java
      - copied, changed from r1703218, lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java   (with props)
Removed:
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java
Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
    lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1703219&r1=1703218&r2=1703219&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Sep 15 15:00:31 2015
@@ -114,6 +114,10 @@ Optimizations
   GeoPointTermsEnum to reduce GC pressure (Nick Knize via Mike
   McCandless)
 
+* LUCENE-6779: Reduce memory allocated by CompressingStoredFieldsWriter to write
+  strings larger than 64kb by an amount equal to string's utf8 size.
+  (Dawid Weiss, Robert Muir, shalin)
+
 Bug Fixes
 
 * LUCENE-6730: Hyper-parameter c is ignored in term frequency NormalizationH1.

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java?rev=1703219&r1=1703218&r2=1703219&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java Tue Sep 15 15:00:31 2015
@@ -40,9 +40,7 @@ import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BitUtil;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.GrowableByteArrayDataOutput;
 import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.packed.PackedInts;
 
 /**
@@ -245,8 +243,6 @@ public final class CompressingStoredFiel
     numChunks++;
   }
   
-  byte scratchBytes[] = new byte[16];
-
   @Override
   public void writeField(FieldInfo info, StorableField field)
       throws IOException {
@@ -293,11 +289,7 @@ public final class CompressingStoredFiel
       bufferedDocs.writeVInt(bytes.length);
       bufferedDocs.writeBytes(bytes.bytes, bytes.offset, bytes.length);
     } else if (string != null) {
-      // this is just an optimized writeString() that re-uses scratchBytes.
-      scratchBytes = ArrayUtil.grow(scratchBytes, string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR);
-      int length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
-      bufferedDocs.writeVInt(length);
-      bufferedDocs.writeBytes(scratchBytes, length);
+      bufferedDocs.writeString(string);
     } else {
       if (number instanceof Byte || number instanceof Short || number instanceof Integer) {
         bufferedDocs.writeZInt(number.intValue());

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java?rev=1703219&r1=1703218&r2=1703219&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java Tue Sep 15 15:00:31 2015
@@ -35,8 +35,6 @@ import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.MergeState;
 import org.apache.lucene.index.SegmentInfo;
-import org.apache.lucene.store.BufferedChecksumIndexInput;
-import org.apache.lucene.store.ChecksumIndexInput;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
@@ -45,7 +43,6 @@ import org.apache.lucene.store.IndexOutp
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.GrowableByteArrayDataOutput;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.packed.BlockPackedWriter;

Copied: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java (from r1703218, lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java?p2=lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java&p1=lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java&r1=1703218&r2=1703219&rev=1703219&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java Tue Sep 15 15:00:31 2015
@@ -1,4 +1,4 @@
-package org.apache.lucene.util;
+package org.apache.lucene.codecs.compressing;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -17,7 +17,11 @@ package org.apache.lucene.util;
  * limitations under the License.
  */
 
+import java.io.IOException;
+
 import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.UnicodeUtil;
 
 /**
  * A {@link DataOutput} that can be used to build a byte[].
@@ -25,11 +29,17 @@ import org.apache.lucene.store.DataOutpu
  */
 public final class GrowableByteArrayDataOutput extends DataOutput {
 
+  /** Minimum utf8 byte size of a string over which double pass over string is to save memory during encode */
+  static final int MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING = 65536;
+
   /** The bytes */
   public byte[] bytes;
   /** The length */
   public int length;
 
+  // scratch for utf8 encoding of small strings
+  byte[] scratchBytes = new byte[16];
+
   /** Create a {@link GrowableByteArrayDataOutput} with the given initial capacity. */
   public GrowableByteArrayDataOutput(int cp) {
     this.bytes = new byte[ArrayUtil.oversize(cp, 1)];
@@ -52,4 +62,22 @@ public final class GrowableByteArrayData
     length = newLength;
   }
 
+  @Override
+  public void writeString(String string) throws IOException {
+    int maxLen = string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR;
+    if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING)  {
+      // string is small enough that we don't need to save memory by falling back to double-pass approach
+      // this is just an optimized writeString() that re-uses scratchBytes.
+      scratchBytes = ArrayUtil.grow(scratchBytes, maxLen);
+      int len = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
+      writeVInt(len);
+      writeBytes(scratchBytes, len);
+    } else  {
+      // use a double pass approach to avoid allocating a large intermediate buffer for string encoding
+      int numBytes = UnicodeUtil.calcUTF16toUTF8Length(string, 0, string.length());
+      writeVInt(numBytes);
+      bytes = ArrayUtil.grow(bytes, length + numBytes);
+      length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), bytes, length);
+    }
+  }
 }

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java?rev=1703219&r1=1703218&r2=1703219&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java Tue Sep 15 15:00:31 2015
@@ -179,11 +179,21 @@ public final class UnicodeUtil {
    *  for length characters. It is the responsibility of the
    *  caller to make sure that the destination array is large enough.
    */
-  // TODO: broken if incoming result.offset != 0
   public static int UTF16toUTF8(final CharSequence s, final int offset, final int length, byte[] out) {
+    return UTF16toUTF8(s, offset, length, out, 0);
+  }
+
+  /** Encode characters from this String, starting at offset
+   *  for length characters. Output to the destination array
+   *  will begin at {@code outOffset}. It is the responsibility of the
+   *  caller to make sure that the destination array is large enough.
+   *  <p>
+   *  note this method returns the final output offset (outOffset + number of bytes written)
+   */
+  public static int UTF16toUTF8(final CharSequence s, final int offset, final int length, byte[] out, int outOffset) {
     final int end = offset + length;
 
-    int upto = 0;
+    int upto = outOffset;
     for(int i=offset;i<end;i++) {
       final int code = (int) s.charAt(i);
 
@@ -223,6 +233,43 @@ public final class UnicodeUtil {
     return upto;
   }
 
+  /**
+   * Calculates the number of UTF8 bytes necessary to write a UTF16 string.
+   *
+   * @return the number of bytes written
+   */
+  public static int calcUTF16toUTF8Length(final CharSequence s, final int offset, final int len) {
+    final int end = offset + len;
+
+    int res = 0;
+    for (int i = offset; i < end; i++) {
+      final int code = (int) s.charAt(i);
+
+      if (code < 0x80)
+        res++;
+      else if (code < 0x800) {
+        res += 2;
+      } else if (code < 0xD800 || code > 0xDFFF) {
+        res += 3;
+      } else {
+        // surrogate pair
+        // confirm valid high surrogate
+        if (code < 0xDC00 && (i < end - 1)) {
+          int utf32 = (int) s.charAt(i + 1);
+          // confirm valid low surrogate and write pair
+          if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
+            i++;
+            res += 4;
+            continue;
+          }
+        }
+        res += 3;
+      }
+    }
+
+    return res;
+  }
+
   // Only called from assert
   /*
   private static boolean matches(char[] source, int offset, int length, byte[] result, int upto) {

Added: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java?rev=1703219&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java (added)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java Tue Sep 15 15:00:31 2015
@@ -0,0 +1,80 @@
+package org.apache.lucene.codecs.compressing;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.UnicodeUtil;
+import org.junit.Test;
+
+/**
+ * Test for {@link GrowableByteArrayDataOutput}
+ */
+public class TestGrowableByteArrayDataOutput extends LuceneTestCase {
+
+  @Test
+  public void testWriteSmallStrings() throws Exception {
+    int minSizeForDoublePass = GrowableByteArrayDataOutput.MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING;
+
+    // a simple string encoding test
+    int num = atLeast(1000);
+    for (int i = 0; i < num; i++) {
+      // create a small string such that the single pass approach is used
+      int length = TestUtil.nextInt(random(), 1, minSizeForDoublePass - 1);
+      String unicode = TestUtil.randomFixedByteLengthUnicodeString(random(), length);
+      byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+      int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
+
+      GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);
+      //explicitly write utf8 len so that we know how many bytes it occupies
+      dataOutput.writeVInt(len);
+      int vintLen = dataOutput.length;
+      // now write the string which will internally write number of bytes as a vint and then utf8 bytes
+      dataOutput.writeString(unicode);
+
+      assertEquals("GrowableByteArrayDataOutput wrote the wrong length after encode", len + vintLen * 2, dataOutput.length);
+      for (int j = 0, k = vintLen * 2; j < len; j++, k++) {
+        assertEquals(utf8[j], dataOutput.bytes[k]);
+      }
+    }
+  }
+
+  @Test
+  public void testWriteLargeStrings() throws Exception {
+    int minSizeForDoublePass = GrowableByteArrayDataOutput.MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING;
+
+    int num = atLeast(1000);
+    for (int i = 0; i < num; i++) {
+      String unicode = TestUtil.randomRealisticUnicodeString(random(), minSizeForDoublePass, 10 * minSizeForDoublePass);
+      byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+      int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
+
+      GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);
+      //explicitly write utf8 len so that we know how many bytes it occupies
+      dataOutput.writeVInt(len);
+      int vintLen = dataOutput.length;
+      // now write the string which will internally write number of bytes as a vint and then utf8 bytes
+      dataOutput.writeString(unicode);
+
+      assertEquals("GrowableByteArrayDataOutput wrote the wrong length after encode", len + vintLen * 2, dataOutput.length);
+      for (int j = 0, k = vintLen * 2; j < len; j++, k++) {
+        assertEquals(utf8[j], dataOutput.bytes[k]);
+      }
+    }
+  }
+}

Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java?rev=1703219&r1=1703218&r2=1703219&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java Tue Sep 15 15:00:31 2015
@@ -214,4 +214,14 @@ public class TestUnicodeUtil extends Luc
       assertEquals(cRef.toString(), unicode);
     }
   }
+
+  public void testCalcUTF16toUTF8Length() {
+    int num = atLeast(5000);
+    for (int i = 0; i < num; i++) {
+      String unicode = TestUtil.randomUnicodeString(random());
+      byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+      int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
+      assertEquals(len, UnicodeUtil.calcUTF16toUTF8Length(unicode, 0, unicode.length()));
+    }
+  }
 }