You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2015/09/15 17:00:32 UTC
svn commit: r1703219 - in /lucene/dev/trunk/lucene: ./
core/src/java/org/apache/lucene/codecs/compressing/
core/src/java/org/apache/lucene/util/
core/src/test/org/apache/lucene/codecs/compressing/
core/src/test/org/apache/lucene/util/
Author: shalin
Date: Tue Sep 15 15:00:31 2015
New Revision: 1703219
URL: http://svn.apache.org/r1703219
Log:
LUCENE-6779: Reduce memory allocated by CompressingStoredFieldsWriter to write strings larger than 64kb by an amount equal to string's utf8 size
Added:
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java
- copied, changed from r1703218, lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java (with props)
Removed:
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1703219&r1=1703218&r2=1703219&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Sep 15 15:00:31 2015
@@ -114,6 +114,10 @@ Optimizations
GeoPointTermsEnum to reduce GC pressure (Nick Knize via Mike
McCandless)
+* LUCENE-6779: Reduce memory allocated by CompressingStoredFieldsWriter to write
+ strings larger than 64kb by an amount equal to string's utf8 size.
+ (Dawid Weiss, Robert Muir, shalin)
+
Bug Fixes
* LUCENE-6730: Hyper-parameter c is ignored in term frequency NormalizationH1.
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java?rev=1703219&r1=1703218&r2=1703219&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java Tue Sep 15 15:00:31 2015
@@ -40,9 +40,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.GrowableByteArrayDataOutput;
import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.packed.PackedInts;
/**
@@ -245,8 +243,6 @@ public final class CompressingStoredFiel
numChunks++;
}
- byte scratchBytes[] = new byte[16];
-
@Override
public void writeField(FieldInfo info, StorableField field)
throws IOException {
@@ -293,11 +289,7 @@ public final class CompressingStoredFiel
bufferedDocs.writeVInt(bytes.length);
bufferedDocs.writeBytes(bytes.bytes, bytes.offset, bytes.length);
} else if (string != null) {
- // this is just an optimized writeString() that re-uses scratchBytes.
- scratchBytes = ArrayUtil.grow(scratchBytes, string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR);
- int length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
- bufferedDocs.writeVInt(length);
- bufferedDocs.writeBytes(scratchBytes, length);
+ bufferedDocs.writeString(string);
} else {
if (number instanceof Byte || number instanceof Short || number instanceof Integer) {
bufferedDocs.writeZInt(number.intValue());
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java?rev=1703219&r1=1703218&r2=1703219&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java Tue Sep 15 15:00:31 2015
@@ -35,8 +35,6 @@ import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentInfo;
-import org.apache.lucene.store.BufferedChecksumIndexInput;
-import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@@ -45,7 +43,6 @@ import org.apache.lucene.store.IndexOutp
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.GrowableByteArrayDataOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.packed.BlockPackedWriter;
Copied: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java (from r1703218, lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java?p2=lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java&p1=lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java&r1=1703218&r2=1703219&rev=1703219&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java Tue Sep 15 15:00:31 2015
@@ -1,4 +1,4 @@
-package org.apache.lucene.util;
+package org.apache.lucene.codecs.compressing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -17,7 +17,11 @@ package org.apache.lucene.util;
* limitations under the License.
*/
+import java.io.IOException;
+
import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.UnicodeUtil;
/**
* A {@link DataOutput} that can be used to build a byte[].
@@ -25,11 +29,17 @@ import org.apache.lucene.store.DataOutpu
*/
public final class GrowableByteArrayDataOutput extends DataOutput {
+ /** Minimum utf8 byte size of a string over which double pass over string is to save memory during encode */
+ static final int MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING = 65536;
+
/** The bytes */
public byte[] bytes;
/** The length */
public int length;
+ // scratch for utf8 encoding of small strings
+ byte[] scratchBytes = new byte[16];
+
/** Create a {@link GrowableByteArrayDataOutput} with the given initial capacity. */
public GrowableByteArrayDataOutput(int cp) {
this.bytes = new byte[ArrayUtil.oversize(cp, 1)];
@@ -52,4 +62,22 @@ public final class GrowableByteArrayData
length = newLength;
}
+ @Override
+ public void writeString(String string) throws IOException {
+ int maxLen = string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR;
+ if (maxLen <= MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING) {
+ // string is small enough that we don't need to save memory by falling back to double-pass approach
+ // this is just an optimized writeString() that re-uses scratchBytes.
+ scratchBytes = ArrayUtil.grow(scratchBytes, maxLen);
+ int len = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
+ writeVInt(len);
+ writeBytes(scratchBytes, len);
+ } else {
+ // use a double pass approach to avoid allocating a large intermediate buffer for string encoding
+ int numBytes = UnicodeUtil.calcUTF16toUTF8Length(string, 0, string.length());
+ writeVInt(numBytes);
+ bytes = ArrayUtil.grow(bytes, length + numBytes);
+ length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), bytes, length);
+ }
+ }
}
Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java?rev=1703219&r1=1703218&r2=1703219&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java Tue Sep 15 15:00:31 2015
@@ -179,11 +179,21 @@ public final class UnicodeUtil {
* for length characters. It is the responsibility of the
* caller to make sure that the destination array is large enough.
*/
- // TODO: broken if incoming result.offset != 0
public static int UTF16toUTF8(final CharSequence s, final int offset, final int length, byte[] out) {
+ return UTF16toUTF8(s, offset, length, out, 0);
+ }
+
+ /** Encode characters from this String, starting at offset
+ * for length characters. Output to the destination array
+ * will begin at {@code outOffset}. It is the responsibility of the
+ * caller to make sure that the destination array is large enough.
+ * <p>
+ * note this method returns the final output offset (outOffset + number of bytes written)
+ */
+ public static int UTF16toUTF8(final CharSequence s, final int offset, final int length, byte[] out, int outOffset) {
final int end = offset + length;
- int upto = 0;
+ int upto = outOffset;
for(int i=offset;i<end;i++) {
final int code = (int) s.charAt(i);
@@ -223,6 +233,43 @@ public final class UnicodeUtil {
return upto;
}
+ /**
+ * Calculates the number of UTF8 bytes necessary to write a UTF16 string.
+ *
+ * @return the number of bytes written
+ */
+ public static int calcUTF16toUTF8Length(final CharSequence s, final int offset, final int len) {
+ final int end = offset + len;
+
+ int res = 0;
+ for (int i = offset; i < end; i++) {
+ final int code = (int) s.charAt(i);
+
+ if (code < 0x80)
+ res++;
+ else if (code < 0x800) {
+ res += 2;
+ } else if (code < 0xD800 || code > 0xDFFF) {
+ res += 3;
+ } else {
+ // surrogate pair
+ // confirm valid high surrogate
+ if (code < 0xDC00 && (i < end - 1)) {
+ int utf32 = (int) s.charAt(i + 1);
+ // confirm valid low surrogate and write pair
+ if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
+ i++;
+ res += 4;
+ continue;
+ }
+ }
+ res += 3;
+ }
+ }
+
+ return res;
+ }
+
// Only called from assert
/*
private static boolean matches(char[] source, int offset, int length, byte[] result, int upto) {
Added: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java?rev=1703219&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java (added)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java Tue Sep 15 15:00:31 2015
@@ -0,0 +1,80 @@
+package org.apache.lucene.codecs.compressing;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.UnicodeUtil;
+import org.junit.Test;
+
+/**
+ * Test for {@link GrowableByteArrayDataOutput}
+ */
+public class TestGrowableByteArrayDataOutput extends LuceneTestCase {
+
+ @Test
+ public void testWriteSmallStrings() throws Exception {
+ int minSizeForDoublePass = GrowableByteArrayDataOutput.MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING;
+
+ // a simple string encoding test
+ int num = atLeast(1000);
+ for (int i = 0; i < num; i++) {
+ // create a small string such that the single pass approach is used
+ int length = TestUtil.nextInt(random(), 1, minSizeForDoublePass - 1);
+ String unicode = TestUtil.randomFixedByteLengthUnicodeString(random(), length);
+ byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+ int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
+
+ GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);
+ //explicitly write utf8 len so that we know how many bytes it occupies
+ dataOutput.writeVInt(len);
+ int vintLen = dataOutput.length;
+ // now write the string which will internally write number of bytes as a vint and then utf8 bytes
+ dataOutput.writeString(unicode);
+
+ assertEquals("GrowableByteArrayDataOutput wrote the wrong length after encode", len + vintLen * 2, dataOutput.length);
+ for (int j = 0, k = vintLen * 2; j < len; j++, k++) {
+ assertEquals(utf8[j], dataOutput.bytes[k]);
+ }
+ }
+ }
+
+ @Test
+ public void testWriteLargeStrings() throws Exception {
+ int minSizeForDoublePass = GrowableByteArrayDataOutput.MIN_UTF8_SIZE_TO_ENABLE_DOUBLE_PASS_ENCODING;
+
+ int num = atLeast(1000);
+ for (int i = 0; i < num; i++) {
+ String unicode = TestUtil.randomRealisticUnicodeString(random(), minSizeForDoublePass, 10 * minSizeForDoublePass);
+ byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+ int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
+
+ GrowableByteArrayDataOutput dataOutput = new GrowableByteArrayDataOutput(1 << 8);
+ //explicitly write utf8 len so that we know how many bytes it occupies
+ dataOutput.writeVInt(len);
+ int vintLen = dataOutput.length;
+ // now write the string which will internally write number of bytes as a vint and then utf8 bytes
+ dataOutput.writeString(unicode);
+
+ assertEquals("GrowableByteArrayDataOutput wrote the wrong length after encode", len + vintLen * 2, dataOutput.length);
+ for (int j = 0, k = vintLen * 2; j < len; j++, k++) {
+ assertEquals(utf8[j], dataOutput.bytes[k]);
+ }
+ }
+ }
+}
Modified: lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java?rev=1703219&r1=1703218&r2=1703219&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java (original)
+++ lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java Tue Sep 15 15:00:31 2015
@@ -214,4 +214,14 @@ public class TestUnicodeUtil extends Luc
assertEquals(cRef.toString(), unicode);
}
}
+
+ public void testCalcUTF16toUTF8Length() {
+ int num = atLeast(5000);
+ for (int i = 0; i < num; i++) {
+ String unicode = TestUtil.randomUnicodeString(random());
+ byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+ int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
+ assertEquals(len, UnicodeUtil.calcUTF16toUTF8Length(unicode, 0, unicode.length()));
+ }
+ }
}