You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2015/09/15 17:36:09 UTC
svn commit: r1703231 - in /lucene/dev/branches/branch_5x: ./ lucene/
lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/
lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene42/
lucene/core/ lucene/core/src/java/org/apache/luce...
Author: shalin
Date: Tue Sep 15 15:36:08 2015
New Revision: 1703231
URL: http://svn.apache.org/r1703231
Log:
LUCENE-6779: Reduce memory allocated by CompressingStoredFieldsWriter to write strings larger than 64kb by an amount equal to string's utf8 size
Added:
lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java
- copied unchanged from r1703219, lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/compressing/GrowableByteArrayDataOutput.java
lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java
- copied unchanged from r1703219, lucene/dev/trunk/lucene/core/src/test/org/apache/lucene/codecs/compressing/TestGrowableByteArrayDataOutput.java
Removed:
lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/GrowableByteArrayDataOutput.java
Modified:
lucene/dev/branches/branch_5x/ (props changed)
lucene/dev/branches/branch_5x/lucene/ (props changed)
lucene/dev/branches/branch_5x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_5x/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/Lucene41StoredFieldsWriter.java
lucene/dev/branches/branch_5x/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene42/Lucene42TermVectorsWriter.java
lucene/dev/branches/branch_5x/lucene/core/ (props changed)
lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1703231&r1=1703230&r2=1703231&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Tue Sep 15 15:36:08 2015
@@ -59,6 +59,10 @@ Optimizations
GeoPointTermsEnum to reduce GC pressure (Nick Knize via Mike
McCandless)
+* LUCENE-6779: Reduce memory allocated by CompressingStoredFieldsWriter to write
+ strings larger than 64kb by an amount equal to string's utf8 size.
+ (Dawid Weiss, Robert Muir, shalin)
+
Bug Fixes
* LUCENE-6730: Hyper-parameter c is ignored in term frequency NormalizationH1.
Modified: lucene/dev/branches/branch_5x/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/Lucene41StoredFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/Lucene41StoredFieldsWriter.java?rev=1703231&r1=1703230&r2=1703231&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/Lucene41StoredFieldsWriter.java (original)
+++ lucene/dev/branches/branch_5x/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene41/Lucene41StoredFieldsWriter.java Tue Sep 15 15:36:08 2015
@@ -48,7 +48,7 @@ import org.apache.lucene.store.IOContext
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.GrowableByteArrayDataOutput;
+import org.apache.lucene.codecs.compressing.GrowableByteArrayDataOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;
Modified: lucene/dev/branches/branch_5x/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene42/Lucene42TermVectorsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene42/Lucene42TermVectorsWriter.java?rev=1703231&r1=1703230&r2=1703231&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene42/Lucene42TermVectorsWriter.java (original)
+++ lucene/dev/branches/branch_5x/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene42/Lucene42TermVectorsWriter.java Tue Sep 15 15:36:08 2015
@@ -51,7 +51,7 @@ import org.apache.lucene.store.IOContext
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.GrowableByteArrayDataOutput;
+import org.apache.lucene.codecs.compressing.GrowableByteArrayDataOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.packed.BlockPackedWriter;
Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java?rev=1703231&r1=1703230&r2=1703231&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingStoredFieldsWriter.java Tue Sep 15 15:36:08 2015
@@ -40,9 +40,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.GrowableByteArrayDataOutput;
import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.packed.PackedInts;
/**
@@ -245,8 +243,6 @@ public final class CompressingStoredFiel
numChunks++;
}
- byte scratchBytes[] = new byte[16];
-
@Override
public void writeField(FieldInfo info, IndexableField field)
throws IOException {
@@ -293,11 +289,7 @@ public final class CompressingStoredFiel
bufferedDocs.writeVInt(bytes.length);
bufferedDocs.writeBytes(bytes.bytes, bytes.offset, bytes.length);
} else if (string != null) {
- // this is just an optimized writeString() that re-uses scratchBytes.
- scratchBytes = ArrayUtil.grow(scratchBytes, string.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR);
- int length = UnicodeUtil.UTF16toUTF8(string, 0, string.length(), scratchBytes);
- bufferedDocs.writeVInt(length);
- bufferedDocs.writeBytes(scratchBytes, length);
+ bufferedDocs.writeString(string);
} else {
if (number instanceof Byte || number instanceof Short || number instanceof Integer) {
bufferedDocs.writeZInt(number.intValue());
Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java?rev=1703231&r1=1703230&r2=1703231&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/compressing/CompressingTermVectorsWriter.java Tue Sep 15 15:36:08 2015
@@ -35,8 +35,6 @@ import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentInfo;
-import org.apache.lucene.store.BufferedChecksumIndexInput;
-import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@@ -45,7 +43,6 @@ import org.apache.lucene.store.IndexOutp
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.GrowableByteArrayDataOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.packed.BlockPackedWriter;
Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java?rev=1703231&r1=1703230&r2=1703231&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java Tue Sep 15 15:36:08 2015
@@ -179,11 +179,21 @@ public final class UnicodeUtil {
* for length characters. It is the responsibility of the
* caller to make sure that the destination array is large enough.
*/
- // TODO: broken if incoming result.offset != 0
public static int UTF16toUTF8(final CharSequence s, final int offset, final int length, byte[] out) {
+ return UTF16toUTF8(s, offset, length, out, 0);
+ }
+
+ /** Encode characters from this String, starting at offset
+ * for length characters. Output to the destination array
+ * will begin at {@code outOffset}. It is the responsibility of the
+ * caller to make sure that the destination array is large enough.
+ * <p>
+ * note this method returns the final output offset (outOffset + number of bytes written)
+ */
+ public static int UTF16toUTF8(final CharSequence s, final int offset, final int length, byte[] out, int outOffset) {
final int end = offset + length;
- int upto = 0;
+ int upto = outOffset;
for(int i=offset;i<end;i++) {
final int code = (int) s.charAt(i);
@@ -223,6 +233,43 @@ public final class UnicodeUtil {
return upto;
}
+ /**
+ * Calculates the number of UTF8 bytes necessary to write a UTF16 string.
+ *
+ * @return the number of bytes written
+ */
+ public static int calcUTF16toUTF8Length(final CharSequence s, final int offset, final int len) {
+ final int end = offset + len;
+
+ int res = 0;
+ for (int i = offset; i < end; i++) {
+ final int code = (int) s.charAt(i);
+
+ if (code < 0x80)
+ res++;
+ else if (code < 0x800) {
+ res += 2;
+ } else if (code < 0xD800 || code > 0xDFFF) {
+ res += 3;
+ } else {
+ // surrogate pair
+ // confirm valid high surrogate
+ if (code < 0xDC00 && (i < end - 1)) {
+ int utf32 = (int) s.charAt(i + 1);
+ // confirm valid low surrogate and write pair
+ if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
+ i++;
+ res += 4;
+ continue;
+ }
+ }
+ res += 3;
+ }
+ }
+
+ return res;
+ }
+
// Only called from assert
/*
private static boolean matches(char[] source, int offset, int length, byte[] result, int upto) {
Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java?rev=1703231&r1=1703230&r2=1703231&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java Tue Sep 15 15:36:08 2015
@@ -214,4 +214,14 @@ public class TestUnicodeUtil extends Luc
assertEquals(cRef.toString(), unicode);
}
}
+
+ public void testCalcUTF16toUTF8Length() {
+ int num = atLeast(5000);
+ for (int i = 0; i < num; i++) {
+ String unicode = TestUtil.randomUnicodeString(random());
+ byte[] utf8 = new byte[unicode.length() * UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR];
+ int len = UnicodeUtil.UTF16toUTF8(unicode, 0, unicode.length(), utf8);
+ assertEquals(len, UnicodeUtil.calcUTF16toUTF8Length(unicode, 0, unicode.length()));
+ }
+ }
}