You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2015/09/03 21:42:39 UTC
svn commit: r1701115 - in /lucene/dev/trunk/solr: CHANGES.txt
solrj/src/java/org/apache/solr/common/util/ByteUtils.java
solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java
Author: shalin
Date: Thu Sep 3 19:42:38 2015
New Revision: 1701115
URL: http://svn.apache.org/r1701115
Log:
SOLR-7971: JavaBinCodec now uses a double pass approach to write strings larger than 64KB to avoid allocating buffer memory equal to string's UTF8 size
Modified:
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java
lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1701115&r1=1701114&r2=1701115&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Thu Sep 3 19:42:38 2015
@@ -210,9 +210,10 @@ Optimizations
are more efficient especially when cluster has a mix of collections in stateFormat=1
and stateFormat=2. (Scott Blum, shalin)
-* SOLR-7971: Reduce memory allocated by JavaBinCodec to encode large strings by an amount
- equal to the string.length().
- (yonik, Steve Rowe, shalin)
+* SOLR-7971: Reduce memory allocated by JavaBinCodec to encode small strings by an amount
+ equal to the string.length(). JavaBinCodec now uses a double pass approach to write strings
+ larger than 64KB to avoid allocating buffer memory equal to string's UTF8 size.
+ (yonik, Steve Rowe, Mikhail Khludnev, Noble Paul, shalin)
* SOLR-7983: Utils.toUTF8 uses larger buffer than necessary for holding UTF8 data. (shalin)
Modified: lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java?rev=1701115&r1=1701114&r2=1701115&view=diff
==============================================================================
--- lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java (original)
+++ lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java Thu Sep 3 19:42:38 2015
@@ -17,10 +17,16 @@
package org.apache.solr.common.util;
+import java.io.IOException;
+import java.io.OutputStream;
+
import org.noggit.CharArr;
public class ByteUtils {
+ /** Maximum number of UTF8 bytes per UTF16 character. */
+ public static final int MAX_UTF8_BYTES_PER_CHAR = 3;
+
/** Converts utf8 to utf16 and returns the number of 16 bit Java chars written.
* Full characters are read, even if this reads past the length passed (and can result in
* an ArrayOutOfBoundsException if invalid UTF8 is passed). Explicit checks for valid UTF8 are not performed.
@@ -121,6 +127,100 @@ public class ByteUtils {
return upto - resultOffset;
}
+ /** Writes UTF8 into the given OutputStream by first writing to the given scratch array
+ * and then writing the contents of the scratch array to the OutputStream. The given scratch byte array
+ * is used to buffer intermediate data before it is written to the byte buffer.
+ *
+ * @return the number of bytes written
+ */
+ public static int writeUTF16toUTF8(CharSequence s, int offset, int len, OutputStream fos, byte[] scratch) throws IOException {
+ final int end = offset + len;
+
+ int upto = 0, totalBytes = 0;
+ for(int i=offset;i<end;i++) {
+ final int code = (int) s.charAt(i);
+
+ if (upto > scratch.length - 4) {
+ // a code point may take upto 4 bytes and we don't have enough space, so reset
+ totalBytes += upto;
+ fos.write(scratch, 0, upto);
+ upto = 0;
+ }
+ if (code < 0x80)
+ scratch[upto++] = (byte) code;
+ else if (code < 0x800) {
+ scratch[upto++] = (byte) (0xC0 | (code >> 6));
+ scratch[upto++] = (byte)(0x80 | (code & 0x3F));
+ } else if (code < 0xD800 || code > 0xDFFF) {
+ scratch[upto++] = (byte)(0xE0 | (code >> 12));
+ scratch[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
+ scratch[upto++] = (byte)(0x80 | (code & 0x3F));
+ } else {
+ // surrogate pair
+ // confirm valid high surrogate
+ if (code < 0xDC00 && (i < end-1)) {
+ int utf32 = (int) s.charAt(i+1);
+ // confirm valid low surrogate and write pair
+ if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
+ utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
+ i++;
+ scratch[upto++] = (byte)(0xF0 | (utf32 >> 18));
+ scratch[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
+ scratch[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
+ scratch[upto++] = (byte)(0x80 | (utf32 & 0x3F));
+ continue;
+ }
+ }
+ // replace unpaired surrogate or out-of-order low surrogate
+ // with substitution character
+ scratch[upto++] = (byte) 0xEF;
+ scratch[upto++] = (byte) 0xBF;
+ scratch[upto++] = (byte) 0xBD;
+ }
+ }
+
+ totalBytes += upto;
+ fos.write(scratch, 0, upto);
+
+ return totalBytes;
+ }
+
+ /**
+ * Calculates the number of UTF8 bytes necessary to write a UTF16 string.
+ *
+ * @return the number of bytes written
+ */
+ public static int calcUTF16toUTF8Length(CharSequence s, int offset, int len) {
+ final int end = offset + len;
+
+ int res = 0;
+ for (int i = offset; i < end; i++) {
+ final int code = (int) s.charAt(i);
+
+ if (code < 0x80)
+ res++;
+ else if (code < 0x800) {
+ res += 2;
+ } else if (code < 0xD800 || code > 0xDFFF) {
+ res += 3;
+ } else {
+ // surrogate pair
+ // confirm valid high surrogate
+ if (code < 0xDC00 && (i < end - 1)) {
+ int utf32 = (int) s.charAt(i + 1);
+ // confirm valid low surrogate and write pair
+ if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
+ i++;
+ res += 4;
+ continue;
+ }
+ }
+ res += 3;
+ }
+ }
+
+ return res;
+ }
}
Modified: lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java?rev=1701115&r1=1701114&r2=1701115&view=diff
==============================================================================
--- lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java (original)
+++ lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java Thu Sep 3 19:42:38 2015
@@ -77,6 +77,8 @@ public class JavaBinCodec {
NAMED_LST = (byte) (6 << 5), // NamedList
EXTERN_STRING = (byte) (7 << 5);
+ private static final int MAX_UTF8_SIZE_FOR_ARRAY_GROW_STRATEGY = 65536;
+
private static byte VERSION = 2;
private final ObjectResolver resolver;
@@ -614,12 +616,20 @@ public class JavaBinCodec {
return;
}
int end = s.length();
- int maxSize = end * 3; // 3 is enough, see SOLR-7971
- if (bytes == null || bytes.length < maxSize) bytes = new byte[maxSize];
- int sz = ByteUtils.UTF16toUTF8(s, 0, end, bytes, 0);
+ int maxSize = end * ByteUtils.MAX_UTF8_BYTES_PER_CHAR;
- writeTag(STR, sz);
- daos.write(bytes, 0, sz);
+ if (maxSize <= MAX_UTF8_SIZE_FOR_ARRAY_GROW_STRATEGY) {
+ if (bytes == null || bytes.length < maxSize) bytes = new byte[maxSize];
+ int sz = ByteUtils.UTF16toUTF8(s, 0, end, bytes, 0);
+ writeTag(STR, sz);
+ daos.write(bytes, 0, sz);
+ } else {
+ // double pass logic for large strings, see SOLR-7971
+ int sz = ByteUtils.calcUTF16toUTF8Length(s, 0, end);
+ writeTag(STR, sz);
+ if (bytes == null || bytes.length < 8192) bytes = new byte[8192];
+ ByteUtils.writeUTF16toUTF8(s, 0, end, daos, bytes);
+ }
}
byte[] bytes;