You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by yo...@apache.org on 2012/01/27 20:01:07 UTC
svn commit: r1236828 - in
/lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common:
cloud/ZkStateReader.java util/ByteUtils.java util/JavaBinCodec.java
Author: yonik
Date: Fri Jan 27 19:01:06 2012
New Revision: 1236828
URL: http://svn.apache.org/viewvc?rev=1236828&view=rev
Log:
revive ByteUtils to remove lucene dependency in solrj, refactor JavaBinCodec to reuse UTF8 encoding function
Added:
lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java
Modified:
lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java
Modified: lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java?rev=1236828&r1=1236827&r2=1236828&view=diff
==============================================================================
--- lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java (original)
+++ lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java Fri Jan 27 19:01:06 2012
@@ -31,15 +31,14 @@ import java.util.concurrent.ThreadFactor
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.CharsRef;
-import org.apache.lucene.util.UnicodeUtil;
+
import org.apache.noggit.CharArr;
import org.apache.noggit.JSONParser;
import org.apache.noggit.JSONWriter;
import org.apache.noggit.ObjectBuilder;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
+import org.apache.solr.common.util.ByteUtils;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher;
@@ -90,17 +89,18 @@ public class ZkStateReader {
}
public static byte[] toUTF8(CharArr out) {
- BytesRef br = new BytesRef(out);
- return Arrays.copyOf(br.bytes, br.length);
+ byte[] arr = new byte[out.size() << 2]; // is 4x the real worst-case upper-bound?
+ int nBytes = ByteUtils.UTF16toUTF8(out, 0, out.size(), arr, 0);
+ return Arrays.copyOf(arr, nBytes);
}
public static Object fromJSON(byte[] utf8) {
// convert directly from bytes to chars
// and parse directly from that instead of going through
// intermediate strings or readers
- CharsRef chars = new CharsRef();
- UnicodeUtil.UTF8toUTF16(utf8, 0, utf8.length, chars); // TODO: this method currently oversizes the array
- JSONParser parser = new JSONParser(chars.chars, chars.offset, chars.length);
+ CharArr chars = new CharArr();
+ ByteUtils.UTF8toUTF16(utf8, 0, utf8.length, chars);
+ JSONParser parser = new JSONParser(chars.getArray(), chars.getStart(), chars.length());
try {
return ObjectBuilder.getVal(parser);
} catch (IOException e) {
Added: lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java?rev=1236828&view=auto
==============================================================================
--- lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java (added)
+++ lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java Fri Jan 27 19:01:06 2012
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.common.util;
+
+import org.apache.noggit.CharArr;
+
+public class ByteUtils {
+
+ /** Converts utf8 to utf16 and returns the number of 16 bit Java chars written.
+ * Full characters are read, even if this reads past the length passed (and can result in
+ * an ArrayOutOfBoundsException if invalid UTF8 is passed). Explicit checks for valid UTF8 are not performed.
+ * The char[] out should probably have enough room to hold the worst case of each byte becoming a Java char.
+ */
+ public static int UTF8toUTF16(byte[] utf8, int offset, int len, char[] out, int out_offset) {
+ int out_start = out_offset;
+ final int limit = offset + len;
+ while (offset < limit) {
+ int b = utf8[offset++]&0xff;
+
+ if (b < 0xc0) {
+ assert b < 0x80;
+ out[out_offset++] = (char)b;
+ } else if (b < 0xe0) {
+ out[out_offset++] = (char)(((b&0x1f)<<6) + (utf8[offset++]&0x3f));
+ } else if (b < 0xf0) {
+ out[out_offset++] = (char)(((b&0xf)<<12) + ((utf8[offset]&0x3f)<<6) + (utf8[offset+1]&0x3f));
+ offset += 2;
+ } else {
+ assert b < 0xf8;
+ int ch = ((b&0x7)<<18) + ((utf8[offset]&0x3f)<<12) + ((utf8[offset+1]&0x3f)<<6) + (utf8[offset+2]&0x3f);
+ offset += 3;
+ if (ch < 0xffff) {
+ out[out_offset++] = (char)ch;
+ } else {
+ int chHalf = ch - 0x0010000;
+ out[out_offset++] = (char) ((chHalf >> 10) + 0xD800);
+ out[out_offset++] = (char) ((chHalf & 0x3FFL) + 0xDC00);
+ }
+ }
+ }
+
+ return out_offset - out_start;
+ }
+
+ /** Convert UTF8 bytes into UTF16 characters. */
+ public static void UTF8toUTF16(byte[] utf8, int offset, int len, CharArr out) {
+ // TODO: do in chunks if the input is large
+ out.reserve(len);
+ int n = UTF8toUTF16(utf8, offset, len, out.getArray(), out.getEnd());
+ out.setEnd(out.getEnd() + n);
+ }
+
+ /** Convert UTF8 bytes into a String */
+ public static String UTF8toUTF16(byte[] utf8, int offset, int len) {
+ char[] out = new char[len];
+ int n = UTF8toUTF16(utf8, offset, len, out, 0);
+ return new String(out,0,n);
+ }
+
+
+
+ /** Writes UTF8 into the byte array, starting at offset. The caller should ensure that
+ * there is enough space for the worst-case scenario.
+ * @returns the number of bytes written
+ */
+ public static int UTF16toUTF8(CharSequence s, int offset, int len, byte[] result, int resultOffset) {
+ final int end = offset + len;
+
+ int upto = resultOffset;
+ for(int i=offset;i<end;i++) {
+ final int code = (int) s.charAt(i);
+
+ if (code < 0x80)
+ result[upto++] = (byte) code;
+ else if (code < 0x800) {
+ result[upto++] = (byte) (0xC0 | (code >> 6));
+ result[upto++] = (byte)(0x80 | (code & 0x3F));
+ } else if (code < 0xD800 || code > 0xDFFF) {
+ result[upto++] = (byte)(0xE0 | (code >> 12));
+ result[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
+ result[upto++] = (byte)(0x80 | (code & 0x3F));
+ } else {
+ // surrogate pair
+ // confirm valid high surrogate
+ if (code < 0xDC00 && (i < end-1)) {
+ int utf32 = (int) s.charAt(i+1);
+ // confirm valid low surrogate and write pair
+ if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
+ utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
+ i++;
+ result[upto++] = (byte)(0xF0 | (utf32 >> 18));
+ result[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
+ result[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
+ result[upto++] = (byte)(0x80 | (utf32 & 0x3F));
+ continue;
+ }
+ }
+ // replace unpaired surrogate or out-of-order low surrogate
+ // with substitution character
+ result[upto++] = (byte) 0xEF;
+ result[upto++] = (byte) 0xBF;
+ result[upto++] = (byte) 0xBD;
+ }
+ }
+
+ return upto - resultOffset;
+ }
+
+
+
+}
Modified: lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java?rev=1236828&r1=1236827&r2=1236828&view=diff
==============================================================================
--- lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java (original)
+++ lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java Fri Jan 27 19:01:06 2012
@@ -463,44 +463,10 @@ public class JavaBinCodec {
int end = s.length();
int maxSize = end * 4;
if (bytes == null || bytes.length < maxSize) bytes = new byte[maxSize];
- int upto = 0;
- for(int i=0;i<end;i++) {
- final int code = (int) s.charAt(i);
-
- if (code < 0x80)
- bytes[upto++] = (byte) code;
- else if (code < 0x800) {
- bytes[upto++] = (byte) (0xC0 | (code >> 6));
- bytes[upto++] = (byte)(0x80 | (code & 0x3F));
- } else if (code < 0xD800 || code > 0xDFFF) {
- bytes[upto++] = (byte)(0xE0 | (code >> 12));
- bytes[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
- bytes[upto++] = (byte)(0x80 | (code & 0x3F));
- } else {
- // surrogate pair
- // confirm valid high surrogate
- if (code < 0xDC00 && (i < end-1)) {
- int utf32 = (int) s.charAt(i+1);
- // confirm valid low surrogate and write pair
- if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
- utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
- i++;
- bytes[upto++] = (byte)(0xF0 | (utf32 >> 18));
- bytes[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
- bytes[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
- bytes[upto++] = (byte)(0x80 | (utf32 & 0x3F));
- continue;
- }
- }
- // replace unpaired surrogate or out-of-order low surrogate
- // with substitution character
- bytes[upto++] = (byte) 0xEF;
- bytes[upto++] = (byte) 0xBF;
- bytes[upto++] = (byte) 0xBD;
- }
- }
- writeTag(STR, upto);
- daos.write(bytes, 0, upto);
+ int sz = ByteUtils.UTF16toUTF8(s, 0, end, bytes, 0);
+
+ writeTag(STR, sz);
+ daos.write(bytes, 0, sz);
}
byte[] bytes;
@@ -600,15 +566,32 @@ public class JavaBinCodec {
} else if (val instanceof String) {
writeStr((String) val);
return true;
- } else if (val instanceof Integer) {
- writeInt(((Integer) val).intValue());
- return true;
- } else if (val instanceof Long) {
- writeLong(((Long) val).longValue());
- return true;
- } else if (val instanceof Float) {
- writeFloat(((Float) val).floatValue());
- return true;
+ } else if (val instanceof Number) {
+
+ if (val instanceof Integer) {
+ writeInt(((Integer) val).intValue());
+ return true;
+ } else if (val instanceof Long) {
+ writeLong(((Long) val).longValue());
+ return true;
+ } else if (val instanceof Float) {
+ writeFloat(((Float) val).floatValue());
+ return true;
+ } else if (val instanceof Double) {
+ daos.writeByte(DOUBLE);
+ daos.writeDouble(((Double) val).doubleValue());
+ return true;
+ } else if (val instanceof Byte) {
+ daos.writeByte(BYTE);
+ daos.writeByte(((Byte) val).intValue());
+ return true;
+ } else if (val instanceof Short) {
+ daos.writeByte(SHORT);
+ daos.writeShort(((Short) val).intValue());
+ return true;
+ }
+ return false;
+
} else if (val instanceof Date) {
daos.writeByte(DATE);
daos.writeLong(((Date) val).getTime());
@@ -617,18 +600,6 @@ public class JavaBinCodec {
if ((Boolean) val) daos.writeByte(BOOL_TRUE);
else daos.writeByte(BOOL_FALSE);
return true;
- } else if (val instanceof Double) {
- daos.writeByte(DOUBLE);
- daos.writeDouble(((Double) val).doubleValue());
- return true;
- } else if (val instanceof Byte) {
- daos.writeByte(BYTE);
- daos.writeByte(((Byte) val).intValue());
- return true;
- } else if (val instanceof Short) {
- daos.writeByte(SHORT);
- daos.writeShort(((Short) val).intValue());
- return true;
} else if (val instanceof byte[]) {
writeByteArray((byte[]) val, 0, ((byte[]) val).length);
return true;