You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by yo...@apache.org on 2012/01/27 20:01:07 UTC

svn commit: r1236828 - in /lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common: cloud/ZkStateReader.java util/ByteUtils.java util/JavaBinCodec.java

Author: yonik
Date: Fri Jan 27 19:01:06 2012
New Revision: 1236828

URL: http://svn.apache.org/viewvc?rev=1236828&view=rev
Log:
revive ByteUtils to remove lucene dependency in solrj, refactor JavaBinCodec to reuse UTF8 encoding function

Added:
    lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java
Modified:
    lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
    lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java

Modified: lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java?rev=1236828&r1=1236827&r2=1236828&view=diff
==============================================================================
--- lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java (original)
+++ lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/cloud/ZkStateReader.java Fri Jan 27 19:01:06 2012
@@ -31,15 +31,14 @@ import java.util.concurrent.ThreadFactor
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.CharsRef;
-import org.apache.lucene.util.UnicodeUtil;
+
 import org.apache.noggit.CharArr;
 import org.apache.noggit.JSONParser;
 import org.apache.noggit.JSONWriter;
 import org.apache.noggit.ObjectBuilder;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
+import org.apache.solr.common.util.ByteUtils;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.WatchedEvent;
 import org.apache.zookeeper.Watcher;
@@ -90,17 +89,18 @@ public class ZkStateReader {
   }
 
   public static byte[] toUTF8(CharArr out) {
-    BytesRef br = new BytesRef(out);
-    return Arrays.copyOf(br.bytes, br.length);
+    byte[] arr = new byte[out.size() << 2]; // is 4x the real worst-case upper-bound?
+    int nBytes = ByteUtils.UTF16toUTF8(out, 0, out.size(), arr, 0);
+    return Arrays.copyOf(arr, nBytes);
   }
 
   public static Object fromJSON(byte[] utf8) {
     // convert directly from bytes to chars
     // and parse directly from that instead of going through
     // intermediate strings or readers
-    CharsRef chars = new CharsRef();
-    UnicodeUtil.UTF8toUTF16(utf8, 0, utf8.length, chars);   // TODO: this method currently oversizes the array
-    JSONParser parser = new JSONParser(chars.chars, chars.offset, chars.length);
+    CharArr chars = new CharArr();
+    ByteUtils.UTF8toUTF16(utf8, 0, utf8.length, chars);
+    JSONParser parser = new JSONParser(chars.getArray(), chars.getStart(), chars.length());
     try {
       return ObjectBuilder.getVal(parser);
     } catch (IOException e) {

Added: lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java?rev=1236828&view=auto
==============================================================================
--- lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java (added)
+++ lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/ByteUtils.java Fri Jan 27 19:01:06 2012
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.common.util;
+
+import org.apache.noggit.CharArr;
+
+public class ByteUtils {
+
+  /** Converts utf8 to utf16 and returns the number of 16 bit Java chars written.
+   * Full characters are read, even if this reads past the length passed (and can result in
+   * an ArrayOutOfBoundsException if invalid UTF8 is passed).  Explicit checks for valid UTF8 are not performed.
+   * The char[] out should probably have enough room to hold the worst case of each byte becoming a Java char.
+   */
+  public static int UTF8toUTF16(byte[] utf8, int offset, int len, char[] out, int out_offset) {
+    int out_start = out_offset;
+    final int limit = offset + len;
+    while (offset < limit) {
+      int b = utf8[offset++]&0xff;
+
+      if (b < 0xc0) {
+        assert b < 0x80;
+        out[out_offset++] = (char)b;
+      } else if (b < 0xe0) {
+        out[out_offset++] = (char)(((b&0x1f)<<6) + (utf8[offset++]&0x3f));
+      } else if (b < 0xf0) {
+        out[out_offset++] = (char)(((b&0xf)<<12) + ((utf8[offset]&0x3f)<<6) + (utf8[offset+1]&0x3f));
+        offset += 2;
+      } else {
+        assert b < 0xf8;
+        int ch = ((b&0x7)<<18) + ((utf8[offset]&0x3f)<<12) + ((utf8[offset+1]&0x3f)<<6) + (utf8[offset+2]&0x3f);
+        offset += 3;
+        if (ch < 0xffff) {
+          out[out_offset++] = (char)ch;
+        } else {
+          int chHalf = ch - 0x0010000;
+          out[out_offset++] = (char) ((chHalf >> 10) + 0xD800);
+          out[out_offset++] = (char) ((chHalf & 0x3FFL) + 0xDC00);
+        }
+      }
+    }
+
+    return out_offset - out_start;
+  }
+
+  /** Convert UTF8 bytes into UTF16 characters. */
+  public static void UTF8toUTF16(byte[] utf8, int offset, int len, CharArr out) {
+    // TODO: do in chunks if the input is large
+    out.reserve(len);
+    int n = UTF8toUTF16(utf8, offset, len, out.getArray(), out.getEnd());
+    out.setEnd(out.getEnd() + n);
+  }
+
+  /** Convert UTF8 bytes into a String */
+  public static String UTF8toUTF16(byte[] utf8, int offset, int len) {
+    char[] out = new char[len];
+    int n = UTF8toUTF16(utf8, offset, len, out, 0);
+    return new String(out,0,n);
+  }
+
+
+
+  /** Writes UTF8 into the byte array, starting at offset.  The caller should ensure that
+   * there is enough space for the worst-case scenario.
+   * @returns the number of bytes written
+   */
+  public static int UTF16toUTF8(CharSequence s, int offset, int len, byte[] result, int resultOffset) {
+    final int end = offset + len;
+
+    int upto = resultOffset;
+    for(int i=offset;i<end;i++) {
+      final int code = (int) s.charAt(i);
+
+      if (code < 0x80)
+        result[upto++] = (byte) code;
+      else if (code < 0x800) {
+        result[upto++] = (byte) (0xC0 | (code >> 6));
+        result[upto++] = (byte)(0x80 | (code & 0x3F));
+      } else if (code < 0xD800 || code > 0xDFFF) {
+        result[upto++] = (byte)(0xE0 | (code >> 12));
+        result[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
+        result[upto++] = (byte)(0x80 | (code & 0x3F));
+      } else {
+        // surrogate pair
+        // confirm valid high surrogate
+        if (code < 0xDC00 && (i < end-1)) {
+          int utf32 = (int) s.charAt(i+1);
+          // confirm valid low surrogate and write pair
+          if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
+            utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
+            i++;
+            result[upto++] = (byte)(0xF0 | (utf32 >> 18));
+            result[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
+            result[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
+            result[upto++] = (byte)(0x80 | (utf32 & 0x3F));
+            continue;
+          }
+        }
+        // replace unpaired surrogate or out-of-order low surrogate
+        // with substitution character
+        result[upto++] = (byte) 0xEF;
+        result[upto++] = (byte) 0xBF;
+        result[upto++] = (byte) 0xBD;
+      }
+    }
+
+    return upto - resultOffset;
+  }
+
+
+
+}

Modified: lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java?rev=1236828&r1=1236827&r2=1236828&view=diff
==============================================================================
--- lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java (original)
+++ lucene/dev/trunk/solr/solrj/src/java/org/apache/solr/common/util/JavaBinCodec.java Fri Jan 27 19:01:06 2012
@@ -463,44 +463,10 @@ public class JavaBinCodec {
     int end = s.length();
     int maxSize = end * 4;
     if (bytes == null || bytes.length < maxSize) bytes = new byte[maxSize];
-    int upto = 0;
-    for(int i=0;i<end;i++) {
-      final int code = (int) s.charAt(i);
-
-      if (code < 0x80)
-        bytes[upto++] = (byte) code;
-      else if (code < 0x800) {
-        bytes[upto++] = (byte) (0xC0 | (code >> 6));
-        bytes[upto++] = (byte)(0x80 | (code & 0x3F));
-      } else if (code < 0xD800 || code > 0xDFFF) {
-        bytes[upto++] = (byte)(0xE0 | (code >> 12));
-        bytes[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
-        bytes[upto++] = (byte)(0x80 | (code & 0x3F));
-      } else {
-        // surrogate pair
-        // confirm valid high surrogate
-        if (code < 0xDC00 && (i < end-1)) {
-          int utf32 = (int) s.charAt(i+1);
-          // confirm valid low surrogate and write pair
-          if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { 
-            utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
-            i++;
-            bytes[upto++] = (byte)(0xF0 | (utf32 >> 18));
-            bytes[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
-            bytes[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
-            bytes[upto++] = (byte)(0x80 | (utf32 & 0x3F));
-            continue;
-          }
-        }
-        // replace unpaired surrogate or out-of-order low surrogate
-        // with substitution character
-        bytes[upto++] = (byte) 0xEF;
-        bytes[upto++] = (byte) 0xBF;
-        bytes[upto++] = (byte) 0xBD;
-      }
-    }
-    writeTag(STR, upto);
-    daos.write(bytes, 0, upto);
+    int sz = ByteUtils.UTF16toUTF8(s, 0, end, bytes, 0);
+
+    writeTag(STR, sz);
+    daos.write(bytes, 0, sz);
   }
 
   byte[] bytes;
@@ -600,15 +566,32 @@ public class JavaBinCodec {
     } else if (val instanceof String) {
       writeStr((String) val);
       return true;
-    } else if (val instanceof Integer) {
-      writeInt(((Integer) val).intValue());
-      return true;
-    } else if (val instanceof Long) {
-      writeLong(((Long) val).longValue());
-      return true;
-    } else if (val instanceof Float) {
-      writeFloat(((Float) val).floatValue());
-      return true;
+    } else if (val instanceof Number) {
+
+      if (val instanceof Integer) {
+        writeInt(((Integer) val).intValue());
+        return true;
+      } else if (val instanceof Long) {
+        writeLong(((Long) val).longValue());
+        return true;
+      } else if (val instanceof Float) {
+        writeFloat(((Float) val).floatValue());
+        return true;
+      } else if (val instanceof Double) {
+        daos.writeByte(DOUBLE);
+        daos.writeDouble(((Double) val).doubleValue());
+        return true;
+      } else if (val instanceof Byte) {
+        daos.writeByte(BYTE);
+        daos.writeByte(((Byte) val).intValue());
+        return true;
+      } else if (val instanceof Short) {
+        daos.writeByte(SHORT);
+        daos.writeShort(((Short) val).intValue());
+        return true;
+      }
+      return false;
+
     } else if (val instanceof Date) {
       daos.writeByte(DATE);
       daos.writeLong(((Date) val).getTime());
@@ -617,18 +600,6 @@ public class JavaBinCodec {
       if ((Boolean) val) daos.writeByte(BOOL_TRUE);
       else daos.writeByte(BOOL_FALSE);
       return true;
-    } else if (val instanceof Double) {
-      daos.writeByte(DOUBLE);
-      daos.writeDouble(((Double) val).doubleValue());
-      return true;
-    } else if (val instanceof Byte) {
-      daos.writeByte(BYTE);
-      daos.writeByte(((Byte) val).intValue());
-      return true;
-    } else if (val instanceof Short) {
-      daos.writeByte(SHORT);
-      daos.writeShort(((Short) val).intValue());
-      return true;
     } else if (val instanceof byte[]) {
       writeByteArray((byte[]) val, 0, ((byte[]) val).length);
       return true;