You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2010/10/15 11:16:21 UTC

svn commit: r1022864 - in /jackrabbit/trunk/jackrabbit-core/src: main/java/org/apache/jackrabbit/core/persistence/util/ test/java/org/apache/jackrabbit/core/persistence/util/

Author: jukka
Date: Fri Oct 15 09:16:15 2010
New Revision: 1022864

URL: http://svn.apache.org/viewvc?rev=1022864&view=rev
Log:
JCR-2762: Optimize bundle serialization

Use variable-length serialization for LONG values, and leverage that also for DATE serialization together with an encoding that maps dates (millisecond accuracy, with timezone offsets) to 64 bits.

Modified:
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/persistence/util/BundleReader.java
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/persistence/util/BundleWriter.java
    jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/persistence/util/BundleBindingTest.java

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/persistence/util/BundleReader.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/persistence/util/BundleReader.java?rev=1022864&r1=1022863&r2=1022864&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/persistence/util/BundleReader.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/persistence/util/BundleReader.java Fri Oct 15 09:16:15 2010
@@ -28,8 +28,11 @@ import org.apache.jackrabbit.spi.commons
 import java.io.DataInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Calendar;
+import java.util.GregorianCalendar;
 import java.util.HashSet;
 import java.util.Set;
+import java.util.TimeZone;
 import java.math.BigDecimal;
 
 import javax.jcr.PropertyType;
@@ -45,6 +48,44 @@ class BundleReader {
     /** Logger instance */
     private static Logger log = LoggerFactory.getLogger(BundleReader.class);
 
+    /**
+     * Pre-calculated {@link TimeZone} objects for common timezone offsets.
+     */
+    private static final TimeZone[] COMMON_TIMEZONES = {
+        TimeZone.getTimeZone("GMT+00:00"), // 0b00000
+        TimeZone.getTimeZone("GMT+01:00"), // 0b00001
+        TimeZone.getTimeZone("GMT+02:00"), // 0b00010
+        TimeZone.getTimeZone("GMT+03:00"), // 0b00011
+        TimeZone.getTimeZone("GMT+04:00"), // 0b00100
+        TimeZone.getTimeZone("GMT+05:00"), // 0b00101
+        TimeZone.getTimeZone("GMT+06:00"), // 0b00110
+        TimeZone.getTimeZone("GMT+07:00"), // 0b00111
+        TimeZone.getTimeZone("GMT+08:00"), // 0b01000
+        TimeZone.getTimeZone("GMT+09:00"), // 0b01001
+        TimeZone.getTimeZone("GMT+10:00"), // 0b01010
+        TimeZone.getTimeZone("GMT+11:00"), // 0b01011
+        TimeZone.getTimeZone("GMT+12:00"), // 0b01100
+        TimeZone.getTimeZone("GMT+13:00"), // 0b01101
+        TimeZone.getTimeZone("GMT+14:00"), // 0b01110
+        TimeZone.getTimeZone("GMT+15:00"), // 0b01111
+        TimeZone.getTimeZone("GMT-16:00"), // 0b10000
+        TimeZone.getTimeZone("GMT-15:00"), // 0b10001
+        TimeZone.getTimeZone("GMT-14:00"), // 0b10010
+        TimeZone.getTimeZone("GMT-13:00"), // 0b10011
+        TimeZone.getTimeZone("GMT-12:00"), // 0b10100
+        TimeZone.getTimeZone("GMT-11:00"), // 0b10101
+        TimeZone.getTimeZone("GMT-10:00"), // 0b10110
+        TimeZone.getTimeZone("GMT-09:00"), // 0b10111
+        TimeZone.getTimeZone("GMT-08:00"), // 0b11000
+        TimeZone.getTimeZone("GMT-07:00"), // 0b11001
+        TimeZone.getTimeZone("GMT-06:00"), // 0b11010
+        TimeZone.getTimeZone("GMT-05:00"), // 0b11011
+        TimeZone.getTimeZone("GMT-04:00"), // 0b11100
+        TimeZone.getTimeZone("GMT-03:00"), // 0b11101
+        TimeZone.getTimeZone("GMT-02:00"), // 0b11110
+        TimeZone.getTimeZone("GMT-01:00"), // 0b11111
+    };
+
     private final BundleBinding binding;
 
     private final DataInputStream in;
@@ -249,7 +290,11 @@ class BundleReader {
                     val = InternalValue.create(readDecimal());
                     break;
                 case PropertyType.LONG:
-                    val = InternalValue.create(in.readLong());
+                    if (version >= BundleBinding.VERSION_3) {
+                        val = InternalValue.create(readVarLong());
+                    } else {
+                        val = InternalValue.create(in.readLong());
+                    }
                     break;
                 case PropertyType.BOOLEAN:
                     val = InternalValue.create(in.readBoolean());
@@ -263,6 +308,11 @@ class BundleReader {
                 case PropertyType.REFERENCE:
                     val = InternalValue.create(readNodeId(), false);
                     break;
+                case PropertyType.DATE:
+                    if (version >= BundleBinding.VERSION_3) {
+                        val = InternalValue.create(readDate());
+                        break;
+                    } // else fall through
                 default:
                     if (version >= BundleBinding.VERSION_3) {
                         val = InternalValue.valueOf(
@@ -385,7 +435,7 @@ class BundleReader {
      * Deserializes a variable-length integer written using bundle
      * serialization version 3.
      *
-     * @return deserialized name
+     * @return deserialized integer
      * @throws IOException if an I/O error occurs
      */
     private int readVarInt() throws IOException {
@@ -397,6 +447,110 @@ class BundleReader {
         }
     }
 
+    /**
+     * Deserializes a variable-length long written using bundle
+     * serialization version 3.
+     *
+     * @return deserialized long
+     * @throws IOException if an I/O error occurs
+     */
+    private long readVarLong() throws IOException {
+        long value = 0;
+        int bits = 0;
+        long b;
+        do {
+            b = in.readUnsignedByte();
+            value = (b & 0x7f) << 57 | value >>> 7;
+            bits += 7;
+        } while ((b & 0x80) != 0);
+        value = value >>> (64 - bits);
+        if ((value & 1) != 0) {
+            return ~(value >>> 1);
+        } else {
+            return value >>> 1;
+        }
+    }
+
+    /**
+     * Deserializes a specially encoded date written using bundle
+     * serialization version 3.
+     *
+     * @return deserialized date
+     * @throws IOException if an I/O error occurs
+     */
+    private Calendar readDate() throws IOException {
+        long ts = readVarLong();
+
+        TimeZone tz;
+        if ((ts & 1) == 0) {
+            tz = COMMON_TIMEZONES[0];
+            ts >>= 1; 
+        } else if ((ts & 2) == 0) {
+            tz = COMMON_TIMEZONES[((int) ts >> 2) & 0x1f]; // 5 bits;
+            ts >>= 7;
+        } else {
+            int m = ((int) ts << 19) >> 21; // 11 bits, sign-extended
+            int h = m / 60;
+            String s;
+            if (m < 0) {
+                s = String.format("GMT-%02d:%02d", -h, h * 60 - m);
+            } else {
+                s = String.format("GMT+%02d:%02d", h, m - h * 60);
+            }
+            tz = TimeZone.getTimeZone(s);
+            ts >>= 13;
+        }
+
+        int u = 0;
+        int s = 0;
+        int m = 0;
+        int h = 0;
+        int type = (int) ts & 3;
+        ts >>= 2;
+        switch (type) {
+        case 3:
+            u = (int) ts & 0x3fffffff; // 30 bits
+            s = u / 1000;
+            m = s / 60;
+            h = m / 60;
+            m -= h * 60;
+            s -= (h * 60 + m) * 60;
+            u -= ((h * 60 + m) * 60 + s) * 1000;
+            ts >>= 30;
+            break;
+        case 2:
+            m = (int) ts & 0x07ff; // 11 bits
+            h = m / 60;
+            m -= h * 60;
+            ts >>= 11;
+            break;
+        case 1:
+            h = (int) ts & 0x1f; // 5 bits
+            ts >>= 5;
+            break;
+        }
+
+        int d = (int) ts & 0x01ff; // 9 bits;
+        ts >>= 9;
+        int y = (int) (ts + 2010);
+
+        Calendar value = Calendar.getInstance(tz);
+        if (y <= 0) {
+            value.set(Calendar.YEAR, 1 - y);
+            value.set(Calendar.ERA, GregorianCalendar.BC);
+        } else {
+            value.set(Calendar.YEAR, y);
+            value.set(Calendar.ERA, GregorianCalendar.AD);
+        }
+        value.set(Calendar.DAY_OF_YEAR, d);
+        value.set(Calendar.HOUR_OF_DAY, h);
+        value.set(Calendar.MINUTE, m);
+        value.set(Calendar.SECOND, s);
+        value.set(Calendar.MILLISECOND, u);
+
+        return value;
+    }
+
     private String readString() throws IOException {
         if (version >= BundleBinding.VERSION_3) {
             return new String(readBytes(0, 0), "UTF-8");

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/persistence/util/BundleWriter.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/persistence/util/BundleWriter.java?rev=1022864&r1=1022863&r2=1022864&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/persistence/util/BundleWriter.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/persistence/util/BundleWriter.java Fri Oct 15 09:16:15 2010
@@ -22,6 +22,8 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.math.BigDecimal;
+import java.util.Calendar;
+import java.util.GregorianCalendar;
 
 import javax.jcr.PropertyType;
 import javax.jcr.RepositoryException;
@@ -36,7 +38,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- * Bundle serializater.
+ * Bundle serializer.
  *
  * @see BundleReader
  */
@@ -282,7 +284,7 @@ class BundleWriter {
                     break;
                 case PropertyType.LONG:
                     try {
-                        out.writeLong(val.getLong());
+                        writeVarLong(val.getLong());
                     } catch (RepositoryException e) {
                         // should never occur
                         throw new IOException("Unexpected error while writing LONG value.");
@@ -308,6 +310,14 @@ class BundleWriter {
                 case PropertyType.REFERENCE:
                     writeNodeId(val.getNodeId());
                     break;
+                case PropertyType.DATE:
+                    try {
+                        writeDate(val.getCalendar());
+                    } catch (RepositoryException e) {
+                        // should never occur
+                        throw new IOException("Unexpected error while writing DATE value.");
+                    }
+                    break;
                 default:
                     writeString(val.toString());
                     break;
@@ -494,6 +504,179 @@ class BundleWriter {
     }
 
     /**
+     * Serializes a long value using a variable length encoding like the
+     * one used by {@link #writeVarInt(int)} for integer values. Before
+     * writing out, the value is first normalized to an unsigned value
+     * by moving the sign bit to be the end negating the other bits of
+     * a negative value. This normalization step maximizes the number of
+     * zero high order bits for typical small values (positive or negative),
+     * and thus keeps the serialization short.
+     *
+     * @param value long value
+     * @throws IOException if an I/O error occurs
+     */
+    private void writeVarLong(long value) throws IOException {
+        // Normalize to an unsigned value with the sign as the lowest bit
+        if (value < 0) {
+            value = ~value << 1 | 1;
+        } else {
+            value <<= 1;
+        }
+        while (true) {
+            long b = value & 0x7f;
+            if (b != value) {
+                out.writeByte((int) b | 0x80);
+                value >>>= 7; // unsigned shift
+            } else {
+                out.writeByte((int) b);
+                return;
+            }
+        }
+    }
+
+    /**
+     * Serializes a JCR date value using the {@link #writeVarLong(long)}
+     * serialization on a special 64-bit date encoding. This encoding maps
+     * the <code>sYYYY-MM-DDThh:mm:ss.sssTZD</code> date format used by
+     * JCR to an as small 64 bit integer (positive or negative) as possible,
+     * while preserving full accuracy (including time zone offsets) and
+     * favouring common levels of accuracy (per minute, hour and day) over
+     * full millisecond level detail.
+     * <p>
+     * Each date value is mapped to separate timestamp and timezone fields,
+     * both of whose lenghts are variable: 
+     * <pre>
+     * +----- ... ------- ... --+
+     * |  timestamp  | timezone |
+     * +----- ... ------- ... --+
+     * </pre>
+     * <p>
+     * The type and length of the timezone field can be determined by looking
+     * at the two least significant bits of the value:
+     * <dl>
+     *   <dt><code>?0</code></dt>
+     *   <dd>
+     *     UTC time. The length of the timezone field is just one bit,
+     *     i.e. the second bit is already a part of the timestamp field.
+     *   </dd>
+     *   <dt><code>01</code></dt>
+     *   <dd>
+     *     The offset is counted as hours from UTC, and stored as the number
+     *     of hours (positive or negative) in the next 5 bits (range from
+     *     -16 to +15 hours), making the timezone field 7 bits long in total.
+     *   </dd>
+     *   <dt><code>11</code></dt>
+     *   <dd>
+     *     The offset is counted as hours and minutes from UTC, and stored
+     *     as the total minute offset (positive or negative) in the next
+     *     11 bits (range from -17 to +17 hours), making the timezone field
+     *     13 bits long in total.
+     *   </dd>
+     * </dl>
+     * <p>
+     * The remaining 51-63 bits of the encoded value make up the timestamp
+     * field that also uses the two least significant bits to indicate the
+     * type and length of the field:
+     * <dl>
+     *   <dt><code>00</code></dt>
+     *   <dd>
+     *     <code>sYYYY-MM-DDT00:00:00.000</code>, i.e. midnight of the
+     *     specified date. The next 9 bits encode the day within the year
+     *     (starting from 1, maximum value 366) and the remaining bits are
+     *     used for the year, stored as an offset from year 2010.
+     *   </dd>
+     *   <dt><code>01</code></dt>
+     *   <dd>
+     *     <code>sYYYY-MM-DDThh:00:00.000</code>, i.e. at the hour. The
+     *     next 5 bits encode the hour within the day (starting from 0,
+     *     maximum value 23) and the remaining bits are used as described
+     *     above for the date.
+     *   </dd>
+     *   <dt><code>10</code></dt>
+     *   <dd>
+     *     <code>sYYYY-MM-DDThh:mm:00.000</code>, i.e. at the minute. The
+     *     next 11 bits encode the minute within the day (starting from 0,
+     *     maximum value 1439) and the remaining bits are used as described
+     *     above for the date.
+     *   </dd>
+     *   <dt><code>11</code></dt>
+     *   <dd>
+     *     <code>sYYYY-MM-DDThh:mm:ss.sss</code>, i.e. full millisecond
+     *     accuracy. The next 30 bits encode the millisecond within the
+     *     day (starting from 0, maximum value 87839999) and the remaining
+     *     bits are used as described above for the date.
+     *   </dd>
+     * </dl>
+     * <p>
+     * With full timezone and millisecond accuracies, this encoding leaves
+     * 10 bits (64 - 9 - 30 - 2 - 11 - 2) for the date offset, which allows
+     * for representation of all timestamps between years 1498 and 2521.
+     * Timestamps outside this range and with a minute-level timezone offset
+     * are automatically truncated to minute-level accuracy to support the
+     * full range of years -9999 to 9999 specified in JCR.
+     * <p>
+     * Note that the year, day of year, and time of day values are stored
+     * as separate bit sequences to avoid problems with changing leap second
+     * or leap year definitions. Bit fields are used for better encoding and
+     * decoding performance than what would be possible with the slightly more
+     * space efficient mechanism of using multiplication and modulo divisions
+     * to separate the different timestamp fields.
+     *
+     * @param value date value
+     * @throws IOException if an I/O error occurs
+     */
+    private void writeDate(Calendar value) throws IOException {
+        int y = value.get(Calendar.YEAR);
+        if (value.isSet(Calendar.ERA)
+                && value.get(Calendar.ERA) == GregorianCalendar.BC) {
+             y = 1 - y; // convert to an astronomical year
+        }
+        y -= 2010; // use a recent offset NOTE: do not change this!
+
+        int d = value.get(Calendar.DAY_OF_YEAR);
+        int h = value.get(Calendar.HOUR_OF_DAY);
+        int m = value.get(Calendar.MINUTE);
+        int s = value.get(Calendar.SECOND);
+        int u = value.get(Calendar.MILLISECOND);
+        int z = value.getTimeZone().getOffset(value.getTimeInMillis()) / (60 * 1000);
+        int zh = z / 60;
+        int zm = z - zh * 60;
+
+        long ts = y << 9 | d & 0x01ff;
+
+        if ((u != 0 || s != 0) && ((-512 <= y && y < 512) || zm == 0)) {
+            ts <<= 30;
+            ts |= (((h * 60 + m) * 60 + s) * 1000 + u) & 0x3fffffff; // 30 bits
+            ts <<= 2;
+            ts |= 3;
+        } else if (m != 0) {
+            ts <<= 11;
+            ts |= (h * 60 + m) & 0x07ff; // 11 bits
+            ts <<= 2;
+            ts |= 2;
+        } else if (h != 0) {
+            ts <<= 5;
+            ts |= h & 0x1f; // 5 bits
+            ts <<= 2;
+            ts |= 1;
+        } else {
+            ts <<= 2;
+        }
+
+        if (zm != 0) {
+            ts <<= 11;
+            ts |= z & 0x07ff; // 11 bits
+            writeVarLong(ts << 2 | 3);
+        } else if (zh != 0) {
+            ts <<= 5;
+            ts |= zh & 0x1f; // 5 bits
+            writeVarLong(ts << 2 | 1);
+        } else {
+            writeVarLong(ts << 1);
+        }
+    }
+
+    /**
      * Serializes a string in UTF-8. The length of the UTF-8 byte sequence
      * is first written as a variable-length string (see
      * {@link #writeVarInt(int)}), and then the sequence itself is written.

Modified: jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/persistence/util/BundleBindingTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/persistence/util/BundleBindingTest.java?rev=1022864&r1=1022863&r2=1022864&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/persistence/util/BundleBindingTest.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/persistence/util/BundleBindingTest.java Fri Oct 15 09:16:15 2010
@@ -44,6 +44,8 @@ import junit.framework.TestCase;
 
 public class BundleBindingTest extends TestCase {
 
+    private static final NameFactory factory = NameFactoryImpl.getInstance();
+
     private BundleBinding binding;
 
     protected void setUp() throws Exception {
@@ -113,7 +115,6 @@ public class BundleBindingTest extends T
         bundle.setSharedSet(new HashSet<NodeId>(Arrays.asList(
                 new NodeId(5, 6), new NodeId(7, 8), new NodeId(9, 10))));
 
-        NameFactory factory = NameFactoryImpl.getInstance();
         PropertyEntry property;
 
         property = new PropertyEntry(
@@ -291,7 +292,6 @@ public class BundleBindingTest extends T
         bundle.setMixinTypeNames(Collections.<Name>emptySet());
         bundle.setSharedSet(Collections.<NodeId>emptySet());
 
-        NameFactory factory = NameFactoryImpl.getInstance();
         bundle.addChildNodeEntry(factory.create("ns1", "test"), new NodeId());
         bundle.addChildNodeEntry(factory.create("ns2", "test"), new NodeId());
         bundle.addChildNodeEntry(factory.create("ns3", "test"), new NodeId());
@@ -311,6 +311,63 @@ public class BundleBindingTest extends T
         assertBundleRoundtrip(bundle);
     }
 
+    /**
+     * Tests serialization of date values.
+     */
+    public void testDateSerialization() throws Exception {
+        assertDateSerialization("2010-10-10T10:10:10.100Z");
+
+        // Different kinds of timezone offsets
+        assertDateSerialization("2010-10-10T10:10:10.100+11:00");
+        assertDateSerialization("2010-10-10T10:10:10.100-14:00");
+        assertDateSerialization("2010-10-10T10:10:10.100+00:12");
+        assertDateSerialization("2010-10-10T10:10:10.100-08:14");
+
+        // Different timestamp accuracies
+        assertDateSerialization("2010-10-10T10:10:00.000Z");
+        assertDateSerialization("2010-10-10T10:00:00.000Z");
+        assertDateSerialization("2010-10-10T00:00:00.000Z");
+
+        // Dates far from today
+        assertDateSerialization("1970-01-01T00:00:00.000Z");
+        assertDateSerialization("1970-01-01T12:34:56.789-13:45");
+        assertDateSerialization("2030-10-10T10:10:10.100+10:10");
+        assertDateSerialization("2345-10-10T10:10:10.100Z");
+        assertDateSerialization("+9876-10-10T10:10:10.100Z");
+        assertDateSerialization("-9876-10-10T10:10:10.100Z");
+    }
+
+    private void assertDateSerialization(String date) throws Exception {
+        assertValueSerialization(
+                InternalValue.valueOf(date, PropertyType.DATE));
+    }
+
+    private void assertValueSerialization(InternalValue value)
+            throws Exception {
+        NodePropBundle bundle = new NodePropBundle(new NodeId());
+        bundle.setParentId(new NodeId());
+        bundle.setNodeTypeName(NameConstants.NT_UNSTRUCTURED);
+        bundle.setMixinTypeNames(Collections.<Name>emptySet());
+        bundle.setSharedSet(Collections.<NodeId>emptySet());
+
+        Name name = factory.create("", "test");
+
+        PropertyEntry property =
+            new PropertyEntry(new PropertyId(bundle.getId(), name));
+        property.setType(value.getType());
+        property.setMultiValued(false);
+        property.setValues(new InternalValue[] { value });
+        bundle.addProperty(property);
+
+        ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+        binding.writeBundle(buffer, bundle);
+        byte[] bytes = buffer.toByteArray();
+        NodePropBundle result =
+            binding.readBundle(new ByteArrayInputStream(bytes), bundle.getId());
+
+        assertEquals(value, result.getPropertyEntry(name).getValues()[0]);
+    }
+
     private void assertBundleRoundtrip(NodePropBundle bundle)
             throws Exception {
         ByteArrayOutputStream buffer = new ByteArrayOutputStream();