You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2016/07/04 07:49:54 UTC

[2/4] lucene-solr:branch_6x: LUCENE-7351: Doc id compression for points.

LUCENE-7351: Doc id compression for points.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/01de73bc
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/01de73bc
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/01de73bc

Branch: refs/heads/branch_6x
Commit: 01de73bc0a1b315bbbe4df046b5c0661cec8de2e
Parents: 4eaf66a
Author: Adrien Grand <jp...@gmail.com>
Authored: Mon Jul 4 09:13:41 2016 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Mon Jul 4 09:33:59 2016 +0200

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   2 +
 .../org/apache/lucene/util/bkd/BKDReader.java   |  18 +-
 .../org/apache/lucene/util/bkd/BKDWriter.java   |   8 +-
 .../apache/lucene/util/bkd/DocIdsWriter.java    | 170 +++++++++++++++++++
 .../lucene/util/bkd/TestDocIdsWriter.java       | 101 +++++++++++
 5 files changed, 288 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/01de73bc/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 53e18a5..ce6e60b2 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -64,6 +64,8 @@ Optimizations
 
 * LUCENE-7356: SearchGroup tweaks. (Christine Poerschke)
 
+* LUCENE-7351: Doc id compression for points. (Adrien Grand)
+
 Other
 
 * LUCENE-4787: Fixed some highlighting javadocs. (Michael Dodsworth via Adrien

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/01de73bc/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
index 9a123a2..3566bc1 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
@@ -44,11 +44,12 @@ public class BKDReader implements Accountable {
   final byte[] maxPackedValue;
   final long pointCount;
   final int docCount;
+  final int version;
   protected final int packedBytesLength;
 
   /** Caller must pre-seek the provided {@link IndexInput} to the index location that {@link BKDWriter#finish} returned */
   public BKDReader(IndexInput in) throws IOException {
-    CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_START);
+    version = CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_CURRENT);
     numDims = in.readVInt();
     maxPointsInLeafNode = in.readVInt();
     bytesPerDim = in.readVInt();
@@ -141,6 +142,7 @@ public class BKDReader implements Accountable {
     this.maxPackedValue = maxPackedValue;
     this.pointCount = pointCount;
     this.docCount = docCount;
+    this.version = BKDWriter.VERSION_CURRENT;
     assert minPackedValue.length == packedBytesLength;
     assert maxPackedValue.length == packedBytesLength;
   }
@@ -314,13 +316,15 @@ public class BKDReader implements Accountable {
   protected void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
     // Leaf node
     in.seek(blockFP);
-      
+
     // How many points are stored in this leaf cell:
     int count = in.readVInt();
     visitor.grow(count);
 
-    for(int i=0;i<count;i++) {
-      visitor.visit(in.readInt());
+    if (version < BKDWriter.VERSION_COMPRESSED_DOC_IDS) {
+      DocIdsWriter.readInts32(in, count, visitor);
+    } else {
+      DocIdsWriter.readInts(in, count, visitor);
     }
   }
 
@@ -330,8 +334,10 @@ public class BKDReader implements Accountable {
     // How many points are stored in this leaf cell:
     int count = in.readVInt();
 
-    for(int i=0;i<count;i++) {
-      docIDs[i] = in.readInt();
+    if (version < BKDWriter.VERSION_COMPRESSED_DOC_IDS) {
+      DocIdsWriter.readInts32(in, count, docIDs);
+    } else {
+      DocIdsWriter.readInts(in, count, docIDs);
     }
 
     return count;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/01de73bc/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index ad9dd5d..6dfdac2 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -77,7 +77,8 @@ public class BKDWriter implements Closeable {
 
   public static final String CODEC_NAME = "BKD";
   public static final int VERSION_START = 0;
-  public static final int VERSION_CURRENT = VERSION_START;
+  public static final int VERSION_COMPRESSED_DOC_IDS = 1;
+  public static final int VERSION_CURRENT = VERSION_COMPRESSED_DOC_IDS;
 
   /** How many bytes each docs takes in the fixed-width offline format */
   private final int bytesPerDoc;
@@ -892,10 +893,7 @@ public class BKDWriter implements Closeable {
   protected void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws IOException {
     assert count > 0: "maxPointsInLeafNode=" + maxPointsInLeafNode;
     out.writeVInt(count);
-
-    for (int i=0;i<count;i++) {
-      out.writeInt(docIDs[start + i]);
-    }
+    DocIdsWriter.writeDocIds(docIDs, start, count, out);
   }
 
   protected void writeLeafBlockPackedValue(IndexOutput out, int[] commonPrefixLengths, byte[] bytes, int offset) throws IOException {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/01de73bc/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java
new file mode 100644
index 0000000..9dce5a8
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/DocIdsWriter.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.bkd;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.PointValues.IntersectVisitor;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+
+class DocIdsWriter {
+
+  private DocIdsWriter() {}
+
+  static void writeDocIds(int[] docIds, int start, int count, IndexOutput out) throws IOException {
+    // docs can be sorted either when all docs in a block have the same value
+    // or when a segment is sorted
+    boolean sorted = true;
+    for (int i = 1; i < count; ++i) {
+      if (docIds[start + i - 1] > docIds[start + i]) {
+        sorted = false;
+        break;
+      }
+    }
+    if (sorted) {
+      out.writeByte((byte) 0);
+      int previous = 0;
+      for (int i = 0; i < count; ++i) {
+        int doc = docIds[start + i];
+        out.writeVInt(doc - previous);
+        previous = doc;
+      }
+    } else {
+      long max = 0;
+      for (int i = 0; i < count; ++i) {
+        max |= Integer.toUnsignedLong(docIds[start + i]);
+      }
+      if (max <= 0xffffff) {
+        out.writeByte((byte) 24);
+        for (int i = 0; i < count; ++i) {
+          out.writeShort((short) (docIds[start + i] >>> 8));
+          out.writeByte((byte) docIds[start + i]);
+        }
+      } else {
+        out.writeByte((byte) 32);
+        for (int i = 0; i < count; ++i) {
+          out.writeInt(docIds[start + i]);
+        }
+      }
+    }
+  }
+
+  /** Read {@code count} integers into {@code docIDs}. */
+  static void readInts(IndexInput in, int count, int[] docIDs) throws IOException {
+    final int bpv = in.readByte();
+    switch (bpv) {
+      case 0:
+        readDeltaVInts(in, count, docIDs);
+        break;
+      case 32:
+        readInts32(in, count, docIDs);
+        break;
+      case 24:
+        readInts24(in, count, docIDs);
+        break;
+      default:
+        throw new IOException("Unsupported number of bits per value: " + bpv);
+    }
+  }
+
+  private static void readDeltaVInts(IndexInput in, int count, int[] docIDs) throws IOException {
+    int doc = 0;
+    for (int i = 0; i < count; i++) {
+      doc += in.readVInt();
+      docIDs[i] = doc;
+    }
+  }
+
+  static <T> void readInts32(IndexInput in, int count, int[] docIDs) throws IOException {
+    for (int i = 0; i < count; i++) {
+      docIDs[i] = in.readInt();
+    }
+  }
+
+  private static void readInts24(IndexInput in, int count, int[] docIDs) throws IOException {
+    int i;
+    for (i = 0; i < count - 7; i += 8) {
+      long l1 = in.readLong();
+      long l2 = in.readLong();
+      long l3 = in.readLong();
+      docIDs[i] =  (int) (l1 >>> 40);
+      docIDs[i+1] = (int) (l1 >>> 16) & 0xffffff;
+      docIDs[i+2] = (int) (((l1 & 0xffff) << 8) | (l2 >>> 56));
+      docIDs[i+3] = (int) (l2 >>> 32) & 0xffffff;
+      docIDs[i+4] = (int) (l2 >>> 8) & 0xffffff;
+      docIDs[i+5] = (int) (((l2 & 0xff) << 16) | (l3 >>> 48));
+      docIDs[i+6] = (int) (l3 >>> 24) & 0xffffff;
+      docIDs[i+7] = (int) l3 & 0xffffff;
+    }
+    for (; i < count; ++i) {
+      docIDs[i] = (Short.toUnsignedInt(in.readShort()) << 8) | Byte.toUnsignedInt(in.readByte());
+    }
+  }
+
+  /** Read {@code count} integers and feed the result directly to {@link IntersectVisitor#visit(int)}. */
+  static void readInts(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
+    final int bpv = in.readByte();
+    switch (bpv) {
+      case 0:
+        readDeltaVInts(in, count, visitor);
+        break;
+      case 32:
+        readInts32(in, count, visitor);
+        break;
+      case 24:
+        readInts24(in, count, visitor);
+        break;
+      default:
+        throw new IOException("Unsupported number of bits per value: " + bpv);
+    }
+  }
+
+  private static void readDeltaVInts(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
+    int doc = 0;
+    for (int i = 0; i < count; i++) {
+      doc += in.readVInt();
+      visitor.visit(doc);
+    }
+  }
+
+  static void readInts32(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
+    for (int i = 0; i < count; i++) {
+      visitor.visit(in.readInt());
+    }
+  }
+
+  private static void readInts24(IndexInput in, int count, IntersectVisitor visitor) throws IOException {
+    int i;
+    for (i = 0; i < count - 7; i += 8) {
+      long l1 = in.readLong();
+      long l2 = in.readLong();
+      long l3 = in.readLong();
+      visitor.visit((int) (l1 >>> 40));
+      visitor.visit((int) (l1 >>> 16) & 0xffffff);
+      visitor.visit((int) (((l1 & 0xffff) << 8) | (l2 >>> 56)));
+      visitor.visit((int) (l2 >>> 32) & 0xffffff);
+      visitor.visit((int) (l2 >>> 8) & 0xffffff);
+      visitor.visit((int) (((l2 & 0xff) << 16) | (l3 >>> 48)));
+      visitor.visit((int) (l3 >>> 24) & 0xffffff);
+      visitor.visit((int) l3 & 0xffffff);
+    }
+    for (; i < count; ++i) {
+      visitor.visit((Short.toUnsignedInt(in.readShort()) << 8) | Byte.toUnsignedInt(in.readByte()));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/01de73bc/lucene/core/src/test/org/apache/lucene/util/bkd/TestDocIdsWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestDocIdsWriter.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestDocIdsWriter.java
new file mode 100644
index 0000000..1191ba5
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestDocIdsWriter.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.bkd;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.index.PointValues.IntersectVisitor;
+import org.apache.lucene.index.PointValues.Relation;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+public class TestDocIdsWriter extends LuceneTestCase {
+
+  public void testRandom() throws Exception {
+    try (Directory dir = newDirectory()) {
+      for (int iter = 0; iter < 1000; ++iter) {
+        int[] docIDs = new int[random().nextInt(5000)];
+        final int bpv = TestUtil.nextInt(random(), 1, 32);
+        for (int i = 0; i < docIDs.length; ++i) {
+          docIDs[i] = TestUtil.nextInt(random(), 0, (1 << bpv) - 1);
+        }
+        test(dir, docIDs);
+      }
+    }
+  }
+
+  public void testSorted() throws Exception {
+    try (Directory dir = newDirectory()) {
+      for (int iter = 0; iter < 1000; ++iter) {
+        int[] docIDs = new int[random().nextInt(5000)];
+        final int bpv = TestUtil.nextInt(random(), 1, 32);
+        for (int i = 0; i < docIDs.length; ++i) {
+          docIDs[i] = TestUtil.nextInt(random(), 0, (1 << bpv) - 1);
+        }
+        Arrays.sort(docIDs);
+        test(dir, docIDs);
+      }
+    }
+  }
+
+  private void test(Directory dir, int[] ints) throws Exception {
+    final long len;
+    try(IndexOutput out = dir.createOutput("tmp", IOContext.DEFAULT)) {
+      DocIdsWriter.writeDocIds(ints, 0, ints.length, out);
+      len = out.getFilePointer();
+      if (random().nextBoolean()) {
+        out.writeLong(0); // garbage
+      }
+    }
+    try (IndexInput in = dir.openInput("tmp", IOContext.READONCE)) {
+      int[] read = new int[ints.length];
+      DocIdsWriter.readInts(in, ints.length, read);
+      assertArrayEquals(ints, read);
+      assertEquals(len, in.getFilePointer());
+    }
+    try (IndexInput in = dir.openInput("tmp", IOContext.READONCE)) {
+      int[] read = new int[ints.length];
+      DocIdsWriter.readInts(in, ints.length, new IntersectVisitor() {
+        int i = 0;
+        @Override
+        public void visit(int docID) throws IOException {
+          read[i++] = docID;
+        }
+
+        @Override
+        public void visit(int docID, byte[] packedValue) throws IOException {
+          throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+          throw new UnsupportedOperationException();
+        }
+
+      });
+      assertArrayEquals(ints, read);
+      assertEquals(len, in.getFilePointer());
+    }
+    dir.deleteFile("tmp");
+  }
+
+}