You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ho...@apache.org on 2016/03/12 01:27:05 UTC
[24/50] [abbrv] lucene-solr git commit: LUCENE-7081: prefix-compress compressible fixed-width data (like InetAddress/BigInteger)

LUCENE-7081: prefix-compress compressible fixed-width data (like InetAddress/BigInteger)


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/162636bf
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/162636bf
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/162636bf

Branch: refs/heads/jira/SOLR-445
Commit: 162636bf05b5b6b35a79bacd2e7440830b05960f
Parents: 89cc676
Author: Robert Muir <rm...@apache.org>
Authored: Thu Mar 10 07:25:48 2016 -0500
Committer: Robert Muir <rm...@apache.org>
Committed: Thu Mar 10 07:25:48 2016 -0500

----------------------------------------------------------------------
 .../lucene54/Lucene54DocValuesConsumer.java     | 21 +++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/162636bf/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java
index 858c54b..96acfd2 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene54/Lucene54DocValuesConsumer.java
@@ -411,17 +411,32 @@ final class Lucene54DocValuesConsumer extends DocValuesConsumer implements Close
   
   /** expert: writes a value dictionary for a sorted/sortedset field */
   private void addTermsDict(FieldInfo field, final Iterable<BytesRef> values) throws IOException {
-    // first check if it's a "fixed-length" terms dict
+    // first check if it's a "fixed-length" terms dict, and compressibility if so
     int minLength = Integer.MAX_VALUE;
     int maxLength = Integer.MIN_VALUE;
     long numValues = 0;
+    BytesRefBuilder previousValue = new BytesRefBuilder();
+    long prefixSum = 0; // only valid for fixed-width data, as we have a choice there
     for (BytesRef v : values) {
       minLength = Math.min(minLength, v.length);
       maxLength = Math.max(maxLength, v.length);
+      if (minLength == maxLength) {
+        int termPosition = (int) (numValues & INTERVAL_MASK);
+        if (termPosition == 0) {
+          // first term in block, save it away to compare against the last term later
+          previousValue.copyBytes(v);
+        } else if (termPosition == INTERVAL_COUNT - 1) {
+          // last term in block, accumulate shared prefix against first term
+          prefixSum += StringHelper.bytesDifference(previousValue.get(), v);
+        }
+      }
       numValues++;
     }
-    if (minLength == maxLength) {
-      // no index needed: direct addressing by mult
+    // for fixed width data, look at the avg(shared prefix) before deciding how to encode:
+    // prefix compression "costs" worst case 2 bytes per term because we must store suffix lengths.
+    // so if we share at least 3 bytes on average, always compress.
+    if (minLength == maxLength && prefixSum <= 3*(numValues >> INTERVAL_SHIFT)) {
+      // no index needed: not very compressible, direct addressing by mult
       addBinaryField(field, values);
     } else if (numValues < REVERSE_INTERVAL_COUNT) {
       // low cardinality: waste a few KB of ram, but can't really use fancy index etc