You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2015/07/21 10:01:25 UTC

svn commit: r1692061 - in /lucene/dev/branches/branch_5x: ./ lucene/ lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene410/ lucene/core/ lucene/core/src/java/org/apache/lucene/codecs/lucene50/ lucene/core/src/test/org/apache/lucene/codecs/...

Author: jpountz
Date: Tue Jul 21 08:01:24 2015
New Revision: 1692061

URL: http://svn.apache.org/r1692061
Log:
LUCENE-6668: Added table encoding to sorted set/numeric doc values.

Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/lucene/   (props changed)
    lucene/dev/branches/branch_5x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_5x/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene410/TestLucene410DocValuesFormat.java
    lucene/dev/branches/branch_5x/lucene/core/   (props changed)
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java
    lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java
    lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java
    lucene/dev/branches/branch_5x/lucene/test-framework/   (props changed)
    lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java

Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1692061&r1=1692060&r2=1692061&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Tue Jul 21 08:01:24 2015
@@ -313,6 +313,10 @@ Optimizations
   and TermsQuery. This should especially help when there are lots of small
   postings lists. (Adrien Grand, Mike McCandless)
 
+* LUCENE-6668: Optimized storage for sorted set and sorted numeric doc values
+  in the case that there are few unique sets of values.
+  (Adrien Grand, Robert Muir)
+
 Build
 
 * LUCENE-6518: Don't report false thread leaks from IBM J9

Modified: lucene/dev/branches/branch_5x/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene410/TestLucene410DocValuesFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene410/TestLucene410DocValuesFormat.java?rev=1692061&r1=1692060&r2=1692061&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene410/TestLucene410DocValuesFormat.java (original)
+++ lucene/dev/branches/branch_5x/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene410/TestLucene410DocValuesFormat.java Tue Jul 21 08:01:24 2015
@@ -63,7 +63,7 @@ public class TestLucene410DocValuesForma
   public void testSortedSetVariableLengthBigVsStoredFields() throws Exception {
     int numIterations = atLeast(1);
     for (int i = 0; i < numIterations; i++) {
-      doTestSortedSetVsStoredFields(atLeast(300), 1, 32766, 16);
+      doTestSortedSetVsStoredFields(atLeast(300), 1, 32766, 16, 100);
     }
   }
   
@@ -71,7 +71,7 @@ public class TestLucene410DocValuesForma
   public void testSortedSetVariableLengthManyVsStoredFields() throws Exception {
     int numIterations = atLeast(1);
     for (int i = 0; i < numIterations; i++) {
-      doTestSortedSetVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500, 16);
+      doTestSortedSetVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500, 16, 100);
     }
   }
   

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java?rev=1692061&r1=1692060&r2=1692061&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesConsumer.java Tue Jul 21 08:01:24 2015
@@ -23,6 +23,11 @@ import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
 
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.DocValuesConsumer;
@@ -34,6 +39,7 @@ import org.apache.lucene.store.RAMOutput
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LongsRef;
 import org.apache.lucene.util.MathUtil;
 import org.apache.lucene.util.PagedBytes;
 import org.apache.lucene.util.PagedBytes.PagedBytesDataInput;
@@ -463,11 +469,22 @@ class Lucene50DocValuesConsumer extends
       // The field is single-valued, we can encode it as NUMERIC
       addNumericField(field, singletonView(docToValueCount, values, null));
     } else {
-      meta.writeVInt(SORTED_WITH_ADDRESSES);
-      // write the stream of values as a numeric field
-      addNumericField(field, values, true);
-      // write the doc -> ord count as a absolute index to the stream
-      addAddresses(field, docToValueCount);
+      final SortedSet<LongsRef> uniqueValueSets = uniqueValueSets(docToValueCount, values);
+      if (uniqueValueSets != null) {
+        meta.writeVInt(SORTED_SET_TABLE);
+
+        // write the set_id -> values mapping
+        writeDictionary(uniqueValueSets);
+
+        // write the doc -> set_id as a numeric field
+        addNumericField(field, docToSetId(uniqueValueSets, docToValueCount, values), false);
+      } else {
+        meta.writeVInt(SORTED_WITH_ADDRESSES);
+        // write the stream of values as a numeric field
+        addNumericField(field, values, true);
+        // write the doc -> ord count as a absolute index to the stream
+        addAddresses(field, docToValueCount);
+      }
     }
   }
 
@@ -481,20 +498,120 @@ class Lucene50DocValuesConsumer extends
       // The field is single-valued, we can encode it as SORTED
       addSortedField(field, values, singletonView(docToOrdCount, ords, -1L));
     } else {
-      meta.writeVInt(SORTED_WITH_ADDRESSES);
+      final SortedSet<LongsRef> uniqueValueSets = uniqueValueSets(docToOrdCount, ords);
+      if (uniqueValueSets != null) {
+        meta.writeVInt(SORTED_SET_TABLE);
+
+        // write the set_id -> ords mapping
+        writeDictionary(uniqueValueSets);
 
-      // write the ord -> byte[] as a binary field
-      addTermsDict(field, values);
+        // write the ord -> byte[] as a binary field
+        addTermsDict(field, values);
 
-      // write the stream of ords as a numeric field
-      // NOTE: we could return an iterator that delta-encodes these within a doc
-      addNumericField(field, ords, false);
+        // write the doc -> set_id as a numeric field
+        addNumericField(field, docToSetId(uniqueValueSets, docToOrdCount, ords), false);
+      } else {
+        meta.writeVInt(SORTED_WITH_ADDRESSES);
+
+        // write the ord -> byte[] as a binary field
+        addTermsDict(field, values);
+
+        // write the stream of ords as a numeric field
+        // NOTE: we could return an iterator that delta-encodes these within a doc
+        addNumericField(field, ords, false);
 
-      // write the doc -> ord count as a absolute index to the stream
-      addAddresses(field, docToOrdCount);
+        // write the doc -> ord count as a absolute index to the stream
+        addAddresses(field, docToOrdCount);
+      }
     }
   }
-  
+
+  private SortedSet<LongsRef> uniqueValueSets(Iterable<Number> docToValueCount, Iterable<Number> values) {
+    Set<LongsRef> uniqueValueSet = new HashSet<>();
+    LongsRef docValues = new LongsRef(256);
+
+    Iterator<Number> valueCountIterator = docToValueCount.iterator();
+    Iterator<Number> valueIterator = values.iterator();
+    int totalDictSize = 0;
+    while (valueCountIterator.hasNext()) {
+      docValues.length = valueCountIterator.next().intValue();
+      if (docValues.length > 256) {
+        return null;
+      }
+      for (int i = 0; i < docValues.length; ++i) {
+        docValues.longs[i] = valueIterator.next().longValue();
+      }
+      if (uniqueValueSet.contains(docValues)) {
+        continue;
+      }
+      totalDictSize += docValues.length;
+      if (totalDictSize > 256) {
+        return null;
+      }
+      uniqueValueSet.add(new LongsRef(Arrays.copyOf(docValues.longs, docValues.length), 0, docValues.length));
+    }
+    assert valueIterator.hasNext() == false;
+    return new TreeSet<>(uniqueValueSet);
+  }
+
+  private void writeDictionary(SortedSet<LongsRef> uniqueValueSets) throws IOException {
+    int lengthSum = 0;
+    for (LongsRef longs : uniqueValueSets) {
+      lengthSum += longs.length;
+    }
+
+    meta.writeInt(lengthSum);
+    for (LongsRef valueSet : uniqueValueSets) {
+      for (int  i = 0; i < valueSet.length; ++i) {
+        meta.writeLong(valueSet.longs[valueSet.offset + i]);
+      }
+    }
+
+    meta.writeInt(uniqueValueSets.size());
+    for (LongsRef valueSet : uniqueValueSets) {
+      meta.writeInt(valueSet.length);
+    }
+  }
+
+  private Iterable<Number> docToSetId(SortedSet<LongsRef> uniqueValueSets, final Iterable<Number> docToValueCount, final Iterable<Number> values) {
+    final Map<LongsRef, Integer> setIds = new HashMap<>();
+    int i = 0;
+    for (LongsRef set : uniqueValueSets) {
+      setIds.put(set, i++);
+    }
+    assert i == uniqueValueSets.size();
+
+    return new Iterable<Number>() {
+
+      @Override
+      public Iterator<Number> iterator() {
+        final Iterator<Number> valueCountIterator = docToValueCount.iterator();
+        final Iterator<Number> valueIterator = values.iterator();
+        final LongsRef docValues = new LongsRef(256);
+        return new Iterator<Number>() {
+
+          @Override
+          public boolean hasNext() {
+            return valueCountIterator.hasNext();
+          }
+
+          @Override
+          public Number next() {
+            docValues.length = valueCountIterator.next().intValue();
+            for (int i = 0; i < docValues.length; ++i) {
+              docValues.longs[i] = valueIterator.next().longValue();
+            }
+            final Integer id = setIds.get(docValues);
+            assert id != null;
+            return id;
+          }
+
+        };
+
+      }
+    };
+  }
+
   // writes addressing information as MONOTONIC_COMPRESSED integer
   private void addAddresses(FieldInfo field, Iterable<Number> values) throws IOException {
     meta.writeVInt(field.number);

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java?rev=1692061&r1=1692060&r2=1692061&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesFormat.java Tue Jul 21 08:01:24 2015
@@ -72,13 +72,21 @@ import org.apache.lucene.util.packed.Mon
  * <p>
  * {@link DocValuesType#SORTED_SET SORTED_SET}:
  * <ul>
+ *    <li>Single: if all documents have 0 or 1 value, then data are written like SORTED.
+ *    <li>SortedSet table: when there are few unique sets of values (&lt; 256) then each set is assigned
+ *        an id, a lookup table is written and the mapping from document to set id is written using the
+ *        numeric strategies above.
  *    <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Binary, 
  *        an ordinal list and per-document index into this list are written using the numeric strategies 
- *        above. 
+ *        above.
  * </ul>
  * <p>
  * {@link DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}:
  * <ul>
+ *    <li>Single: if all documents have 0 or 1 value, then data are written like NUMERIC.
+ *    <li>SortedSet table: when there are few unique sets of values (&lt; 256) then each set is assigned
+ *        an id, a lookup table is written and the mapping from document to set id is written using the
+ *        numeric strategies above.
  *    <li>SortedNumeric: a value list and per-document index into this list are written using the numeric
  *        strategies above.
  * </ul>
@@ -108,21 +116,24 @@ import org.apache.lucene.util.packed.Mon
  *     <li>PrefixBinaryEntry --&gt; BinaryHeader,AddressInterval,AddressOffset,PackedVersion,BlockSize</li>
  *     <li>BinaryHeader --&gt; FieldNumber,EntryType,BinaryType,MissingOffset,MinLength,MaxLength,DataOffset</li>
  *     <li>SortedEntry --&gt; FieldNumber,EntryType,BinaryEntry,NumericEntry</li>
- *     <li>SortedSetEntry --&gt; EntryType,BinaryEntry,NumericEntry,NumericEntry</li>
- *     <li>SortedNumericEntry --&gt; EntryType,NumericEntry,NumericEntry</li>
+ *     <li>SortedSetEntry --&gt; SingleSortedSetEntry | AddressesSortedSetEntry | TableSortedSetEntry</li>
+ *     <li>SingleSortedSetEntry --&gt; SetHeader,SortedEntry</li>
+ *     <li>AddressesSortedSetEntry --&gt; SetHeader,BinaryEntry,NumericEntry,NumericEntry</li>
+ *     <li>TableSortedSetEntry --&gt; SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}<sup>TotalTableLength</sup>,TableSize,{@link DataOutput#writeInt Int32}<sup>TableSize</sup>,BinaryEntry,NumericEntry</li>
+ *     <li>SetHeader --&gt; FieldNumber,EntryType,SetType</li>
+ *     <li>SortedNumericEntry --&gt; SingleSortedNumericEntry | AddressesSortedNumericEntry | TableSortedNumericEntry</li>
+ *     <li>SingleNumericEntry --&gt; SetHeader,NumericEntry</li>
+ *     <li>AddressesSortedNumericEntry --&gt; SetHeader,NumericEntry,NumericEntry</li>
+ *     <li>TableSortedNumericEntry --&gt; SetHeader,TotalTableLength,{@link DataOutput#writeLong Int64}<sup>TotalTableLength</sup>,TableSize,{@link DataOutput#writeInt Int32}<sup>TableSize</sup>,NumericEntry</li>
  *     <li>FieldNumber,PackedVersion,MinLength,MaxLength,BlockSize,ValueCount --&gt; {@link DataOutput#writeVInt VInt}</li>
  *     <li>EntryType,CompressionType --&gt; {@link DataOutput#writeByte Byte}</li>
  *     <li>Header --&gt; {@link CodecUtil#writeIndexHeader IndexHeader}</li>
  *     <li>MinValue,GCD,MissingOffset,AddressOffset,DataOffset,EndOffset --&gt; {@link DataOutput#writeLong Int64}</li>
- *     <li>TableSize,BitsPerValue --&gt; {@link DataOutput#writeVInt vInt}</li>
+ *     <li>TableSize,BitsPerValue,TotalTableLength --&gt; {@link DataOutput#writeVInt vInt}</li>
  *     <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
  *   </ul>
  *   <p>Sorted fields have two entries: a BinaryEntry with the value metadata,
  *      and an ordinary NumericEntry for the document-to-ord metadata.</p>
- *   <p>SortedSet fields have three entries: a BinaryEntry with the value metadata,
- *      and two NumericEntries for the document-to-ord-index and ordinal list metadata.</p>
- *   <p>SortedNumeric fields have two entries: A NumericEntry with the value metadata,
- *      and a numeric entry with the document-to-value index.</p>
  *   <p>FieldNumber of -1 indicates the end of metadata.</p>
  *   <p>EntryType is a 0 (NumericEntry) or 1 (BinaryEntry)</p>
  *   <p>DataOffset is the pointer to the start of the data in the DocValues data (.dvd)</p>
@@ -144,6 +155,15 @@ import org.apache.lucene.util.packed.Mon
  *         <li>1 --&gt; variable-width. An address for each value is stored.
  *         <li>2 --&gt; prefix-compressed. An address to the start of every interval'th value is stored.
  *      </ul>
+ *   <p>SetType indicates how SortedSet and SortedNumeric values will be stored:
+ *       <ul>
+ *         <li>0 --&gt; with addresses. There are two numeric entries: a first one from document to start
+ *             offset, and a second one from offset to ord/value.
+ *         <li>1 --&gt; single-valued. Used when all documents have at most one value and is encoded like
+ *             a regular Sorted/Numeric entry.
+ *         <li>2 --&gt; table-encoded. A lookup table of unique sets of values is written, followed by a
+ *             numeric entry that maps each document to an ordinal in this table.
+ *       </ul>
  *   <p>MinLength and MaxLength represent the min and max byte[] value lengths for Binary values.
  *      If they are equal, then all values are of a fixed size, and can be addressed as DataOffset + (docID * length).
  *      Otherwise, the binary values are of variable size, and packed integer metadata (PackedVersion,BlockSize)
@@ -187,7 +207,8 @@ public final class Lucene50DocValuesForm
   static final String META_CODEC = "Lucene50DocValuesMetadata";
   static final String META_EXTENSION = "dvm";
   static final int VERSION_START = 0;
-  static final int VERSION_CURRENT = VERSION_START;
+  static final int VERSION_SORTEDSET_TABLE = 1;
+  static final int VERSION_CURRENT = VERSION_SORTEDSET_TABLE;
   
   // indicates docvalues type
   static final byte NUMERIC = 0;
@@ -235,6 +256,9 @@ public final class Lucene50DocValuesForm
   /** Single-valued sorted set values, encoded as sorted values, so no level
    *  of indirection: {@code docId -> ord}. */
   static final int SORTED_SINGLE_VALUED = 1;
+  /** Compressed giving IDs to unique sets of values:
+   * {@code docId -> setId -> ords} */
+  static final int SORTED_SET_TABLE = 2;
   
   /** placeholder for missing offset that means there are no missing values */
   static final int ALL_LIVE = -1;

Modified: lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java?rev=1692061&r1=1692060&r2=1692061&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50DocValuesProducer.java Tue Jul 21 08:01:24 2015
@@ -206,6 +206,28 @@ class Lucene50DocValuesProducer extends
     ordIndexes.put(info.name, n2);
   }
 
+  private void readSortedSetFieldWithTable(FieldInfo info, IndexInput meta) throws IOException {
+    // sortedset table = binary + ordset table + ordset index
+    if (meta.readVInt() != info.number) {
+      throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
+    }
+    if (meta.readByte() != Lucene50DocValuesFormat.BINARY) {
+      throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
+    }
+
+    BinaryEntry b = readBinaryEntry(meta);
+    binaries.put(info.name, b);
+
+    if (meta.readVInt() != info.number) {
+      throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
+    }
+    if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
+      throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
+    }
+    NumericEntry n = readNumericEntry(meta);
+    ords.put(info.name, n);
+  }
+
   private int readFields(IndexInput meta, FieldInfos infos) throws IOException {
     int numFields = 0;
     int fieldNumber = meta.readVInt();
@@ -229,6 +251,8 @@ class Lucene50DocValuesProducer extends
         sortedSets.put(info.name, ss);
         if (ss.format == SORTED_WITH_ADDRESSES) {
           readSortedSetFieldWithAddresses(info, meta);
+        } else if (ss.format == SORTED_SET_TABLE) {
+          readSortedSetFieldWithTable(info, meta);
         } else if (ss.format == SORTED_SINGLE_VALUED) {
           if (meta.readVInt() != fieldNumber) {
             throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
@@ -243,13 +267,6 @@ class Lucene50DocValuesProducer extends
       } else if (type == Lucene50DocValuesFormat.SORTED_NUMERIC) {
         SortedSetEntry ss = readSortedSetEntry(meta);
         sortedNumerics.put(info.name, ss);
-        if (meta.readVInt() != fieldNumber) {
-          throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
-        }
-        if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
-          throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
-        }
-        numerics.put(info.name, readNumericEntry(meta));
         if (ss.format == SORTED_WITH_ADDRESSES) {
           if (meta.readVInt() != fieldNumber) {
             throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
@@ -257,9 +274,33 @@ class Lucene50DocValuesProducer extends
           if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
             throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
           }
+          numerics.put(info.name, readNumericEntry(meta));
+          if (meta.readVInt() != fieldNumber) {
+            throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
+          }
+          if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
+            throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
+          }
           NumericEntry ordIndex = readNumericEntry(meta);
           ordIndexes.put(info.name, ordIndex);
-        } else if (ss.format != SORTED_SINGLE_VALUED) {
+        } else if (ss.format == SORTED_SET_TABLE) {
+          if (meta.readVInt() != info.number) {
+            throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
+          }
+          if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
+            throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
+          }
+          NumericEntry n = readNumericEntry(meta);
+          ords.put(info.name, n);
+        } else if (ss.format == SORTED_SINGLE_VALUED) {
+          if (meta.readVInt() != fieldNumber) {
+            throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
+          }
+          if (meta.readByte() != Lucene50DocValuesFormat.NUMERIC) {
+            throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
+          }
+          numerics.put(info.name, readNumericEntry(meta));
+        } else {
           throw new AssertionError();
         }
       } else {
@@ -346,7 +387,24 @@ class Lucene50DocValuesProducer extends
   SortedSetEntry readSortedSetEntry(IndexInput meta) throws IOException {
     SortedSetEntry entry = new SortedSetEntry();
     entry.format = meta.readVInt();
-    if (entry.format != SORTED_SINGLE_VALUED && entry.format != SORTED_WITH_ADDRESSES) {
+    if (entry.format == SORTED_SET_TABLE) {
+      final int totalTableLength = meta.readInt();
+      if (totalTableLength > 256) {
+        throw new CorruptIndexException("SORTED_SET_TABLE cannot have more than 256 values in its dictionary, got=" + totalTableLength, meta);
+      }
+      entry.table = new long[totalTableLength];
+      for (int i = 0; i < totalTableLength; ++i) {
+        entry.table[i] = meta.readLong();
+      }
+      final int tableSize = meta.readInt();
+      if (tableSize > totalTableLength + 1) { // +1 because of the empty set
+        throw new CorruptIndexException("SORTED_SET_TABLE cannot have more set ids than ords in its dictionary, got " + totalTableLength + " ords and " + tableSize + " sets", meta);
+      }
+      entry.tableOffsets = new int[tableSize + 1];
+      for (int i = 1; i < entry.tableOffsets.length; ++i) {
+        entry.tableOffsets[i] = entry.tableOffsets[i - 1] + meta.readInt();
+      }
+    } else if (entry.format != SORTED_SINGLE_VALUED && entry.format != SORTED_WITH_ADDRESSES) {
       throw new CorruptIndexException("Unknown format: " + entry.format, meta);
     }
     return entry;
@@ -611,12 +669,14 @@ class Lucene50DocValuesProducer extends
   @Override
   public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
     SortedSetEntry ss = sortedNumerics.get(field.name);
-    NumericEntry numericEntry = numerics.get(field.name);
-    final LongValues values = getNumeric(numericEntry);
     if (ss.format == SORTED_SINGLE_VALUED) {
+      NumericEntry numericEntry = numerics.get(field.name);
+      final LongValues values = getNumeric(numericEntry);
       final Bits docsWithField = getLiveBits(numericEntry.missingOffset, maxDoc);
       return DocValues.singleton(values, docsWithField);
     } else if (ss.format == SORTED_WITH_ADDRESSES) {
+      NumericEntry numericEntry = numerics.get(field.name);
+      final LongValues values = getNumeric(numericEntry);
       final MonotonicBlockPackedReader ordIndex = getOrdIndexInstance(field, ordIndexes.get(field.name));
       
       return new SortedNumericDocValues() {
@@ -639,6 +699,33 @@ class Lucene50DocValuesProducer extends
           return (int) (endOffset - startOffset);
         }
       };
+    } else if (ss.format == SORTED_SET_TABLE) {
+      NumericEntry entry = ords.get(field.name);
+      final LongValues ordinals = getNumeric(entry);
+
+      final long[] table = ss.table;
+      final int[] offsets = ss.tableOffsets;
+      return new SortedNumericDocValues() {
+        int startOffset;
+        int endOffset;
+        
+        @Override
+        public void setDocument(int doc) {
+          final int ord = (int) ordinals.get(doc);
+          startOffset = offsets[ord];
+          endOffset = offsets[ord + 1];
+        }
+
+        @Override
+        public long valueAt(int index) {
+          return table[startOffset + index];
+        }
+
+        @Override
+        public int count() {
+          return endOffset - startOffset;
+        }
+      };
     } else {
       throw new AssertionError();
     }
@@ -647,13 +734,20 @@ class Lucene50DocValuesProducer extends
   @Override
   public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
     SortedSetEntry ss = sortedSets.get(field.name);
-    if (ss.format == SORTED_SINGLE_VALUED) {
-      final SortedDocValues values = getSorted(field);
-      return DocValues.singleton(values);
-    } else if (ss.format != SORTED_WITH_ADDRESSES) {
-      throw new AssertionError();
+    switch (ss.format) {
+      case SORTED_SINGLE_VALUED:
+        final SortedDocValues values = getSorted(field);
+        return DocValues.singleton(values);
+      case SORTED_WITH_ADDRESSES:
+        return getSortedSetWithAddresses(field);
+      case SORTED_SET_TABLE:
+        return getSortedSetTable(field, ss);
+      default:
+        throw new AssertionError();
     }
+  }
 
+  private SortedSetDocValues getSortedSetWithAddresses(FieldInfo field) throws IOException {
     final long valueCount = binaries.get(field.name).count;
     // we keep the byte[]s and list of ords on disk, these could be large
     final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
@@ -722,7 +816,76 @@ class Lucene50DocValuesProducer extends
       }
     };
   }
-  
+
+  private SortedSetDocValues getSortedSetTable(FieldInfo field, SortedSetEntry ss) throws IOException {
+    final long valueCount = binaries.get(field.name).count;
+    final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
+    final LongValues ordinals = getNumeric(ords.get(field.name));
+
+    final long[] table = ss.table;
+    final int[] offsets = ss.tableOffsets;
+
+    return new RandomAccessOrds() {
+
+      int offset, startOffset, endOffset;
+
+      @Override
+      public void setDocument(int docID) {
+        final int ord = (int) ordinals.get(docID);
+        offset = startOffset = offsets[ord];
+        endOffset = offsets[ord + 1];
+      }
+
+      @Override
+      public long ordAt(int index) {
+        return table[startOffset + index];
+      }
+
+      @Override
+      public long nextOrd() {
+        if (offset == endOffset) {
+          return NO_MORE_ORDS;
+        } else {
+          return table[offset++];
+        }
+      }
+
+      @Override
+      public int cardinality() {
+        return endOffset - startOffset;
+      }
+
+      @Override
+      public BytesRef lookupOrd(long ord) {
+        return binary.get(ord);
+      }
+
+      @Override
+      public long getValueCount() {
+        return valueCount;
+      }
+
+      @Override
+      public long lookupTerm(BytesRef key) {
+        if (binary instanceof CompressedBinaryDocValues) {
+          return ((CompressedBinaryDocValues) binary).lookupTerm(key);
+        } else {
+          return super.lookupTerm(key);
+        }
+      }
+
+      @Override
+      public TermsEnum termsEnum() {
+        if (binary instanceof CompressedBinaryDocValues) {
+          return ((CompressedBinaryDocValues) binary).getTermsEnum();
+        } else {
+          return super.termsEnum();
+        }
+      }
+
+    };
+  }
+
   private Bits getLiveBits(final long offset, final int count) throws IOException {
     if (offset == ALL_MISSING) {
       return new Bits.MatchNoBits(count);
@@ -831,6 +994,9 @@ class Lucene50DocValuesProducer extends
   static class SortedSetEntry {
     private SortedSetEntry() {}
     int format;
+
+    long[] table;
+    int[] tableOffsets;
   }
 
   // internally we compose complex dv (sorted/sortedset) from other ones

Modified: lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java?rev=1692061&r1=1692060&r2=1692061&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java (original)
+++ lucene/dev/branches/branch_5x/lucene/core/src/test/org/apache/lucene/codecs/lucene50/TestLucene50DocValuesFormat.java Tue Jul 21 08:01:24 2015
@@ -64,7 +64,7 @@ public class TestLucene50DocValuesFormat
   public void testSortedSetVariableLengthBigVsStoredFields() throws Exception {
     int numIterations = atLeast(1);
     for (int i = 0; i < numIterations; i++) {
-      doTestSortedSetVsStoredFields(atLeast(300), 1, 32766, 16);
+      doTestSortedSetVsStoredFields(atLeast(300), 1, 32766, 16, 100);
     }
   }
   
@@ -72,7 +72,7 @@ public class TestLucene50DocValuesFormat
   public void testSortedSetVariableLengthManyVsStoredFields() throws Exception {
     int numIterations = atLeast(1);
     for (int i = 0; i < numIterations; i++) {
-      doTestSortedSetVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500, 16);
+      doTestSortedSetVsStoredFields(TestUtil.nextInt(random(), 1024, 2049), 1, 500, 16, 100);
     }
   }
   

Modified: lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java?rev=1692061&r1=1692060&r2=1692061&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java (original)
+++ lucene/dev/branches/branch_5x/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java Tue Jul 21 08:01:24 2015
@@ -24,6 +24,7 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
@@ -62,6 +63,8 @@ import org.apache.lucene.util.BytesRefHa
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.TestUtil;
 
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
+
 import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
 
 /**
@@ -1940,29 +1943,30 @@ public abstract class BaseDocValuesForma
     directory.close();
   }
 
-  protected void doTestSortedSetVsStoredFields(int numDocs, int minLength, int maxLength, int maxValuesPerDoc) throws Exception {
+  protected void doTestSortedSetVsStoredFields(int numDocs, int minLength, int maxLength, int maxValuesPerDoc, int maxUniqueValues) throws Exception {
     Directory dir = newFSDirectory(createTempDir("dvduel"));
     IndexWriterConfig conf = newIndexWriterConfig(new MockAnalyzer(random()));
     RandomIndexWriter writer = new RandomIndexWriter(random(), dir, conf);
-    
+
+    Set<String> valueSet = new HashSet<String>();
+    for (int i = 0; i < 10000 && valueSet.size() < maxUniqueValues; ++i) {
+      final int length = TestUtil.nextInt(random(), minLength, maxLength);
+      valueSet.add(TestUtil.randomSimpleString(random(), length));
+    }
+    String[] uniqueValues = valueSet.toArray(new String[0]);
+
     // index some docs
     for (int i = 0; i < numDocs; i++) {
       Document doc = new Document();
       Field idField = new StringField("id", Integer.toString(i), Field.Store.NO);
       doc.add(idField);
-      final int length;
-      if (minLength == maxLength) {
-        length = minLength; // fixed length
-      } else {
-        length = TestUtil.nextInt(random(), minLength, maxLength);
-      }
       int numValues = TestUtil.nextInt(random(), 0, maxValuesPerDoc);
       // create a random set of strings
       Set<String> values = new TreeSet<>();
       for (int v = 0; v < numValues; v++) {
-        values.add(TestUtil.randomSimpleString(random(), length));
+        values.add(RandomPicks.randomFrom(random(), uniqueValues));
       }
-      
+
       // add ordered to the stored field
       for (String v : values) {
         doc.add(new StoredField("stored", v));
@@ -2041,7 +2045,7 @@ public abstract class BaseDocValuesForma
     int numIterations = atLeast(1);
     for (int i = 0; i < numIterations; i++) {
       int fixedLength = TestUtil.nextInt(random(), 1, 10);
-      doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 16);
+      doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 16, 100);
     }
   }
   
@@ -2107,12 +2111,37 @@ public abstract class BaseDocValuesForma
       );
     }
   }
-  
+
+  public void testSortedNumericsFewUniqueSetsVsStoredFields() throws Exception {
+    assumeTrue("Codec does not support SORTED_NUMERIC", codecSupportsSortedNumeric());
+    final long[] values = new long[TestUtil.nextInt(random(), 2, 6)];
+    for (int i = 0; i < values.length; ++i) {
+      values[i] = random().nextLong();
+    }
+    int numIterations = atLeast(1);
+    for (int i = 0; i < numIterations; i++) {
+      doTestSortedNumericsVsStoredFields(
+          new LongProducer() {
+            @Override
+            long next() {
+              return TestUtil.nextLong(random(), 0, 6);
+            }
+          },
+          new LongProducer() {
+            @Override
+            long next() {
+              return values[random().nextInt(values.length)];
+            }
+          }
+      );
+    }
+  }
+
   public void testSortedSetVariableLengthVsStoredFields() throws Exception {
     assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
     int numIterations = atLeast(1);
     for (int i = 0; i < numIterations; i++) {
-      doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 16);
+      doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 16, 100);
     }
   }
 
@@ -2121,7 +2150,7 @@ public abstract class BaseDocValuesForma
     int numIterations = atLeast(1);
     for (int i = 0; i < numIterations; i++) {
       int fixedLength = TestUtil.nextInt(random(), 1, 10);
-      doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 1);
+      doTestSortedSetVsStoredFields(atLeast(300), fixedLength, fixedLength, 1, 100);
     }
   }
   
@@ -2129,7 +2158,39 @@ public abstract class BaseDocValuesForma
     assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
     int numIterations = atLeast(1);
     for (int i = 0; i < numIterations; i++) {
-      doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 1);
+      doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 1, 100);
+    }
+  }
+
+  public void testSortedSetFixedLengthFewUniqueSetsVsStoredFields() throws Exception {
+    assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
+    int numIterations = atLeast(1);
+    for (int i = 0; i < numIterations; i++) {
+      doTestSortedSetVsStoredFields(atLeast(300), 10, 10, 6, 6);
+    }
+  }
+
+  public void testSortedSetVariableLengthFewUniqueSetsVsStoredFields() throws Exception {
+    assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
+    int numIterations = atLeast(1);
+    for (int i = 0; i < numIterations; i++) {
+      doTestSortedSetVsStoredFields(atLeast(300), 1, 10, 6, 6);
+    }
+  }
+
+  public void testSortedSetVariableLengthManyValuesPerDocVsStoredFields() throws Exception {
+    assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
+    int numIterations = atLeast(1);
+    for (int i = 0; i < numIterations; i++) {
+      doTestSortedSetVsStoredFields(atLeast(20), 1, 10, 500, 1000);
+    }
+  }
+
+  public void testSortedSetFixedLengthManyValuesPerDocVsStoredFields() throws Exception {
+    assumeTrue("Codec does not support SORTED_SET", codecSupportsSortedSet());
+    int numIterations = atLeast(1);
+    for (int i = 0; i < numIterations; i++) {
+      doTestSortedSetVsStoredFields(atLeast(20), 10, 10, 500, 1000);
     }
   }