You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2013/08/19 18:28:03 UTC

svn commit: r1515496 - in /lucene/dev/branches/lucene5178/lucene: codecs/src/java/org/apache/lucene/codecs/diskdv/ codecs/src/java/org/apache/lucene/codecs/simpletext/ core/src/java/org/apache/lucene/codecs/lucene45/ core/src/java/org/apache/lucene/ind...

Author: rmuir
Date: Mon Aug 19 16:28:03 2013
New Revision: 1515496

URL: http://svn.apache.org/r1515496
Log:
support missing for 4.5 and disk dv

Modified:
    lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java
    lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java
    lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java
    lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java
    lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java
    lucene/dev/branches/lucene5178/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java
    lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java
    lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
    lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java

Modified: lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java?rev=1515496&r1=1515495&r2=1515496&view=diff
==============================================================================
--- lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java (original)
+++ lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/diskdv/DiskDocValuesProducer.java Mon Aug 19 16:28:03 2013
@@ -17,427 +17,34 @@ package org.apache.lucene.codecs.diskdv;
  * limitations under the License.
  */
 
-import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.DELTA_COMPRESSED;
-import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.GCD_COMPRESSED;
-import static org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer.TABLE_COMPRESSED;
-
 import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
 
-import org.apache.lucene.codecs.CodecUtil;
-import org.apache.lucene.codecs.DocValuesProducer;
-import org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer;
-import org.apache.lucene.codecs.lucene45.Lucene45DocValuesFormat;
-import org.apache.lucene.index.BinaryDocValues;
-import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.codecs.lucene45.Lucene45DocValuesProducer;
 import org.apache.lucene.index.FieldInfo;
-import org.apache.lucene.index.IndexFileNames;
-import org.apache.lucene.index.NumericDocValues;
 import org.apache.lucene.index.SegmentReadState;
-import org.apache.lucene.index.SortedDocValues;
-import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.packed.BlockPackedReader;
 import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
-import org.apache.lucene.util.packed.PackedInts;
-
-class DiskDocValuesProducer extends DocValuesProducer {
-  private final Map<Integer,NumericEntry> numerics;
-  private final Map<Integer,NumericEntry> ords;
-  private final Map<Integer,NumericEntry> ordIndexes;
-  private final Map<Integer,BinaryEntry> binaries;
-  private final IndexInput data;
-  private final int maxDoc;
-  
-  DiskDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
-    String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
-    this.maxDoc = state.segmentInfo.getDocCount();
-    // read in the entries from the metadata file.
-    IndexInput in = state.directory.openInput(metaName, state.context);
-    boolean success = false;
-    final int version;
-    try {
-      version = CodecUtil.checkHeader(in, metaCodec, 
-                                      Lucene45DocValuesFormat.VERSION_CURRENT,
-                                      Lucene45DocValuesFormat.VERSION_CURRENT);
-      numerics = new HashMap<Integer,NumericEntry>();
-      ords = new HashMap<Integer,NumericEntry>();
-      ordIndexes = new HashMap<Integer,NumericEntry>();
-      binaries = new HashMap<Integer,BinaryEntry>();
-      readFields(in);
-
-      success = true;
-    } finally {
-      if (success) {
-        IOUtils.close(in);
-      } else {
-        IOUtils.closeWhileHandlingException(in);
-      }
-    }
-
-    success = false;
-    try {
-      String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
-      data = state.directory.openInput(dataName, state.context);
-      final int version2 = CodecUtil.checkHeader(data, dataCodec, 
-                                                 Lucene45DocValuesFormat.VERSION_CURRENT,
-                                                 Lucene45DocValuesFormat.VERSION_CURRENT);
-      if (version != version2) {
-        throw new CorruptIndexException("Versions mismatch");
-      }
-
-      success = true;
-    } finally {
-      if (!success) {
-        IOUtils.closeWhileHandlingException(this.data);
-      }
-    }
-
-  }
-  
-  private void readFields(IndexInput meta) throws IOException {
-    int fieldNumber = meta.readVInt();
-    while (fieldNumber != -1) {
-      byte type = meta.readByte();
-      if (type == Lucene45DocValuesFormat.NUMERIC) {
-        numerics.put(fieldNumber, readNumericEntry(meta));
-      } else if (type == Lucene45DocValuesFormat.BINARY) {
-        BinaryEntry b = readBinaryEntry(meta);
-        binaries.put(fieldNumber, b);
-      } else if (type == Lucene45DocValuesFormat.SORTED) {
-        // sorted = binary + numeric
-        if (meta.readVInt() != fieldNumber) {
-          throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt");
-        }
-        if (meta.readByte() != Lucene45DocValuesFormat.BINARY) {
-          throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt");
-        }
-        BinaryEntry b = readBinaryEntry(meta);
-        binaries.put(fieldNumber, b);
-        
-        if (meta.readVInt() != fieldNumber) {
-          throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt");
-        }
-        if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) {
-          throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt");
-        }
-        NumericEntry n = readNumericEntry(meta);
-        ords.put(fieldNumber, n);
-      } else if (type == Lucene45DocValuesFormat.SORTED_SET) {
-        // sortedset = binary + numeric + ordIndex
-        if (meta.readVInt() != fieldNumber) {
-          throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
-        }
-        if (meta.readByte() != Lucene45DocValuesFormat.BINARY) {
-          throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
-        }
-        BinaryEntry b = readBinaryEntry(meta);
-        binaries.put(fieldNumber, b);
-        
-        if (meta.readVInt() != fieldNumber) {
-          throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
-        }
-        if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) {
-          throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
-        }
-        NumericEntry n1 = readNumericEntry(meta);
-        ords.put(fieldNumber, n1);
-        
-        if (meta.readVInt() != fieldNumber) {
-          throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
-        }
-        if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) {
-          throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt");
-        }
-        NumericEntry n2 = readNumericEntry(meta);
-        ordIndexes.put(fieldNumber, n2);
-      } else {
-        throw new CorruptIndexException("invalid type: " + type + ", resource=" + meta);
-      }
-      fieldNumber = meta.readVInt();
-    }
-  }
-  
-  static NumericEntry readNumericEntry(IndexInput meta) throws IOException {
-    NumericEntry entry = new NumericEntry();
-    entry.format = meta.readVInt();
-    entry.packedIntsVersion = meta.readVInt();
-    entry.offset = meta.readLong();
-    entry.count = meta.readVLong();
-    entry.blockSize = meta.readVInt();
-    switch(entry.format) {
-      case GCD_COMPRESSED:
-        entry.minValue = meta.readLong();
-        entry.gcd = meta.readLong();
-        break;
-      case TABLE_COMPRESSED:
-        if (entry.count > Integer.MAX_VALUE) {
-          throw new CorruptIndexException("Cannot use TABLE_COMPRESSED with more than MAX_VALUE values, input=" + meta);
-        }
-        final int uniqueValues = meta.readVInt();
-        if (uniqueValues > 256) {
-          throw new CorruptIndexException("TABLE_COMPRESSED cannot have more than 256 distinct values, input=" + meta);
-        }
-        entry.table = new long[uniqueValues];
-        for (int i = 0; i < uniqueValues; ++i) {
-          entry.table[i] = meta.readLong();
-        }
-        break;
-      case DELTA_COMPRESSED:
-        break;
-      default:
-        throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta);
-    }
-    return entry;
-  }
-  
-  static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException {
-    BinaryEntry entry = new BinaryEntry();
-    int format = meta.readVInt();
-    if (format != Lucene45DocValuesConsumer.BINARY_FIXED_UNCOMPRESSED && format != Lucene45DocValuesConsumer.BINARY_VARIABLE_UNCOMPRESSED) {
-      throw new CorruptIndexException("Unexpected format for binary entry: " + format + ", input=" + meta);
-    }
-    entry.minLength = meta.readVInt();
-    entry.maxLength = meta.readVInt();
-    entry.count = meta.readVLong();
-    entry.offset = meta.readLong();
-    if (entry.minLength != entry.maxLength) {
-      entry.addressesOffset = meta.readLong();
-      entry.packedIntsVersion = meta.readVInt();
-      entry.blockSize = meta.readVInt();
-    }
-    return entry;
-  }
 
-  @Override
-  public NumericDocValues getNumeric(FieldInfo field) throws IOException {
-    NumericEntry entry = numerics.get(field.number);
-    return getNumeric(field, entry);
-  }
-  
-  private LongNumericDocValues getNumeric(FieldInfo field, final NumericEntry entry) throws IOException {
-    final IndexInput data = this.data.clone();
-    data.seek(entry.offset);
+class DiskDocValuesProducer extends Lucene45DocValuesProducer {
 
-    switch (entry.format) {
-      case DELTA_COMPRESSED:
-        final BlockPackedReader reader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true);
-        return new LongNumericDocValues() {
-          @Override
-          public long get(long id) {
-            return reader.get(id);
-          }
-        };
-      case GCD_COMPRESSED:
-        final long min = entry.minValue;
-        final long mult = entry.gcd;
-        final BlockPackedReader quotientReader = new BlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true);
-        return new LongNumericDocValues() {
-          @Override
-          public long get(long id) {
-            return min + mult * quotientReader.get(id);
-          }
-        };
-      case TABLE_COMPRESSED:
-        final long[] table = entry.table;
-        final int bitsRequired = PackedInts.bitsRequired(table.length - 1);
-        final PackedInts.Reader ords = PackedInts.getDirectReaderNoHeader(data, PackedInts.Format.PACKED, entry.packedIntsVersion, (int) entry.count, bitsRequired);
-        return new LongNumericDocValues() {
-          @Override
-          long get(long id) {
-            return table[(int) ords.get((int) id)];
-          }
-        };
-      default:
-        throw new AssertionError();
-    }
+  DiskDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
+    super(state, dataCodec, dataExtension, metaCodec, metaExtension);
   }
 
   @Override
-  public BinaryDocValues getBinary(FieldInfo field) throws IOException {
-    BinaryEntry bytes = binaries.get(field.number);
-    if (bytes.minLength == bytes.maxLength) {
-      return getFixedBinary(field, bytes);
-    } else {
-      return getVariableBinary(field, bytes);
-    }
-  }
-  
-  private BinaryDocValues getFixedBinary(FieldInfo field, final BinaryEntry bytes) {
-    final IndexInput data = this.data.clone();
-
-    return new LongBinaryDocValues() {
-      @Override
-      public void get(long id, BytesRef result) {
-        long address = bytes.offset + id * bytes.maxLength;
-        try {
-          data.seek(address);
-          // NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource) 
-          // assume "they" own the bytes after calling this!
-          final byte[] buffer = new byte[bytes.maxLength];
-          data.readBytes(buffer, 0, buffer.length);
-          result.bytes = buffer;
-          result.offset = 0;
-          result.length = buffer.length;
-        } catch (IOException e) {
-          throw new RuntimeException(e);
-        }
-      }
-    };
-  }
-  
-  private BinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException {
-    final IndexInput data = this.data.clone();
+  protected MonotonicBlockPackedReader getAddressInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException {
     data.seek(bytes.addressesOffset);
-
-    final MonotonicBlockPackedReader addresses = new MonotonicBlockPackedReader(data, bytes.packedIntsVersion, bytes.blockSize, bytes.count, true);
-    return new LongBinaryDocValues() {
-      @Override
-      public void get(long id, BytesRef result) {
-        long startAddress = bytes.offset + (id == 0 ? 0 : + addresses.get(id-1));
-        long endAddress = bytes.offset + addresses.get(id);
-        int length = (int) (endAddress - startAddress);
-        try {
-          data.seek(startAddress);
-          // NOTE: we could have one buffer, but various consumers (e.g. FieldComparatorSource) 
-          // assume "they" own the bytes after calling this!
-          final byte[] buffer = new byte[length];
-          data.readBytes(buffer, 0, buffer.length);
-          result.bytes = buffer;
-          result.offset = 0;
-          result.length = length;
-        } catch (IOException e) {
-          throw new RuntimeException(e);
-        }
-      }
-    };
+    return new MonotonicBlockPackedReader(data.clone(), bytes.packedIntsVersion, bytes.blockSize, bytes.count, true);
   }
 
   @Override
-  public SortedDocValues getSorted(FieldInfo field) throws IOException {
-    final int valueCount = (int) binaries.get(field.number).count;
-    final BinaryDocValues binary = getBinary(field);
-    final NumericDocValues ordinals = getNumeric(field, ords.get(field.number));
-    return new SortedDocValues() {
-
-      @Override
-      public int getOrd(int docID) {
-        return (int) ordinals.get(docID);
-      }
-
-      @Override
-      public void lookupOrd(int ord, BytesRef result) {
-        binary.get(ord, result);
-      }
-
-      @Override
-      public int getValueCount() {
-        return valueCount;
-      }
-    };
+  protected MonotonicBlockPackedReader getIntervalInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException {
+    throw new AssertionError();
   }
 
   @Override
-  public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
-    final long valueCount = binaries.get(field.number).count;
-    final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
-    final LongNumericDocValues ordinals = getNumeric(field, ords.get(field.number));
-    NumericEntry entry = ordIndexes.get(field.number);
-    IndexInput data = this.data.clone();
+  protected MonotonicBlockPackedReader getOrdIndexInstance(IndexInput data, FieldInfo field, NumericEntry entry) throws IOException {
     data.seek(entry.offset);
-    final MonotonicBlockPackedReader ordIndex = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, true);
-    
-    return new SortedSetDocValues() {
-      long offset;
-      long endOffset;
-      
-      @Override
-      public long nextOrd() {
-        if (offset == endOffset) {
-          return NO_MORE_ORDS;
-        } else {
-          long ord = ordinals.get(offset);
-          offset++;
-          return ord;
-        }
-      }
-
-      @Override
-      public void setDocument(int docID) {
-        offset = (docID == 0 ? 0 : ordIndex.get(docID-1));
-        endOffset = ordIndex.get(docID);
-      }
-
-      @Override
-      public void lookupOrd(long ord, BytesRef result) {
-        binary.get(ord, result);
-      }
-
-      @Override
-      public long getValueCount() {
-        return valueCount;
-      }
-    };
-  }
-  
-  @Override
-  public Bits getDocsWithField(FieldInfo field) throws IOException {
-    if (field.getDocValuesType() == FieldInfo.DocValuesType.SORTED_SET) {
-      return new SortedSetDocsWithField(getSortedSet(field), maxDoc);
-    } else {
-      return new Bits.MatchAllBits(maxDoc);
-    }
-  }
-
-  @Override
-  public void close() throws IOException {
-    data.close();
-  }
-  
-  static class NumericEntry {
-    long offset;
-
-    int format;
-    int packedIntsVersion;
-    long count;
-    int blockSize;
-    
-    long minValue;
-    long gcd;
-    long table[];
-  }
-  
-  static class BinaryEntry {
-    long offset;
-
-    long count;
-    int minLength;
-    int maxLength;
-    long addressesOffset;
-    int packedIntsVersion;
-    int blockSize;
-  }
-  
-  // internally we compose complex dv (sorted/sortedset) from other ones
-  static abstract class LongNumericDocValues extends NumericDocValues {
-    @Override
-    public final long get(int docID) {
-      return get((long) docID);
-    }
-    
-    abstract long get(long id);
-  }
-  
-  static abstract class LongBinaryDocValues extends BinaryDocValues {
-    @Override
-    public final void get(int docID, BytesRef result) {
-      get((long)docID, result);
-    }
-    
-    abstract void get(long id, BytesRef Result);
+    return new MonotonicBlockPackedReader(data.clone(), entry.packedIntsVersion, entry.blockSize, entry.count, true);
   }
 }

Modified: lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java?rev=1515496&r1=1515495&r2=1515496&view=diff
==============================================================================
--- lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java (original)
+++ lucene/dev/branches/lucene5178/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesReader.java Mon Aug 19 16:28:03 2013
@@ -321,12 +321,8 @@ class SimpleTextDocValuesReader extends 
       @Override
       public void lookupOrd(int ord, BytesRef result) {
         try {
-          if (ord == -1) {
-            result.length = 0;
-            return;
-          }
-          if (ord < -1 || ord >= field.numValues) {
-            throw new IndexOutOfBoundsException("ord must be -1 .. " + (field.numValues-1) + "; got " + ord);
+          if (ord < 0 || ord >= field.numValues) {
+            throw new IndexOutOfBoundsException("ord must be 0 .. " + (field.numValues-1) + "; got " + ord);
           }
           in.seek(field.dataStartFilePointer + ord * (9 + field.pattern.length() + field.maxLength));
           SimpleTextUtil.readLine(in, scratch);

Modified: lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java?rev=1515496&r1=1515495&r2=1515496&view=diff
==============================================================================
--- lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java (original)
+++ lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesConsumer.java Mon Aug 19 16:28:03 2013
@@ -23,7 +23,6 @@ import java.util.HashSet;
 
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.DocValuesConsumer;
-import org.apache.lucene.codecs.MissingOrdRemapper;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.SegmentWriteState;
@@ -88,14 +87,20 @@ public class Lucene45DocValuesConsumer e
     long minValue = Long.MAX_VALUE;
     long maxValue = Long.MIN_VALUE;
     long gcd = 0;
+    boolean missing = false;
     // TODO: more efficient?
     HashSet<Long> uniqueValues = null;
     if (optimizeStorage) {
       uniqueValues = new HashSet<>();
 
-      // nocommit: impl null values (ideally smartly)
       for (Number nv : values) {
-        final long v = nv == null ? 0 : nv.longValue();
+        final long v;
+        if (nv == null) {
+          v = 0;
+          missing = true;
+        } else {
+          v = nv.longValue();
+        }
 
         if (gcd != 1) {
           if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) {
@@ -142,6 +147,12 @@ public class Lucene45DocValuesConsumer e
     meta.writeVInt(field.number);
     meta.writeByte(Lucene45DocValuesFormat.NUMERIC);
     meta.writeVInt(format);
+    if (missing) {
+      meta.writeLong(data.getFilePointer());
+      writeMissingBitset(values);
+    } else {
+      meta.writeLong(-1L);
+    }
     meta.writeVInt(PackedInts.VERSION_CURRENT);
     meta.writeLong(data.getFilePointer());
     meta.writeVLong(count);
@@ -184,6 +195,27 @@ public class Lucene45DocValuesConsumer e
         throw new AssertionError();
     }
   }
+  
+  // TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on,
+  // but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode)
+  void writeMissingBitset(Iterable<?> values) throws IOException {
+    byte bits = 0;
+    int count = 0;
+    for (Object v : values) {
+      if (count == 8) {
+        data.writeByte(bits);
+        count = 0;
+        bits = 0;
+      }
+      if (v != null) {
+        bits |= 1 << (count & 7);
+      }
+      count++;
+    }
+    if (count > 0) {
+      data.writeByte(bits);
+    }
+  }
 
   @Override
   public void addBinaryField(FieldInfo field, Iterable<BytesRef> values) throws IOException {
@@ -194,8 +226,15 @@ public class Lucene45DocValuesConsumer e
     int maxLength = Integer.MIN_VALUE;
     final long startFP = data.getFilePointer();
     long count = 0;
+    boolean missing = false;
     for(BytesRef v : values) {
-      final int length = v == null ? 0 : v.length;
+      final int length;
+      if (v == null) {
+        length = 0;
+        missing = true;
+      } else {
+        length = v.length;
+      }
       minLength = Math.min(minLength, length);
       maxLength = Math.max(maxLength, length);
       if (v != null) {
@@ -204,6 +243,12 @@ public class Lucene45DocValuesConsumer e
       count++;
     }
     meta.writeVInt(minLength == maxLength ? BINARY_FIXED_UNCOMPRESSED : BINARY_VARIABLE_UNCOMPRESSED);
+    if (missing) {
+      meta.writeLong(data.getFilePointer());
+      writeMissingBitset(values);
+    } else {
+      meta.writeLong(-1L);
+    }
     meta.writeVInt(minLength);
     meta.writeVInt(maxLength);
     meta.writeVLong(count);
@@ -244,6 +289,7 @@ public class Lucene45DocValuesConsumer e
       meta.writeVInt(field.number);
       meta.writeByte(Lucene45DocValuesFormat.BINARY);
       meta.writeVInt(BINARY_PREFIX_COMPRESSED);
+      meta.writeLong(-1L);
       // now write the bytes: sharing prefixes within a block
       final long startFP = data.getFilePointer();
       // currently, we have to store the delta from expected for every 1/nth term
@@ -286,34 +332,6 @@ public class Lucene45DocValuesConsumer e
 
   @Override
   public void addSortedField(FieldInfo field, Iterable<BytesRef> values, Iterable<Number> docToOrd) throws IOException {
-    // nocommit: remove this hack and support missing!
-
-    // three cases for simulating the old writer:
-    // 1. no missing
-    // 2. missing (and empty string in use): remap ord=-1 -> ord=0
-    // 3. missing (and empty string not in use): remap all ords +1, insert empty string into values
-    boolean anyMissing = false;
-    for (Number n : docToOrd) {
-      if (n.longValue() == -1) {
-        anyMissing = true;
-        break;
-      }
-    }
-    
-    boolean hasEmptyString = false;
-    for (BytesRef b : values) {
-      hasEmptyString = b.length == 0;
-      break;
-    }
-    
-    if (!anyMissing) {
-      // nothing to do
-    } else if (hasEmptyString) {
-      docToOrd = MissingOrdRemapper.mapMissingToOrd0(docToOrd);
-    } else {
-      docToOrd = MissingOrdRemapper.mapAllOrds(docToOrd);
-      values = MissingOrdRemapper.insertEmptyValue(values);
-    }
     meta.writeVInt(field.number);
     meta.writeByte(Lucene45DocValuesFormat.SORTED);
     addTermsDict(field, values);
@@ -334,6 +352,7 @@ public class Lucene45DocValuesConsumer e
     meta.writeVInt(field.number);
     meta.writeByte(Lucene45DocValuesFormat.NUMERIC);
     meta.writeVInt(DELTA_COMPRESSED);
+    meta.writeLong(-1L);
     meta.writeVInt(PackedInts.VERSION_CURRENT);
     meta.writeLong(data.getFilePointer());
     meta.writeVLong(maxDoc);

Modified: lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java?rev=1515496&r1=1515495&r2=1515496&view=diff
==============================================================================
--- lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java (original)
+++ lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/codecs/lucene45/Lucene45DocValuesProducer.java Mon Aug 19 16:28:03 2013
@@ -53,7 +53,7 @@ import org.apache.lucene.util.packed.Blo
 import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
 import org.apache.lucene.util.packed.PackedInts;
 
-class Lucene45DocValuesProducer extends DocValuesProducer {
+public class Lucene45DocValuesProducer extends DocValuesProducer {
   private final Map<Integer,NumericEntry> numerics;
   private final Map<Integer,BinaryEntry> binaries;
   private final Map<Integer,NumericEntry> ords;
@@ -65,7 +65,7 @@ class Lucene45DocValuesProducer extends 
   private final Map<Integer,MonotonicBlockPackedReader> addressInstances = new HashMap<Integer,MonotonicBlockPackedReader>();
   private final Map<Integer,MonotonicBlockPackedReader> ordIndexInstances = new HashMap<Integer,MonotonicBlockPackedReader>();
   
-  Lucene45DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
+  protected Lucene45DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
     String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
     // read in the entries from the metadata file.
     IndexInput in = state.directory.openInput(metaName, state.context);
@@ -176,6 +176,7 @@ class Lucene45DocValuesProducer extends 
   static NumericEntry readNumericEntry(IndexInput meta) throws IOException {
     NumericEntry entry = new NumericEntry();
     entry.format = meta.readVInt();
+    entry.missingOffset = meta.readLong();
     entry.packedIntsVersion = meta.readVInt();
     entry.offset = meta.readLong();
     entry.count = meta.readVLong();
@@ -209,6 +210,7 @@ class Lucene45DocValuesProducer extends 
   static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException {
     BinaryEntry entry = new BinaryEntry();
     entry.format = meta.readVInt();
+    entry.missingOffset = meta.readLong();
     entry.minLength = meta.readVInt();
     entry.maxLength = meta.readVInt();
     entry.count = meta.readVLong();
@@ -315,9 +317,7 @@ class Lucene45DocValuesProducer extends 
     };
   }
   
-  private BinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException {
-    final IndexInput data = this.data.clone();
-    
+  protected MonotonicBlockPackedReader getAddressInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException {
     final MonotonicBlockPackedReader addresses;
     synchronized (addressInstances) {
       MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number);
@@ -328,6 +328,13 @@ class Lucene45DocValuesProducer extends 
       }
       addresses = addrInstance;
     }
+    return addresses;
+  }
+  
+  private BinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException {
+    final IndexInput data = this.data.clone();
+    
+    final MonotonicBlockPackedReader addresses = getAddressInstance(data, field, bytes);
 
     return new LongBinaryDocValues() {
       @Override
@@ -350,12 +357,10 @@ class Lucene45DocValuesProducer extends 
       }
     };
   }
-
-  private BinaryDocValues getCompressedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException {
-    final IndexInput data = this.data.clone();
-    final long interval = bytes.addressInterval;
-
+  
+  protected MonotonicBlockPackedReader getIntervalInstance(IndexInput data, FieldInfo field, BinaryEntry bytes) throws IOException {
     final MonotonicBlockPackedReader addresses;
+    final long interval = bytes.addressInterval;
     synchronized (addressInstances) {
       MonotonicBlockPackedReader addrInstance = addressInstances.get(field.number);
       if (addrInstance == null) {
@@ -371,6 +376,14 @@ class Lucene45DocValuesProducer extends 
       }
       addresses = addrInstance;
     }
+    return addresses;
+  }
+
+
+  private BinaryDocValues getCompressedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException {
+    final IndexInput data = this.data.clone();
+
+    final MonotonicBlockPackedReader addresses = getIntervalInstance(data, field, bytes);
     
     return new CompressedBinaryDocValues(bytes, addresses, data);
   }
@@ -420,26 +433,30 @@ class Lucene45DocValuesProducer extends 
       }
     };
   }
-
-  @Override
-  public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
-    final long valueCount = binaries.get(field.number).count;
-    // we keep the byte[]s and list of ords on disk, these could be large
-    final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
-    final LongNumericDocValues ordinals = getNumeric(ords.get(field.number));
-    // but the addresses to the ord stream are in RAM
+  
+  protected MonotonicBlockPackedReader getOrdIndexInstance(IndexInput data, FieldInfo field, NumericEntry entry) throws IOException {
     final MonotonicBlockPackedReader ordIndex;
     synchronized (ordIndexInstances) {
       MonotonicBlockPackedReader ordIndexInstance = ordIndexInstances.get(field.number);
       if (ordIndexInstance == null) {
-        NumericEntry entry = ordIndexes.get(field.number);
-        IndexInput data = this.data.clone();
         data.seek(entry.offset);
         ordIndexInstance = new MonotonicBlockPackedReader(data, entry.packedIntsVersion, entry.blockSize, entry.count, false);
         ordIndexInstances.put(field.number, ordIndexInstance);
       }
       ordIndex = ordIndexInstance;
     }
+    return ordIndex;
+  }
+
+  @Override
+  public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
+    final IndexInput data = this.data.clone();
+    final long valueCount = binaries.get(field.number).count;
+    // we keep the byte[]s and list of ords on disk, these could be large
+    final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
+    final LongNumericDocValues ordinals = getNumeric(ords.get(field.number));
+    // but the addresses to the ord stream are in RAM
+    final MonotonicBlockPackedReader ordIndex = getOrdIndexInstance(data, field, ordIndexes.get(field.number));
     
     return new SortedSetDocValues() {
       long offset;
@@ -491,15 +508,47 @@ class Lucene45DocValuesProducer extends 
       }
     };
   }
+  
+  public Bits getMissingBits(final long offset) throws IOException {
+    if (offset == -1) {
+      return new Bits.MatchAllBits(maxDoc);
+    } else {
+      final IndexInput in = data.clone();
+      return new Bits() {
+
+        @Override
+        public boolean get(int index) {
+          try {
+            in.seek(offset + (index >> 3));
+            return (in.readByte() & (1 << (index & 7))) != 0;
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+        }
+
+        @Override
+        public int length() {
+          return maxDoc;
+        }
+      };
+    }
+  }
 
   @Override
   public Bits getDocsWithField(FieldInfo field) throws IOException {
-    // nocommit: only use this if the field's entry has missing values (write that),
-    // otherwise return MatchAllBits
-    if (field.getDocValuesType() == FieldInfo.DocValuesType.SORTED_SET) {
-      return new SortedSetDocsWithField(getSortedSet(field), maxDoc);
-    } else {
-      return new Bits.MatchAllBits(maxDoc);
+    switch(field.getDocValuesType()) {
+      case SORTED_SET:
+        return new SortedSetDocsWithField(getSortedSet(field), maxDoc);
+      case SORTED:
+        return new SortedDocsWithField(getSorted(field), maxDoc);
+      case BINARY:
+        BinaryEntry be = binaries.get(field.number);
+        return getMissingBits(be.missingOffset);
+      case NUMERIC:
+        NumericEntry ne = numerics.get(field.number);
+        return getMissingBits(ne.missingOffset);
+      default:
+        throw new AssertionError();
     }
   }
 
@@ -508,30 +557,32 @@ class Lucene45DocValuesProducer extends 
     data.close();
   }
   
-  static class NumericEntry {
-    long offset;
-
-    int format;
-    int packedIntsVersion;
-    long count;
-    int blockSize;
+  protected static class NumericEntry {
+    long missingOffset;
+    public long offset;
+
+    public int format;
+    public int packedIntsVersion;
+    public long count;
+    public int blockSize;
     
     long minValue;
     long gcd;
     long table[];
   }
   
-  static class BinaryEntry {
+  protected static class BinaryEntry {
+    long missingOffset;
     long offset;
 
     int format;
-    long count;
+    public long count;
     int minLength;
     int maxLength;
-    long addressesOffset;
-    long addressInterval;
-    int packedIntsVersion;
-    int blockSize;
+    public long addressesOffset;
+    public long addressInterval;
+    public int packedIntsVersion;
+    public int blockSize;
   }
   
   // internally we compose complex dv (sorted/sortedset) from other ones

Modified: lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java?rev=1515496&r1=1515495&r2=1515496&view=diff
==============================================================================
--- lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java (original)
+++ lucene/dev/branches/lucene5178/lucene/core/src/java/org/apache/lucene/index/SortedDocValues.java Mon Aug 19 16:28:03 2013
@@ -43,7 +43,7 @@ public abstract class SortedDocValues ex
   public abstract int getOrd(int docID);
 
   /** Retrieves the value for the specified ordinal.
-   * @param ord ordinal to lookup
+   * @param ord ordinal to lookup (must be &gt;= 0 and &lt {@link #getValueCount()})
    * @param result will be populated with the ordinal's value
    * @see #getOrd(int) 
    */

Modified: lucene/dev/branches/lucene5178/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5178/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java?rev=1515496&r1=1515495&r2=1515496&view=diff
==============================================================================
--- lucene/dev/branches/lucene5178/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java (original)
+++ lucene/dev/branches/lucene5178/lucene/core/src/test/org/apache/lucene/search/TestFieldCache.java Mon Aug 19 16:28:03 2013
@@ -480,7 +480,7 @@ public class TestFieldCache extends Luce
     } catch (IllegalStateException expected) {}
     
     Bits bits = FieldCache.DEFAULT.getDocsWithField(ar, "binary");
-    assertTrue(bits instanceof Bits.MatchAllBits);
+    assertTrue(bits.get(0));
     
     // Sorted type: can be retrieved via getTerms(), getTermsIndex(), getDocTermOrds()
     try {
@@ -510,7 +510,7 @@ public class TestFieldCache extends Luce
     assertEquals(1, sortedSet.getValueCount());
     
     bits = FieldCache.DEFAULT.getDocsWithField(ar, "sorted");
-    assertTrue(bits instanceof Bits.MatchAllBits);
+    assertTrue(bits.get(0));
     
     // Numeric type: can be retrieved via getInts() and so on
     Ints numeric = FieldCache.DEFAULT.getInts(ar, "numeric", false);
@@ -537,7 +537,7 @@ public class TestFieldCache extends Luce
     } catch (IllegalStateException expected) {}
     
     bits = FieldCache.DEFAULT.getDocsWithField(ar, "numeric");
-    assertTrue(bits instanceof Bits.MatchAllBits);
+    assertTrue(bits.get(0));
     
     // SortedSet type: can be retrieved via getDocTermOrds() 
     if (defaultCodecSupportsSortedSet()) {

Modified: lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java?rev=1515496&r1=1515495&r2=1515496&view=diff
==============================================================================
--- lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java (original)
+++ lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/AssertingAtomicReader.java Mon Aug 19 16:28:03 2013
@@ -438,14 +438,14 @@ public class AssertingAtomicReader exten
       this.in = in;
       this.maxDoc = maxDoc;
       this.valueCount = in.getValueCount();
-      assert valueCount >= 1 && valueCount <= maxDoc;
+      assert valueCount >= 0 && valueCount <= maxDoc;
     }
 
     @Override
     public int getOrd(int docID) {
       assert docID >= 0 && docID < maxDoc;
       int ord = in.getOrd(docID);
-      assert ord >= 0 && ord < valueCount;
+      assert ord >= -1 && ord < valueCount;
       return ord;
     }
 

Modified: lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java?rev=1515496&r1=1515495&r2=1515496&view=diff
==============================================================================
--- lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java (original)
+++ lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/index/BaseDocValuesFormatTestCase.java Mon Aug 19 16:28:03 2013
@@ -696,7 +696,10 @@ public abstract class BaseDocValuesForma
     BytesRef scratch = new BytesRef();
     dv.lookupOrd(dv.getOrd(0), scratch);
     assertEquals(new BytesRef("hello world 2"), scratch);
-    dv.lookupOrd(dv.getOrd(1), scratch);
+    if (codecSupportsDocsWithField("dv")) {
+      assertEquals(-1, dv.getOrd(1));
+    }
+    dv.get(1, scratch);
     assertEquals(new BytesRef(""), scratch);
     ireader.close();
     directory.close();

Modified: lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java?rev=1515496&r1=1515495&r2=1515496&view=diff
==============================================================================
--- lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java (original)
+++ lucene/dev/branches/lucene5178/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java Mon Aug 19 16:28:03 2013
@@ -1372,8 +1372,11 @@ public abstract class LuceneTestCase ext
   /** Returns true if the codec for the field "supports" docsWithField 
    * (other codecs return MatchAllBits, because you couldnt write missing values before) */
   public static boolean codecSupportsDocsWithField(String field) {
-    // currently only one codec!
-    return _TestUtil.getDocValuesFormat(Codec.getDefault(), field).equals("SimpleText");
+    String name = _TestUtil.getDocValuesFormat(Codec.getDefault(), field);
+    if (name.equals("Lucene40") || name.equals("Lucene42")) {
+      return false;
+    }
+    return true;
   }
 
   public void assertReaderEquals(String info, IndexReader leftReader, IndexReader rightReader) throws IOException {