You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/11/09 17:50:49 UTC

svn commit: r1407538 - in /lucene/dev/branches/lucene4547/lucene: codecs/src/java/org/apache/lucene/codecs/simpletext/ core/src/java/org/apache/lucene/index/

Author: mikemccand
Date: Fri Nov  9 16:50:48 2012
New Revision: 1407538

URL: http://svn.apache.org/viewvc?rev=1407538&view=rev
Log:
dirt path @ sorted dv flushing

Added:
    lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedBytesDVWriter.java   (with props)
Modified:
    lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSimpleDocValuesFormat.java
    lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/BytesDVWriter.java
    lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java
    lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessorPerField.java

Modified: lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSimpleDocValuesFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSimpleDocValuesFormat.java?rev=1407538&r1=1407537&r2=1407538&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSimpleDocValuesFormat.java (original)
+++ lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSimpleDocValuesFormat.java Fri Nov  9 16:50:48 2012
@@ -131,7 +131,7 @@ public class SimpleTextSimpleDocValuesFo
     // nocommit
     @Override
     public SortedDocValuesConsumer addSortedField(FieldInfo field, int valueCount, boolean fixedLength, int maxLength) throws IOException {
-      return null;
+      return null; // nocommit
     }
 
     /** write the header for this field */

Modified: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/BytesDVWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/BytesDVWriter.java?rev=1407538&r1=1407537&r2=1407538&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/BytesDVWriter.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/BytesDVWriter.java Fri Nov  9 16:50:48 2012
@@ -59,7 +59,7 @@ class BytesDVWriter {
     // Fill in any holes:
     while(pending.size() < docID) {
       pending.add(BytesRef.EMPTY_BYTES);
-      bytesUsed += RamUsageEstimator.NUM_BYTES_OBJECT_REF;
+      bytesUsed += (int) (RamUsageEstimator.NUM_BYTES_OBJECT_REF * 1.25);
       mergeLength(0);
     }
     byte[] bytes = new byte[value.length];

Modified: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java?rev=1407538&r1=1407537&r2=1407538&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java Fri Nov  9 16:50:48 2012
@@ -91,7 +91,7 @@ final class DocFieldProcessor extends Do
       while(field != null) {
         // nocommit maybe we should sort by .... somethign?
         // field name?  field number?  else this is hash order!!
-        if (field.bytesDVWriter != null || field.numberDVWriter != null) {
+        if (field.bytesDVWriter != null || field.numberDVWriter != null || field.sortedBytesDVWriter != null) {
 
           if (dvConsumer == null) {
             SimpleDocValuesFormat fmt =  state.segmentInfo.getCodec().simpleDocValuesFormat();
@@ -111,12 +111,25 @@ final class DocFieldProcessor extends Do
                                       dvConsumer.addBinaryField(field.fieldInfo,
                                                                 field.bytesDVWriter.fixedLength >= 0,
                                                                 field.bytesDVWriter.maxLength));
+            // nocommit must null it out now else next seg
+            // will flush even if no docs had DV...?
+          }
+          if (field.sortedBytesDVWriter != null) {
+            field.sortedBytesDVWriter.flush(field.fieldInfo, state,
+                                            dvConsumer.addSortedField(field.fieldInfo,
+                                                                      field.sortedBytesDVWriter.hash.size(),
+                                                                      field.sortedBytesDVWriter.fixedLength >= 0,
+                                                                      field.sortedBytesDVWriter.maxLength));
+            // nocommit must null it out now else next seg
+            // will flush even if no docs had DV...?
           }
           if (field.numberDVWriter != null) {
             field.numberDVWriter.flush(field.fieldInfo, state,
                                        dvConsumer.addNumericField(field.fieldInfo,
                                                                   field.numberDVWriter.minValue,
                                                                   field.numberDVWriter.maxValue));
+            // nocommit must null it out now else next seg
+            // will flush even if no docs had DV...?
           }
         }
         field = field.next;
@@ -290,8 +303,15 @@ final class DocFieldProcessor extends Do
       if (dvType != null) {
         switch(dvType) {
         case BYTES_VAR_STRAIGHT:
+        case BYTES_FIXED_STRAIGHT:
           fp.addBytesDVField(docState.docID, field.binaryValue());
           break;
+        case BYTES_VAR_SORTED:
+        case BYTES_FIXED_SORTED:
+        case BYTES_VAR_DEREF:
+        case BYTES_FIXED_DEREF:
+          fp.addSortedBytesDVField(docState.docID, field.binaryValue());
+          break;
         case VAR_INTS:
         case FIXED_INTS_8:
         case FIXED_INTS_16:

Modified: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessorPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessorPerField.java?rev=1407538&r1=1407537&r2=1407538&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessorPerField.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessorPerField.java Fri Nov  9 16:50:48 2012
@@ -35,6 +35,7 @@ final class DocFieldProcessorPerField {
   // nocommit after flush we should null these out?  then we
   // don't need reset() impl'd in each...
   BytesDVWriter bytesDVWriter;
+  SortedBytesDVWriter sortedBytesDVWriter;
   NumberDVWriter numberDVWriter;
 
   DocFieldProcessorPerField next;
@@ -58,6 +59,14 @@ final class DocFieldProcessorPerField {
   }
 
   // nocommit make this generic chain through consumer?
+  public void addSortedBytesDVField(int docID, BytesRef value) {
+    if (sortedBytesDVWriter == null) {
+      sortedBytesDVWriter = new SortedBytesDVWriter(fieldInfo, bytesUsed);
+    }
+    sortedBytesDVWriter.addValue(docID, value);
+  }
+
+  // nocommit make this generic chain through consumer?
   public void addNumberDVField(int docID, Number value) {
     if (numberDVWriter == null) {
       numberDVWriter = new NumberDVWriter(fieldInfo, bytesUsed);

Added: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedBytesDVWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedBytesDVWriter.java?rev=1407538&view=auto
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedBytesDVWriter.java (added)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedBytesDVWriter.java Fri Nov  9 16:50:48 2012
@@ -0,0 +1,145 @@
+package org.apache.lucene.index;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.codecs.BinaryDocValuesConsumer;
+import org.apache.lucene.codecs.SortedDocValuesConsumer;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefHash;
+import org.apache.lucene.util.Counter;
+import org.apache.lucene.util.RamUsageEstimator;
+
+
+/** Buffers up pending byte[] per doc, deref and sorting via
+ *  int ord, then flushes when segment flushes. */
+// nocommit name?
+// nocommit make this a consumer in the chain?
+class SortedBytesDVWriter {
+
+  // nocommit more ram efficient?
+  // nocommit pass allocator that counts RAM used!
+  final BytesRefHash hash = new BytesRefHash();
+  private final ArrayList<Integer> pending = new ArrayList<Integer>();
+  private final Counter iwBytesUsed;
+  private int bytesUsed;
+  private final FieldInfo fieldInfo;
+
+  private static final BytesRef EMPTY = new BytesRef(BytesRef.EMPTY_BYTES);
+
+  // -2 means not set yet; -1 means length isn't fixed;
+  // -otherwise it's the fixed length seen so far:
+  int fixedLength = -2;
+  int maxLength;
+
+  public SortedBytesDVWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
+    this.fieldInfo = fieldInfo;
+    this.iwBytesUsed = iwBytesUsed;
+  }
+
+  public void addValue(int docID, BytesRef value) {
+    final int oldBytesUsed = bytesUsed;
+    if (value == null) {
+      // nocommit improve message
+      throw new IllegalArgumentException("null sortedValue not allowed (field=" + fieldInfo.name + ")");
+    }
+
+    // Fill in any holes:
+    while(pending.size() < docID) {
+      addOneValue(EMPTY);
+    }
+
+    addOneValue(value);
+    iwBytesUsed.addAndGet(bytesUsed - oldBytesUsed);
+  }
+
+  private void addOneValue(BytesRef value) {
+    mergeLength(value.length);
+
+    int ord = hash.add(value);
+    if (ord < 0) {
+      ord = -ord-1;
+    } else {
+      // nocommit this is undercounting!
+      bytesUsed += value.length;
+    }
+    pending.add(ord);
+    // estimate 25% overhead for ArrayList:
+    bytesUsed += (int) (RamUsageEstimator.NUM_BYTES_OBJECT_REF * 1.25) + RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + RamUsageEstimator.NUM_BYTES_INT;
+  }
+
+  private void mergeLength(int length) {
+    if (fixedLength == -2) {
+      fixedLength = length;
+    } else if (fixedLength != length) {
+      fixedLength = -1;
+    }
+    maxLength = Math.max(maxLength, length);
+  }
+
+  public void flush(FieldInfo fieldInfo, SegmentWriteState state, SortedDocValuesConsumer consumer) throws IOException {
+    int valueCount = hash.size();
+
+    final int maxDoc = state.segmentInfo.getDocCount();
+    int emptyOrd = -1;
+    if (pending.size() < maxDoc) {
+      // Make sure we added EMPTY value before sorting:
+      emptyOrd = hash.add(EMPTY);
+      if (emptyOrd < 0) {
+        emptyOrd = -emptyOrd-1;
+      }
+    }
+
+    int[] sortedValues = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+    // nocommit must budget this into RAM consumption up front!
+    int[] ordMap = new int[valueCount];
+
+    // Write values, in sorted order:
+    BytesRef scratch = new BytesRef();
+    for(int ord=0;ord<valueCount;ord++) {
+      consumer.addValue(hash.get(sortedValues[ord], scratch));
+      ordMap[sortedValues[ord]] = ord;
+    }
+    final int bufferedDocCount = pending.size();
+
+    for(int docID=0;docID<bufferedDocCount;docID++) {
+      consumer.addDoc(ordMap[pending.get(docID)]);
+    }
+    for(int docID=bufferedDocCount;docID<maxDoc;docID++) {
+      consumer.addDoc(ordMap[emptyOrd]);
+    }
+    reset();
+  }
+
+  public void abort() {
+    reset();
+  }
+
+  private void reset() {
+    pending.clear();
+    pending.trimToSize();
+    hash.clear();
+    iwBytesUsed.addAndGet(-bytesUsed);
+    bytesUsed = 0;
+    fixedLength = -2;
+    maxLength = 0;
+  }
+}
\ No newline at end of file