You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2012/11/09 17:50:49 UTC
svn commit: r1407538 - in /lucene/dev/branches/lucene4547/lucene:
codecs/src/java/org/apache/lucene/codecs/simpletext/
core/src/java/org/apache/lucene/index/
Author: mikemccand
Date: Fri Nov 9 16:50:48 2012
New Revision: 1407538
URL: http://svn.apache.org/viewvc?rev=1407538&view=rev
Log:
dirt path @ sorted dv flushing
Added:
lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedBytesDVWriter.java (with props)
Modified:
lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSimpleDocValuesFormat.java
lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/BytesDVWriter.java
lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java
lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessorPerField.java
Modified: lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSimpleDocValuesFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSimpleDocValuesFormat.java?rev=1407538&r1=1407537&r2=1407538&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSimpleDocValuesFormat.java (original)
+++ lucene/dev/branches/lucene4547/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSimpleDocValuesFormat.java Fri Nov 9 16:50:48 2012
@@ -131,7 +131,7 @@ public class SimpleTextSimpleDocValuesFo
// nocommit
@Override
public SortedDocValuesConsumer addSortedField(FieldInfo field, int valueCount, boolean fixedLength, int maxLength) throws IOException {
- return null;
+ return null; // nocommit
}
/** write the header for this field */
Modified: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/BytesDVWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/BytesDVWriter.java?rev=1407538&r1=1407537&r2=1407538&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/BytesDVWriter.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/BytesDVWriter.java Fri Nov 9 16:50:48 2012
@@ -59,7 +59,7 @@ class BytesDVWriter {
// Fill in any holes:
while(pending.size() < docID) {
pending.add(BytesRef.EMPTY_BYTES);
- bytesUsed += RamUsageEstimator.NUM_BYTES_OBJECT_REF;
+ bytesUsed += (int) (RamUsageEstimator.NUM_BYTES_OBJECT_REF * 1.25);
mergeLength(0);
}
byte[] bytes = new byte[value.length];
Modified: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java?rev=1407538&r1=1407537&r2=1407538&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessor.java Fri Nov 9 16:50:48 2012
@@ -91,7 +91,7 @@ final class DocFieldProcessor extends Do
while(field != null) {
// nocommit maybe we should sort by .... somethign?
// field name? field number? else this is hash order!!
- if (field.bytesDVWriter != null || field.numberDVWriter != null) {
+ if (field.bytesDVWriter != null || field.numberDVWriter != null || field.sortedBytesDVWriter != null) {
if (dvConsumer == null) {
SimpleDocValuesFormat fmt = state.segmentInfo.getCodec().simpleDocValuesFormat();
@@ -111,12 +111,25 @@ final class DocFieldProcessor extends Do
dvConsumer.addBinaryField(field.fieldInfo,
field.bytesDVWriter.fixedLength >= 0,
field.bytesDVWriter.maxLength));
+ // nocommit must null it out now else next seg
+ // will flush even if no docs had DV...?
+ }
+ if (field.sortedBytesDVWriter != null) {
+ field.sortedBytesDVWriter.flush(field.fieldInfo, state,
+ dvConsumer.addSortedField(field.fieldInfo,
+ field.sortedBytesDVWriter.hash.size(),
+ field.sortedBytesDVWriter.fixedLength >= 0,
+ field.sortedBytesDVWriter.maxLength));
+ // nocommit must null it out now else next seg
+ // will flush even if no docs had DV...?
}
if (field.numberDVWriter != null) {
field.numberDVWriter.flush(field.fieldInfo, state,
dvConsumer.addNumericField(field.fieldInfo,
field.numberDVWriter.minValue,
field.numberDVWriter.maxValue));
+ // nocommit must null it out now else next seg
+ // will flush even if no docs had DV...?
}
}
field = field.next;
@@ -290,8 +303,15 @@ final class DocFieldProcessor extends Do
if (dvType != null) {
switch(dvType) {
case BYTES_VAR_STRAIGHT:
+ case BYTES_FIXED_STRAIGHT:
fp.addBytesDVField(docState.docID, field.binaryValue());
break;
+ case BYTES_VAR_SORTED:
+ case BYTES_FIXED_SORTED:
+ case BYTES_VAR_DEREF:
+ case BYTES_FIXED_DEREF:
+ fp.addSortedBytesDVField(docState.docID, field.binaryValue());
+ break;
case VAR_INTS:
case FIXED_INTS_8:
case FIXED_INTS_16:
Modified: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessorPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessorPerField.java?rev=1407538&r1=1407537&r2=1407538&view=diff
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessorPerField.java (original)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/DocFieldProcessorPerField.java Fri Nov 9 16:50:48 2012
@@ -35,6 +35,7 @@ final class DocFieldProcessorPerField {
// nocommit after flush we should null these out? then we
// don't need reset() impl'd in each...
BytesDVWriter bytesDVWriter;
+ SortedBytesDVWriter sortedBytesDVWriter;
NumberDVWriter numberDVWriter;
DocFieldProcessorPerField next;
@@ -58,6 +59,14 @@ final class DocFieldProcessorPerField {
}
// nocommit make this generic chain through consumer?
+ public void addSortedBytesDVField(int docID, BytesRef value) {
+ if (sortedBytesDVWriter == null) {
+ sortedBytesDVWriter = new SortedBytesDVWriter(fieldInfo, bytesUsed);
+ }
+ sortedBytesDVWriter.addValue(docID, value);
+ }
+
+ // nocommit make this generic chain through consumer?
public void addNumberDVField(int docID, Number value) {
if (numberDVWriter == null) {
numberDVWriter = new NumberDVWriter(fieldInfo, bytesUsed);
Added: lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedBytesDVWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedBytesDVWriter.java?rev=1407538&view=auto
==============================================================================
--- lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedBytesDVWriter.java (added)
+++ lucene/dev/branches/lucene4547/lucene/core/src/java/org/apache/lucene/index/SortedBytesDVWriter.java Fri Nov 9 16:50:48 2012
@@ -0,0 +1,145 @@
+package org.apache.lucene.index;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.codecs.BinaryDocValuesConsumer;
+import org.apache.lucene.codecs.SortedDocValuesConsumer;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefHash;
+import org.apache.lucene.util.Counter;
+import org.apache.lucene.util.RamUsageEstimator;
+
+
+/** Buffers up pending byte[] per doc, deref and sorting via
+ * int ord, then flushes when segment flushes. */
+// nocommit name?
+// nocommit make this a consumer in the chain?
+class SortedBytesDVWriter {
+
+ // nocommit more ram efficient?
+ // nocommit pass allocator that counts RAM used!
+ final BytesRefHash hash = new BytesRefHash();
+ private final ArrayList<Integer> pending = new ArrayList<Integer>();
+ private final Counter iwBytesUsed;
+ private int bytesUsed;
+ private final FieldInfo fieldInfo;
+
+ private static final BytesRef EMPTY = new BytesRef(BytesRef.EMPTY_BYTES);
+
+ // -2 means not set yet; -1 means length isn't fixed;
+ // -otherwise it's the fixed length seen so far:
+ int fixedLength = -2;
+ int maxLength;
+
+ public SortedBytesDVWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
+ this.fieldInfo = fieldInfo;
+ this.iwBytesUsed = iwBytesUsed;
+ }
+
+ public void addValue(int docID, BytesRef value) {
+ final int oldBytesUsed = bytesUsed;
+ if (value == null) {
+ // nocommit improve message
+ throw new IllegalArgumentException("null sortedValue not allowed (field=" + fieldInfo.name + ")");
+ }
+
+ // Fill in any holes:
+ while(pending.size() < docID) {
+ addOneValue(EMPTY);
+ }
+
+ addOneValue(value);
+ iwBytesUsed.addAndGet(bytesUsed - oldBytesUsed);
+ }
+
+ private void addOneValue(BytesRef value) {
+ mergeLength(value.length);
+
+ int ord = hash.add(value);
+ if (ord < 0) {
+ ord = -ord-1;
+ } else {
+ // nocommit this is undercounting!
+ bytesUsed += value.length;
+ }
+ pending.add(ord);
+ // estimate 25% overhead for ArrayList:
+ bytesUsed += (int) (RamUsageEstimator.NUM_BYTES_OBJECT_REF * 1.25) + RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + RamUsageEstimator.NUM_BYTES_INT;
+ }
+
+ private void mergeLength(int length) {
+ if (fixedLength == -2) {
+ fixedLength = length;
+ } else if (fixedLength != length) {
+ fixedLength = -1;
+ }
+ maxLength = Math.max(maxLength, length);
+ }
+
+ public void flush(FieldInfo fieldInfo, SegmentWriteState state, SortedDocValuesConsumer consumer) throws IOException {
+ int valueCount = hash.size();
+
+ final int maxDoc = state.segmentInfo.getDocCount();
+ int emptyOrd = -1;
+ if (pending.size() < maxDoc) {
+ // Make sure we added EMPTY value before sorting:
+ emptyOrd = hash.add(EMPTY);
+ if (emptyOrd < 0) {
+ emptyOrd = -emptyOrd-1;
+ }
+ }
+
+ int[] sortedValues = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+ // nocommit must budget this into RAM consumption up front!
+ int[] ordMap = new int[valueCount];
+
+ // Write values, in sorted order:
+ BytesRef scratch = new BytesRef();
+ for(int ord=0;ord<valueCount;ord++) {
+ consumer.addValue(hash.get(sortedValues[ord], scratch));
+ ordMap[sortedValues[ord]] = ord;
+ }
+ final int bufferedDocCount = pending.size();
+
+ for(int docID=0;docID<bufferedDocCount;docID++) {
+ consumer.addDoc(ordMap[pending.get(docID)]);
+ }
+ for(int docID=bufferedDocCount;docID<maxDoc;docID++) {
+ consumer.addDoc(ordMap[emptyOrd]);
+ }
+ reset();
+ }
+
+ public void abort() {
+ reset();
+ }
+
+ private void reset() {
+ pending.clear();
+ pending.trimToSize();
+ hash.clear();
+ iwBytesUsed.addAndGet(-bytesUsed);
+ bytesUsed = 0;
+ fixedLength = -2;
+ maxLength = 0;
+ }
+}
\ No newline at end of file