You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2014/07/20 14:08:33 UTC
svn commit: r1612080 [1/3] - in /lucene/dev/trunk/lucene: ./
codecs/src/java/org/apache/lucene/codecs/blocktreeords/
codecs/src/resources/META-INF/services/
codecs/src/test/org/apache/lucene/codecs/blocktreeords/
core/src/java/org/apache/lucene/codecs/...
Author: mikemccand
Date: Sun Jul 20 12:08:32 2014
New Revision: 1612080
URL: http://svn.apache.org/r1612080
Log:
LUCENE-5819: add terms dict and postings format that implement term ordinals
Added:
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/FSTOrdsOutputs.java (with props)
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/Ords41PostingsFormat.java (with props)
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsReader.java (with props)
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java (with props)
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java (with props)
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnum.java (with props)
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnumFrame.java (with props)
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnum.java (with props)
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnumFrame.java (with props)
lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/package.html (with props)
lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/blocktreeords/
lucene/dev/trunk/lucene/codecs/src/test/org/apache/lucene/codecs/blocktreeords/TestOrdsBlockTree.java (with props)
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/ByteSequenceOutputs.java
lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/BasePostingsFormatTestCase.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/store/MockDirectoryWrapper.java
lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1612080&r1=1612079&r2=1612080&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Sun Jul 20 12:08:32 2014
@@ -118,6 +118,11 @@ New Features
"any" term) are allowed. This is a generalization of
MultiPhraseQuery and span queries, and enables "correct" (including
position) length search-time graph synonyms. (Mike McCandless)
+
+* LUCENE-5819: Add OrdsLucene41 block tree terms dict and postings
+ format, to include term ordinals in the index so the optional
+ TermsEnum.ord() and TermsEnum.seekExact(long ord) APIs work. (Mike
+ McCandless)
API Changes
Added: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/FSTOrdsOutputs.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/FSTOrdsOutputs.java?rev=1612080&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/FSTOrdsOutputs.java (added)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/FSTOrdsOutputs.java Sun Jul 20 12:08:32 2014
@@ -0,0 +1,233 @@
+package org.apache.lucene.codecs.blocktreeords;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.fst.Outputs;
+
+/** A custom FST outputs implementation that stores block data
+ * (BytesRef), long ordStart, long numTerms. */
+
+final class FSTOrdsOutputs extends Outputs<FSTOrdsOutputs.Output> {
+
+ public static final Output NO_OUTPUT = new Output(new BytesRef(), 0, 0);
+
+ private static final BytesRef NO_BYTES = new BytesRef();
+
+ public static final class Output {
+ public final BytesRef bytes;
+ // Inclusive:
+ public final long startOrd;
+ // Inclusive:
+ public final long endOrd;
+
+ public Output(BytesRef bytes, long startOrd, long endOrd) {
+ assert startOrd >= 0: "startOrd=" + startOrd;
+ assert endOrd >= 0: "endOrd=" + endOrd;
+ this.bytes = bytes;
+ this.startOrd = startOrd;
+ this.endOrd = endOrd;
+ }
+
+ @Override
+ public String toString() {
+ long x;
+ if (endOrd > Long.MAX_VALUE/2) {
+ x = Long.MAX_VALUE-endOrd;
+ } else {
+ assert endOrd >= 0;
+ x = -endOrd;
+ }
+ return startOrd + " to " + x;
+ }
+
+ @Override
+ public int hashCode() {
+ int hash = bytes.hashCode();
+ hash = (int) (hash ^ startOrd);
+ hash = (int) (hash ^ endOrd);
+ return hash;
+ }
+
+ @Override
+ public boolean equals(Object _other) {
+ if (_other instanceof Output) {
+ Output other = (Output) _other;
+ return bytes.equals(other.bytes) && startOrd == other.startOrd && endOrd == other.endOrd;
+ } else {
+ return false;
+ }
+ }
+ }
+
+ @Override
+ public Output common(Output output1, Output output2) {
+ BytesRef bytes1 = output1.bytes;
+ BytesRef bytes2 = output2.bytes;
+
+ assert bytes1 != null;
+ assert bytes2 != null;
+
+ int pos1 = bytes1.offset;
+ int pos2 = bytes2.offset;
+ int stopAt1 = pos1 + Math.min(bytes1.length, bytes2.length);
+ while(pos1 < stopAt1) {
+ if (bytes1.bytes[pos1] != bytes2.bytes[pos2]) {
+ break;
+ }
+ pos1++;
+ pos2++;
+ }
+
+ BytesRef prefixBytes;
+
+ if (pos1 == bytes1.offset) {
+ // no common prefix
+ prefixBytes = NO_BYTES;
+ } else if (pos1 == bytes1.offset + bytes1.length) {
+ // bytes1 is a prefix of bytes2
+ prefixBytes = bytes1;
+ } else if (pos2 == bytes2.offset + bytes2.length) {
+ // bytes2 is a prefix of bytes1
+ prefixBytes = bytes2;
+ } else {
+ prefixBytes = new BytesRef(bytes1.bytes, bytes1.offset, pos1-bytes1.offset);
+ }
+
+ return newOutput(prefixBytes,
+ Math.min(output1.startOrd, output2.startOrd),
+ Math.min(output1.endOrd, output2.endOrd));
+ }
+
+ @Override
+ public Output subtract(Output output, Output inc) {
+ assert output != null;
+ assert inc != null;
+ if (inc == NO_OUTPUT) {
+ // no prefix removed
+ return output;
+ } else {
+ assert StringHelper.startsWith(output.bytes, inc.bytes);
+ BytesRef suffix;
+ if (inc.bytes.length == output.bytes.length) {
+ // entire output removed
+ suffix = NO_BYTES;
+ } else if (inc.bytes.length == 0) {
+ suffix = output.bytes;
+ } else {
+ assert inc.bytes.length < output.bytes.length: "inc.length=" + inc.bytes.length + " vs output.length=" + output.bytes.length;
+ assert inc.bytes.length > 0;
+ suffix = new BytesRef(output.bytes.bytes, output.bytes.offset + inc.bytes.length, output.bytes.length-inc.bytes.length);
+ }
+ assert output.startOrd >= inc.startOrd;
+ assert output.endOrd >= inc.endOrd;
+ return newOutput(suffix, output.startOrd-inc.startOrd, output.endOrd - inc.endOrd);
+ }
+ }
+
+ @Override
+ public Output add(Output prefix, Output output) {
+ assert prefix != null;
+ assert output != null;
+ if (prefix == NO_OUTPUT) {
+ return output;
+ } else if (output == NO_OUTPUT) {
+ return prefix;
+ } else {
+ BytesRef bytes = new BytesRef(prefix.bytes.length + output.bytes.length);
+ System.arraycopy(prefix.bytes.bytes, prefix.bytes.offset, bytes.bytes, 0, prefix.bytes.length);
+ System.arraycopy(output.bytes.bytes, output.bytes.offset, bytes.bytes, prefix.bytes.length, output.bytes.length);
+ bytes.length = prefix.bytes.length + output.bytes.length;
+ return newOutput(bytes, prefix.startOrd + output.startOrd, prefix.endOrd + output.endOrd);
+ }
+ }
+
+ @Override
+ public void write(Output prefix, DataOutput out) throws IOException {
+ out.writeVInt(prefix.bytes.length);
+ out.writeBytes(prefix.bytes.bytes, prefix.bytes.offset, prefix.bytes.length);
+ out.writeVLong(prefix.startOrd);
+ out.writeVLong(prefix.endOrd);
+ }
+
+ @Override
+ public Output read(DataInput in) throws IOException {
+ int len = in.readVInt();
+ BytesRef bytes;
+ if (len == 0) {
+ bytes = NO_BYTES;
+ } else {
+ bytes = new BytesRef(len);
+ in.readBytes(bytes.bytes, 0, len);
+ bytes.length = len;
+ }
+
+ long startOrd = in.readVLong();
+ long endOrd = in.readVLong();
+
+ Output result = newOutput(bytes, startOrd, endOrd);
+
+ return result;
+ }
+
+ @Override
+ public void skipOutput(DataInput in) throws IOException {
+ int len = in.readVInt();
+ in.skipBytes(len);
+ in.readVLong();
+ in.readVLong();
+ }
+
+ @Override
+ public void skipFinalOutput(DataInput in) throws IOException {
+ skipOutput(in);
+ }
+
+ @Override
+ public Output getNoOutput() {
+ return NO_OUTPUT;
+ }
+
+ @Override
+ public String outputToString(Output output) {
+ if ((output.endOrd == 0 || output.endOrd == Long.MAX_VALUE) && output.startOrd == 0) {
+ return "";
+ } else {
+ return output.toString();
+ }
+ }
+
+ public Output newOutput(BytesRef bytes, long startOrd, long endOrd) {
+ if (bytes.length == 0 && startOrd == 0 && endOrd == 0) {
+ return NO_OUTPUT;
+ } else {
+ return new Output(bytes, startOrd, endOrd);
+ }
+ }
+
+ @Override
+ public long ramBytesUsed(Output output) {
+ return 2 * RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + 2 * RamUsageEstimator.NUM_BYTES_LONG + 2 * RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER + 2 * RamUsageEstimator.NUM_BYTES_INT + output.bytes.length;
+ }
+}
Added: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/Ords41PostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/Ords41PostingsFormat.java?rev=1612080&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/Ords41PostingsFormat.java (added)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/Ords41PostingsFormat.java Sun Jul 20 12:08:32 2014
@@ -0,0 +1,111 @@
+package org.apache.lucene.codecs.blocktreeords;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.PostingsReaderBase;
+import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
+import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.IOUtils;
+
+/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene41PostingsWriter}. */
+public class Ords41PostingsFormat extends PostingsFormat {
+
+ private final int minTermBlockSize;
+ private final int maxTermBlockSize;
+
+ /**
+ * Fixed packed block size, number of integers encoded in
+ * a single packed block.
+ */
+ // NOTE: must be multiple of 64 because of PackedInts long-aligned encoding/decoding
+ public final static int BLOCK_SIZE = 128;
+
+ /** Creates {@code Lucene41PostingsFormat} with default
+ * settings. */
+ public Ords41PostingsFormat() {
+ this(OrdsBlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, OrdsBlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
+ }
+
+ /** Creates {@code Lucene41PostingsFormat} with custom
+ * values for {@code minBlockSize} and {@code
+ * maxBlockSize} passed to block terms dictionary.
+ * @see OrdsBlockTreeTermsWriter#OrdsBlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) */
+ public Ords41PostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
+ super("OrdsLucene41");
+ this.minTermBlockSize = minTermBlockSize;
+ assert minTermBlockSize > 1;
+ this.maxTermBlockSize = maxTermBlockSize;
+ assert minTermBlockSize <= maxTermBlockSize;
+ }
+
+ @Override
+ public String toString() {
+ return getName() + "(blocksize=" + BLOCK_SIZE + ")";
+ }
+
+ @Override
+ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
+ PostingsWriterBase postingsWriter = new Lucene41PostingsWriter(state);
+
+ boolean success = false;
+ try {
+ FieldsConsumer ret = new OrdsBlockTreeTermsWriter(state,
+ postingsWriter,
+ minTermBlockSize,
+ maxTermBlockSize);
+ success = true;
+ return ret;
+ } finally {
+ if (!success) {
+ IOUtils.closeWhileHandlingException(postingsWriter);
+ }
+ }
+ }
+
+ @Override
+ public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
+ PostingsReaderBase postingsReader = new Lucene41PostingsReader(state.directory,
+ state.fieldInfos,
+ state.segmentInfo,
+ state.context,
+ state.segmentSuffix);
+ boolean success = false;
+ try {
+ FieldsProducer ret = new OrdsBlockTreeTermsReader(state.directory,
+ state.fieldInfos,
+ state.segmentInfo,
+ postingsReader,
+ state.context,
+ state.segmentSuffix);
+ success = true;
+ return ret;
+ } finally {
+ if (!success) {
+ IOUtils.closeWhileHandlingException(postingsReader);
+ }
+ }
+ }
+}
Added: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsReader.java?rev=1612080&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsReader.java (added)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsReader.java Sun Jul 20 12:08:32 2014
@@ -0,0 +1,246 @@
+package org.apache.lucene.codecs.blocktreeords;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.TreeMap;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsReaderBase;
+import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+
+/**
+ * See {@link OrdsBlockTreeTermsWriter}.
+ *
+ * @lucene.experimental
+ */
+
+public final class OrdsBlockTreeTermsReader extends FieldsProducer {
+
+ // Open input to the main terms dict file (_X.tiv)
+ final IndexInput in;
+
+ //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG;
+
+ // Reads the terms dict entries, to gather state to
+ // produce DocsEnum on demand
+ final PostingsReaderBase postingsReader;
+
+ private final TreeMap<String,OrdsFieldReader> fields = new TreeMap<>();
+
+ /** File offset where the directory starts in the terms file. */
+ private long dirOffset;
+
+ /** File offset where the directory starts in the index file. */
+ private long indexDirOffset;
+
+ final String segment;
+
+ private final int version;
+
+ /** Sole constructor. */
+ public OrdsBlockTreeTermsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo info,
+ PostingsReaderBase postingsReader, IOContext ioContext,
+ String segmentSuffix)
+ throws IOException {
+
+ this.postingsReader = postingsReader;
+
+ this.segment = info.name;
+ in = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, OrdsBlockTreeTermsWriter.TERMS_EXTENSION),
+ ioContext);
+
+ boolean success = false;
+ IndexInput indexIn = null;
+
+ try {
+ version = CodecUtil.checkHeader(in,
+ OrdsBlockTreeTermsWriter.TERMS_CODEC_NAME,
+ OrdsBlockTreeTermsWriter.VERSION_START,
+ OrdsBlockTreeTermsWriter.VERSION_CURRENT);
+ indexIn = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, OrdsBlockTreeTermsWriter.TERMS_INDEX_EXTENSION),
+ ioContext);
+ int indexVersion = CodecUtil.checkHeader(indexIn,
+ OrdsBlockTreeTermsWriter.TERMS_INDEX_CODEC_NAME,
+ OrdsBlockTreeTermsWriter.VERSION_START,
+ OrdsBlockTreeTermsWriter.VERSION_CURRENT);
+ if (indexVersion != version) {
+ throw new CorruptIndexException("mixmatched version files: " + in + "=" + version + "," + indexIn + "=" + indexVersion);
+ }
+
+ // verify
+ CodecUtil.checksumEntireFile(indexIn);
+
+ // Have PostingsReader init itself
+ postingsReader.init(in);
+
+ // Read per-field details
+ seekDir(in, dirOffset);
+ seekDir(indexIn, indexDirOffset);
+
+ final int numFields = in.readVInt();
+ if (numFields < 0) {
+ throw new CorruptIndexException("invalid numFields: " + numFields + " (resource=" + in + ")");
+ }
+
+ for(int i=0;i<numFields;i++) {
+ final int field = in.readVInt();
+ final long numTerms = in.readVLong();
+ assert numTerms >= 0;
+ // System.out.println("read field=" + field + " numTerms=" + numTerms + " i=" + i);
+ final int numBytes = in.readVInt();
+ final BytesRef code = new BytesRef(new byte[numBytes]);
+ in.readBytes(code.bytes, 0, numBytes);
+ code.length = numBytes;
+ final Output rootCode = OrdsBlockTreeTermsWriter.FST_OUTPUTS.newOutput(code, 0, numTerms);
+ final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+ assert fieldInfo != null: "field=" + field;
+ assert numTerms <= Integer.MAX_VALUE;
+ final long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong();
+ final long sumDocFreq = in.readVLong();
+ final int docCount = in.readVInt();
+ final int longsSize = in.readVInt();
+ // System.out.println(" longsSize=" + longsSize);
+
+ BytesRef minTerm = readBytesRef(in);
+ BytesRef maxTerm = readBytesRef(in);
+ if (docCount < 0 || docCount > info.getDocCount()) { // #docs with field must be <= #docs
+ throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + info.getDocCount() + " (resource=" + in + ")");
+ }
+ if (sumDocFreq < docCount) { // #postings must be >= #docs with field
+ throw new CorruptIndexException("invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount + " (resource=" + in + ")");
+ }
+ if (sumTotalTermFreq != -1 && sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings
+ throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq + " (resource=" + in + ")");
+ }
+ final long indexStartFP = indexIn.readVLong();
+ OrdsFieldReader previous = fields.put(fieldInfo.name,
+ new OrdsFieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount,
+ indexStartFP, longsSize, indexIn, minTerm, maxTerm));
+ if (previous != null) {
+ throw new CorruptIndexException("duplicate field: " + fieldInfo.name + " (resource=" + in + ")");
+ }
+ }
+ indexIn.close();
+
+ success = true;
+ } finally {
+ if (!success) {
+ // this.close() will close in:
+ IOUtils.closeWhileHandlingException(indexIn, this);
+ }
+ }
+ }
+
+ private static BytesRef readBytesRef(IndexInput in) throws IOException {
+ BytesRef bytes = new BytesRef();
+ bytes.length = in.readVInt();
+ bytes.bytes = new byte[bytes.length];
+ in.readBytes(bytes.bytes, 0, bytes.length);
+ return bytes;
+ }
+
+ /** Seek {@code input} to the directory offset. */
+ private void seekDir(IndexInput input, long dirOffset)
+ throws IOException {
+ input.seek(input.length() - CodecUtil.footerLength() - 8);
+ dirOffset = input.readLong();
+ input.seek(dirOffset);
+ }
+
+ // for debugging
+ // private static String toHex(int v) {
+ // return "0x" + Integer.toHexString(v);
+ // }
+
+ @Override
+ public void close() throws IOException {
+ try {
+ IOUtils.close(in, postingsReader);
+ } finally {
+ // Clear so refs to terms index is GCable even if
+ // app hangs onto us:
+ fields.clear();
+ }
+ }
+
+ @Override
+ public Iterator<String> iterator() {
+ return Collections.unmodifiableSet(fields.keySet()).iterator();
+ }
+
+ @Override
+ public Terms terms(String field) throws IOException {
+ assert field != null;
+ return fields.get(field);
+ }
+
+ @Override
+ public int size() {
+ return fields.size();
+ }
+
+ // for debugging
+ String brToString(BytesRef b) {
+ if (b == null) {
+ return "null";
+ } else {
+ try {
+ return b.utf8ToString() + " " + b;
+ } catch (Throwable t) {
+ // If BytesRef isn't actually UTF8, or it's eg a
+ // prefix of UTF8 that ends mid-unicode-char, we
+ // fallback to hex:
+ return b.toString();
+ }
+ }
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ long sizeInByes = ((postingsReader!=null) ? postingsReader.ramBytesUsed() : 0);
+ for (OrdsFieldReader reader : fields.values()) {
+ sizeInByes += reader.ramBytesUsed();
+ }
+ return sizeInByes;
+ }
+
+ @Override
+ public void checkIntegrity() throws IOException {
+ // term dictionary
+ CodecUtil.checksumEntireFile(in);
+
+ // postings
+ postingsReader.checkIntegrity();
+ }
+}
Added: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java?rev=1612080&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java (added)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java Sun Jul 20 12:08:32 2014
@@ -0,0 +1,1101 @@
+package org.apache.lucene.codecs.blocktreeords;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.codecs.BlockTermState;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter;
+import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.BytesRefFSTEnum;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.NoOutputs;
+import org.apache.lucene.util.fst.Util;
+import org.apache.lucene.util.packed.PackedInts;
+
+/*
+ TODO:
+
+ - Currently there is a one-to-one mapping of indexed
+ term to term block, but we could decouple the two, ie,
+ put more terms into the index than there are blocks.
+ The index would take up more RAM but then it'd be able
+ to avoid seeking more often and could make PK/FuzzyQ
+ faster if the additional indexed terms could store
+ the offset into the terms block.
+
+ - The blocks are not written in true depth-first
+ order, meaning if you just next() the file pointer will
+ sometimes jump backwards. For example, block foo* will
+ be written before block f* because it finished before.
+ This could possibly hurt performance if the terms dict is
+ not hot, since OSs anticipate sequential file access. We
+ could fix the writer to re-order the blocks as a 2nd
+ pass.
+
+ - Each block encodes the term suffixes packed
+ sequentially using a separate vInt per term, which is
+ 1) wasteful and 2) slow (must linear scan to find a
+ particular suffix). We should instead 1) make
+ random-access array so we can directly access the Nth
+ suffix, and 2) bulk-encode this array using bulk int[]
+ codecs; then at search time we can binary search when
+ we seek a particular term.
+*/
+
+/**
+ * This is just like {@link BlockTreeTermsWriter}, except it also stores a version per term, and adds a method to its TermsEnum
+ * implementation to seekExact only if the version is >= the specified version. The version is added to the terms index to avoid seeking if
+ * no term in the block has a high enough version. The term blocks file is .tiv and the terms index extension is .tipv.
+ *
+ * @lucene.experimental
+ */
+
+public final class OrdsBlockTreeTermsWriter extends FieldsConsumer {
+
+ // private static boolean DEBUG = IDOrdsSegmentTermsEnum.DEBUG;
+
+ static final FSTOrdsOutputs FST_OUTPUTS = new FSTOrdsOutputs();
+
+ static final Output NO_OUTPUT = FST_OUTPUTS.getNoOutput();
+
+ /** Suggested default value for the {@code
+ * minItemsInBlock} parameter to {@link
+ * #OrdsBlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */
+ public final static int DEFAULT_MIN_BLOCK_SIZE = 25;
+
+ /** Suggested default value for the {@code
+ * maxItemsInBlock} parameter to {@link
+ * #OrdsBlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. */
+ public final static int DEFAULT_MAX_BLOCK_SIZE = 48;
+
+ //public final static boolean DEBUG = false;
+ //private final static boolean SAVE_DOT_FILES = false;
+
+ static final int OUTPUT_FLAGS_NUM_BITS = 2;
+ static final int OUTPUT_FLAGS_MASK = 0x3;
+ static final int OUTPUT_FLAG_IS_FLOOR = 0x1;
+ static final int OUTPUT_FLAG_HAS_TERMS = 0x2;
+
+ /** Extension of terms file */
+ static final String TERMS_EXTENSION = "tio";
+ final static String TERMS_CODEC_NAME = "BLOCK_TREE_ORDS_TERMS_DICT";
+
+ /** Initial terms format. */
+ public static final int VERSION_START = 0;
+
+ /** Current terms format. */
+ public static final int VERSION_CURRENT = VERSION_START;
+
+ /** Extension of terms index file */
+ static final String TERMS_INDEX_EXTENSION = "tipo";
+ final static String TERMS_INDEX_CODEC_NAME = "BLOCK_TREE_ORDS_TERMS_INDEX";
+
+ private final IndexOutput out;
+ private final IndexOutput indexOut;
+ final int maxDoc;
+ final int minItemsInBlock;
+ final int maxItemsInBlock;
+
+ final PostingsWriterBase postingsWriter;
+ final FieldInfos fieldInfos;
+
+ private static class FieldMetaData {
+ public final FieldInfo fieldInfo;
+ public final Output rootCode;
+ public final long numTerms;
+ public final long indexStartFP;
+ public final long sumTotalTermFreq;
+ public final long sumDocFreq;
+ public final int docCount;
+ private final int longsSize;
+ public final BytesRef minTerm;
+ public final BytesRef maxTerm;
+
+ public FieldMetaData(FieldInfo fieldInfo, Output rootCode, long numTerms, long indexStartFP,
+ long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize,
+ BytesRef minTerm, BytesRef maxTerm) {
+ assert numTerms > 0;
+ this.fieldInfo = fieldInfo;
+ assert rootCode != null: "field=" + fieldInfo.name + " numTerms=" + numTerms;
+ this.rootCode = rootCode;
+ this.indexStartFP = indexStartFP;
+ this.numTerms = numTerms;
+ this.sumTotalTermFreq = sumTotalTermFreq;
+ this.sumDocFreq = sumDocFreq;
+ this.docCount = docCount;
+ this.longsSize = longsSize;
+ this.minTerm = minTerm;
+ this.maxTerm = maxTerm;
+ }
+ }
+
+ private final List<FieldMetaData> fields = new ArrayList<>();
+
+ // private final String segment;
+
+ /** Create a new writer. The number of items (terms or
+ * sub-blocks) per block will aim to be between
+ * minItemsPerBlock and maxItemsPerBlock, though in some
+ * cases the blocks may be smaller than the min. */
+ public OrdsBlockTreeTermsWriter(
+ SegmentWriteState state,
+ PostingsWriterBase postingsWriter,
+ int minItemsInBlock,
+ int maxItemsInBlock)
+ throws IOException
+ {
+ if (minItemsInBlock <= 1) {
+ throw new IllegalArgumentException("minItemsInBlock must be >= 2; got " + minItemsInBlock);
+ }
+ if (maxItemsInBlock <= 0) {
+ throw new IllegalArgumentException("maxItemsInBlock must be >= 1; got " + maxItemsInBlock);
+ }
+ if (minItemsInBlock > maxItemsInBlock) {
+ throw new IllegalArgumentException("maxItemsInBlock must be >= minItemsInBlock; got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock);
+ }
+ if (2*(minItemsInBlock-1) > maxItemsInBlock) {
+ throw new IllegalArgumentException("maxItemsInBlock must be at least 2*(minItemsInBlock-1); got maxItemsInBlock=" + maxItemsInBlock + " minItemsInBlock=" + minItemsInBlock);
+ }
+
+ maxDoc = state.segmentInfo.getDocCount();
+
+ final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);
+ out = state.directory.createOutput(termsFileName, state.context);
+ boolean success = false;
+ IndexOutput indexOut = null;
+ try {
+ fieldInfos = state.fieldInfos;
+ this.minItemsInBlock = minItemsInBlock;
+ this.maxItemsInBlock = maxItemsInBlock;
+ CodecUtil.writeHeader(out, TERMS_CODEC_NAME, VERSION_CURRENT);
+
+ final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
+ indexOut = state.directory.createOutput(termsIndexFileName, state.context);
+ CodecUtil.writeHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT);
+
+ this.postingsWriter = postingsWriter;
+ // segment = state.segmentInfo.name;
+
+ // System.out.println("BTW.init seg=" + state.segmentName);
+
+ postingsWriter.init(out); // have consumer write its format/header
+ success = true;
+ } finally {
+ if (!success) {
+ IOUtils.closeWhileHandlingException(out, indexOut);
+ }
+ }
+ this.indexOut = indexOut;
+ }
+
+ @Override
+ public void write(Fields fields) throws IOException {
+
+ String lastField = null;
+ for(String field : fields) {
+ assert lastField == null || lastField.compareTo(field) < 0;
+ lastField = field;
+
+ Terms terms = fields.terms(field);
+ if (terms == null) {
+ continue;
+ }
+
+ TermsEnum termsEnum = terms.iterator(null);
+
+ TermsWriter termsWriter = new TermsWriter(fieldInfos.fieldInfo(field));
+ BytesRef minTerm = null;
+ BytesRef maxTerm = new BytesRef();
+ while (true) {
+ BytesRef term = termsEnum.next();
+ if (term == null) {
+ break;
+ }
+ if (minTerm == null) {
+ minTerm = BytesRef.deepCopyOf(term);
+ }
+ maxTerm.copyBytes(term);
+ termsWriter.write(term, termsEnum);
+ }
+
+ termsWriter.finish(minTerm, minTerm == null ? null : maxTerm);
+ }
+ }
+
+ static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) {
+ assert fp < (1L << 62);
+ return (fp << 2) | (hasTerms ? OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor ? OUTPUT_FLAG_IS_FLOOR : 0);
+ }
+
+ private static class PendingEntry {
+ public final boolean isTerm;
+
+ protected PendingEntry(boolean isTerm) {
+ this.isTerm = isTerm;
+ }
+ }
+
+ private static final class PendingTerm extends PendingEntry {
+ public final BytesRef term;
+ // stats + metadata
+ public final BlockTermState state;
+
+ public PendingTerm(BytesRef term, BlockTermState state) {
+ super(true);
+ this.term = term;
+ this.state = state;
+ }
+
+ @Override
+ public String toString() {
+ return term.utf8ToString();
+ }
+ }
+
+ private static final class SubIndex {
+ public final FST<Output> index;
+ public final long termOrdStart;
+
+ public SubIndex(FST<Output> index, long termOrdStart) {
+ this.index = index;
+ this.termOrdStart = termOrdStart;
+ }
+ }
+
+ private static final class PendingBlock extends PendingEntry {
+ public final BytesRef prefix;
+ public final long fp;
+ public FST<Output> index;
+ public List<SubIndex> subIndices;
+ public final boolean hasTerms;
+ public final boolean isFloor;
+ public final int floorLeadByte;
+ public long totFloorTermCount;
+ private final long totalTermCount;
+ private final IntsRef scratchIntsRef = new IntsRef();
+
+ public PendingBlock(BytesRef prefix, long fp, boolean hasTerms, long totalTermCount,
+ boolean isFloor, int floorLeadByte, List<SubIndex> subIndices) {
+ super(false);
+ this.prefix = prefix;
+ this.fp = fp;
+ this.hasTerms = hasTerms;
+ this.totalTermCount = totalTermCount;
+ this.isFloor = isFloor;
+ this.floorLeadByte = floorLeadByte;
+ this.subIndices = subIndices;
+ }
+
+ @Override
+ public String toString() {
+ return "BLOCK: " + prefix.utf8ToString();
+ }
+
+ public void compileIndex(List<PendingBlock> floorBlocks, RAMOutputStream scratchBytes) throws IOException {
+
+ assert (isFloor && floorBlocks != null && floorBlocks.size() != 0) || (!isFloor && floorBlocks == null): "isFloor=" + isFloor + " floorBlocks=" + floorBlocks;
+
+ assert scratchBytes.getFilePointer() == 0;
+
+ // TODO: try writing the leading vLong in MSB order
+ // (opposite of what Lucene does today), for better
+ // outputs sharing in the FST
+ //System.out.println("\ncompileIndex isFloor=" + isFloor + " numTerms=" + totalTermCount);
+ long lastSumTotalTermCount = 0;
+ long sumTotalTermCount = totalTermCount;
+ scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor));
+ if (isFloor) {
+ scratchBytes.writeVInt(floorBlocks.size());
+ for (PendingBlock sub : floorBlocks) {
+ assert sub.floorLeadByte != -1;
+ //if (DEBUG) {
+ // System.out.println(" write floorLeadByte=" + Integer.toHexString(sub.floorLeadByte&0xff));
+ //}
+ scratchBytes.writeByte((byte) sub.floorLeadByte);
+ // System.out.println(" write floor byte=" + (byte) sub.floorLeadByte + " ordShift=" + sumTotalTermCount);
+ scratchBytes.writeVLong(sumTotalTermCount - lastSumTotalTermCount);
+ lastSumTotalTermCount = sumTotalTermCount;
+ sumTotalTermCount += sub.totalTermCount;
+ assert sub.fp > fp;
+ scratchBytes.writeVLong((sub.fp - fp) << 1 | (sub.hasTerms ? 1 : 0));
+ }
+ }
+
+ final Builder<Output> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
+ 0, 0, true, false, Integer.MAX_VALUE,
+ FST_OUTPUTS, null, false,
+ PackedInts.COMPACT, true, 15);
+ //if (DEBUG) {
+ // System.out.println(" compile index for prefix=" + prefix);
+ //}
+ //indexBuilder.DEBUG = false;
+ final byte[] bytes = new byte[(int) scratchBytes.getFilePointer()];
+ assert bytes.length > 0;
+ // System.out.println(" bytes=" + bytes.length);
+ scratchBytes.writeTo(bytes, 0);
+ indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef),
+ FST_OUTPUTS.newOutput(new BytesRef(bytes, 0, bytes.length),
+ 0, Long.MAX_VALUE-(sumTotalTermCount-1)));
+ scratchBytes.reset();
+
+ // Copy over index for all sub-blocks
+
+ if (subIndices != null) {
+ for(SubIndex subIndex : subIndices) {
+ //System.out.println(" append subIndex: termOrdStart=" + subIndex.termOrdStart);
+ append(indexBuilder, subIndex.index, subIndex.termOrdStart);
+ }
+ }
+
+ if (floorBlocks != null) {
+ long termOrdOffset = totalTermCount;
+ for (PendingBlock sub : floorBlocks) {
+ if (sub.subIndices != null) {
+ for(SubIndex subIndex : sub.subIndices) {
+ append(indexBuilder, subIndex.index, termOrdOffset + subIndex.termOrdStart);
+ }
+ }
+ sub.subIndices = null;
+ termOrdOffset += sub.totalTermCount;
+ }
+ totFloorTermCount = termOrdOffset;
+ } else {
+ totFloorTermCount = sumTotalTermCount;
+ }
+
+ index = indexBuilder.finish();
+ subIndices = null;
+
+ /*
+ Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"));
+ Util.toDot(index, w, false, false);
+ System.out.println("SAVED to out.dot");
+ w.close();
+ */
+ }
+
+ // TODO: maybe we could add bulk-add method to
+ // Builder? Takes FST and unions it w/ current
+ // FST.
+ private void append(Builder<Output> builder, FST<Output> subIndex, long termOrdOffset) throws IOException {
+ final BytesRefFSTEnum<Output> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
+ BytesRefFSTEnum.InputOutput<Output> indexEnt;
+ while ((indexEnt = subIndexEnum.next()) != null) {
+ //if (DEBUG) {
+ // System.out.println(" add sub=" + indexEnt.input + " " + indexEnt.input + " output=" + indexEnt.output);
+ //}
+ Output output = indexEnt.output;
+ long blockTermCount = output.endOrd - output.startOrd + 1;
+ Output newOutput = FST_OUTPUTS.newOutput(output.bytes, termOrdOffset+output.startOrd, output.endOrd-termOrdOffset);
+ //System.out.println(" append sub=" + indexEnt.input + " output=" + indexEnt.output + " termOrdOffset=" + termOrdOffset + " blockTermCount=" + blockTermCount + " newOutput=" + newOutput + " endOrd=" + (termOrdOffset+Long.MAX_VALUE-output.endOrd));
+ builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), newOutput);
+ }
+ }
+ }
+
+ final RAMOutputStream scratchBytes = new RAMOutputStream();
+
+ class TermsWriter {
+ private final FieldInfo fieldInfo;
+ private final int longsSize;
+ private long numTerms;
+ final FixedBitSet docsSeen;
+ long sumTotalTermFreq;
+ long sumDocFreq;
+ long indexStartFP;
+
+ // Used only to partition terms into the block tree; we
+ // don't pull an FST from this builder:
+ private final NoOutputs noOutputs;
+ private final Builder<Object> blockBuilder;
+
+ // PendingTerm or PendingBlock:
+ private final List<PendingEntry> pending = new ArrayList<>();
+
+ // Index into pending of most recently written block
+ private int lastBlockIndex = -1;
+
+ // Re-used when segmenting a too-large block into floor
+ // blocks:
+ private int[] subBytes = new int[10];
+ private int[] subTermCounts = new int[10];
+ private int[] subTermCountSums = new int[10];
+ private int[] subSubCounts = new int[10];
+
+ // This class assigns terms to blocks "naturally", ie,
+ // according to the number of terms under a given prefix
+ // that we encounter:
+ private class FindBlocks extends Builder.FreezeTail<Object> {
+
+ @Override
+ public void freeze(final Builder.UnCompiledNode<Object>[] frontier, int prefixLenPlus1, final IntsRef lastInput) throws IOException {
+
+ //if (DEBUG) System.out.println(" freeze prefixLenPlus1=" + prefixLenPlus1);
+
+ for(int idx=lastInput.length; idx >= prefixLenPlus1; idx--) {
+ final Builder.UnCompiledNode<Object> node = frontier[idx];
+
+ long totCount = 0;
+
+ if (node.isFinal) {
+ totCount++;
+ }
+
+ for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
+ @SuppressWarnings("unchecked") final Builder.UnCompiledNode<Object> target = (Builder.UnCompiledNode<Object>) node.arcs[arcIdx].target;
+ totCount += target.inputCount;
+ target.clear();
+ node.arcs[arcIdx].target = null;
+ }
+ node.numArcs = 0;
+
+ if (totCount >= minItemsInBlock || idx == 0) {
+ // We are on a prefix node that has enough
+ // entries (terms or sub-blocks) under it to let
+ // us write a new block or multiple blocks (main
+ // block + follow on floor blocks):
+ //if (DEBUG) {
+ // if (totCount < minItemsInBlock && idx != 0) {
+ // System.out.println(" force block has terms");
+ // }
+ //}
+ writeBlocks(lastInput, idx, (int) totCount);
+ node.inputCount = 1;
+ } else {
+ // stragglers! carry count upwards
+ node.inputCount = totCount;
+ }
+ frontier[idx] = new Builder.UnCompiledNode<>(blockBuilder, idx);
+ }
+ }
+ }
+
+ // Write the top count entries on the pending stack as
+ // one or more blocks. Returns how many blocks were
+ // written. If the entry count is <= maxItemsPerBlock
+ // we just write a single block; else we break into
+ // primary (initial) block and then one or more
+ // following floor blocks:
+
+ void writeBlocks(IntsRef prevTerm, int prefixLength, int count) throws IOException {
+ if (count <= maxItemsInBlock) {
+ // Easy case: not floor block. Eg, prefix is "foo",
+ // and we found 30 terms/sub-blocks starting w/ that
+ // prefix, and minItemsInBlock <= 30 <=
+ // maxItemsInBlock.
+ final PendingBlock nonFloorBlock = writeBlock(prevTerm, prefixLength, prefixLength, count, count, 0, false, -1, true);
+ nonFloorBlock.compileIndex(null, scratchBytes);
+ pending.add(nonFloorBlock);
+ } else {
+ // Floor block case. Eg, prefix is "foo" but we
+ // have 100 terms/sub-blocks starting w/ that
+ // prefix. We segment the entries into a primary
+ // block and following floor blocks using the first
+ // label in the suffix to assign to floor blocks.
+
+ // TODO: we could store min & max suffix start byte
+ // in each block, to make floor blocks authoritative
+
+ /*
+ if (DEBUG) {
+ final BytesRef prefix = new BytesRef(prefixLength);
+ for(int m=0;m<prefixLength;m++) {
+ prefix.bytes[m] = (byte) prevTerm.ints[m];
+ }
+ prefix.length = prefixLength;
+ //System.out.println("\nWBS count=" + count + " prefix=" + prefix.utf8ToString() + " " + prefix);
+ System.out.println("writeBlocks: prefix=" + toString(prefix) + " " + prefix + " count=" + count + " pending.size()=" + pending.size());
+ }
+ */
+ //System.out.println("\nwbs count=" + count);
+
+ final int savLabel = prevTerm.ints[prevTerm.offset + prefixLength];
+
+ // Count up how many items fall under
+ // each unique label after the prefix.
+
+ // TODO: this is wasteful since the builder had
+ // already done this (partitioned these sub-terms
+ // according to their leading prefix byte)
+
+ final List<PendingEntry> slice = pending.subList(pending.size()-count, pending.size());
+ int lastSuffixLeadLabel = -1;
+ int termCount = 0;
+ int subCount = 0;
+ int numSubs = 0;
+
+ for(PendingEntry ent : slice) {
+
+ // First byte in the suffix of this term
+ final int suffixLeadLabel;
+ if (ent.isTerm) {
+ PendingTerm term = (PendingTerm) ent;
+ if (term.term.length == prefixLength) {
+ // Suffix is 0, ie prefix 'foo' and term is
+ // 'foo' so the term has empty string suffix
+ // in this block
+ assert lastSuffixLeadLabel == -1;
+ assert numSubs == 0;
+ suffixLeadLabel = -1;
+ } else {
+ suffixLeadLabel = term.term.bytes[term.term.offset + prefixLength] & 0xff;
+ }
+ } else {
+ PendingBlock block = (PendingBlock) ent;
+ assert block.prefix.length > prefixLength;
+ suffixLeadLabel = block.prefix.bytes[block.prefix.offset + prefixLength] & 0xff;
+ }
+
+ if (suffixLeadLabel != lastSuffixLeadLabel && (termCount + subCount) != 0) {
+ if (subBytes.length == numSubs) {
+ subBytes = ArrayUtil.grow(subBytes);
+ subTermCounts = ArrayUtil.grow(subTermCounts);
+ subSubCounts = ArrayUtil.grow(subSubCounts);
+ }
+ subBytes[numSubs] = lastSuffixLeadLabel;
+ lastSuffixLeadLabel = suffixLeadLabel;
+ subTermCounts[numSubs] = termCount;
+ subSubCounts[numSubs] = subCount;
+ /*
+ if (suffixLeadLabel == -1) {
+ System.out.println(" sub " + -1 + " termCount=" + termCount + " subCount=" + subCount);
+ } else {
+ System.out.println(" sub " + Integer.toHexString(suffixLeadLabel) + " termCount=" + termCount + " subCount=" + subCount);
+ }
+ */
+ termCount = subCount = 0;
+ numSubs++;
+ }
+
+ if (ent.isTerm) {
+ termCount++;
+ } else {
+ subCount++;
+ }
+ }
+
+ if (subBytes.length == numSubs) {
+ subBytes = ArrayUtil.grow(subBytes);
+ subTermCounts = ArrayUtil.grow(subTermCounts);
+ subSubCounts = ArrayUtil.grow(subSubCounts);
+ }
+
+ subBytes[numSubs] = lastSuffixLeadLabel;
+ subTermCounts[numSubs] = termCount;
+ subSubCounts[numSubs] = subCount;
+ numSubs++;
+ /*
+ if (lastSuffixLeadLabel == -1) {
+ System.out.println(" sub " + -1 + " termCount=" + termCount + " subCount=" + subCount);
+ } else {
+ System.out.println(" sub " + Integer.toHexString(lastSuffixLeadLabel) + " termCount=" + termCount + " subCount=" + subCount);
+ }
+ */
+
+ if (subTermCountSums.length < numSubs) {
+ subTermCountSums = ArrayUtil.grow(subTermCountSums, numSubs);
+ }
+
+ // Roll up (backwards) the termCounts; postings impl
+ // needs this to know where to pull the term slice
+ // from its pending terms stack:
+ int sum = 0;
+ for(int idx=numSubs-1;idx>=0;idx--) {
+ sum += subTermCounts[idx];
+ subTermCountSums[idx] = sum;
+ }
+
+ // TODO: make a better segmenter? It'd have to
+ // absorb the too-small end blocks backwards into
+ // the previous blocks
+
+ // Naive greedy segmentation; this is not always
+ // best (it can produce a too-small block as the
+ // last block):
+ int pendingCount = 0;
+ int startLabel = subBytes[0];
+ int curStart = count;
+ subCount = 0;
+
+ final List<PendingBlock> floorBlocks = new ArrayList<>();
+ PendingBlock firstBlock = null;
+
+ for(int sub=0;sub<numSubs;sub++) {
+ pendingCount += subTermCounts[sub] + subSubCounts[sub];
+ //System.out.println(" " + (subTermCounts[sub] + subSubCounts[sub]));
+ subCount++;
+
+ // Greedily make a floor block as soon as we've
+ // crossed the min count
+ if (pendingCount >= minItemsInBlock) {
+ final int curPrefixLength;
+ if (startLabel == -1) {
+ curPrefixLength = prefixLength;
+ } else {
+ curPrefixLength = 1+prefixLength;
+ // floor term:
+ prevTerm.ints[prevTerm.offset + prefixLength] = startLabel;
+ }
+ //System.out.println(" " + subCount + " subs");
+ final PendingBlock floorBlock = writeBlock(prevTerm, prefixLength, curPrefixLength, curStart, pendingCount, subTermCountSums[1+sub], true, startLabel, curStart == pendingCount);
+ if (firstBlock == null) {
+ firstBlock = floorBlock;
+ } else {
+ floorBlocks.add(floorBlock);
+ }
+ curStart -= pendingCount;
+ //System.out.println(" = " + pendingCount);
+ pendingCount = 0;
+
+ assert minItemsInBlock == 1 || subCount > 1: "minItemsInBlock=" + minItemsInBlock + " subCount=" + subCount + " sub=" + sub + " of " + numSubs + " subTermCount=" + subTermCountSums[sub] + " subSubCount=" + subSubCounts[sub] + " depth=" + prefixLength;
+ subCount = 0;
+ startLabel = subBytes[sub+1];
+
+ if (curStart == 0) {
+ break;
+ }
+
+ if (curStart <= maxItemsInBlock) {
+ // remainder is small enough to fit into a
+ // block. NOTE that this may be too small (<
+ // minItemsInBlock); need a true segmenter
+ // here
+ assert startLabel != -1;
+ assert firstBlock != null;
+ prevTerm.ints[prevTerm.offset + prefixLength] = startLabel;
+ //System.out.println(" final " + (numSubs-sub-1) + " subs");
+ /*
+ for(sub++;sub < numSubs;sub++) {
+ System.out.println(" " + (subTermCounts[sub] + subSubCounts[sub]));
+ }
+ System.out.println(" = " + curStart);
+ if (curStart < minItemsInBlock) {
+ System.out.println(" **");
+ }
+ */
+ floorBlocks.add(writeBlock(prevTerm, prefixLength, prefixLength+1, curStart, curStart, 0, true, startLabel, true));
+ break;
+ }
+ }
+ }
+
+ prevTerm.ints[prevTerm.offset + prefixLength] = savLabel;
+
+ assert firstBlock != null;
+ firstBlock.compileIndex(floorBlocks, scratchBytes);
+
+ pending.add(firstBlock);
+ //if (DEBUG) System.out.println(" done pending.size()=" + pending.size());
+ }
+ lastBlockIndex = pending.size()-1;
+ }
+
+ // BytesRef prefix;
+
+ // for debugging
+ @SuppressWarnings("unused")
+ private String toString(BytesRef b) {
+ try {
+ return b.utf8ToString() + " " + b;
+ } catch (Throwable t) {
+ // If BytesRef isn't actually UTF8, or it's eg a
+ // prefix of UTF8 that ends mid-unicode-char, we
+ // fallback to hex:
+ return b.toString();
+ }
+ }
+
+ // Writes all entries in the pending slice as a single
+ // block:
+ private PendingBlock writeBlock(IntsRef prevTerm, int prefixLength, int indexPrefixLength, int startBackwards, int length,
+ int futureTermCount, boolean isFloor, int floorLeadByte, boolean isLastInFloor) throws IOException {
+
+ assert length > 0;
+
+ final int start = pending.size()-startBackwards;
+
+ assert start >= 0: "pending.size()=" + pending.size() + " startBackwards=" + startBackwards + " length=" + length;
+
+ final List<PendingEntry> slice = pending.subList(start, start + length);
+
+ final long startFP = out.getFilePointer();
+
+ // System.out.println("\nwriteBlock field=" + fieldInfo.name + " seg=" + segment + " prefixLength=" + prefixLength + " floorLeadByte=" + floorLeadByte + " isLastInFloor=" + isLastInFloor + " length=" + length + " startFP=" + startFP);
+
+ final BytesRef prefix = new BytesRef(indexPrefixLength);
+ for(int m=0;m<indexPrefixLength;m++) {
+ prefix.bytes[m] = (byte) prevTerm.ints[m];
+ }
+ prefix.length = indexPrefixLength;
+ // System.out.println(" prefix=" + toString(prefix));
+ // this.prefix = prefix;
+
+ // Write block header:
+ out.writeVInt((length<<1)|(isLastInFloor ? 1:0));
+
+ // if (DEBUG) {
+ // System.out.println(" writeBlock " + (isFloor ? "(floor) " : "") + "seg=" + segment + " pending.size()=" + pending.size() + " prefixLength=" + prefixLength + " indexPrefix=" + toString(prefix) + " entCount=" + length + " startFP=" + startFP + " futureTermCount=" + futureTermCount + (isFloor ? (" floorLeadByte=" + Integer.toHexString(floorLeadByte&0xff)) : "") + " isLastInFloor=" + isLastInFloor);
+ // }
+
+ // 1st pass: pack term suffix bytes into byte[] blob
+ // TODO: cutover to bulk int codec... simple64?
+
+ final boolean isLeafBlock;
+ if (lastBlockIndex < start) {
+ // This block definitely does not contain sub-blocks:
+ isLeafBlock = true;
+ //System.out.println("no scan true isFloor=" + isFloor);
+ } else if (!isFloor) {
+ // This block definitely does contain at least one sub-block:
+ isLeafBlock = false;
+ //System.out.println("no scan false " + lastBlockIndex + " vs start=" + start + " len=" + length);
+ } else {
+ // Must scan up-front to see if there is a sub-block
+ boolean v = true;
+ //System.out.println("scan " + lastBlockIndex + " vs start=" + start + " len=" + length);
+ for (PendingEntry ent : slice) {
+ if (!ent.isTerm) {
+ v = false;
+ break;
+ }
+ }
+ isLeafBlock = v;
+ }
+ // System.out.println(" isLeaf=" + isLeafBlock);
+
+ final List<SubIndex> subIndices;
+
+ // Number of terms in this block
+ int termCount;
+
+ // Number of terms in this block and all sub-blocks (recursively)
+ long totalTermCount;
+
+ long[] longs = new long[longsSize];
+ boolean absolute = true;
+
+ int countx = 0;
+ if (isLeafBlock) {
+ subIndices = null;
+ for (PendingEntry ent : slice) {
+ assert ent.isTerm;
+ PendingTerm term = (PendingTerm) ent;
+ BlockTermState state = term.state;
+ final int suffix = term.term.length - prefixLength;
+ /*
+ if (DEBUG) {
+ BytesRef suffixBytes = new BytesRef(suffix);
+ System.arraycopy(term.term.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
+ suffixBytes.length = suffix;
+ System.out.println(" " + (countx++) + ": write term suffix=" + toString(suffixBytes));
+ }
+ */
+ // For leaf block we write suffix straight
+ suffixWriter.writeVInt(suffix);
+ suffixWriter.writeBytes(term.term.bytes, prefixLength, suffix);
+
+ // Write term stats, to separate byte[] blob:
+ statsWriter.writeVInt(state.docFreq);
+ if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
+ assert state.totalTermFreq >= state.docFreq: state.totalTermFreq + " vs " + state.docFreq;
+ statsWriter.writeVLong(state.totalTermFreq - state.docFreq);
+ }
+ // System.out.println(" dF=" + state.docFreq + " tTF=" + state.totalTermFreq);
+
+ // Write term meta data
+ postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute);
+ for (int pos = 0; pos < longsSize; pos++) {
+ assert longs[pos] >= 0;
+ metaWriter.writeVLong(longs[pos]);
+ }
+ bytesWriter.writeTo(metaWriter);
+ bytesWriter.reset();
+ absolute = false;
+ }
+ termCount = length;
+ totalTermCount = length;
+ } else {
+ subIndices = new ArrayList<>();
+ termCount = 0;
+ totalTermCount = 0;
+ for (PendingEntry ent : slice) {
+ if (ent.isTerm) {
+ PendingTerm term = (PendingTerm) ent;
+ BlockTermState state = term.state;
+ final int suffix = term.term.length - prefixLength;
+ /*
+ if (DEBUG) {
+ BytesRef suffixBytes = new BytesRef(suffix);
+ System.arraycopy(term.term.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
+ suffixBytes.length = suffix;
+ System.out.println(" " + (countx++) + ": write term suffix=" + toString(suffixBytes) + " termOrd=" + totalTermCount);
+ }
+ */
+ // For non-leaf block we borrow 1 bit to record
+ // if entry is term or sub-block
+ suffixWriter.writeVInt(suffix<<1);
+ suffixWriter.writeBytes(term.term.bytes, prefixLength, suffix);
+
+ // Write term stats, to separate byte[] blob:
+ statsWriter.writeVInt(state.docFreq);
+ if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
+ assert state.totalTermFreq >= state.docFreq;
+ statsWriter.writeVLong(state.totalTermFreq - state.docFreq);
+ }
+
+ // TODO: now that terms dict "sees" these longs,
+ // we can explore better column-stride encodings
+ // to encode all long[0]s for this block at
+ // once, all long[1]s, etc., e.g. using
+ // Simple64. Alternatively, we could interleave
+ // stats + meta ... no reason to have them
+ // separate anymore:
+
+ // Write term meta data
+ postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute);
+ for (int pos = 0; pos < longsSize; pos++) {
+ assert longs[pos] >= 0;
+ metaWriter.writeVLong(longs[pos]);
+ }
+ bytesWriter.writeTo(metaWriter);
+ bytesWriter.reset();
+ absolute = false;
+
+ termCount++;
+ totalTermCount++;
+ } else {
+ PendingBlock block = (PendingBlock) ent;
+ final int suffix = block.prefix.length - prefixLength;
+
+ assert suffix > 0;
+
+ // For non-leaf block we steal 1 bit to record
+ // if entry is term or sub-block
+ suffixWriter.writeVInt((suffix<<1)|1);
+ suffixWriter.writeBytes(block.prefix.bytes, prefixLength, suffix);
+ assert block.fp < startFP;
+
+ suffixWriter.writeVLong(startFP - block.fp);
+
+ /*
+ if (DEBUG) {
+ BytesRef suffixBytes = new BytesRef(suffix);
+ System.arraycopy(block.prefix.bytes, prefixLength, suffixBytes.bytes, 0, suffix);
+ suffixBytes.length = suffix;
+ System.out.println(" " + (countx++) + ": write sub-block suffix=" + toString(suffixBytes) + " subFP=" + block.fp + " subCode=" + (startFP-block.fp) + " floor=" + block.isFloor + " totFloorTermCount=" + block.totFloorTermCount);
+ }
+ */
+
+ suffixWriter.writeVLong(block.totFloorTermCount);
+ subIndices.add(new SubIndex(block.index, totalTermCount));
+ totalTermCount += block.totFloorTermCount;
+ }
+ }
+
+ assert subIndices.size() != 0;
+ }
+
+ // TODO: we could block-write the term suffix pointers;
+ // this would take more space but would enable binary
+ // search on lookup
+
+ // Write suffixes byte[] blob to terms dict output:
+ out.writeVInt((int) (suffixWriter.getFilePointer() << 1) | (isLeafBlock ? 1:0));
+ suffixWriter.writeTo(out);
+ suffixWriter.reset();
+
+ // Write term stats byte[] blob
+ out.writeVInt((int) statsWriter.getFilePointer());
+ //System.out.println("write stats @ fp=" + out.getFilePointer());
+ statsWriter.writeTo(out);
+ statsWriter.reset();
+
+ // Write term meta data byte[] blob
+ out.writeVInt((int) metaWriter.getFilePointer());
+ metaWriter.writeTo(out);
+ metaWriter.reset();
+
+ // Remove slice replaced by block:
+ slice.clear();
+
+ if (lastBlockIndex >= start) {
+ if (lastBlockIndex < start+length) {
+ lastBlockIndex = start;
+ } else {
+ lastBlockIndex -= length;
+ }
+ }
+
+ // if (DEBUG) {
+ // System.out.println(" fpEnd=" + out.getFilePointer());
+ // }
+
+ return new PendingBlock(prefix, startFP, termCount != 0, totalTermCount, isFloor, floorLeadByte, subIndices);
+ }
+
+ TermsWriter(FieldInfo fieldInfo) {
+ this.fieldInfo = fieldInfo;
+ docsSeen = new FixedBitSet(maxDoc);
+
+ noOutputs = NoOutputs.getSingleton();
+
+ // This Builder is just used transiently to fragment
+ // terms into "good" blocks; we don't save the
+ // resulting FST:
+ blockBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
+ 0, 0, true,
+ true, Integer.MAX_VALUE,
+ noOutputs,
+ new FindBlocks(), false,
+ PackedInts.COMPACT,
+ true, 15);
+
+ this.longsSize = postingsWriter.setField(fieldInfo);
+ }
+
+ private final IntsRef scratchIntsRef = new IntsRef();
+
+ /** Writes one term's worth of postings. */
+ public void write(BytesRef text, TermsEnum termsEnum) throws IOException {
+ BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen);
+ if (state != null) {
+ assert state.docFreq != 0;
+ assert fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY || state.totalTermFreq >= state.docFreq: "postingsWriter=" + postingsWriter;
+ sumDocFreq += state.docFreq;
+ sumTotalTermFreq += state.totalTermFreq;
+ blockBuilder.add(Util.toIntsRef(text, scratchIntsRef), noOutputs.getNoOutput());
+
+ PendingTerm term = new PendingTerm(BytesRef.deepCopyOf(text), state);
+ pending.add(term);
+ numTerms++;
+ }
+ }
+
+ // Finishes all terms in this field
+ public void finish(BytesRef minTerm, BytesRef maxTerm) throws IOException {
+ if (numTerms > 0) {
+ blockBuilder.finish();
+
+ // We better have one final "root" block:
+ assert pending.size() == 1 && !pending.get(0).isTerm: "pending.size()=" + pending.size() + " pending=" + pending;
+ final PendingBlock root = (PendingBlock) pending.get(0);
+ assert root.prefix.length == 0;
+ assert root.index.getEmptyOutput() != null;
+
+ // Write FST to index
+ indexStartFP = indexOut.getFilePointer();
+ root.index.save(indexOut);
+ //System.out.println(" write FST " + indexStartFP + " field=" + fieldInfo.name);
+
+ // if (SAVE_DOT_FILES || DEBUG) {
+ // final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
+ // Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
+ // Util.toDot(root.index, w, false, false);
+ // System.out.println("SAVED to " + dotFileName);
+ // w.close();
+ // }
+
+ fields.add(new FieldMetaData(fieldInfo,
+ ((PendingBlock) pending.get(0)).index.getEmptyOutput(),
+ numTerms,
+ indexStartFP,
+ sumTotalTermFreq,
+ sumDocFreq,
+ docsSeen.cardinality(),
+ longsSize,
+ minTerm, maxTerm));
+ } else {
+ assert docsSeen.cardinality() == 0;
+ }
+ }
+
+ private final RAMOutputStream suffixWriter = new RAMOutputStream();
+ private final RAMOutputStream statsWriter = new RAMOutputStream();
+ private final RAMOutputStream metaWriter = new RAMOutputStream();
+ private final RAMOutputStream bytesWriter = new RAMOutputStream();
+ }
+
+ @Override
+ public void close() throws IOException {
+
+ boolean success = false;
+ try {
+
+ final long dirStart = out.getFilePointer();
+ final long indexDirStart = indexOut.getFilePointer();
+
+ out.writeVInt(fields.size());
+
+ for(FieldMetaData field : fields) {
+ // System.out.println(" field " + field.fieldInfo.name + " " + field.numTerms + " terms longsSize=" + field.longsSize);
+ out.writeVInt(field.fieldInfo.number);
+ assert field.numTerms > 0;
+ out.writeVLong(field.numTerms);
+ out.writeVInt(field.rootCode.bytes.length);
+ out.writeBytes(field.rootCode.bytes.bytes, field.rootCode.bytes.offset, field.rootCode.bytes.length);
+ if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
+ out.writeVLong(field.sumTotalTermFreq);
+ }
+ out.writeVLong(field.sumDocFreq);
+ out.writeVInt(field.docCount);
+ out.writeVInt(field.longsSize);
+ indexOut.writeVLong(field.indexStartFP);
+ writeBytesRef(out, field.minTerm);
+ writeBytesRef(out, field.maxTerm);
+ }
+ out.writeLong(dirStart);
+ CodecUtil.writeFooter(out);
+ indexOut.writeLong(indexDirStart);
+ CodecUtil.writeFooter(indexOut);
+ success = true;
+ } finally {
+ if (success) {
+ IOUtils.close(out, indexOut, postingsWriter);
+ } else {
+ IOUtils.closeWhileHandlingException(out, indexOut, postingsWriter);
+ }
+ }
+ }
+
+ private static void writeBytesRef(IndexOutput out, BytesRef bytes) throws IOException {
+ out.writeVInt(bytes.length);
+ out.writeBytes(bytes.bytes, bytes.offset, bytes.length);
+ }
+}
Added: lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java?rev=1612080&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java (added)
+++ lucene/dev/trunk/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java Sun Jul 20 12:08:32 2014
@@ -0,0 +1,173 @@
+package org.apache.lucene.codecs.blocktreeords;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.fst.FST;
+
+/** BlockTree's implementation of {@link Terms}. */
+final class OrdsFieldReader extends Terms implements Accountable {
+ final long numTerms;
+ final FieldInfo fieldInfo;
+ final long sumTotalTermFreq;
+ final long sumDocFreq;
+ final int docCount;
+ final long indexStartFP;
+ final long rootBlockFP;
+ final Output rootCode;
+ final BytesRef minTerm;
+ final BytesRef maxTerm;
+ final int longsSize;
+ final OrdsBlockTreeTermsReader parent;
+
+ final FST<Output> index;
+ //private boolean DEBUG;
+
+ OrdsFieldReader(OrdsBlockTreeTermsReader parent, FieldInfo fieldInfo, long numTerms,
+ Output rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount,
+ long indexStartFP, int longsSize, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm) throws IOException {
+ assert numTerms > 0;
+ this.fieldInfo = fieldInfo;
+ //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id");
+ this.parent = parent;
+ this.numTerms = numTerms;
+ this.sumTotalTermFreq = sumTotalTermFreq;
+ this.sumDocFreq = sumDocFreq;
+ this.docCount = docCount;
+ this.indexStartFP = indexStartFP;
+ this.rootCode = rootCode;
+ this.longsSize = longsSize;
+ this.minTerm = minTerm;
+ this.maxTerm = maxTerm;
+ // if (DEBUG) {
+ // System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + rootCode + " divisor=" + indexDivisor);
+ // }
+
+ rootBlockFP = (new ByteArrayDataInput(rootCode.bytes.bytes,
+ rootCode.bytes.offset,
+ rootCode.bytes.length)).readVLong() >>> OrdsBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS;
+
+ if (indexIn != null) {
+ final IndexInput clone = indexIn.clone();
+ //System.out.println("start=" + indexStartFP + " field=" + fieldInfo.name);
+ clone.seek(indexStartFP);
+ index = new FST<>(clone, OrdsBlockTreeTermsWriter.FST_OUTPUTS);
+
+ /*
+ if (true) {
+ final String dotFileName = "/tmp/" + parent.segment + "_" + fieldInfo.name + ".dot";
+ Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
+ Util.toDot(index, w, false, false);
+ System.out.println("FST INDEX: SAVED to " + dotFileName);
+ w.close();
+ }
+ */
+ } else {
+ index = null;
+ }
+ }
+
+ @Override
+ public BytesRef getMin() throws IOException {
+ if (minTerm == null) {
+ // Older index that didn't store min/maxTerm
+ return super.getMin();
+ } else {
+ return minTerm;
+ }
+ }
+
+ @Override
+ public BytesRef getMax() throws IOException {
+ if (maxTerm == null) {
+ // Older index that didn't store min/maxTerm
+ return super.getMax();
+ } else {
+ return maxTerm;
+ }
+ }
+
+ @Override
+ public boolean hasFreqs() {
+ return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+ }
+
+ @Override
+ public boolean hasOffsets() {
+ return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+ }
+
+ @Override
+ public boolean hasPositions() {
+ return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+ }
+
+ @Override
+ public boolean hasPayloads() {
+ return fieldInfo.hasPayloads();
+ }
+
+ @Override
+ public TermsEnum iterator(TermsEnum reuse) throws IOException {
+ return new OrdsSegmentTermsEnum(this);
+ }
+
+ @Override
+ public long size() {
+ return numTerms;
+ }
+
+ @Override
+ public long getSumTotalTermFreq() {
+ return sumTotalTermFreq;
+ }
+
+ @Override
+ public long getSumDocFreq() {
+ return sumDocFreq;
+ }
+
+ @Override
+ public int getDocCount() {
+ return docCount;
+ }
+
+ @Override
+ public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
+ if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
+ throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
+ }
+ return new OrdsIntersectTermsEnum(this, compiled, startTerm);
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ return ((index!=null)? index.ramBytesUsed() : 0);
+ }
+}