You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by pn...@apache.org on 2014/09/28 10:50:13 UTC
[09/10] Lucene.Net.Codes/Sep fully ported,
work done on SimpleText and Memory as well
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/d852d5b0/src/Lucene.Net.Codecs/Memory/DirectPostingsFormat.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Memory/DirectPostingsFormat.cs b/src/Lucene.Net.Codecs/Memory/DirectPostingsFormat.cs
index c28a147..310a771 100644
--- a/src/Lucene.Net.Codecs/Memory/DirectPostingsFormat.cs
+++ b/src/Lucene.Net.Codecs/Memory/DirectPostingsFormat.cs
@@ -1,2302 +1,2936 @@
-package codecs.memory;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.TreeMap;
-
-import codecs.FieldsConsumer;
-import codecs.FieldsProducer;
-import codecs.PostingsFormat;
-import codecs.lucene41.Lucene41PostingsFormat; // javadocs
-import index.DocsAndPositionsEnum;
-import index.DocsEnum;
-import index.FieldInfo.IndexOptions;
-import index.FieldInfo;
-import index.Fields;
-import index.OrdTermState;
-import index.SegmentReadState;
-import index.SegmentWriteState;
-import index.TermState;
-import index.Terms;
-import index.TermsEnum;
-import store.IOContext;
-import store.RAMOutputStream;
-import util.ArrayUtil;
-import util.Bits;
-import util.BytesRef;
-import util.RamUsageEstimator;
-import util.automaton.CompiledAutomaton;
-import util.automaton.RunAutomaton;
-import util.automaton.Transition;
-
-// TODO:
-// - build depth-N prefix hash?
-// - or: longer dense skip lists than just next byte?
-
-/** Wraps {@link Lucene41PostingsFormat} format for on-disk
- * storage, but then at read time loads and stores all
- * terms & postings directly in RAM as byte[], int[].
- *
- * <p><b><font color=red>WARNING</font></b>: This is
- * exceptionally RAM intensive: it makes no effort to
- * compress the postings data, storing terms as separate
- * byte[] and postings as separate int[], but as a result it
- * gives substantial increase in search performance.
- *
- * <p>This postings format supports {@link TermsEnum#ord}
- * and {@link TermsEnum#seekExact(long)}.
-
- * <p>Because this holds all term bytes as a single
- * byte[], you cannot have more than 2.1GB worth of term
- * bytes in a single segment.
- *
- * @lucene.experimental */
-
-public final class DirectPostingsFormat extends PostingsFormat {
-
- private final int minSkipCount;
- private final int lowFreqCutoff;
-
- private final static int DEFAULT_MIN_SKIP_COUNT = 8;
- private final static int DEFAULT_LOW_FREQ_CUTOFF = 32;
-
- //private static final bool DEBUG = true;
-
- // TODO: allow passing/wrapping arbitrary postings format?
-
- public DirectPostingsFormat() {
- this(DEFAULT_MIN_SKIP_COUNT, DEFAULT_LOW_FREQ_CUTOFF);
- }
-
- /** minSkipCount is how many terms in a row must have the
- * same prefix before we put a skip pointer down. Terms
- * with docFreq <= lowFreqCutoff will use a single int[]
- * to hold all docs, freqs, position and offsets; terms
- * with higher docFreq will use separate arrays. */
- public DirectPostingsFormat(int minSkipCount, int lowFreqCutoff) {
- super("Direct");
- this.minSkipCount = minSkipCount;
- this.lowFreqCutoff = lowFreqCutoff;
- }
-
- @Override
- public FieldsConsumer fieldsConsumer(SegmentWriteState state) {
- return PostingsFormat.forName("Lucene41").fieldsConsumer(state);
- }
-
- @Override
- public FieldsProducer fieldsProducer(SegmentReadState state) {
- FieldsProducer postings = PostingsFormat.forName("Lucene41").fieldsProducer(state);
- if (state.context.context != IOContext.Context.MERGE) {
- FieldsProducer loadedPostings;
- try {
- postings.checkIntegrity();
- loadedPostings = new DirectFields(state, postings, minSkipCount, lowFreqCutoff);
- } finally {
- postings.close();
- }
- return loadedPostings;
- } else {
- // Don't load postings for merge:
- return postings;
- }
- }
-
- private static final class DirectFields extends FieldsProducer {
- private final Map<String,DirectField> fields = new TreeMap<>();
-
- public DirectFields(SegmentReadState state, Fields fields, int minSkipCount, int lowFreqCutoff) {
- for (String field : fields) {
- this.fields.put(field, new DirectField(state, field, fields.terms(field), minSkipCount, lowFreqCutoff));
- }
- }
-
- @Override
- public Iterator<String> iterator() {
- return Collections.unmodifiableSet(fields.keySet()).iterator();
- }
-
- @Override
- public Terms terms(String field) {
- return fields.get(field);
- }
-
- @Override
- public int size() {
- return fields.size();
- }
-
- @Override
- public long getUniqueTermCount() {
- long numTerms = 0;
- for(DirectField field : fields.values()) {
- numTerms += field.terms.length;
- }
- return numTerms;
- }
-
- @Override
- public void close() {
- }
-
- @Override
- public long ramBytesUsed() {
- long sizeInBytes = 0;
- for(Map.Entry<String,DirectField> entry: fields.entrySet()) {
- sizeInBytes += entry.getKey().length() * RamUsageEstimator.NUM_BYTES_CHAR;
- sizeInBytes += entry.getValue().ramBytesUsed();
- }
- return sizeInBytes;
- }
-
- @Override
- public void checkIntegrity() {
- // if we read entirely into ram, we already validated.
- // otherwise returned the raw postings reader
- }
- }
-
- private final static class DirectField extends Terms {
-
- private static abstract class TermAndSkip {
- public int[] skips;
-
- /** Returns the approximate number of RAM bytes used */
- public abstract long ramBytesUsed();
- }
-
- private static final class LowFreqTerm extends TermAndSkip {
- public final int[] postings;
- public final byte[] payloads;
- public final int docFreq;
- public final int totalTermFreq;
-
- public LowFreqTerm(int[] postings, byte[] payloads, int docFreq, int totalTermFreq) {
- this.postings = postings;
- this.payloads = payloads;
- this.docFreq = docFreq;
- this.totalTermFreq = totalTermFreq;
- }
-
- @Override
- public long ramBytesUsed() {
- return ((postings!=null) ? RamUsageEstimator.sizeOf(postings) : 0) +
- ((payloads!=null) ? RamUsageEstimator.sizeOf(payloads) : 0);
- }
- }
-
- // TODO: maybe specialize into prx/no-prx/no-frq cases?
- private static final class HighFreqTerm extends TermAndSkip {
- public final long totalTermFreq;
- public final int[] docIDs;
- public final int[] freqs;
- public final int[][] positions;
- public final byte[][][] payloads;
-
- public HighFreqTerm(int[] docIDs, int[] freqs, int[][] positions, byte[][][] payloads, long totalTermFreq) {
- this.docIDs = docIDs;
- this.freqs = freqs;
- this.positions = positions;
- this.payloads = payloads;
- this.totalTermFreq = totalTermFreq;
- }
-
- @Override
- public long ramBytesUsed() {
- long sizeInBytes = 0;
- sizeInBytes += (docIDs!=null)? RamUsageEstimator.sizeOf(docIDs) : 0;
- sizeInBytes += (freqs!=null)? RamUsageEstimator.sizeOf(freqs) : 0;
-
- if(positions != null) {
- for(int[] position : positions) {
- sizeInBytes += (position!=null) ? RamUsageEstimator.sizeOf(position) : 0;
- }
- }
-
- if (payloads != null) {
- for(byte[][] payload : payloads) {
- if(payload != null) {
- for(byte[] pload : payload) {
- sizeInBytes += (pload!=null) ? RamUsageEstimator.sizeOf(pload) : 0;
- }
- }
- }
- }
-
- return sizeInBytes;
- }
- }
-
- private final byte[] termBytes;
- private final int[] termOffsets;
-
- private final int[] skips;
- private final int[] skipOffsets;
-
- private final TermAndSkip[] terms;
- private final bool hasFreq;
- private final bool hasPos;
- private final bool hasOffsets;
- private final bool hasPayloads;
- private final long sumTotalTermFreq;
- private final int docCount;
- private final long sumDocFreq;
- private int skipCount;
-
- // TODO: maybe make a separate builder? These are only
- // used during load:
- private int count;
- private int[] sameCounts = new int[10];
- private final int minSkipCount;
-
- private final static class IntArrayWriter {
- private int[] ints = new int[10];
- private int upto;
-
- public void add(int value) {
- if (ints.length == upto) {
- ints = ArrayUtil.grow(ints);
- }
- ints[upto++] = value;
- }
-
- public int[] get() {
- final int[] arr = new int[upto];
- System.arraycopy(ints, 0, arr, 0, upto);
- upto = 0;
- return arr;
- }
- }
-
- public DirectField(SegmentReadState state, String field, Terms termsIn, int minSkipCount, int lowFreqCutoff) {
- final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
-
- sumTotalTermFreq = termsIn.getSumTotalTermFreq();
- sumDocFreq = termsIn.getSumDocFreq();
- docCount = termsIn.getDocCount();
-
- final int numTerms = (int) termsIn.size();
- if (numTerms == -1) {
- throw new IllegalArgumentException("codec does not provide Terms.size()");
- }
- terms = new TermAndSkip[numTerms];
- termOffsets = new int[1+numTerms];
-
- byte[] termBytes = new byte[1024];
-
- this.minSkipCount = minSkipCount;
-
- hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_ONLY) > 0;
- hasPos = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) > 0;
- hasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) > 0;
- hasPayloads = fieldInfo.hasPayloads();
-
- BytesRef term;
- DocsEnum docsEnum = null;
- DocsAndPositionsEnum docsAndPositionsEnum = null;
- final TermsEnum termsEnum = termsIn.iterator(null);
- int termOffset = 0;
-
- final IntArrayWriter scratch = new IntArrayWriter();
-
- // Used for payloads, if any:
- final RAMOutputStream ros = new RAMOutputStream();
-
- // if (DEBUG) {
- // System.out.println("\nLOAD terms seg=" + state.segmentInfo.name + " field=" + field + " hasOffsets=" + hasOffsets + " hasFreq=" + hasFreq + " hasPos=" + hasPos + " hasPayloads=" + hasPayloads);
- // }
-
- while ((term = termsEnum.next()) != null) {
- final int docFreq = termsEnum.docFreq();
- final long totalTermFreq = termsEnum.totalTermFreq();
-
- // if (DEBUG) {
- // System.out.println(" term=" + term.utf8ToString());
- // }
-
- termOffsets[count] = termOffset;
-
- if (termBytes.length < (termOffset + term.length)) {
- termBytes = ArrayUtil.grow(termBytes, termOffset + term.length);
- }
- System.arraycopy(term.bytes, term.offset, termBytes, termOffset, term.length);
- termOffset += term.length;
- termOffsets[count+1] = termOffset;
-
- if (hasPos) {
- docsAndPositionsEnum = termsEnum.docsAndPositions(null, docsAndPositionsEnum);
- } else {
- docsEnum = termsEnum.docs(null, docsEnum);
- }
-
- final TermAndSkip ent;
-
- final DocsEnum docsEnum2;
- if (hasPos) {
- docsEnum2 = docsAndPositionsEnum;
- } else {
- docsEnum2 = docsEnum;
- }
-
- int docID;
-
- if (docFreq <= lowFreqCutoff) {
-
- ros.reset();
-
- // Pack postings for low-freq terms into a single int[]:
- while ((docID = docsEnum2.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
- scratch.add(docID);
- if (hasFreq) {
- final int freq = docsEnum2.freq();
- scratch.add(freq);
- if (hasPos) {
- for(int pos=0;pos<freq;pos++) {
- scratch.add(docsAndPositionsEnum.nextPosition());
- if (hasOffsets) {
- scratch.add(docsAndPositionsEnum.startOffset());
- scratch.add(docsAndPositionsEnum.endOffset());
- }
- if (hasPayloads) {
- final BytesRef payload = docsAndPositionsEnum.getPayload();
- if (payload != null) {
- scratch.add(payload.length);
- ros.writeBytes(payload.bytes, payload.offset, payload.length);
- } else {
- scratch.add(0);
- }
- }
- }
- }
- }
- }
-
- final byte[] payloads;
- if (hasPayloads) {
- ros.flush();
- payloads = new byte[(int) ros.length()];
- ros.writeTo(payloads, 0);
- } else {
- payloads = null;
- }
-
- final int[] postings = scratch.get();
-
- ent = new LowFreqTerm(postings, payloads, docFreq, (int) totalTermFreq);
- } else {
- final int[] docs = new int[docFreq];
- final int[] freqs;
- final int[][] positions;
- final byte[][][] payloads;
- if (hasFreq) {
- freqs = new int[docFreq];
- if (hasPos) {
- positions = new int[docFreq][];
- if (hasPayloads) {
- payloads = new byte[docFreq][][];
- } else {
- payloads = null;
- }
- } else {
- positions = null;
- payloads = null;
- }
- } else {
- freqs = null;
- positions = null;
- payloads = null;
- }
-
- // Use separate int[] for the postings for high-freq
- // terms:
- int upto = 0;
- while ((docID = docsEnum2.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
- docs[upto] = docID;
- if (hasFreq) {
- final int freq = docsEnum2.freq();
- freqs[upto] = freq;
- if (hasPos) {
- final int mult;
- if (hasOffsets) {
- mult = 3;
- } else {
- mult = 1;
- }
- if (hasPayloads) {
- payloads[upto] = new byte[freq][];
- }
- positions[upto] = new int[mult*freq];
- int posUpto = 0;
- for(int pos=0;pos<freq;pos++) {
- positions[upto][posUpto] = docsAndPositionsEnum.nextPosition();
- if (hasPayloads) {
- BytesRef payload = docsAndPositionsEnum.getPayload();
- if (payload != null) {
- byte[] payloadBytes = new byte[payload.length];
- System.arraycopy(payload.bytes, payload.offset, payloadBytes, 0, payload.length);
- payloads[upto][pos] = payloadBytes;
- }
- }
- posUpto++;
- if (hasOffsets) {
- positions[upto][posUpto++] = docsAndPositionsEnum.startOffset();
- positions[upto][posUpto++] = docsAndPositionsEnum.endOffset();
- }
- }
- }
- }
-
- upto++;
- }
- Debug.Assert( upto == docFreq;
- ent = new HighFreqTerm(docs, freqs, positions, payloads, totalTermFreq);
- }
-
- terms[count] = ent;
- setSkips(count, termBytes);
- count++;
- }
-
- // End sentinel:
- termOffsets[count] = termOffset;
-
- finishSkips();
-
- //System.out.println(skipCount + " skips: " + field);
-
- this.termBytes = new byte[termOffset];
- System.arraycopy(termBytes, 0, this.termBytes, 0, termOffset);
-
- // Pack skips:
- this.skips = new int[skipCount];
- this.skipOffsets = new int[1+numTerms];
-
- int skipOffset = 0;
- for(int i=0;i<numTerms;i++) {
- final int[] termSkips = terms[i].skips;
- skipOffsets[i] = skipOffset;
- if (termSkips != null) {
- System.arraycopy(termSkips, 0, skips, skipOffset, termSkips.length);
- skipOffset += termSkips.length;
- terms[i].skips = null;
- }
- }
- this.skipOffsets[numTerms] = skipOffset;
- Debug.Assert( skipOffset == skipCount;
- }
-
- /** Returns approximate RAM bytes used */
- public long ramBytesUsed() {
- long sizeInBytes = 0;
- sizeInBytes += ((termBytes!=null) ? RamUsageEstimator.sizeOf(termBytes) : 0);
- sizeInBytes += ((termOffsets!=null) ? RamUsageEstimator.sizeOf(termOffsets) : 0);
- sizeInBytes += ((skips!=null) ? RamUsageEstimator.sizeOf(skips) : 0);
- sizeInBytes += ((skipOffsets!=null) ? RamUsageEstimator.sizeOf(skipOffsets) : 0);
- sizeInBytes += ((sameCounts!=null) ? RamUsageEstimator.sizeOf(sameCounts) : 0);
-
- if(terms!=null) {
- for(TermAndSkip termAndSkip : terms) {
- sizeInBytes += (termAndSkip!=null) ? termAndSkip.ramBytesUsed() : 0;
- }
- }
-
- return sizeInBytes;
- }
-
- // Compares in unicode (UTF8) order:
- int compare(int ord, BytesRef other) {
- final byte[] otherBytes = other.bytes;
-
- int upto = termOffsets[ord];
- final int termLen = termOffsets[1+ord] - upto;
- int otherUpto = other.offset;
-
- final int stop = upto + Math.min(termLen, other.length);
- while (upto < stop) {
- int diff = (termBytes[upto++] & 0xFF) - (otherBytes[otherUpto++] & 0xFF);
- if (diff != 0) {
- return diff;
- }
- }
-
- // One is a prefix of the other, or, they are equal:
- return termLen - other.length;
- }
-
- private void setSkips(int termOrd, byte[] termBytes) {
-
- final int termLength = termOffsets[termOrd+1] - termOffsets[termOrd];
-
- if (sameCounts.length < termLength) {
- sameCounts = ArrayUtil.grow(sameCounts, termLength);
- }
-
- // Update skip pointers:
- if (termOrd > 0) {
- final int lastTermLength = termOffsets[termOrd] - termOffsets[termOrd-1];
- final int limit = Math.min(termLength, lastTermLength);
-
- int lastTermOffset = termOffsets[termOrd-1];
- int termOffset = termOffsets[termOrd];
-
- int i = 0;
- for(;i<limit;i++) {
- if (termBytes[lastTermOffset++] == termBytes[termOffset++]) {
- sameCounts[i]++;
- } else {
- for(;i<limit;i++) {
- if (sameCounts[i] >= minSkipCount) {
- // Go back and add a skip pointer:
- saveSkip(termOrd, sameCounts[i]);
- }
- sameCounts[i] = 1;
- }
- break;
- }
- }
-
- for(;i<lastTermLength;i++) {
- if (sameCounts[i] >= minSkipCount) {
- // Go back and add a skip pointer:
- saveSkip(termOrd, sameCounts[i]);
- }
- sameCounts[i] = 0;
- }
- for(int j=limit;j<termLength;j++) {
- sameCounts[j] = 1;
- }
- } else {
- for(int i=0;i<termLength;i++) {
- sameCounts[i]++;
- }
- }
- }
-
- private void finishSkips() {
- Debug.Assert( count == terms.length;
- int lastTermOffset = termOffsets[count-1];
- int lastTermLength = termOffsets[count] - lastTermOffset;
-
- for(int i=0;i<lastTermLength;i++) {
- if (sameCounts[i] >= minSkipCount) {
- // Go back and add a skip pointer:
- saveSkip(count, sameCounts[i]);
- }
- }
-
- // Reverse the skip pointers so they are "nested":
- for(int termID=0;termID<terms.length;termID++) {
- TermAndSkip term = terms[termID];
- if (term.skips != null && term.skips.length > 1) {
- for(int pos=0;pos<term.skips.length/2;pos++) {
- final int otherPos = term.skips.length-pos-1;
-
- final int temp = term.skips[pos];
- term.skips[pos] = term.skips[otherPos];
- term.skips[otherPos] = temp;
- }
- }
- }
- }
-
- private void saveSkip(int ord, int backCount) {
- final TermAndSkip term = terms[ord - backCount];
- skipCount++;
- if (term.skips == null) {
- term.skips = new int[] {ord};
- } else {
- // Normally we'd grow at a slight exponential... but
- // given that the skips themselves are already log(N)
- // we can grow by only 1 and still have amortized
- // linear time:
- final int[] newSkips = new int[term.skips.length+1];
- System.arraycopy(term.skips, 0, newSkips, 0, term.skips.length);
- term.skips = newSkips;
- term.skips[term.skips.length-1] = ord;
- }
- }
-
- @Override
- public TermsEnum iterator(TermsEnum reuse) {
- DirectTermsEnum termsEnum;
- if (reuse != null && reuse instanceof DirectTermsEnum) {
- termsEnum = (DirectTermsEnum) reuse;
- if (!termsEnum.canReuse(terms)) {
- termsEnum = new DirectTermsEnum();
- }
- } else {
- termsEnum = new DirectTermsEnum();
- }
- termsEnum.reset();
- return termsEnum;
- }
-
- @Override
- public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) {
- return new DirectIntersectTermsEnum(compiled, startTerm);
- }
-
- @Override
- public long size() {
- return terms.length;
- }
-
- @Override
- public long getSumTotalTermFreq() {
- return sumTotalTermFreq;
- }
-
- @Override
- public long getSumDocFreq() {
- return sumDocFreq;
- }
-
- @Override
- public int getDocCount() {
- return docCount;
- }
-
- @Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
- }
-
- @Override
- public bool hasFreqs() {
- return hasFreq;
- }
-
- @Override
- public bool hasOffsets() {
- return hasOffsets;
- }
-
- @Override
- public bool hasPositions() {
- return hasPos;
- }
-
- @Override
- public bool hasPayloads() {
- return hasPayloads;
- }
-
- private final class DirectTermsEnum extends TermsEnum {
-
- private final BytesRef scratch = new BytesRef();
- private int termOrd;
-
- bool canReuse(TermAndSkip[] other) {
- return DirectField.this.terms == other;
- }
-
- private BytesRef setTerm() {
- scratch.bytes = termBytes;
- scratch.offset = termOffsets[termOrd];
- scratch.length = termOffsets[termOrd+1] - termOffsets[termOrd];
- return scratch;
- }
-
- public void reset() {
- termOrd = -1;
- }
-
- @Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
- }
-
- @Override
- public BytesRef next() {
- termOrd++;
- if (termOrd < terms.length) {
- return setTerm();
- } else {
- return null;
- }
- }
-
- @Override
- public TermState termState() {
- OrdTermState state = new OrdTermState();
- state.ord = termOrd;
- return state;
- }
-
- // If non-negative, exact match; else, -ord-1, where ord
- // is where you would insert the term.
- private int findTerm(BytesRef term) {
-
- // Just do binary search: should be (constant factor)
- // faster than using the skip list:
- int low = 0;
- int high = terms.length-1;
-
- while (low <= high) {
- int mid = (low + high) >>> 1;
- int cmp = compare(mid, term);
- if (cmp < 0) {
- low = mid + 1;
- } else if (cmp > 0) {
- high = mid - 1;
- } else {
- return mid; // key found
- }
- }
-
- return -(low + 1); // key not found.
- }
-
- @Override
- public SeekStatus seekCeil(BytesRef term) {
- // TODO: we should use the skip pointers; should be
- // faster than bin search; we should also hold
- // & reuse current state so seeking forwards is
- // faster
- final int ord = findTerm(term);
- // if (DEBUG) {
- // System.out.println(" find term=" + term.utf8ToString() + " ord=" + ord);
- // }
- if (ord >= 0) {
- termOrd = ord;
- setTerm();
- return SeekStatus.FOUND;
- } else if (ord == -terms.length-1) {
- return SeekStatus.END;
- } else {
- termOrd = -ord - 1;
- setTerm();
- return SeekStatus.NOT_FOUND;
- }
- }
-
- @Override
- public bool seekExact(BytesRef term) {
- // TODO: we should use the skip pointers; should be
- // faster than bin search; we should also hold
- // & reuse current state so seeking forwards is
- // faster
- final int ord = findTerm(term);
- if (ord >= 0) {
- termOrd = ord;
- setTerm();
- return true;
- } else {
- return false;
- }
- }
-
- @Override
- public void seekExact(long ord) {
- termOrd = (int) ord;
- setTerm();
- }
-
- @Override
- public void seekExact(BytesRef term, TermState state) {
- termOrd = (int) ((OrdTermState) state).ord;
- setTerm();
- Debug.Assert( term.equals(scratch);
- }
-
- @Override
- public BytesRef term() {
- return scratch;
- }
-
- @Override
- public long ord() {
- return termOrd;
- }
-
- @Override
- public int docFreq() {
- if (terms[termOrd] instanceof LowFreqTerm) {
- return ((LowFreqTerm) terms[termOrd]).docFreq;
- } else {
- return ((HighFreqTerm) terms[termOrd]).docIDs.length;
- }
- }
-
- @Override
- public long totalTermFreq() {
- if (terms[termOrd] instanceof LowFreqTerm) {
- return ((LowFreqTerm) terms[termOrd]).totalTermFreq;
- } else {
- return ((HighFreqTerm) terms[termOrd]).totalTermFreq;
- }
- }
-
- @Override
- public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) {
- // TODO: implement reuse, something like Pulsing:
- // it's hairy!
-
- if (terms[termOrd] instanceof LowFreqTerm) {
- final int[] postings = ((LowFreqTerm) terms[termOrd]).postings;
- if (hasFreq) {
- if (hasPos) {
- int posLen;
- if (hasOffsets) {
- posLen = 3;
- } else {
- posLen = 1;
- }
- if (hasPayloads) {
- posLen++;
- }
- LowFreqDocsEnum docsEnum;
- if (reuse instanceof LowFreqDocsEnum) {
- docsEnum = (LowFreqDocsEnum) reuse;
- if (!docsEnum.canReuse(liveDocs, posLen)) {
- docsEnum = new LowFreqDocsEnum(liveDocs, posLen);
- }
- } else {
- docsEnum = new LowFreqDocsEnum(liveDocs, posLen);
- }
-
- return docsEnum.reset(postings);
- } else {
- LowFreqDocsEnumNoPos docsEnum;
- if (reuse instanceof LowFreqDocsEnumNoPos) {
- docsEnum = (LowFreqDocsEnumNoPos) reuse;
- if (!docsEnum.canReuse(liveDocs)) {
- docsEnum = new LowFreqDocsEnumNoPos(liveDocs);
- }
- } else {
- docsEnum = new LowFreqDocsEnumNoPos(liveDocs);
- }
-
- return docsEnum.reset(postings);
- }
- } else {
- LowFreqDocsEnumNoTF docsEnum;
- if (reuse instanceof LowFreqDocsEnumNoTF) {
- docsEnum = (LowFreqDocsEnumNoTF) reuse;
- if (!docsEnum.canReuse(liveDocs)) {
- docsEnum = new LowFreqDocsEnumNoTF(liveDocs);
- }
- } else {
- docsEnum = new LowFreqDocsEnumNoTF(liveDocs);
- }
-
- return docsEnum.reset(postings);
- }
- } else {
- final HighFreqTerm term = (HighFreqTerm) terms[termOrd];
-
- HighFreqDocsEnum docsEnum;
- if (reuse instanceof HighFreqDocsEnum) {
- docsEnum = (HighFreqDocsEnum) reuse;
- if (!docsEnum.canReuse(liveDocs)) {
- docsEnum = new HighFreqDocsEnum(liveDocs);
- }
- } else {
- docsEnum = new HighFreqDocsEnum(liveDocs);
- }
-
- //System.out.println(" DE for term=" + new BytesRef(terms[termOrd].term).utf8ToString() + ": " + term.docIDs.length + " docs");
- return docsEnum.reset(term.docIDs, term.freqs);
- }
- }
-
- @Override
- public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) {
- if (!hasPos) {
- return null;
- }
-
- // TODO: implement reuse, something like Pulsing:
- // it's hairy!
-
- if (terms[termOrd] instanceof LowFreqTerm) {
- final LowFreqTerm term = ((LowFreqTerm) terms[termOrd]);
- final int[] postings = term.postings;
- final byte[] payloads = term.payloads;
- return new LowFreqDocsAndPositionsEnum(liveDocs, hasOffsets, hasPayloads).reset(postings, payloads);
- } else {
- final HighFreqTerm term = (HighFreqTerm) terms[termOrd];
- return new HighFreqDocsAndPositionsEnum(liveDocs, hasOffsets).reset(term.docIDs, term.freqs, term.positions, term.payloads);
- }
- }
- }
-
- private final class DirectIntersectTermsEnum extends TermsEnum {
- private final RunAutomaton runAutomaton;
- private final CompiledAutomaton compiledAutomaton;
- private int termOrd;
- private final BytesRef scratch = new BytesRef();
-
- private final class State {
- int changeOrd;
- int state;
- Transition[] transitions;
- int transitionUpto;
- int transitionMax;
- int transitionMin;
- }
-
- private State[] states;
- private int stateUpto;
-
- public DirectIntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) {
- runAutomaton = compiled.runAutomaton;
- compiledAutomaton = compiled;
- termOrd = -1;
- states = new State[1];
- states[0] = new State();
- states[0].changeOrd = terms.length;
- states[0].state = runAutomaton.getInitialState();
- states[0].transitions = compiledAutomaton.sortedTransitions[states[0].state];
- states[0].transitionUpto = -1;
- states[0].transitionMax = -1;
-
- //System.out.println("IE.init startTerm=" + startTerm);
-
- if (startTerm != null) {
- int skipUpto = 0;
- if (startTerm.length == 0) {
- if (terms.length > 0 && termOffsets[1] == 0) {
- termOrd = 0;
- }
- } else {
- termOrd++;
-
- nextLabel:
- for(int i=0;i<startTerm.length;i++) {
- final int label = startTerm.bytes[startTerm.offset+i] & 0xFF;
-
- while (label > states[i].transitionMax) {
- states[i].transitionUpto++;
- Debug.Assert( states[i].transitionUpto < states[i].transitions.length;
- states[i].transitionMin = states[i].transitions[states[i].transitionUpto].getMin();
- states[i].transitionMax = states[i].transitions[states[i].transitionUpto].getMax();
- Debug.Assert( states[i].transitionMin >= 0;
- Debug.Assert( states[i].transitionMin <= 255;
- Debug.Assert( states[i].transitionMax >= 0;
- Debug.Assert( states[i].transitionMax <= 255;
- }
-
- // Skip forwards until we find a term matching
- // the label at this position:
- while (termOrd < terms.length) {
- final int skipOffset = skipOffsets[termOrd];
- final int numSkips = skipOffsets[termOrd+1] - skipOffset;
- final int termOffset = termOffsets[termOrd];
- final int termLength = termOffsets[1+termOrd] - termOffset;
-
- // if (DEBUG) {
- // System.out.println(" check termOrd=" + termOrd + " term=" + new BytesRef(termBytes, termOffset, termLength).utf8ToString() + " skips=" + Arrays.toString(skips) + " i=" + i);
- // }
-
- if (termOrd == states[stateUpto].changeOrd) {
- // if (DEBUG) {
- // System.out.println(" end push return");
- // }
- stateUpto--;
- termOrd--;
- return;
- }
-
- if (termLength == i) {
- termOrd++;
- skipUpto = 0;
- // if (DEBUG) {
- // System.out.println(" term too short; next term");
- // }
- } else if (label < (termBytes[termOffset+i] & 0xFF)) {
- termOrd--;
- // if (DEBUG) {
- // System.out.println(" no match; already beyond; return termOrd=" + termOrd);
- // }
- stateUpto -= skipUpto;
- Debug.Assert( stateUpto >= 0;
- return;
- } else if (label == (termBytes[termOffset+i] & 0xFF)) {
- // if (DEBUG) {
- // System.out.println(" label[" + i + "] matches");
- // }
- if (skipUpto < numSkips) {
- grow();
-
- final int nextState = runAutomaton.step(states[stateUpto].state, label);
-
- // Automaton is required to accept startTerm:
- Debug.Assert( nextState != -1;
-
- stateUpto++;
- states[stateUpto].changeOrd = skips[skipOffset + skipUpto++];
- states[stateUpto].state = nextState;
- states[stateUpto].transitions = compiledAutomaton.sortedTransitions[nextState];
- states[stateUpto].transitionUpto = -1;
- states[stateUpto].transitionMax = -1;
- //System.out.println(" push " + states[stateUpto].transitions.length + " trans");
-
- // if (DEBUG) {
- // System.out.println(" push skip; changeOrd=" + states[stateUpto].changeOrd);
- // }
-
- // Match next label at this same term:
- continue nextLabel;
- } else {
- // if (DEBUG) {
- // System.out.println(" linear scan");
- // }
- // Index exhausted: just scan now (the
- // number of scans required will be less
- // than the minSkipCount):
- final int startTermOrd = termOrd;
- while (termOrd < terms.length && compare(termOrd, startTerm) <= 0) {
- Debug.Assert( termOrd == startTermOrd || skipOffsets[termOrd] == skipOffsets[termOrd+1];
- termOrd++;
- }
- Debug.Assert( termOrd - startTermOrd < minSkipCount;
- termOrd--;
- stateUpto -= skipUpto;
- // if (DEBUG) {
- // System.out.println(" end termOrd=" + termOrd);
- // }
- return;
- }
- } else {
- if (skipUpto < numSkips) {
- termOrd = skips[skipOffset + skipUpto];
- // if (DEBUG) {
- // System.out.println(" no match; skip to termOrd=" + termOrd);
- // }
- } else {
- // if (DEBUG) {
- // System.out.println(" no match; next term");
- // }
- termOrd++;
- }
- skipUpto = 0;
- }
- }
-
- // startTerm is >= last term so enum will not
- // return any terms:
- termOrd--;
- // if (DEBUG) {
- // System.out.println(" beyond end; no terms will match");
- // }
- return;
- }
- }
-
- final int termOffset = termOffsets[termOrd];
- final int termLen = termOffsets[1+termOrd] - termOffset;
-
- if (termOrd >= 0 && !startTerm.equals(new BytesRef(termBytes, termOffset, termLen))) {
- stateUpto -= skipUpto;
- termOrd--;
- }
- // if (DEBUG) {
- // System.out.println(" loop end; return termOrd=" + termOrd + " stateUpto=" + stateUpto);
- // }
- }
- }
-
- @Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
- }
-
- private void grow() {
- if (states.length == 1+stateUpto) {
- final State[] newStates = new State[states.length+1];
- System.arraycopy(states, 0, newStates, 0, states.length);
- newStates[states.length] = new State();
- states = newStates;
- }
- }
-
- @Override
- public BytesRef next() {
- // if (DEBUG) {
- // System.out.println("\nIE.next");
- // }
-
- termOrd++;
- int skipUpto = 0;
-
- if (termOrd == 0 && termOffsets[1] == 0) {
- // Special-case empty string:
- Debug.Assert( stateUpto == 0;
- // if (DEBUG) {
- // System.out.println(" visit empty string");
- // }
- if (runAutomaton.isAccept(states[0].state)) {
- scratch.bytes = termBytes;
- scratch.offset = 0;
- scratch.length = 0;
- return scratch;
- }
- termOrd++;
- }
-
- nextTerm:
-
- while (true) {
- // if (DEBUG) {
- // System.out.println(" cycle termOrd=" + termOrd + " stateUpto=" + stateUpto + " skipUpto=" + skipUpto);
- // }
- if (termOrd == terms.length) {
- // if (DEBUG) {
- // System.out.println(" return END");
- // }
- return null;
- }
-
- final State state = states[stateUpto];
- if (termOrd == state.changeOrd) {
- // Pop:
- // if (DEBUG) {
- // System.out.println(" pop stateUpto=" + stateUpto);
- // }
- stateUpto--;
- /*
- if (DEBUG) {
- try {
- //System.out.println(" prefix pop " + new BytesRef(terms[termOrd].term, 0, Math.min(stateUpto, terms[termOrd].term.length)).utf8ToString());
- System.out.println(" prefix pop " + new BytesRef(terms[termOrd].term, 0, Math.min(stateUpto, terms[termOrd].term.length)));
- } catch (ArrayIndexOutOfBoundsException aioobe) {
- System.out.println(" prefix pop " + new BytesRef(terms[termOrd].term, 0, Math.min(stateUpto, terms[termOrd].term.length)));
- }
- }
- */
-
- continue;
- }
-
- final int termOffset = termOffsets[termOrd];
- final int termLength = termOffsets[termOrd+1] - termOffset;
- final int skipOffset = skipOffsets[termOrd];
- final int numSkips = skipOffsets[termOrd+1] - skipOffset;
-
- // if (DEBUG) {
- // System.out.println(" term=" + new BytesRef(termBytes, termOffset, termLength).utf8ToString() + " skips=" + Arrays.toString(skips));
- // }
-
- Debug.Assert( termOrd < state.changeOrd;
-
- Debug.Assert( stateUpto <= termLength: "term.length=" + termLength + "; stateUpto=" + stateUpto;
- final int label = termBytes[termOffset+stateUpto] & 0xFF;
-
- while (label > state.transitionMax) {
- //System.out.println(" label=" + label + " vs max=" + state.transitionMax + " transUpto=" + state.transitionUpto + " vs " + state.transitions.length);
- state.transitionUpto++;
- if (state.transitionUpto == state.transitions.length) {
- // We've exhausted transitions leaving this
- // state; force pop+next/skip now:
- //System.out.println("forcepop: stateUpto=" + stateUpto);
- if (stateUpto == 0) {
- termOrd = terms.length;
- return null;
- } else {
- Debug.Assert( state.changeOrd > termOrd;
- // if (DEBUG) {
- // System.out.println(" jumpend " + (state.changeOrd - termOrd));
- // }
- //System.out.println(" jump to termOrd=" + states[stateUpto].changeOrd + " vs " + termOrd);
- termOrd = states[stateUpto].changeOrd;
- skipUpto = 0;
- stateUpto--;
- }
- continue nextTerm;
- }
- Debug.Assert( state.transitionUpto < state.transitions.length: " state.transitionUpto=" + state.transitionUpto + " vs " + state.transitions.length;
- state.transitionMin = state.transitions[state.transitionUpto].getMin();
- state.transitionMax = state.transitions[state.transitionUpto].getMax();
- Debug.Assert( state.transitionMin >= 0;
- Debug.Assert( state.transitionMin <= 255;
- Debug.Assert( state.transitionMax >= 0;
- Debug.Assert( state.transitionMax <= 255;
- }
-
- /*
- if (DEBUG) {
- System.out.println(" check ord=" + termOrd + " term[" + stateUpto + "]=" + (char) label + "(" + label + ") term=" + new BytesRef(terms[termOrd].term).utf8ToString() + " trans " +
- (char) state.transitionMin + "(" + state.transitionMin + ")" + "-" + (char) state.transitionMax + "(" + state.transitionMax + ") nextChange=+" + (state.changeOrd - termOrd) + " skips=" + (skips == null ? "null" : Arrays.toString(skips)));
- System.out.println(" check ord=" + termOrd + " term[" + stateUpto + "]=" + Integer.toHexString(label) + "(" + label + ") term=" + new BytesRef(termBytes, termOffset, termLength) + " trans " +
- Integer.toHexString(state.transitionMin) + "(" + state.transitionMin + ")" + "-" + Integer.toHexString(state.transitionMax) + "(" + state.transitionMax + ") nextChange=+" + (state.changeOrd - termOrd) + " skips=" + (skips == null ? "null" : Arrays.toString(skips)));
- }
- */
-
- final int targetLabel = state.transitionMin;
-
- if ((termBytes[termOffset+stateUpto] & 0xFF) < targetLabel) {
- // if (DEBUG) {
- // System.out.println(" do bin search");
- // }
- //int startTermOrd = termOrd;
- int low = termOrd+1;
- int high = state.changeOrd-1;
- while (true) {
- if (low > high) {
- // Label not found
- termOrd = low;
- // if (DEBUG) {
- // System.out.println(" advanced by " + (termOrd - startTermOrd));
- // }
- //System.out.println(" jump " + (termOrd - startTermOrd));
- skipUpto = 0;
- continue nextTerm;
- }
- int mid = (low + high) >>> 1;
- int cmp = (termBytes[termOffsets[mid] + stateUpto] & 0xFF) - targetLabel;
- // if (DEBUG) {
- // System.out.println(" bin: check label=" + (char) (termBytes[termOffsets[low] + stateUpto] & 0xFF) + " ord=" + mid);
- // }
- if (cmp < 0) {
- low = mid+1;
- } else if (cmp > 0) {
- high = mid - 1;
- } else {
- // Label found; walk backwards to first
- // occurrence:
- while (mid > termOrd && (termBytes[termOffsets[mid-1] + stateUpto] & 0xFF) == targetLabel) {
- mid--;
- }
- termOrd = mid;
- // if (DEBUG) {
- // System.out.println(" advanced by " + (termOrd - startTermOrd));
- // }
- //System.out.println(" jump " + (termOrd - startTermOrd));
- skipUpto = 0;
- continue nextTerm;
- }
- }
- }
-
- int nextState = runAutomaton.step(states[stateUpto].state, label);
-
- if (nextState == -1) {
- // Skip
- // if (DEBUG) {
- // System.out.println(" automaton doesn't accept; skip");
- // }
- if (skipUpto < numSkips) {
- // if (DEBUG) {
- // System.out.println(" jump " + (skips[skipOffset+skipUpto]-1 - termOrd));
- // }
- termOrd = skips[skipOffset+skipUpto];
- } else {
- termOrd++;
- }
- skipUpto = 0;
- } else if (skipUpto < numSkips) {
- // Push:
- // if (DEBUG) {
- // System.out.println(" push");
- // }
- /*
- if (DEBUG) {
- try {
- //System.out.println(" prefix push " + new BytesRef(term, 0, stateUpto+1).utf8ToString());
- System.out.println(" prefix push " + new BytesRef(term, 0, stateUpto+1));
- } catch (ArrayIndexOutOfBoundsException aioobe) {
- System.out.println(" prefix push " + new BytesRef(term, 0, stateUpto+1));
- }
- }
- */
-
- grow();
- stateUpto++;
- states[stateUpto].state = nextState;
- states[stateUpto].changeOrd = skips[skipOffset + skipUpto++];
- states[stateUpto].transitions = compiledAutomaton.sortedTransitions[nextState];
- states[stateUpto].transitionUpto = -1;
- states[stateUpto].transitionMax = -1;
-
- if (stateUpto == termLength) {
- // if (DEBUG) {
- // System.out.println(" term ends after push");
- // }
- if (runAutomaton.isAccept(nextState)) {
- // if (DEBUG) {
- // System.out.println(" automaton accepts: return");
- // }
- scratch.bytes = termBytes;
- scratch.offset = termOffsets[termOrd];
- scratch.length = termOffsets[1+termOrd] - scratch.offset;
- // if (DEBUG) {
- // System.out.println(" ret " + scratch.utf8ToString());
- // }
- return scratch;
- } else {
- // if (DEBUG) {
- // System.out.println(" automaton rejects: nextTerm");
- // }
- termOrd++;
- skipUpto = 0;
- }
- }
- } else {
- // Run the non-indexed tail of this term:
-
- // TODO: add Debug.Assert( that we don't inc too many times
-
- if (compiledAutomaton.commonSuffixRef != null) {
- //System.out.println("suffix " + compiledAutomaton.commonSuffixRef.utf8ToString());
- Debug.Assert( compiledAutomaton.commonSuffixRef.offset == 0;
- if (termLength < compiledAutomaton.commonSuffixRef.length) {
- termOrd++;
- skipUpto = 0;
- continue nextTerm;
- }
- int offset = termOffset + termLength - compiledAutomaton.commonSuffixRef.length;
- for(int suffix=0;suffix<compiledAutomaton.commonSuffixRef.length;suffix++) {
- if (termBytes[offset + suffix] != compiledAutomaton.commonSuffixRef.bytes[suffix]) {
- termOrd++;
- skipUpto = 0;
- continue nextTerm;
- }
- }
- }
-
- int upto = stateUpto+1;
- while (upto < termLength) {
- nextState = runAutomaton.step(nextState, termBytes[termOffset+upto] & 0xFF);
- if (nextState == -1) {
- termOrd++;
- skipUpto = 0;
- // if (DEBUG) {
- // System.out.println(" nomatch tail; next term");
- // }
- continue nextTerm;
- }
- upto++;
- }
-
- if (runAutomaton.isAccept(nextState)) {
- scratch.bytes = termBytes;
- scratch.offset = termOffsets[termOrd];
- scratch.length = termOffsets[1+termOrd] - scratch.offset;
- // if (DEBUG) {
- // System.out.println(" match tail; return " + scratch.utf8ToString());
- // System.out.println(" ret2 " + scratch.utf8ToString());
- // }
- return scratch;
- } else {
- termOrd++;
- skipUpto = 0;
- // if (DEBUG) {
- // System.out.println(" nomatch tail; next term");
- // }
- }
- }
- }
- }
-
- @Override
- public TermState termState() {
- OrdTermState state = new OrdTermState();
- state.ord = termOrd;
- return state;
- }
-
- @Override
- public BytesRef term() {
- return scratch;
- }
-
- @Override
- public long ord() {
- return termOrd;
- }
-
- @Override
- public int docFreq() {
- if (terms[termOrd] instanceof LowFreqTerm) {
- return ((LowFreqTerm) terms[termOrd]).docFreq;
- } else {
- return ((HighFreqTerm) terms[termOrd]).docIDs.length;
- }
- }
-
- @Override
- public long totalTermFreq() {
- if (terms[termOrd] instanceof LowFreqTerm) {
- return ((LowFreqTerm) terms[termOrd]).totalTermFreq;
- } else {
- return ((HighFreqTerm) terms[termOrd]).totalTermFreq;
- }
- }
-
- @Override
- public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) {
- // TODO: implement reuse, something like Pulsing:
- // it's hairy!
-
- if (terms[termOrd] instanceof LowFreqTerm) {
- final int[] postings = ((LowFreqTerm) terms[termOrd]).postings;
- if (hasFreq) {
- if (hasPos) {
- int posLen;
- if (hasOffsets) {
- posLen = 3;
- } else {
- posLen = 1;
- }
- if (hasPayloads) {
- posLen++;
- }
- return new LowFreqDocsEnum(liveDocs, posLen).reset(postings);
- } else {
- return new LowFreqDocsEnumNoPos(liveDocs).reset(postings);
- }
- } else {
- return new LowFreqDocsEnumNoTF(liveDocs).reset(postings);
- }
- } else {
- final HighFreqTerm term = (HighFreqTerm) terms[termOrd];
- // System.out.println("DE for term=" + new BytesRef(terms[termOrd].term).utf8ToString() + ": " + term.docIDs.length + " docs");
- return new HighFreqDocsEnum(liveDocs).reset(term.docIDs, term.freqs);
- }
- }
-
- @Override
- public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) {
- if (!hasPos) {
- return null;
- }
-
- // TODO: implement reuse, something like Pulsing:
- // it's hairy!
-
- if (terms[termOrd] instanceof LowFreqTerm) {
- final LowFreqTerm term = ((LowFreqTerm) terms[termOrd]);
- final int[] postings = term.postings;
- final byte[] payloads = term.payloads;
- return new LowFreqDocsAndPositionsEnum(liveDocs, hasOffsets, hasPayloads).reset(postings, payloads);
- } else {
- final HighFreqTerm term = (HighFreqTerm) terms[termOrd];
- return new HighFreqDocsAndPositionsEnum(liveDocs, hasOffsets).reset(term.docIDs, term.freqs, term.positions, term.payloads);
- }
- }
-
- @Override
- public SeekStatus seekCeil(BytesRef term) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public void seekExact(long ord) {
- throw new UnsupportedOperationException();
- }
- }
- }
-
- // Docs only:
- private final static class LowFreqDocsEnumNoTF extends DocsEnum {
- private int[] postings;
- private final Bits liveDocs;
- private int upto;
-
- public LowFreqDocsEnumNoTF(Bits liveDocs) {
- this.liveDocs = liveDocs;
- }
-
- public bool canReuse(Bits liveDocs) {
- return liveDocs == this.liveDocs;
- }
-
- public DocsEnum reset(int[] postings) {
- this.postings = postings;
- upto = -1;
- return this;
- }
-
- // TODO: can do this w/o setting members?
-
- @Override
- public int nextDoc() {
- upto++;
- if (liveDocs == null) {
- if (upto < postings.length) {
- return postings[upto];
- }
- } else {
- while (upto < postings.length) {
- if (liveDocs.get(postings[upto])) {
- return postings[upto];
- }
- upto++;
- }
- }
- return NO_MORE_DOCS;
- }
-
- @Override
- public int docID() {
- if (upto < 0) {
- return -1;
- } else if (upto < postings.length) {
- return postings[upto];
- } else {
- return NO_MORE_DOCS;
- }
- }
-
- @Override
- public int freq() {
- return 1;
- }
-
- @Override
- public int advance(int target) {
- // Linear scan, but this is low-freq term so it won't
- // be costly:
- return slowAdvance(target);
- }
-
- @Override
- public long cost() {
- return postings.length;
- }
- }
-
- // Docs + freqs:
- private final static class LowFreqDocsEnumNoPos extends DocsEnum {
- private int[] postings;
- private final Bits liveDocs;
- private int upto;
-
- public LowFreqDocsEnumNoPos(Bits liveDocs) {
- this.liveDocs = liveDocs;
- }
-
- public bool canReuse(Bits liveDocs) {
- return liveDocs == this.liveDocs;
- }
-
- public DocsEnum reset(int[] postings) {
- this.postings = postings;
- upto = -2;
- return this;
- }
-
- // TODO: can do this w/o setting members?
- @Override
- public int nextDoc() {
- upto += 2;
- if (liveDocs == null) {
- if (upto < postings.length) {
- return postings[upto];
- }
- } else {
- while (upto < postings.length) {
- if (liveDocs.get(postings[upto])) {
- return postings[upto];
- }
- upto += 2;
- }
- }
- return NO_MORE_DOCS;
- }
-
- @Override
- public int docID() {
- if (upto < 0) {
- return -1;
- } else if (upto < postings.length) {
- return postings[upto];
- } else {
- return NO_MORE_DOCS;
- }
- }
-
- @Override
- public int freq() {
- return postings[upto+1];
- }
-
- @Override
- public int advance(int target) {
- // Linear scan, but this is low-freq term so it won't
- // be costly:
- return slowAdvance(target);
- }
-
- @Override
- public long cost() {
- return postings.length / 2;
- }
- }
-
- // Docs + freqs + positions/offets:
- private final static class LowFreqDocsEnum extends DocsEnum {
- private int[] postings;
- private final Bits liveDocs;
- private final int posMult;
- private int upto;
- private int freq;
-
- public LowFreqDocsEnum(Bits liveDocs, int posMult) {
- this.liveDocs = liveDocs;
- this.posMult = posMult;
- // if (DEBUG) {
- // System.out.println("LowFreqDE: posMult=" + posMult);
- // }
- }
-
- public bool canReuse(Bits liveDocs, int posMult) {
- return liveDocs == this.liveDocs && posMult == this.posMult;
- }
-
- public DocsEnum reset(int[] postings) {
- this.postings = postings;
- upto = -2;
- freq = 0;
- return this;
- }
-
- // TODO: can do this w/o setting members?
- @Override
- public int nextDoc() {
- upto += 2 + freq*posMult;
- // if (DEBUG) {
- // System.out.println(" nextDoc freq=" + freq + " upto=" + upto + " vs " + postings.length);
- // }
- if (liveDocs == null) {
- if (upto < postings.length) {
- freq = postings[upto+1];
- Debug.Assert( freq > 0;
- return postings[upto];
- }
- } else {
- while (upto < postings.length) {
- freq = postings[upto+1];
- Debug.Assert( freq > 0;
- if (liveDocs.get(postings[upto])) {
- return postings[upto];
- }
- upto += 2 + freq*posMult;
- }
- }
- return NO_MORE_DOCS;
- }
-
- @Override
- public int docID() {
- // TODO: store docID member?
- if (upto < 0) {
- return -1;
- } else if (upto < postings.length) {
- return postings[upto];
- } else {
- return NO_MORE_DOCS;
- }
- }
-
- @Override
- public int freq() {
- // TODO: can I do postings[upto+1]?
- return freq;
- }
-
- @Override
- public int advance(int target) {
- // Linear scan, but this is low-freq term so it won't
- // be costly:
- return slowAdvance(target);
- }
-
- @Override
- public long cost() {
- // TODO: could do a better estimate
- return postings.length / 2;
- }
- }
-
- private final static class LowFreqDocsAndPositionsEnum extends DocsAndPositionsEnum {
- private int[] postings;
- private final Bits liveDocs;
- private final int posMult;
- private final bool hasOffsets;
- private final bool hasPayloads;
- private final BytesRef payload = new BytesRef();
- private int upto;
- private int docID;
- private int freq;
- private int skipPositions;
- private int startOffset;
- private int endOffset;
- private int lastPayloadOffset;
- private int payloadOffset;
- private int payloadLength;
- private byte[] payloadBytes;
-
- public LowFreqDocsAndPositionsEnum(Bits liveDocs, bool hasOffsets, bool hasPayloads) {
- this.liveDocs = liveDocs;
- this.hasOffsets = hasOffsets;
- this.hasPayloads = hasPayloads;
- if (hasOffsets) {
- if (hasPayloads) {
- posMult = 4;
- } else {
- posMult = 3;
- }
- } else if (hasPayloads) {
- posMult = 2;
- } else {
- posMult = 1;
- }
- }
-
- public DocsAndPositionsEnum reset(int[] postings, byte[] payloadBytes) {
- this.postings = postings;
- upto = 0;
- skipPositions = 0;
- startOffset = -1;
- endOffset = -1;
- docID = -1;
- payloadLength = 0;
- this.payloadBytes = payloadBytes;
- return this;
- }
-
- @Override
- public int nextDoc() {
- if (hasPayloads) {
- for(int i=0;i<skipPositions;i++) {
- upto++;
- if (hasOffsets) {
- upto += 2;
- }
- payloadOffset += postings[upto++];
- }
- } else {
- upto += posMult * skipPositions;
- }
-
- if (liveDocs == null) {
- if (upto < postings.length) {
- docID = postings[upto++];
- freq = postings[upto++];
- skipPositions = freq;
- return docID;
- }
- } else {
- while(upto < postings.length) {
- docID = postings[upto++];
- freq = postings[upto++];
- if (liveDocs.get(docID)) {
- skipPositions = freq;
- return docID;
- }
- if (hasPayloads) {
- for(int i=0;i<freq;i++) {
- upto++;
- if (hasOffsets) {
- upto += 2;
- }
- payloadOffset += postings[upto++];
- }
- } else {
- upto += posMult * freq;
- }
- }
- }
-
- return docID = NO_MORE_DOCS;
- }
-
- @Override
- public int docID() {
- return docID;
- }
-
- @Override
- public int freq() {
- return freq;
- }
-
- @Override
- public int nextPosition() {
- Debug.Assert( skipPositions > 0;
- skipPositions--;
- final int pos = postings[upto++];
- if (hasOffsets) {
- startOffset = postings[upto++];
- endOffset = postings[upto++];
- }
- if (hasPayloads) {
- payloadLength = postings[upto++];
- lastPayloadOffset = payloadOffset;
- payloadOffset += payloadLength;
- }
- return pos;
- }
-
- @Override
- public int startOffset() {
- return startOffset;
- }
-
- @Override
- public int endOffset() {
- return endOffset;
- }
-
- @Override
- public int advance(int target) {
- return slowAdvance(target);
- }
-
- @Override
- public BytesRef getPayload() {
- if (payloadLength > 0) {
- payload.bytes = payloadBytes;
- payload.offset = lastPayloadOffset;
- payload.length = payloadLength;
- return payload;
- } else {
- return null;
- }
- }
-
- @Override
- public long cost() {
- // TODO: could do a better estimate
- return postings.length / 2;
- }
- }
-
- // Docs + freqs:
- private final static class HighFreqDocsEnum extends DocsEnum {
- private int[] docIDs;
- private int[] freqs;
- private final Bits liveDocs;
- private int upto;
- private int docID = -1;
-
- public HighFreqDocsEnum(Bits liveDocs) {
- this.liveDocs = liveDocs;
- }
-
- public bool canReuse(Bits liveDocs) {
- return liveDocs == this.liveDocs;
- }
-
- public int[] getDocIDs() {
- return docIDs;
- }
-
- public int[] getFreqs() {
- return freqs;
- }
-
- public DocsEnum reset(int[] docIDs, int[] freqs) {
- this.docIDs = docIDs;
- this.freqs = freqs;
- docID = upto = -1;
- return this;
- }
-
- @Override
- public int nextDoc() {
- upto++;
- if (liveDocs == null) {
- try {
- return docID = docIDs[upto];
- } catch (ArrayIndexOutOfBoundsException e) {
- }
- } else {
- while (upto < docIDs.length) {
- if (liveDocs.get(docIDs[upto])) {
- return docID = docIDs[upto];
- }
- upto++;
- }
- }
- return docID = NO_MORE_DOCS;
- }
-
- @Override
- public int docID() {
- return docID;
- }
-
- @Override
- public int freq() {
- if (freqs == null) {
- return 1;
- } else {
- return freqs[upto];
- }
- }
-
- @Override
- public int advance(int target) {
- /*
- upto++;
- if (upto == docIDs.length) {
- return docID = NO_MORE_DOCS;
- }
- final int index = Arrays.binarySearch(docIDs, upto, docIDs.length, target);
- if (index < 0) {
- upto = -index - 1;
- } else {
- upto = index;
- }
- if (liveDocs != null) {
- while (upto < docIDs.length) {
- if (liveDocs.get(docIDs[upto])) {
- break;
- }
- upto++;
- }
- }
- if (upto == docIDs.length) {
- return NO_MORE_DOCS;
- } else {
- return docID = docIDs[upto];
- }
- */
-
- //System.out.println(" advance target=" + target + " cur=" + docID() + " upto=" + upto + " of " + docIDs.length);
- // if (DEBUG) {
- // System.out.println("advance target=" + target + " len=" + docIDs.length);
- // }
- upto++;
- if (upto == docIDs.length) {
- return docID = NO_MORE_DOCS;
- }
-
- // First "grow" outwards, since most advances are to
- // nearby docs:
- int inc = 10;
- int nextUpto = upto+10;
- int low;
- int high;
- while (true) {
- //System.out.println(" grow nextUpto=" + nextUpto + " inc=" + inc);
- if (nextUpto >= docIDs.length) {
- low = nextUpto-inc;
- high = docIDs.length-1;
- break;
- }
- //System.out.println(" docID=" + docIDs[nextUpto]);
-
- if (target <= docIDs[nextUpto]) {
- low = nextUpto-inc;
- high = nextUpto;
- break;
- }
- inc *= 2;
- nextUpto += inc;
- }
-
- // Now do normal binary search
- //System.out.println(" after fwd: low=" + low + " high=" + high);
-
- while (true) {
-
- if (low > high) {
- // Not exactly found
- //System.out.println(" break: no match");
- upto = low;
- break;
- }
-
- int mid = (low + high) >>> 1;
- int cmp = docIDs[mid] - target;
- //System.out.println(" bsearch low=" + low + " high=" + high+ ": docIDs[" + mid + "]=" + docIDs[mid]);
-
- if (cmp < 0) {
- low = mid + 1;
- } else if (cmp > 0) {
- high = mid - 1;
- } else {
- // Found target
- upto = mid;
- //System.out.println(" break: match");
- break;
- }
- }
-
- //System.out.println(" end upto=" + upto + " docID=" + (upto >= docIDs.length ? NO_MORE_DOCS : docIDs[upto]));
-
- if (liveDocs != null) {
- while (upto < docIDs.length) {
- if (liveDocs.get(docIDs[upto])) {
- break;
- }
- upto++;
- }
- }
- if (upto == docIDs.length) {
- //System.out.println(" return END");
- return docID = NO_MORE_DOCS;
- } else {
- //System.out.println(" return docID=" + docIDs[upto] + " upto=" + upto);
- return docID = docIDs[upto];
- }
- }
-
- @Override
- public long cost() {
- return docIDs.length;
- }
- }
-
- // TODO: specialize offsets and not
- private final static class HighFreqDocsAndPositionsEnum extends DocsAndPositionsEnum {
- private int[] docIDs;
- private int[] freqs;
- private int[][] positions;
- private byte[][][] payloads;
- private final Bits liveDocs;
- private final bool hasOffsets;
- private final int posJump;
- private int upto;
- private int docID = -1;
- private int posUpto;
- private int[] curPositions;
-
- public HighFreqDocsAndPositionsEnum(Bits liveDocs, bool hasOffsets) {
- this.liveDocs = liveDocs;
- this.hasOffsets = hasOffsets;
- posJump = hasOffsets ? 3 : 1;
- }
-
- public int[] getDocIDs() {
- return docIDs;
- }
-
- public int[][] getPositions() {
- return positions;
- }
-
- public int getPosJump() {
- return posJump;
- }
-
- public Bits getLiveDocs() {
- return liveDocs;
- }
-
- public DocsAndPositionsEnum reset(int[] docIDs, int[] freqs, int[][] positions, byte[][][] payloads) {
- this.docIDs = docIDs;
- this.freqs = freqs;
- this.positions = positions;
- this.payloads = payloads;
- upto = -1;
- return this;
- }
-
- @Override
- public int nextDoc() {
- upto++;
- if (liveDocs == null) {
- if (upto < docIDs.length) {
- posUpto = -posJump;
- curPositions = positions[upto];
- return docID = docIDs[upto];
- }
- } else {
- while (upto < docIDs.length) {
- if (liveDocs.get(docIDs[upto])) {
- posUpto = -posJump;
- curPositions = positions[upto];
- return docID = docIDs[upto];
- }
- upto++;
- }
- }
-
- return docID = NO_MORE_DOCS;
- }
-
- @Override
- public int freq() {
- return freqs[upto];
- }
-
- @Override
- public int docID() {
- return docID;
- }
-
- @Override
- public int nextPosition() {
- posUpto += posJump;
- return curPositions[posUpto];
- }
-
- @Override
- public int startOffset() {
- if (hasOffsets) {
- return curPositions[posUpto+1];
- } else {
- return -1;
- }
- }
-
- @Override
- public int endOffset() {
- if (hasOffsets) {
- return curPositions[posUpto+2];
- } else {
- return -1;
- }
- }
-
- @Override
- public int advance(int target) {
-
- /*
- upto++;
- if (upto == docIDs.length) {
- return NO_MORE_DOCS;
- }
- final int index = Arrays.binarySearch(docIDs, upto, docIDs.length, target);
- if (index < 0) {
- upto = -index - 1;
- } else {
- upto = index;
- }
- if (liveDocs != null) {
- while (upto < docIDs.length) {
- if (liveDocs.get(docIDs[upto])) {
- break;
- }
- upto++;
- }
- }
- posUpto = hasOffsets ? -3 : -1;
- if (upto == docIDs.length) {
- return NO_MORE_DOCS;
- } else {
- return docID();
- }
- */
-
- //System.out.println(" advance target=" + target + " cur=" + docID() + " upto=" + upto + " of " + docIDs.length);
- // if (DEBUG) {
- // System.out.println("advance target=" + target + " len=" + docIDs.length);
- // }
- upto++;
- if (upto == docIDs.length) {
- return docID = NO_MORE_DOCS;
- }
-
- // First "grow" outwards, since most advances are to
- // nearby docs:
- int inc = 10;
- int nextUpto = upto+10;
- int low;
- int high;
- while (true) {
- //System.out.println(" grow nextUpto=" + nextUpto + " inc=" + inc);
- if (nextUpto >= docIDs.length) {
- low = nextUpto-inc;
- high = docIDs.length-1;
- break;
- }
- //System.out.println(" docID=" + docIDs[nextUpto]);
-
- if (target <= docIDs[nextUpto]) {
- low = nextUpto-inc;
- high = nextUpto;
- break;
- }
- inc *= 2;
- nextUpto += inc;
- }
-
- // Now do normal binary search
- //System.out.println(" after fwd: low=" + low + " high=" + high);
-
- while (true) {
-
- if (low > high) {
- // Not exactly found
- //System.out.println(" break: no match");
- upto = low;
- break;
- }
-
- int mid = (low + high) >>> 1;
- int cmp = docIDs[mid] - target;
- //System.out.println(" bsearch low=" + low + " high=" + high+ ": docIDs[" + mid + "]=" + docIDs[mid]);
-
- if (cmp < 0) {
- low = mid + 1;
- } else if (cmp > 0) {
- high = mid - 1;
- } else {
- // Found target
- upto = mid;
- //System.out.println(" break: match");
- break;
- }
- }
-
- //System.out.println(" end upto=" + upto + " docID=" + (upto >= docIDs.length ? NO_MORE_DOCS : docIDs[upto]));
-
- if (liveDocs != null) {
- while (upto < docIDs.length) {
- if (liveDocs.get(docIDs[upto])) {
- break;
- }
- upto++;
- }
- }
- if (upto == docIDs.length) {
- //System.out.println(" return END");
- return docID = NO_MORE_DOCS;
- } else {
- //System.out.println(" return docID=" + docIDs[upto] + " upto=" + upto);
- posUpto = -posJump;
- curPositions = positions[upto];
- return docID = docIDs[upto];
- }
- }
-
- private final BytesRef payload = new BytesRef();
-
- @Override
- public BytesRef getPayload() {
- if (payloads == null) {
- return null;
- } else {
- final byte[] payloadBytes = payloads[upto][posUpto/(hasOffsets ? 3:1)];
- if (payloadBytes == null) {
- return null;
- }
- payload.bytes = payloadBytes;
- payload.length = payloadBytes.length;
- payload.offset = 0;
- return payload;
- }
- }
-
- @Override
- public long cost() {
- return docIDs.length;
- }
- }
-}
+using System;
+using System.Diagnostics;
+using System.Collections.Generic;
+
+namespace org.apache.lucene.codecs.memory
+{
+
+ /// <summary>
+ /// Licensed to the Apache Software Foundation (ASF) under one or more
+ /// contributor license agreements. See the NOTICE file distributed with
+ /// this work for additional information regarding copyright ownership.
+ /// The ASF licenses this file to You under the Apache License, Version 2.0
+ /// (the "License"); you may not use this file except in compliance with
+ /// the License. You may obtain a copy of the License at
+ ///
+ /// http://www.apache.org/licenses/LICENSE-2.0
+ ///
+ /// Unless required by applicable law or agreed to in writing, software
+ /// distributed under the License is distributed on an "AS IS" BASIS,
+ /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ /// See the License for the specific language governing permissions and
+ /// limitations under the License.
+ /// </summary>
+
+
+ using Lucene41PostingsFormat = org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; // javadocs
+ using DocsAndPositionsEnum = org.apache.lucene.index.DocsAndPositionsEnum;
+ using DocsEnum = org.apache.lucene.index.DocsEnum;
+ using IndexOptions = org.apache.lucene.index.FieldInfo.IndexOptions;
+ using FieldInfo = org.apache.lucene.index.FieldInfo;
+ using Fields = org.apache.lucene.index.Fields;
+ using OrdTermState = org.apache.lucene.index.OrdTermState;
+ using SegmentReadState = org.apache.lucene.index.SegmentReadState;
+ using SegmentWriteState = org.apache.lucene.index.SegmentWriteState;
+ using TermState = org.apache.lucene.index.TermState;
+ using Terms = org.apache.lucene.index.Terms;
+ using TermsEnum = org.apache.lucene.index.TermsEnum;
+ using IOContext = org.apache.lucene.store.IOContext;
+ using RAMOutputStream = org.apache.lucene.store.RAMOutputStream;
+ using ArrayUtil = org.apache.lucene.util.ArrayUtil;
+ using Bits = org.apache.lucene.util.Bits;
+ using BytesRef = org.apache.lucene.util.BytesRef;
+ using RamUsageEstimator = org.apache.lucene.util.RamUsageEstimator;
+ using CompiledAutomaton = org.apache.lucene.util.automaton.CompiledAutomaton;
+ using RunAutomaton = org.apache.lucene.util.automaton.RunAutomaton;
+ using Transition = org.apache.lucene.util.automaton.Transition;
+
+ // TODO:
+ // - build depth-N prefix hash?
+ // - or: longer dense skip lists than just next byte?
+
+ /// <summary>
+ /// Wraps <seealso cref="Lucene41PostingsFormat"/> format for on-disk
+ /// storage, but then at read time loads and stores all
+ /// terms & postings directly in RAM as byte[], int[].
+ ///
+ /// <para><b><font color=red>WARNING</font></b>: This is
+ /// exceptionally RAM intensive: it makes no effort to
+ /// compress the postings data, storing terms as separate
+ /// byte[] and postings as separate int[], but as a result it
+ /// gives substantial increase in search performance.
+ ///
+ /// </para>
+ /// <para>This postings format supports <seealso cref="TermsEnum#ord"/>
+ /// and <seealso cref="TermsEnum#seekExact(long)"/>.
+ ///
+ /// </para>
+ /// <para>Because this holds all term bytes as a single
+ /// byte[], you cannot have more than 2.1GB worth of term
+ /// bytes in a single segment.
+ ///
+ /// @lucene.experimental
+ /// </para>
+ /// </summary>
+
+ public sealed class DirectPostingsFormat : PostingsFormat
+ {
+
+ private readonly int minSkipCount;
+ private readonly int lowFreqCutoff;
+
+ private const int DEFAULT_MIN_SKIP_COUNT = 8;
+ private const int DEFAULT_LOW_FREQ_CUTOFF = 32;
+
+ //private static final boolean DEBUG = true;
+
+ // TODO: allow passing/wrapping arbitrary postings format?
+
+ public DirectPostingsFormat() : this(DEFAULT_MIN_SKIP_COUNT, DEFAULT_LOW_FREQ_CUTOFF)
+ {
+ }
+
+ /// <summary>
+ /// minSkipCount is how many terms in a row must have the
+ /// same prefix before we put a skip pointer down. Terms
+ /// with docFreq <= lowFreqCutoff will use a single int[]
+ /// to hold all docs, freqs, position and offsets; terms
+ /// with higher docFreq will use separate arrays.
+ /// </summary>
+ public DirectPostingsFormat(int minSkipCount, int lowFreqCutoff) : base("Direct")
+ {
+ this.minSkipCount = minSkipCount;
+ this.lowFreqCutoff = lowFreqCutoff;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public org.apache.lucene.codecs.FieldsConsumer fieldsConsumer(org.apache.lucene.index.SegmentWriteState state) throws java.io.IOException
+ public override FieldsConsumer fieldsConsumer(SegmentWriteState state)
+ {
+ return PostingsFormat.forName("Lucene41").fieldsConsumer(state);
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public org.apache.lucene.codecs.FieldsProducer fieldsProducer(org.apache.lucene.index.SegmentReadState state) throws java.io.IOException
+ public override FieldsProducer fieldsProducer(SegmentReadState state)
+ {
+ FieldsProducer postings = PostingsFormat.forName("Lucene41").fieldsProducer(state);
+ if (state.context.context != IOContext.Context.MERGE)
+ {
+ FieldsProducer loadedPostings;
+ try
+ {
+ postings.checkIntegrity();
+ loadedPostings = new DirectFields(state, postings, minSkipCount, lowFreqCutoff);
+ }
+ finally
+ {
+ postings.close();
+ }
+ return loadedPostings;
+ }
+ else
+ {
+ // Don't load postings for merge:
+ return postings;
+ }
+ }
+
+ private sealed class DirectFields : FieldsProducer
+ {
+ internal readonly IDictionary<string, DirectField> fields = new SortedDictionary<string, DirectField>();
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public DirectFields(org.apache.lucene.index.SegmentReadState state, org.apache.lucene.index.Fields fields, int minSkipCount, int lowFreqCutoff) throws java.io.IOException
+ public DirectFields(SegmentReadState state, Fields fields, int minSkipCount, int lowFreqCutoff)
+ {
+ foreach (string field in fields)
+ {
+ this.fields[field] = new DirectField(state, field, fields.terms(field), minSkipCount, lowFreqCutoff);
+ }
+ }
+
+ public override IEnumerator<string> iterator()
+ {
+ return Collections.unmodifiableSet(fields.Keys).GetEnumerator();
+ }
+
+ public override Terms terms(string field)
+ {
+ return fields[field];
+ }
+
+ public override int size()
+ {
+ return fields.Count;
+ }
+
+ public override long UniqueTermCount
+ {
+ get
+ {
+ long numTerms = 0;
+ foreach (DirectField field in fields.Values)
+ {
+ numTerms += field.terms.Length;
+ }
+ return numTerms;
+ }
+ }
+
+ public override void close()
+ {
+ }
+
+ public override long ramBytesUsed()
+ {
+ long sizeInBytes = 0;
+ foreach (KeyValuePair<string, DirectField> entry in fields.SetOfKeyValuePairs())
+ {
+ sizeInBytes += entry.Key.length() * RamUsageEstimator.NUM_BYTES_CHAR;
+ sizeInBytes += entry.Value.ramBytesUsed();
+ }
+ return sizeInBytes;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void checkIntegrity() throws java.io.IOException
+ public override void checkIntegrity()
+ {
+ // if we read entirely into ram, we already validated.
+ // otherwise returned the raw postings reader
+ }
+ }
+
+ private sealed class DirectField : Terms
+ {
+
+ private abstract class TermAndSkip
+ {
+ public int[] skips;
+
+ /// <summary>
+ /// Returns the approximate number of RAM bytes used </summary>
+ public abstract long ramBytesUsed();
+ }
+
+ private sealed class LowFreqTerm : TermAndSkip
+ {
+ public readonly int[] postings;
+ public readonly sbyte[] payloads;
+ public readonly int docFreq;
+ public readonly int totalTermFreq;
+
+ public LowFreqTerm(int[] postings, sbyte[] payloads, int docFreq, int totalTermFreq)
+ {
+ this.postings = postings;
+ this.payloads = payloads;
+ this.docFreq = docFreq;
+ this.totalTermFreq = totalTermFreq;
+ }
+
+ public override long ramBytesUsed()
+ {
+ return ((postings != null) ? RamUsageEstimator.sizeOf(postings) : 0) + ((payloads != null) ? RamUsageEstimator.sizeOf(payloads) : 0);
+ }
+ }
+
+ // TODO: maybe specialize into prx/no-prx/no-frq cases?
+ private sealed class HighFreqTerm : TermAndSkip
+ {
+ public readonly long totalTermFreq;
+ public readonly int[] docIDs;
+ public readonly int[] freqs;
+ public readonly int[][] positions;
+ public readonly sbyte[][][] payloads;
+
+ public HighFreqTerm(int[] docIDs, int[] freqs, int[][] positions, sbyte[][][] payloads, long totalTermFreq)
+ {
+ this.docIDs = docIDs;
+ this.freqs = freqs;
+ this.positions = positions;
+ this.payloads = payloads;
+ this.totalTermFreq = totalTermFreq;
+ }
+
+ public override long ramBytesUsed()
+ {
+ long sizeInBytes = 0;
+ sizeInBytes += (docIDs != null)? RamUsageEstimator.sizeOf(docIDs) : 0;
+ sizeInBytes += (freqs != null)? RamUsageEstimator.sizeOf(freqs) : 0;
+
+ if (positions != null)
+ {
+ foreach (int[] position in positions)
+ {
+ sizeInBytes += (position != null) ? RamUsageEstimator.sizeOf(position) : 0;
+ }
+ }
+
+ if (payloads != null)
+ {
+ foreach (sbyte[][] payload in payloads)
+ {
+ if (payload != null)
+ {
+ foreach (sbyte[] pload in payload)
+ {
+ sizeInBytes += (pload != null) ? RamUsageEstimator.sizeOf(pload) : 0;
+ }
+ }
+ }
+ }
+
+ return sizeInBytes;
+ }
+ }
+
+ internal readonly sbyte[] termBytes;
+ internal readonly int[] termOffsets;
+
+ internal readonly int[] skips;
+ internal readonly int[] skipOffsets;
+
+ internal readonly TermAndSkip[] terms;
+ internal readonly bool hasFreq;
+ internal readonly bool hasPos;
+ internal readonly bool hasOffsets_Renamed;
+ internal readonly bool hasPayloads_Renamed;
+ internal readonly long sumTotalTermFreq;
+ internal readonly int docCount;
+ internal readonly long sumDocFreq;
+ internal int skipCount;
+
+ // TODO: maybe make a separate builder? These are only
+ // used during load:
+ internal int count;
+ internal int[] sameCounts = new int[10];
+ internal readonly int minSkipCount;
+
+ private sealed class IntArrayWriter
+ {
+ internal int[] ints = new int[10];
+ internal int upto;
+
+ public void add(int value)
+ {
+ if (ints.Length == upto)
+ {
+ ints = ArrayUtil.grow(ints);
+ }
+ ints[upto++] = value;
+ }
+
+ public int[] get()
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int[] arr = new int[upto];
+ int[] arr = new int[upto];
+ Array.Copy(ints, 0, arr, 0, upto);
+ upto = 0;
+ return arr;
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public DirectField(org.apache.lucene.index.SegmentReadState state, String field, org.apache.lucene.index.Terms termsIn, int minSkipCount, int lowFreqCutoff) throws java.io.IOException
+ public DirectField(SegmentReadState state, string field, Terms termsIn, int minSkipCount, int lowFreqCutoff)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.index.FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
+ FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
+
+ sumTotalTermFreq = termsIn.SumTotalTermFreq;
+ sumDocFreq = termsIn.SumDocFreq;
+ docCount = termsIn.DocCount;
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int numTerms = (int) termsIn.size();
+ int numTerms = (int) termsIn.size();
+ if (numTerms == -1)
+ {
+ throw new System.ArgumentException("codec does not provide Terms.size()");
+ }
+ terms = new TermAndSkip[numTerms];
+ termOffsets = new int[1 + numTerms];
+
+ sbyte[] termBytes = new sbyte[1024];
+
+ this.minSkipCount = minSkipCount;
+
+ hasFreq = fieldInfo.IndexOptions.compareTo(IndexOptions.DOCS_ONLY) > 0;
+ hasPos = fieldInfo.IndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) > 0;
+ hasOffsets_Renamed = fieldInfo.IndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) > 0;
+ hasPayloads_Renamed = fieldInfo.hasPayloads();
+
+ BytesRef term;
+ DocsEnum docsEnum = null;
+ DocsAndPositionsEnum docsAndPositionsEnum = null;
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.index.TermsEnum termsEnum = termsIn.iterator(null);
+ TermsEnum termsEnum = termsIn.iterator(null);
+ int termOffset = 0;
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final IntArrayWriter scratch = new IntArrayWriter();
+ IntArrayWriter scratch = new IntArrayWriter();
+
+ // Used for payloads, if any:
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.store.RAMOutputStream ros = new org.apache.lucene.store.RAMOutputStream();
+ RAMOutputStream ros = new RAMOutputStream();
+
+ // if (DEBUG) {
+ // System.out.println("\nLOAD terms seg=" + state.segmentInfo.name + " field=" + field + " hasOffsets=" + hasOffsets + " hasFreq=" + hasFreq + " hasPos=" + hasPos + " hasPayloads=" + hasPayloads);
+ // }
+
+ while ((term = termsEnum.next()) != null)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int docFreq = termsEnum.docFreq();
+ int docFreq = termsEnum.docFreq();
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final long totalTermFreq = termsEnum.totalTermFreq();
+ long totalTermFreq = termsEnum.totalTermFreq();
+
+ // if (DEBUG) {
+ // System.out.println(" term=" + term.utf8ToString());
+ // }
+
+ termOffsets[count] = termOffset;
+
+ if (termBytes.Length < (termOffset + term.length))
+ {
+ termBytes = ArrayUtil.grow(termBytes, termOffset + term.length);
+ }
+ Array.Copy(term.bytes, term.offset, termBytes, termOffset, term.length);
+ termOffset += term.length;
+ termOffsets[count + 1] = termOffset;
+
+ if (hasPos)
+ {
+ docsAndPositionsEnum = termsEnum.docsAndPositions(null, docsAndPositionsEnum);
+ }
+ else
+ {
+ docsEnum = termsEnum.docs(null, docsEnum);
+ }
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final TermAndSkip ent;
+ TermAndSkip ent;
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.index.DocsEnum docsEnum2;
+ DocsEnum docsEnum2;
+ if (hasPos)
+ {
+ docsEnum2 = docsAndPositionsEnum;
+ }
+ else
+ {
+ docsEnum2 = docsEnum;
+ }
+
+ int docID;
+
+ if (docFreq <= lowFreqCutoff)
+ {
+
+ ros.reset();
+
+ // Pack postings for low-freq terms into a single int[]:
+ while ((docID = docsEnum2.nextDoc()) != DocsEnum.NO_MORE_DOCS)
+ {
+ scratch.add(docID);
+ if (hasFreq)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int freq = docsEnum2.freq();
+ int freq = docsEnum2.freq();
+ scratch.add(freq);
+ if (hasPos)
+ {
+ for (int pos = 0;pos < freq;pos++)
+ {
+ scratch.add(docsAndPositionsEnum.nextPosition());
+ if (hasOffsets_Renamed)
+ {
+ scratch.add(docsAndPositionsEnum.startOffset());
+ scratch.add(docsAndPositionsEnum.endOffset());
+ }
+ if (hasPayloads_Renamed)
+ {
+//JAVA TO C# CONVERTER WARNING: The origina
<TRUNCATED>