You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by pn...@apache.org on 2014/09/28 10:50:10 UTC
[06/10] Lucene.Net.Codes/Sep fully ported,
work done on SimpleText and Memory as well
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/d852d5b0/src/Lucene.Net.Codecs/Memory/FSTTermsReader.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Memory/FSTTermsReader.cs b/src/Lucene.Net.Codecs/Memory/FSTTermsReader.cs
index da1d517..fb81d4d 100644
--- a/src/Lucene.Net.Codecs/Memory/FSTTermsReader.cs
+++ b/src/Lucene.Net.Codecs/Memory/FSTTermsReader.cs
@@ -1,762 +1,1010 @@
-package codecs.memory;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.BitSet;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.TreeMap;
-
-import codecs.BlockTermState;
-import codecs.CodecUtil;
-import codecs.FieldsProducer;
-import codecs.PostingsReaderBase;
-import index.CorruptIndexException;
-import index.DocsAndPositionsEnum;
-import index.DocsEnum;
-import index.FieldInfo.IndexOptions;
-import index.FieldInfo;
-import index.FieldInfos;
-import index.IndexFileNames;
-import index.SegmentInfo;
-import index.SegmentReadState;
-import index.TermState;
-import index.Terms;
-import index.TermsEnum;
-import store.ByteArrayDataInput;
-import store.IndexInput;
-import util.ArrayUtil;
-import util.Bits;
-import util.BytesRef;
-import util.IOUtils;
-import util.RamUsageEstimator;
-import util.automaton.ByteRunAutomaton;
-import util.automaton.CompiledAutomaton;
-import util.fst.BytesRefFSTEnum.InputOutput;
-import util.fst.BytesRefFSTEnum;
-import util.fst.FST;
-import util.fst.Outputs;
-import util.fst.Util;
-
-/**
- * FST-based terms dictionary reader.
- *
- * The FST directly maps each term and its metadata,
- * it is memory resident.
- *
- * @lucene.experimental
- */
-
-public class FSTTermsReader extends FieldsProducer {
- final TreeMap<String, TermsReader> fields = new TreeMap<>();
- final PostingsReaderBase postingsReader;
- //static bool TEST = false;
- final int version;
-
- public FSTTermsReader(SegmentReadState state, PostingsReaderBase postingsReader) {
- final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, FSTTermsWriter.TERMS_EXTENSION);
-
- this.postingsReader = postingsReader;
- final IndexInput in = state.directory.openInput(termsFileName, state.context);
-
- bool success = false;
- try {
- version = readHeader(in);
- if (version >= FSTTermsWriter.TERMS_VERSION_CHECKSUM) {
- CodecUtil.checksumEntireFile(in);
- }
- this.postingsReader.init(in);
- seekDir(in);
-
- final FieldInfos fieldInfos = state.fieldInfos;
- final int numFields = in.readVInt();
- for (int i = 0; i < numFields; i++) {
- int fieldNumber = in.readVInt();
- FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);
- long numTerms = in.readVLong();
- long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong();
- long sumDocFreq = in.readVLong();
- int docCount = in.readVInt();
- int longsSize = in.readVInt();
- TermsReader current = new TermsReader(fieldInfo, in, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize);
- TermsReader previous = fields.put(fieldInfo.name, current);
- checkFieldSummary(state.segmentInfo, in, current, previous);
- }
- success = true;
- } finally {
- if (success) {
- IOUtils.close(in);
- } else {
- IOUtils.closeWhileHandlingException(in);
- }
- }
- }
-
- private int readHeader(IndexInput in) {
- return CodecUtil.checkHeader(in, FSTTermsWriter.TERMS_CODEC_NAME,
- FSTTermsWriter.TERMS_VERSION_START,
- FSTTermsWriter.TERMS_VERSION_CURRENT);
- }
- private void seekDir(IndexInput in) {
- if (version >= FSTTermsWriter.TERMS_VERSION_CHECKSUM) {
- in.seek(in.length() - CodecUtil.footerLength() - 8);
- } else {
- in.seek(in.length() - 8);
- }
- in.seek(in.readLong());
- }
- private void checkFieldSummary(SegmentInfo info, IndexInput in, TermsReader field, TermsReader previous) {
- // #docs with field must be <= #docs
- if (field.docCount < 0 || field.docCount > info.getDocCount()) {
- throw new CorruptIndexException("invalid docCount: " + field.docCount + " maxDoc: " + info.getDocCount() + " (resource=" + in + ")");
- }
- // #postings must be >= #docs with field
- if (field.sumDocFreq < field.docCount) {
- throw new CorruptIndexException("invalid sumDocFreq: " + field.sumDocFreq + " docCount: " + field.docCount + " (resource=" + in + ")");
- }
- // #positions must be >= #postings
- if (field.sumTotalTermFreq != -1 && field.sumTotalTermFreq < field.sumDocFreq) {
- throw new CorruptIndexException("invalid sumTotalTermFreq: " + field.sumTotalTermFreq + " sumDocFreq: " + field.sumDocFreq + " (resource=" + in + ")");
- }
- if (previous != null) {
- throw new CorruptIndexException("duplicate fields: " + field.fieldInfo.name + " (resource=" + in + ")");
- }
- }
-
- @Override
- public Iterator<String> iterator() {
- return Collections.unmodifiableSet(fields.keySet()).iterator();
- }
-
- @Override
- public Terms terms(String field) {
- Debug.Assert( field != null;
- return fields.get(field);
- }
-
- @Override
- public int size() {
- return fields.size();
- }
-
- @Override
- public void close() {
- try {
- IOUtils.close(postingsReader);
- } finally {
- fields.clear();
- }
- }
-
- final class TermsReader extends Terms {
- final FieldInfo fieldInfo;
- final long numTerms;
- final long sumTotalTermFreq;
- final long sumDocFreq;
- final int docCount;
- final int longsSize;
- final FST<FSTTermOutputs.TermData> dict;
-
- TermsReader(FieldInfo fieldInfo, IndexInput in, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) {
- this.fieldInfo = fieldInfo;
- this.numTerms = numTerms;
- this.sumTotalTermFreq = sumTotalTermFreq;
- this.sumDocFreq = sumDocFreq;
- this.docCount = docCount;
- this.longsSize = longsSize;
- this.dict = new FST<>(in, new FSTTermOutputs(fieldInfo, longsSize));
- }
-
- @Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
- }
-
- @Override
- public bool hasFreqs() {
- return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
- }
-
- @Override
- public bool hasOffsets() {
- return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
- }
-
- @Override
- public bool hasPositions() {
- return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
- }
-
- @Override
- public bool hasPayloads() {
- return fieldInfo.hasPayloads();
- }
-
- @Override
- public long size() {
- return numTerms;
- }
-
- @Override
- public long getSumTotalTermFreq() {
- return sumTotalTermFreq;
- }
-
- @Override
- public long getSumDocFreq() {
- return sumDocFreq;
- }
-
- @Override
- public int getDocCount() {
- return docCount;
- }
-
- @Override
- public TermsEnum iterator(TermsEnum reuse) {
- return new SegmentTermsEnum();
- }
-
- @Override
- public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) {
- return new IntersectTermsEnum(compiled, startTerm);
- }
-
- // Only wraps common operations for PBF interact
- abstract class BaseTermsEnum extends TermsEnum {
- /* Current term, null when enum ends or unpositioned */
- BytesRef term;
-
- /* Current term stats + decoded metadata (customized by PBF) */
- final BlockTermState state;
-
- /* Current term stats + undecoded metadata (long[] & byte[]) */
- FSTTermOutputs.TermData meta;
- ByteArrayDataInput bytesReader;
-
- /** Decodes metadata into customized term state */
- abstract void decodeMetaData() ;
-
- BaseTermsEnum() {
- this.state = postingsReader.newTermState();
- this.bytesReader = new ByteArrayDataInput();
- this.term = null;
- // NOTE: metadata will only be initialized in child class
- }
-
- @Override
- public TermState termState() {
- decodeMetaData();
- return state.clone();
- }
-
- @Override
- public BytesRef term() {
- return term;
- }
-
- @Override
- public int docFreq() {
- return state.docFreq;
- }
-
- @Override
- public long totalTermFreq() {
- return state.totalTermFreq;
- }
-
- @Override
- public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) {
- decodeMetaData();
- return postingsReader.docs(fieldInfo, state, liveDocs, reuse, flags);
- }
-
- @Override
- public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) {
- if (!hasPositions()) {
- return null;
- }
- decodeMetaData();
- return postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse, flags);
- }
-
- @Override
- public void seekExact(long ord) {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public long ord() {
- throw new UnsupportedOperationException();
- }
- }
-
-
- // Iterates through all terms in this field
- private final class SegmentTermsEnum extends BaseTermsEnum {
- final BytesRefFSTEnum<FSTTermOutputs.TermData> fstEnum;
-
- /* True when current term's metadata is decoded */
- bool decoded;
-
- /* True when current enum is 'positioned' by seekExact(TermState) */
- bool seekPending;
-
- SegmentTermsEnum() {
- super();
- this.fstEnum = new BytesRefFSTEnum<>(dict);
- this.decoded = false;
- this.seekPending = false;
- this.meta = null;
- }
-
- @Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
- }
-
- // Let PBF decode metadata from long[] and byte[]
- @Override
- void decodeMetaData() {
- if (!decoded && !seekPending) {
- if (meta.bytes != null) {
- bytesReader.reset(meta.bytes, 0, meta.bytes.length);
- }
- postingsReader.decodeTerm(meta.longs, bytesReader, fieldInfo, state, true);
- decoded = true;
- }
- }
-
- // Update current enum according to FSTEnum
- void updateEnum(final InputOutput<FSTTermOutputs.TermData> pair) {
- if (pair == null) {
- term = null;
- } else {
- term = pair.input;
- meta = pair.output;
- state.docFreq = meta.docFreq;
- state.totalTermFreq = meta.totalTermFreq;
- }
- decoded = false;
- seekPending = false;
- }
-
- @Override
- public BytesRef next() {
- if (seekPending) { // previously positioned, but termOutputs not fetched
- seekPending = false;
- SeekStatus status = seekCeil(term);
- Debug.Assert( status == SeekStatus.FOUND; // must positioned on valid term
- }
- updateEnum(fstEnum.next());
- return term;
- }
-
- @Override
- public bool seekExact(BytesRef target) {
- updateEnum(fstEnum.seekExact(target));
- return term != null;
- }
-
- @Override
- public SeekStatus seekCeil(BytesRef target) {
- updateEnum(fstEnum.seekCeil(target));
- if (term == null) {
- return SeekStatus.END;
- } else {
- return term.equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND;
- }
- }
-
- @Override
- public void seekExact(BytesRef target, TermState otherState) {
- if (!target.equals(term)) {
- state.copyFrom(otherState);
- term = BytesRef.deepCopyOf(target);
- seekPending = true;
- }
- }
- }
-
- // Iterates intersect result with automaton (cannot seek!)
- private final class IntersectTermsEnum extends BaseTermsEnum {
- /* True when current term's metadata is decoded */
- bool decoded;
-
- /* True when there is pending term when calling next() */
- bool pending;
-
- /* stack to record how current term is constructed,
- * used to accumulate metadata or rewind term:
- * level == term.length + 1,
- * == 0 when term is null */
- Frame[] stack;
- int level;
-
- /* to which level the metadata is accumulated
- * so that we can accumulate metadata lazily */
- int metaUpto;
-
- /* term dict fst */
- final FST<FSTTermOutputs.TermData> fst;
- final FST.BytesReader fstReader;
- final Outputs<FSTTermOutputs.TermData> fstOutputs;
-
- /* query automaton to intersect with */
- final ByteRunAutomaton fsa;
-
- private final class Frame {
- /* fst stats */
- FST.Arc<FSTTermOutputs.TermData> fstArc;
-
- /* automaton stats */
- int fsaState;
-
- Frame() {
- this.fstArc = new FST.Arc<>();
- this.fsaState = -1;
- }
-
- public String toString() {
- return "arc=" + fstArc + " state=" + fsaState;
- }
- }
-
- IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) {
- super();
- //if (TEST) System.out.println("Enum init, startTerm=" + startTerm);
- this.fst = dict;
- this.fstReader = fst.getBytesReader();
- this.fstOutputs = dict.outputs;
- this.fsa = compiled.runAutomaton;
- this.level = -1;
- this.stack = new Frame[16];
- for (int i = 0 ; i < stack.length; i++) {
- this.stack[i] = new Frame();
- }
-
- Frame frame;
- frame = loadVirtualFrame(newFrame());
- this.level++;
- frame = loadFirstFrame(newFrame());
- pushFrame(frame);
-
- this.meta = null;
- this.metaUpto = 1;
- this.decoded = false;
- this.pending = false;
-
- if (startTerm == null) {
- pending = isAccept(topFrame());
- } else {
- doSeekCeil(startTerm);
- pending = !startTerm.equals(term) && isValid(topFrame()) && isAccept(topFrame());
- }
- }
-
- @Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
- }
-
- @Override
- void decodeMetaData() {
- Debug.Assert( term != null;
- if (!decoded) {
- if (meta.bytes != null) {
- bytesReader.reset(meta.bytes, 0, meta.bytes.length);
- }
- postingsReader.decodeTerm(meta.longs, bytesReader, fieldInfo, state, true);
- decoded = true;
- }
- }
-
- /** Lazily accumulate meta data, when we got a accepted term */
- void loadMetaData() {
- FST.Arc<FSTTermOutputs.TermData> last, next;
- last = stack[metaUpto].fstArc;
- while (metaUpto != level) {
- metaUpto++;
- next = stack[metaUpto].fstArc;
- next.output = fstOutputs.add(next.output, last.output);
- last = next;
- }
- if (last.isFinal()) {
- meta = fstOutputs.add(last.output, last.nextFinalOutput);
- } else {
- meta = last.output;
- }
- state.docFreq = meta.docFreq;
- state.totalTermFreq = meta.totalTermFreq;
- }
-
- @Override
- public SeekStatus seekCeil(BytesRef target) {
- decoded = false;
- term = doSeekCeil(target);
- loadMetaData();
- if (term == null) {
- return SeekStatus.END;
- } else {
- return term.equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND;
- }
- }
-
- @Override
- public BytesRef next() {
- //if (TEST) System.out.println("Enum next()");
- if (pending) {
- pending = false;
- loadMetaData();
- return term;
- }
- decoded = false;
- DFS:
- while (level > 0) {
- Frame frame = newFrame();
- if (loadExpandFrame(topFrame(), frame) != null) { // has valid target
- pushFrame(frame);
- if (isAccept(frame)) { // gotcha
- break;
- }
- continue; // check next target
- }
- frame = popFrame();
- while(level > 0) {
- if (loadNextFrame(topFrame(), frame) != null) { // has valid sibling
- pushFrame(frame);
- if (isAccept(frame)) { // gotcha
- break DFS;
- }
- continue DFS; // check next target
- }
- frame = popFrame();
- }
- return null;
- }
- loadMetaData();
- return term;
- }
-
- private BytesRef doSeekCeil(BytesRef target) {
- //if (TEST) System.out.println("Enum doSeekCeil()");
- Frame frame= null;
- int label, upto = 0, limit = target.length;
- while (upto < limit) { // to target prefix, or ceil label (rewind prefix)
- frame = newFrame();
- label = target.bytes[upto] & 0xff;
- frame = loadCeilFrame(label, topFrame(), frame);
- if (frame == null || frame.fstArc.label != label) {
- break;
- }
- Debug.Assert( isValid(frame); // target must be fetched from automaton
- pushFrame(frame);
- upto++;
- }
- if (upto == limit) { // got target
- return term;
- }
- if (frame != null) { // got larger term('s prefix)
- pushFrame(frame);
- return isAccept(frame) ? term : next();
- }
- while (level > 0) { // got target's prefix, advance to larger term
- frame = popFrame();
- while (level > 0 && !canRewind(frame)) {
- frame = popFrame();
- }
- if (loadNextFrame(topFrame(), frame) != null) {
- pushFrame(frame);
- return isAccept(frame) ? term : next();
- }
- }
- return null;
- }
-
- /** Virtual frame, never pop */
- Frame loadVirtualFrame(Frame frame) {
- frame.fstArc.output = fstOutputs.getNoOutput();
- frame.fstArc.nextFinalOutput = fstOutputs.getNoOutput();
- frame.fsaState = -1;
- return frame;
- }
-
- /** Load frame for start arc(node) on fst */
- Frame loadFirstFrame(Frame frame) {
- frame.fstArc = fst.getFirstArc(frame.fstArc);
- frame.fsaState = fsa.getInitialState();
- return frame;
- }
-
- /** Load frame for target arc(node) on fst */
- Frame loadExpandFrame(Frame top, Frame frame) {
- if (!canGrow(top)) {
- return null;
- }
- frame.fstArc = fst.readFirstRealTargetArc(top.fstArc.target, frame.fstArc, fstReader);
- frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label);
- //if (TEST) System.out.println(" loadExpand frame="+frame);
- if (frame.fsaState == -1) {
- return loadNextFrame(top, frame);
- }
- return frame;
- }
-
- /** Load frame for sibling arc(node) on fst */
- Frame loadNextFrame(Frame top, Frame frame) {
- if (!canRewind(frame)) {
- return null;
- }
- while (!frame.fstArc.isLast()) {
- frame.fstArc = fst.readNextRealArc(frame.fstArc, fstReader);
- frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label);
- if (frame.fsaState != -1) {
- break;
- }
- }
- //if (TEST) System.out.println(" loadNext frame="+frame);
- if (frame.fsaState == -1) {
- return null;
- }
- return frame;
- }
-
- /** Load frame for target arc(node) on fst, so that
- * arc.label >= label and !fsa.reject(arc.label) */
- Frame loadCeilFrame(int label, Frame top, Frame frame) {
- FST.Arc<FSTTermOutputs.TermData> arc = frame.fstArc;
- arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader);
- if (arc == null) {
- return null;
- }
- frame.fsaState = fsa.step(top.fsaState, arc.label);
- //if (TEST) System.out.println(" loadCeil frame="+frame);
- if (frame.fsaState == -1) {
- return loadNextFrame(top, frame);
- }
- return frame;
- }
-
- bool isAccept(Frame frame) { // reach a term both fst&fsa accepts
- return fsa.isAccept(frame.fsaState) && frame.fstArc.isFinal();
- }
- bool isValid(Frame frame) { // reach a prefix both fst&fsa won't reject
- return /*frame != null &&*/ frame.fsaState != -1;
- }
- bool canGrow(Frame frame) { // can walk forward on both fst&fsa
- return frame.fsaState != -1 && FST.targetHasArcs(frame.fstArc);
- }
- bool canRewind(Frame frame) { // can jump to sibling
- return !frame.fstArc.isLast();
- }
-
- void pushFrame(Frame frame) {
- term = grow(frame.fstArc.label);
- level++;
- //if (TEST) System.out.println(" term=" + term + " level=" + level);
- }
-
- Frame popFrame() {
- term = shrink();
- level--;
- metaUpto = metaUpto > level ? level : metaUpto;
- //if (TEST) System.out.println(" term=" + term + " level=" + level);
- return stack[level+1];
- }
-
- Frame newFrame() {
- if (level+1 == stack.length) {
- final Frame[] temp = new Frame[ArrayUtil.oversize(level+2, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
- System.arraycopy(stack, 0, temp, 0, stack.length);
- for (int i = stack.length; i < temp.length; i++) {
- temp[i] = new Frame();
- }
- stack = temp;
- }
- return stack[level+1];
- }
-
- Frame topFrame() {
- return stack[level];
- }
-
- BytesRef grow(int label) {
- if (term == null) {
- term = new BytesRef(new byte[16], 0, 0);
- } else {
- if (term.length == term.bytes.length) {
- term.grow(term.length+1);
- }
- term.bytes[term.length++] = (byte)label;
- }
- return term;
- }
-
- BytesRef shrink() {
- if (term.length == 0) {
- term = null;
- } else {
- term.length--;
- }
- return term;
- }
- }
- }
-
- static<T> void walk(FST<T> fst) {
- final ArrayList<FST.Arc<T>> queue = new ArrayList<>();
- final BitSet seen = new BitSet();
- final FST.BytesReader reader = fst.getBytesReader();
- final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
- queue.add(startArc);
- while (!queue.isEmpty()) {
- final FST.Arc<T> arc = queue.remove(0);
- final long node = arc.target;
- //System.out.println(arc);
- if (FST.targetHasArcs(arc) && !seen.get((int) node)) {
- seen.set((int) node);
- fst.readFirstRealTargetArc(node, arc, reader);
- while (true) {
- queue.add(new FST.Arc<T>().copyFrom(arc));
- if (arc.isLast()) {
- break;
- } else {
- fst.readNextRealArc(arc, reader);
- }
- }
- }
- }
- }
-
- @Override
- public long ramBytesUsed() {
- long ramBytesUsed = 0;
- for (TermsReader r : fields.values()) {
- ramBytesUsed += r.dict == null ? 0 : r.dict.sizeInBytes();
- }
- return ramBytesUsed;
- }
-
- @Override
- public void checkIntegrity() {
- postingsReader.checkIntegrity();
- }
-}
+using System;
+using System.Diagnostics;
+using System.Collections;
+using System.Collections.Generic;
+
+namespace org.apache.lucene.codecs.memory
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ using CorruptIndexException = org.apache.lucene.index.CorruptIndexException;
+ using DocsAndPositionsEnum = org.apache.lucene.index.DocsAndPositionsEnum;
+ using DocsEnum = org.apache.lucene.index.DocsEnum;
+ using IndexOptions = org.apache.lucene.index.FieldInfo.IndexOptions;
+ using FieldInfo = org.apache.lucene.index.FieldInfo;
+ using FieldInfos = org.apache.lucene.index.FieldInfos;
+ using IndexFileNames = org.apache.lucene.index.IndexFileNames;
+ using SegmentInfo = org.apache.lucene.index.SegmentInfo;
+ using SegmentReadState = org.apache.lucene.index.SegmentReadState;
+ using TermState = org.apache.lucene.index.TermState;
+ using Terms = org.apache.lucene.index.Terms;
+ using TermsEnum = org.apache.lucene.index.TermsEnum;
+ using ByteArrayDataInput = org.apache.lucene.store.ByteArrayDataInput;
+ using IndexInput = org.apache.lucene.store.IndexInput;
+ using ArrayUtil = org.apache.lucene.util.ArrayUtil;
+ using Bits = org.apache.lucene.util.Bits;
+ using BytesRef = org.apache.lucene.util.BytesRef;
+ using IOUtils = org.apache.lucene.util.IOUtils;
+ using RamUsageEstimator = org.apache.lucene.util.RamUsageEstimator;
+ using ByteRunAutomaton = org.apache.lucene.util.automaton.ByteRunAutomaton;
+ using CompiledAutomaton = org.apache.lucene.util.automaton.CompiledAutomaton;
+ using InputOutput = org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput;
+ using BytesRefFSTEnum = org.apache.lucene.util.fst.BytesRefFSTEnum;
+ using FST = org.apache.lucene.util.fst.FST;
+ using Outputs = org.apache.lucene.util.fst.Outputs;
+ using Util = org.apache.lucene.util.fst.Util;
+
+ /// <summary>
+ /// FST-based terms dictionary reader.
+ ///
+ /// The FST directly maps each term and its metadata,
+ /// it is memory resident.
+ ///
+ /// @lucene.experimental
+ /// </summary>
+
+ public class FSTTermsReader : FieldsProducer
+ {
+ internal readonly SortedDictionary<string, TermsReader> fields = new SortedDictionary<string, TermsReader>();
+ internal readonly PostingsReaderBase postingsReader;
+ //static boolean TEST = false;
+ internal readonly int version;
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public FSTTermsReader(org.apache.lucene.index.SegmentReadState state, org.apache.lucene.codecs.PostingsReaderBase postingsReader) throws java.io.IOException
+ public FSTTermsReader(SegmentReadState state, PostingsReaderBase postingsReader)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final String termsFileName = org.apache.lucene.index.IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, FSTTermsWriter.TERMS_EXTENSION);
+ string termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, FSTTermsWriter.TERMS_EXTENSION);
+
+ this.postingsReader = postingsReader;
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.store.IndexInput in = state.directory.openInput(termsFileName, state.context);
+ IndexInput @in = state.directory.openInput(termsFileName, state.context);
+
+ bool success = false;
+ try
+ {
+ version = readHeader(@in);
+ if (version >= FSTTermsWriter.TERMS_VERSION_CHECKSUM)
+ {
+ CodecUtil.checksumEntireFile(@in);
+ }
+ this.postingsReader.init(@in);
+ seekDir(@in);
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.index.FieldInfos fieldInfos = state.fieldInfos;
+ FieldInfos fieldInfos = state.fieldInfos;
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int numFields = in.readVInt();
+ int numFields = @in.readVInt();
+ for (int i = 0; i < numFields; i++)
+ {
+ int fieldNumber = @in.readVInt();
+ FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);
+ long numTerms = @in.readVLong();
+ long sumTotalTermFreq = fieldInfo.IndexOptions == IndexOptions.DOCS_ONLY ? - 1 : @in.readVLong();
+ long sumDocFreq = @in.readVLong();
+ int docCount = @in.readVInt();
+ int longsSize = @in.readVInt();
+ TermsReader current = new TermsReader(this, fieldInfo, @in, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize);
+ TermsReader previous = fields[fieldInfo.name] = current;
+ checkFieldSummary(state.segmentInfo, @in, current, previous);
+ }
+ success = true;
+ }
+ finally
+ {
+ if (success)
+ {
+ IOUtils.close(@in);
+ }
+ else
+ {
+ IOUtils.closeWhileHandlingException(@in);
+ }
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: private int readHeader(org.apache.lucene.store.IndexInput in) throws java.io.IOException
+ private int readHeader(IndexInput @in)
+ {
+ return CodecUtil.checkHeader(@in, FSTTermsWriter.TERMS_CODEC_NAME, FSTTermsWriter.TERMS_VERSION_START, FSTTermsWriter.TERMS_VERSION_CURRENT);
+ }
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: private void seekDir(org.apache.lucene.store.IndexInput in) throws java.io.IOException
+ private void seekDir(IndexInput @in)
+ {
+ if (version >= FSTTermsWriter.TERMS_VERSION_CHECKSUM)
+ {
+ @in.seek(@in.length() - CodecUtil.footerLength() - 8);
+ }
+ else
+ {
+ @in.seek(@in.length() - 8);
+ }
+ @in.seek(@in.readLong());
+ }
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: private void checkFieldSummary(org.apache.lucene.index.SegmentInfo info, org.apache.lucene.store.IndexInput in, TermsReader field, TermsReader previous) throws java.io.IOException
+ private void checkFieldSummary(SegmentInfo info, IndexInput @in, TermsReader field, TermsReader previous)
+ {
+ // #docs with field must be <= #docs
+ if (field.docCount < 0 || field.docCount > info.DocCount)
+ {
+ throw new CorruptIndexException("invalid docCount: " + field.docCount + " maxDoc: " + info.DocCount + " (resource=" + @in + ")");
+ }
+ // #postings must be >= #docs with field
+ if (field.sumDocFreq < field.docCount)
+ {
+ throw new CorruptIndexException("invalid sumDocFreq: " + field.sumDocFreq + " docCount: " + field.docCount + " (resource=" + @in + ")");
+ }
+ // #positions must be >= #postings
+ if (field.sumTotalTermFreq != -1 && field.sumTotalTermFreq < field.sumDocFreq)
+ {
+ throw new CorruptIndexException("invalid sumTotalTermFreq: " + field.sumTotalTermFreq + " sumDocFreq: " + field.sumDocFreq + " (resource=" + @in + ")");
+ }
+ if (previous != null)
+ {
+ throw new CorruptIndexException("duplicate fields: " + field.fieldInfo.name + " (resource=" + @in + ")");
+ }
+ }
+
+ public override IEnumerator<string> iterator()
+ {
+ return Collections.unmodifiableSet(fields.Keys).GetEnumerator();
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public org.apache.lucene.index.Terms terms(String field) throws java.io.IOException
+ public override Terms terms(string field)
+ {
+ Debug.Assert(field != null);
+ return fields[field];
+ }
+
+ public override int size()
+ {
+ return fields.Count;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void close() throws java.io.IOException
+ public override void close()
+ {
+ try
+ {
+ IOUtils.close(postingsReader);
+ }
+ finally
+ {
+ fields.Clear();
+ }
+ }
+
+ internal sealed class TermsReader : Terms
+ {
+ private readonly FSTTermsReader outerInstance;
+
+ internal readonly FieldInfo fieldInfo;
+ internal readonly long numTerms;
+ internal readonly long sumTotalTermFreq;
+ internal readonly long sumDocFreq;
+ internal readonly int docCount;
+ internal readonly int longsSize;
+ internal readonly FST<FSTTermOutputs.TermData> dict;
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: TermsReader(org.apache.lucene.index.FieldInfo fieldInfo, org.apache.lucene.store.IndexInput in, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) throws java.io.IOException
+ internal TermsReader(FSTTermsReader outerInstance, FieldInfo fieldInfo, IndexInput @in, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize)
+ {
+ this.outerInstance = outerInstance;
+ this.fieldInfo = fieldInfo;
+ this.numTerms = numTerms;
+ this.sumTotalTermFreq = sumTotalTermFreq;
+ this.sumDocFreq = sumDocFreq;
+ this.docCount = docCount;
+ this.longsSize = longsSize;
+ this.dict = new FST<>(@in, new FSTTermOutputs(fieldInfo, longsSize));
+ }
+
+ public override IComparer<BytesRef> Comparator
+ {
+ get
+ {
+ return BytesRef.UTF8SortedAsUnicodeComparator;
+ }
+ }
+
+ public override bool hasFreqs()
+ {
+ return fieldInfo.IndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+ }
+
+ public override bool hasOffsets()
+ {
+ return fieldInfo.IndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+ }
+
+ public override bool hasPositions()
+ {
+ return fieldInfo.IndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+ }
+
+ public override bool hasPayloads()
+ {
+ return fieldInfo.hasPayloads();
+ }
+
+ public override long size()
+ {
+ return numTerms;
+ }
+
+ public override long SumTotalTermFreq
+ {
+ get
+ {
+ return sumTotalTermFreq;
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public long getSumDocFreq() throws java.io.IOException
+ public override long SumDocFreq
+ {
+ get
+ {
+ return sumDocFreq;
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public int getDocCount() throws java.io.IOException
+ public override int DocCount
+ {
+ get
+ {
+ return docCount;
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public org.apache.lucene.index.TermsEnum iterator(org.apache.lucene.index.TermsEnum reuse) throws java.io.IOException
+ public override TermsEnum iterator(TermsEnum reuse)
+ {
+ return new SegmentTermsEnum(this);
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public org.apache.lucene.index.TermsEnum intersect(org.apache.lucene.util.automaton.CompiledAutomaton compiled, org.apache.lucene.util.BytesRef startTerm) throws java.io.IOException
+ public override TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm)
+ {
+ return new IntersectTermsEnum(this, compiled, startTerm);
+ }
+
+ // Only wraps common operations for PBF interact
+ internal abstract class BaseTermsEnum : TermsEnum
+ {
+ private readonly FSTTermsReader.TermsReader outerInstance;
+
+ /* Current term, null when enum ends or unpositioned */
+ internal BytesRef term_Renamed;
+
+ /* Current term stats + decoded metadata (customized by PBF) */
+ internal readonly BlockTermState state;
+
+ /* Current term stats + undecoded metadata (long[] & byte[]) */
+ internal FSTTermOutputs.TermData meta;
+ internal ByteArrayDataInput bytesReader;
+
+ /// <summary>
+ /// Decodes metadata into customized term state </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: abstract void decodeMetaData() throws java.io.IOException;
+ internal abstract void decodeMetaData();
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: BaseTermsEnum() throws java.io.IOException
+ internal BaseTermsEnum(FSTTermsReader.TermsReader outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ this.state = outerInstance.outerInstance.postingsReader.newTermState();
+ this.bytesReader = new ByteArrayDataInput();
+ this.term_Renamed = null;
+ // NOTE: metadata will only be initialized in child class
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public org.apache.lucene.index.TermState termState() throws java.io.IOException
+ public override TermState termState()
+ {
+ decodeMetaData();
+ return state.clone();
+ }
+
+ public override BytesRef term()
+ {
+ return term_Renamed;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public int docFreq() throws java.io.IOException
+ public override int docFreq()
+ {
+ return state.docFreq;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public long totalTermFreq() throws java.io.IOException
+ public override long totalTermFreq()
+ {
+ return state.totalTermFreq;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public org.apache.lucene.index.DocsEnum docs(org.apache.lucene.util.Bits liveDocs, org.apache.lucene.index.DocsEnum reuse, int flags) throws java.io.IOException
+ public override DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags)
+ {
+ decodeMetaData();
+ return outerInstance.outerInstance.postingsReader.docs(outerInstance.fieldInfo, state, liveDocs, reuse, flags);
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public org.apache.lucene.index.DocsAndPositionsEnum docsAndPositions(org.apache.lucene.util.Bits liveDocs, org.apache.lucene.index.DocsAndPositionsEnum reuse, int flags) throws java.io.IOException
+ public override DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags)
+ {
+ if (!outerInstance.hasPositions())
+ {
+ return null;
+ }
+ decodeMetaData();
+ return outerInstance.outerInstance.postingsReader.docsAndPositions(outerInstance.fieldInfo, state, liveDocs, reuse, flags);
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void seekExact(long ord) throws java.io.IOException
+ public override void seekExact(long ord)
+ {
+ throw new System.NotSupportedException();
+ }
+
+ public override long ord()
+ {
+ throw new System.NotSupportedException();
+ }
+ }
+
+
+ // Iterates through all terms in this field
+ private sealed class SegmentTermsEnum : BaseTermsEnum
+ {
+ private readonly FSTTermsReader.TermsReader outerInstance;
+
+ internal readonly BytesRefFSTEnum<FSTTermOutputs.TermData> fstEnum;
+
+ /* True when current term's metadata is decoded */
+ internal bool decoded;
+
+ /* True when current enum is 'positioned' by seekExact(TermState) */
+ internal bool seekPending;
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: SegmentTermsEnum() throws java.io.IOException
+ internal SegmentTermsEnum(FSTTermsReader.TermsReader outerInstance) : base(outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ this.fstEnum = new BytesRefFSTEnum<>(outerInstance.dict);
+ this.decoded = false;
+ this.seekPending = false;
+ this.meta = null;
+ }
+
+ public override IComparer<BytesRef> Comparator
+ {
+ get
+ {
+ return BytesRef.UTF8SortedAsUnicodeComparator;
+ }
+ }
+
+ // Let PBF decode metadata from long[] and byte[]
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override void decodeMetaData() throws java.io.IOException
+ internal override void decodeMetaData()
+ {
+ if (!decoded && !seekPending)
+ {
+ if (meta.bytes != null)
+ {
+ bytesReader.reset(meta.bytes, 0, meta.bytes.Length);
+ }
+ outerInstance.outerInstance.postingsReader.decodeTerm(meta.longs, bytesReader, outerInstance.fieldInfo, state, true);
+ decoded = true;
+ }
+ }
+
+ // Update current enum according to FSTEnum
+//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
+//ORIGINAL LINE: void updateEnum(final org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput<FSTTermOutputs.TermData> pair)
+ internal void updateEnum(InputOutput<FSTTermOutputs.TermData> pair)
+ {
+ if (pair == null)
+ {
+ term_Renamed = null;
+ }
+ else
+ {
+ term_Renamed = pair.input;
+ meta = pair.output;
+ state.docFreq = meta.docFreq;
+ state.totalTermFreq = meta.totalTermFreq;
+ }
+ decoded = false;
+ seekPending = false;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public org.apache.lucene.util.BytesRef next() throws java.io.IOException
+ public override BytesRef next()
+ {
+ if (seekPending) // previously positioned, but termOutputs not fetched
+ {
+ seekPending = false;
+ SeekStatus status = seekCeil(term_Renamed);
+ Debug.Assert(status == SeekStatus.FOUND); // must positioned on valid term
+ }
+ updateEnum(fstEnum.next());
+ return term_Renamed;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean seekExact(org.apache.lucene.util.BytesRef target) throws java.io.IOException
+ public override bool seekExact(BytesRef target)
+ {
+ updateEnum(fstEnum.seekExact(target));
+ return term_Renamed != null;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public SeekStatus seekCeil(org.apache.lucene.util.BytesRef target) throws java.io.IOException
+ public override SeekStatus seekCeil(BytesRef target)
+ {
+ updateEnum(fstEnum.seekCeil(target));
+ if (term_Renamed == null)
+ {
+ return SeekStatus.END;
+ }
+ else
+ {
+ return term_Renamed.Equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND;
+ }
+ }
+
+ public override void seekExact(BytesRef target, TermState otherState)
+ {
+ if (!target.Equals(term_Renamed))
+ {
+ state.copyFrom(otherState);
+ term_Renamed = BytesRef.deepCopyOf(target);
+ seekPending = true;
+ }
+ }
+ }
+
+ // Iterates intersect result with automaton (cannot seek!)
+ private sealed class IntersectTermsEnum : BaseTermsEnum
+ {
+ private readonly FSTTermsReader.TermsReader outerInstance;
+
+ /* True when current term's metadata is decoded */
+ internal bool decoded;
+
+ /* True when there is pending term when calling next() */
+ internal bool pending;
+
+ /* stack to record how current term is constructed,
+ * used to accumulate metadata or rewind term:
+ * level == term.length + 1,
+ * == 0 when term is null */
+ internal Frame[] stack;
+ internal int level;
+
+ /* to which level the metadata is accumulated
+ * so that we can accumulate metadata lazily */
+ internal int metaUpto;
+
+ /* term dict fst */
+ internal readonly FST<FSTTermOutputs.TermData> fst;
+ internal readonly FST.BytesReader fstReader;
+ internal readonly Outputs<FSTTermOutputs.TermData> fstOutputs;
+
+ /* query automaton to intersect with */
+ internal readonly ByteRunAutomaton fsa;
+
+ private sealed class Frame
+ {
+ private readonly FSTTermsReader.TermsReader.IntersectTermsEnum outerInstance;
+
+ /* fst stats */
+ internal FST.Arc<FSTTermOutputs.TermData> fstArc;
+
+ /* automaton stats */
+ internal int fsaState;
+
+ internal Frame(FSTTermsReader.TermsReader.IntersectTermsEnum outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ this.fstArc = new FST.Arc<>();
+ this.fsaState = -1;
+ }
+
+ public override string ToString()
+ {
+ return "arc=" + fstArc + " state=" + fsaState;
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: IntersectTermsEnum(org.apache.lucene.util.automaton.CompiledAutomaton compiled, org.apache.lucene.util.BytesRef startTerm) throws java.io.IOException
+ internal IntersectTermsEnum(FSTTermsReader.TermsReader outerInstance, CompiledAutomaton compiled, BytesRef startTerm) : base(outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ //if (TEST) System.out.println("Enum init, startTerm=" + startTerm);
+ this.fst = outerInstance.dict;
+ this.fstReader = fst.BytesReader;
+ this.fstOutputs = outerInstance.dict.outputs;
+ this.fsa = compiled.runAutomaton;
+ this.level = -1;
+ this.stack = new Frame[16];
+ for (int i = 0 ; i < stack.Length; i++)
+ {
+ this.stack[i] = new Frame(this);
+ }
+
+ Frame frame;
+ frame = loadVirtualFrame(newFrame());
+ this.level++;
+ frame = loadFirstFrame(newFrame());
+ pushFrame(frame);
+
+ this.meta = null;
+ this.metaUpto = 1;
+ this.decoded = false;
+ this.pending = false;
+
+ if (startTerm == null)
+ {
+ pending = isAccept(topFrame());
+ }
+ else
+ {
+ doSeekCeil(startTerm);
+ pending = !startTerm.Equals(term_Renamed) && isValid(topFrame()) && isAccept(topFrame());
+ }
+ }
+
+ public override IComparer<BytesRef> Comparator
+ {
+ get
+ {
+ return BytesRef.UTF8SortedAsUnicodeComparator;
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override void decodeMetaData() throws java.io.IOException
+ internal override void decodeMetaData()
+ {
+ Debug.Assert(term_Renamed != null);
+ if (!decoded)
+ {
+ if (meta.bytes != null)
+ {
+ bytesReader.reset(meta.bytes, 0, meta.bytes.Length);
+ }
+ outerInstance.outerInstance.postingsReader.decodeTerm(meta.longs, bytesReader, outerInstance.fieldInfo, state, true);
+ decoded = true;
+ }
+ }
+
+ /// <summary>
+ /// Lazily accumulate meta data, when we got a accepted term </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: void loadMetaData() throws java.io.IOException
+ internal void loadMetaData()
+ {
+ FST.Arc<FSTTermOutputs.TermData> last, next;
+ last = stack[metaUpto].fstArc;
+ while (metaUpto != level)
+ {
+ metaUpto++;
+ next = stack[metaUpto].fstArc;
+ next.output = fstOutputs.add(next.output, last.output);
+ last = next;
+ }
+ if (last.Final)
+ {
+ meta = fstOutputs.add(last.output, last.nextFinalOutput);
+ }
+ else
+ {
+ meta = last.output;
+ }
+ state.docFreq = meta.docFreq;
+ state.totalTermFreq = meta.totalTermFreq;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public SeekStatus seekCeil(org.apache.lucene.util.BytesRef target) throws java.io.IOException
+ public override SeekStatus seekCeil(BytesRef target)
+ {
+ decoded = false;
+ term_Renamed = doSeekCeil(target);
+ loadMetaData();
+ if (term_Renamed == null)
+ {
+ return SeekStatus.END;
+ }
+ else
+ {
+ return term_Renamed.Equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND;
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public org.apache.lucene.util.BytesRef next() throws java.io.IOException
+ public override BytesRef next()
+ {
+ //if (TEST) System.out.println("Enum next()");
+ if (pending)
+ {
+ pending = false;
+ loadMetaData();
+ return term_Renamed;
+ }
+ decoded = false;
+ while (level > 0)
+ {
+ Frame frame = newFrame();
+ if (loadExpandFrame(topFrame(), frame) != null) // has valid target
+ {
+ pushFrame(frame);
+ if (isAccept(frame)) // gotcha
+ {
+ break;
+ }
+ continue; // check next target
+ }
+ frame = popFrame();
+ while (level > 0)
+ {
+ if (loadNextFrame(topFrame(), frame) != null) // has valid sibling
+ {
+ pushFrame(frame);
+ if (isAccept(frame)) // gotcha
+ {
+ goto DFSBreak;
+ }
+ goto DFSContinue; // check next target
+ }
+ frame = popFrame();
+ }
+ return null;
+ DFSContinue:;
+ }
+ DFSBreak:
+ loadMetaData();
+ return term_Renamed;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: private org.apache.lucene.util.BytesRef doSeekCeil(org.apache.lucene.util.BytesRef target) throws java.io.IOException
+ internal BytesRef doSeekCeil(BytesRef target)
+ {
+ //if (TEST) System.out.println("Enum doSeekCeil()");
+ Frame frame = null;
+ int label , upto = 0, limit = target.length;
+ while (upto < limit) // to target prefix, or ceil label (rewind prefix)
+ {
+ frame = newFrame();
+ label = target.bytes[upto] & 0xff;
+ frame = loadCeilFrame(label, topFrame(), frame);
+ if (frame == null || frame.fstArc.label != label)
+ {
+ break;
+ }
+ Debug.Assert(isValid(frame)); // target must be fetched from automaton
+ pushFrame(frame);
+ upto++;
+ }
+ if (upto == limit) // got target
+ {
+ return term_Renamed;
+ }
+ if (frame != null) // got larger term('s prefix)
+ {
+ pushFrame(frame);
+ return isAccept(frame) ? term_Renamed : next();
+ }
+ while (level > 0) // got target's prefix, advance to larger term
+ {
+ frame = popFrame();
+ while (level > 0 && !canRewind(frame))
+ {
+ frame = popFrame();
+ }
+ if (loadNextFrame(topFrame(), frame) != null)
+ {
+ pushFrame(frame);
+ return isAccept(frame) ? term_Renamed : next();
+ }
+ }
+ return null;
+ }
+
+ /// <summary>
+ /// Virtual frame, never pop </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: Frame loadVirtualFrame(Frame frame) throws java.io.IOException
+ internal Frame loadVirtualFrame(Frame frame)
+ {
+ frame.fstArc.output = fstOutputs.NoOutput;
+ frame.fstArc.nextFinalOutput = fstOutputs.NoOutput;
+ frame.fsaState = -1;
+ return frame;
+ }
+
+ /// <summary>
+ /// Load frame for start arc(node) on fst </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: Frame loadFirstFrame(Frame frame) throws java.io.IOException
+ internal Frame loadFirstFrame(Frame frame)
+ {
+ frame.fstArc = fst.getFirstArc(frame.fstArc);
+ frame.fsaState = fsa.InitialState;
+ return frame;
+ }
+
+ /// <summary>
+ /// Load frame for target arc(node) on fst </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: Frame loadExpandFrame(Frame top, Frame frame) throws java.io.IOException
+ internal Frame loadExpandFrame(Frame top, Frame frame)
+ {
+ if (!canGrow(top))
+ {
+ return null;
+ }
+ frame.fstArc = fst.readFirstRealTargetArc(top.fstArc.target, frame.fstArc, fstReader);
+ frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label);
+ //if (TEST) System.out.println(" loadExpand frame="+frame);
+ if (frame.fsaState == -1)
+ {
+ return loadNextFrame(top, frame);
+ }
+ return frame;
+ }
+
+ /// <summary>
+ /// Load frame for sibling arc(node) on fst </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: Frame loadNextFrame(Frame top, Frame frame) throws java.io.IOException
+ internal Frame loadNextFrame(Frame top, Frame frame)
+ {
+ if (!canRewind(frame))
+ {
+ return null;
+ }
+ while (!frame.fstArc.Last)
+ {
+ frame.fstArc = fst.readNextRealArc(frame.fstArc, fstReader);
+ frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label);
+ if (frame.fsaState != -1)
+ {
+ break;
+ }
+ }
+ //if (TEST) System.out.println(" loadNext frame="+frame);
+ if (frame.fsaState == -1)
+ {
+ return null;
+ }
+ return frame;
+ }
+
+ /// <summary>
+ /// Load frame for target arc(node) on fst, so that
+ /// arc.label >= label and !fsa.reject(arc.label)
+ /// </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: Frame loadCeilFrame(int label, Frame top, Frame frame) throws java.io.IOException
+ internal Frame loadCeilFrame(int label, Frame top, Frame frame)
+ {
+ FST.Arc<FSTTermOutputs.TermData> arc = frame.fstArc;
+ arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader);
+ if (arc == null)
+ {
+ return null;
+ }
+ frame.fsaState = fsa.step(top.fsaState, arc.label);
+ //if (TEST) System.out.println(" loadCeil frame="+frame);
+ if (frame.fsaState == -1)
+ {
+ return loadNextFrame(top, frame);
+ }
+ return frame;
+ }
+
+ internal bool isAccept(Frame frame) // reach a term both fst&fsa accepts
+ {
+ return fsa.isAccept(frame.fsaState) && frame.fstArc.Final;
+ }
+ internal bool isValid(Frame frame) // reach a prefix both fst&fsa won't reject
+ {
+ return frame.fsaState != -1; //frame != null &&
+ }
+ internal bool canGrow(Frame frame) // can walk forward on both fst&fsa
+ {
+ return frame.fsaState != -1 && FST.targetHasArcs(frame.fstArc);
+ }
+ internal bool canRewind(Frame frame) // can jump to sibling
+ {
+ return !frame.fstArc.Last;
+ }
+
+ internal void pushFrame(Frame frame)
+ {
+ term_Renamed = grow(frame.fstArc.label);
+ level++;
+ //if (TEST) System.out.println(" term=" + term + " level=" + level);
+ }
+
+ internal Frame popFrame()
+ {
+ term_Renamed = shrink();
+ level--;
+ metaUpto = metaUpto > level ? level : metaUpto;
+ //if (TEST) System.out.println(" term=" + term + " level=" + level);
+ return stack[level + 1];
+ }
+
+ internal Frame newFrame()
+ {
+ if (level + 1 == stack.Length)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final Frame[] temp = new Frame[org.apache.lucene.util.ArrayUtil.oversize(level+2, org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+ Frame[] temp = new Frame[ArrayUtil.oversize(level + 2, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+ Array.Copy(stack, 0, temp, 0, stack.Length);
+ for (int i = stack.Length; i < temp.Length; i++)
+ {
+ temp[i] = new Frame(this);
+ }
+ stack = temp;
+ }
+ return stack[level + 1];
+ }
+
+ internal Frame topFrame()
+ {
+ return stack[level];
+ }
+
+ internal BytesRef grow(int label)
+ {
+ if (term_Renamed == null)
+ {
+ term_Renamed = new BytesRef(new sbyte[16], 0, 0);
+ }
+ else
+ {
+ if (term_Renamed.length == term_Renamed.bytes.length)
+ {
+ term_Renamed.grow(term_Renamed.length + 1);
+ }
+ term_Renamed.bytes[term_Renamed.length++] = (sbyte)label;
+ }
+ return term_Renamed;
+ }
+
+ internal BytesRef shrink()
+ {
+ if (term_Renamed.length == 0)
+ {
+ term_Renamed = null;
+ }
+ else
+ {
+ term_Renamed.length--;
+ }
+ return term_Renamed;
+ }
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: static<T> void walk(org.apache.lucene.util.fst.FST<T> fst) throws java.io.IOException
+ internal static void walk<T>(FST<T> fst)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final java.util.ArrayList<org.apache.lucene.util.fst.FST.Arc<T>> queue = new java.util.ArrayList<>();
+ List<FST.Arc<T>> queue = new List<FST.Arc<T>>();
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final java.util.BitSet seen = new java.util.BitSet();
+ BitArray seen = new BitArray();
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.BytesReader reader = fst.getBytesReader();
+ FST.BytesReader reader = fst.BytesReader;
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.Arc<T> startArc = fst.getFirstArc(new org.apache.lucene.util.fst.FST.Arc<T>());
+ FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
+ queue.Add(startArc);
+ while (queue.Count > 0)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.Arc<T> arc = queue.remove(0);
+ FST.Arc<T> arc = queue.Remove(0);
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final long node = arc.target;
+ long node = arc.target;
+ //System.out.println(arc);
+ if (FST.targetHasArcs(arc) && !seen.Get((int) node))
+ {
+ seen.Set((int) node, true);
+ fst.readFirstRealTargetArc(node, arc, reader);
+ while (true)
+ {
+ queue.Add((new FST.Arc<T>()).copyFrom(arc));
+ if (arc.Last)
+ {
+ break;
+ }
+ else
+ {
+ fst.readNextRealArc(arc, reader);
+ }
+ }
+ }
+ }
+ }
+
+ public override long ramBytesUsed()
+ {
+ long ramBytesUsed = 0;
+ foreach (TermsReader r in fields.Values)
+ {
+ ramBytesUsed += r.dict == null ? 0 : r.dict.sizeInBytes();
+ }
+ return ramBytesUsed;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void checkIntegrity() throws java.io.IOException
+ public override void checkIntegrity()
+ {
+ postingsReader.checkIntegrity();
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/d852d5b0/src/Lucene.Net.Codecs/Memory/FSTTermsWriter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Memory/FSTTermsWriter.cs b/src/Lucene.Net.Codecs/Memory/FSTTermsWriter.cs
index 4b587b7..785b137 100644
--- a/src/Lucene.Net.Codecs/Memory/FSTTermsWriter.cs
+++ b/src/Lucene.Net.Codecs/Memory/FSTTermsWriter.cs
@@ -1,277 +1,329 @@
-package codecs.memory;
+using System.Collections.Generic;
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+namespace org.apache.lucene.codecs.memory
+{
-import java.io.IOException;
-import java.util.List;
-import java.util.ArrayList;
-import java.util.Comparator;
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
-import index.FieldInfo.IndexOptions;
-import index.FieldInfo;
-import index.FieldInfos;
-import index.IndexFileNames;
-import index.SegmentWriteState;
-import store.DataOutput;
-import store.IndexOutput;
-import store.RAMOutputStream;
-import util.ArrayUtil;
-import util.BytesRef;
-import util.IOUtils;
-import util.IntsRef;
-import util.fst.Builder;
-import util.fst.FST;
-import util.fst.Util;
-import codecs.BlockTermState;
-import codecs.PostingsWriterBase;
-import codecs.PostingsConsumer;
-import codecs.FieldsConsumer;
-import codecs.TermsConsumer;
-import codecs.TermStats;
-import codecs.CodecUtil;
-/**
- * FST-based term dict, using metadata as FST output.
- *
- * The FST directly holds the mapping between <term, metadata>.
- *
- * Term metadata consists of three parts:
- * 1. term statistics: docFreq, totalTermFreq;
- * 2. monotonic long[], e.g. the pointer to the postings list for that term;
- * 3. generic byte[], e.g. other information need by postings reader.
- *
- * <p>
- * File:
- * <ul>
- * <li><tt>.tst</tt>: <a href="#Termdictionary">Term Dictionary</a></li>
- * </ul>
- * <p>
- *
- * <a name="Termdictionary" id="Termdictionary"></a>
- * <h3>Term Dictionary</h3>
- * <p>
- * The .tst contains a list of FSTs, one for each field.
- * The FST maps a term to its corresponding statistics (e.g. docfreq)
- * and metadata (e.g. information for postings list reader like file pointer
- * to postings list).
- * </p>
- * <p>
- * Typically the metadata is separated into two parts:
- * <ul>
- * <li>
- * Monotonical long array: Some metadata will always be ascending in order
- * with the corresponding term. This part is used by FST to share outputs between arcs.
- * </li>
- * <li>
- * Generic byte array: Used to store non-monotonic metadata.
- * </li>
- * </ul>
- * </p>
- *
- * File format:
- * <ul>
- * <li>TermsDict(.tst) --> Header, <i>PostingsHeader</i>, FieldSummary, DirOffset</li>
- * <li>FieldSummary --> NumFields, <FieldNumber, NumTerms, SumTotalTermFreq?,
- * SumDocFreq, DocCount, LongsSize, TermFST ><sup>NumFields</sup></li>
- * <li>TermFST --> {@link FST FST<TermData>}</li>
- * <li>TermData --> Flag, BytesSize?, LongDelta<sup>LongsSize</sup>?, Byte<sup>BytesSize</sup>?,
- * < DocFreq[Same?], (TotalTermFreq-DocFreq) > ? </li>
- * <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
- * <li>DirOffset --> {@link DataOutput#writeLong Uint64}</li>
- * <li>DocFreq, LongsSize, BytesSize, NumFields,
- * FieldNumber, DocCount --> {@link DataOutput#writeVInt VInt}</li>
- * <li>TotalTermFreq, NumTerms, SumTotalTermFreq, SumDocFreq, LongDelta -->
- * {@link DataOutput#writeVLong VLong}</li>
- * </ul>
- * <p>Notes:</p>
- * <ul>
- * <li>
- * The format of PostingsHeader and generic meta bytes are customized by the specific postings implementation:
- * they contain arbitrary per-file data (such as parameters or versioning information), and per-term data
- * (non-monotonic ones like pulsed postings data).
- * </li>
- * <li>
- * The format of TermData is determined by FST, typically monotonic metadata will be dense around shallow arcs,
- * while in deeper arcs only generic bytes and term statistics exist.
- * </li>
- * <li>
- * The byte Flag is used to indicate which part of metadata exists on current arc. Specially the monotonic part
- * is omitted when it is an array of 0s.
- * </li>
- * <li>
- * Since LongsSize is per-field fixed, it is only written once in field summary.
- * </li>
- * </ul>
- *
- * @lucene.experimental
- */
+ using IndexOptions = org.apache.lucene.index.FieldInfo.IndexOptions;
+ using FieldInfo = org.apache.lucene.index.FieldInfo;
+ using FieldInfos = org.apache.lucene.index.FieldInfos;
+ using IndexFileNames = org.apache.lucene.index.IndexFileNames;
+ using SegmentWriteState = org.apache.lucene.index.SegmentWriteState;
+ using DataOutput = org.apache.lucene.store.DataOutput;
+ using IndexOutput = org.apache.lucene.store.IndexOutput;
+ using RAMOutputStream = org.apache.lucene.store.RAMOutputStream;
+ using ArrayUtil = org.apache.lucene.util.ArrayUtil;
+ using BytesRef = org.apache.lucene.util.BytesRef;
+ using IOUtils = org.apache.lucene.util.IOUtils;
+ using IntsRef = org.apache.lucene.util.IntsRef;
+ using Builder = org.apache.lucene.util.fst.Builder;
+ using FST = org.apache.lucene.util.fst.FST;
+ using Util = org.apache.lucene.util.fst.Util;
-public class FSTTermsWriter extends FieldsConsumer {
- static final String TERMS_EXTENSION = "tmp";
- static final String TERMS_CODEC_NAME = "FST_TERMS_DICT";
- public static final int TERMS_VERSION_START = 0;
- public static final int TERMS_VERSION_CHECKSUM = 1;
- public static final int TERMS_VERSION_CURRENT = TERMS_VERSION_CHECKSUM;
-
- final PostingsWriterBase postingsWriter;
- final FieldInfos fieldInfos;
- IndexOutput out;
- final List<FieldMetaData> fields = new ArrayList<>();
+ /// <summary>
+ /// FST-based term dict, using metadata as FST output.
+ ///
+ /// The FST directly holds the mapping between <term, metadata>.
+ ///
+ /// Term metadata consists of three parts:
+ /// 1. term statistics: docFreq, totalTermFreq;
+ /// 2. monotonic long[], e.g. the pointer to the postings list for that term;
+ /// 3. generic byte[], e.g. other information need by postings reader.
+ ///
+ /// <para>
+ /// File:
+ /// <ul>
+ /// <li><tt>.tst</tt>: <a href="#Termdictionary">Term Dictionary</a></li>
+ /// </ul>
+ /// </para>
+ /// <para>
+ ///
+ /// <a name="Termdictionary" id="Termdictionary"></a>
+ /// <h3>Term Dictionary</h3>
+ /// </para>
+ /// <para>
+ /// The .tst contains a list of FSTs, one for each field.
+ /// The FST maps a term to its corresponding statistics (e.g. docfreq)
+ /// and metadata (e.g. information for postings list reader like file pointer
+ /// to postings list).
+ /// </para>
+ /// <para>
+ /// Typically the metadata is separated into two parts:
+ /// <ul>
+ /// <li>
+ /// Monotonical long array: Some metadata will always be ascending in order
+ /// with the corresponding term. This part is used by FST to share outputs between arcs.
+ /// </li>
+ /// <li>
+ /// Generic byte array: Used to store non-monotonic metadata.
+ /// </li>
+ /// </ul>
+ /// </para>
+ ///
+ /// File format:
+ /// <ul>
+ /// <li>TermsDict(.tst) --> Header, <i>PostingsHeader</i>, FieldSummary, DirOffset</li>
+ /// <li>FieldSummary --> NumFields, <FieldNumber, NumTerms, SumTotalTermFreq?,
+ /// SumDocFreq, DocCount, LongsSize, TermFST ><sup>NumFields</sup></li>
+ /// <li>TermFST --> <seealso cref="FST FST<TermData>"/></li>
+ /// <li>TermData --> Flag, BytesSize?, LongDelta<sup>LongsSize</sup>?, Byte<sup>BytesSize</sup>?,
+ /// < DocFreq[Same?], (TotalTermFreq-DocFreq) > ? </li>
+ /// <li>Header --> <seealso cref="CodecUtil#writeHeader CodecHeader"/></li>
+ /// <li>DirOffset --> <seealso cref="DataOutput#writeLong Uint64"/></li>
+ /// <li>DocFreq, LongsSize, BytesSize, NumFields,
+ /// FieldNumber, DocCount --> <seealso cref="DataOutput#writeVInt VInt"/></li>
+ /// <li>TotalTermFreq, NumTerms, SumTotalTermFreq, SumDocFreq, LongDelta -->
+ /// <seealso cref="DataOutput#writeVLong VLong"/></li>
+ /// </ul>
+ /// <para>Notes:</para>
+ /// <ul>
+ /// <li>
+ /// The format of PostingsHeader and generic meta bytes are customized by the specific postings implementation:
+ /// they contain arbitrary per-file data (such as parameters or versioning information), and per-term data
+ /// (non-monotonic ones like pulsed postings data).
+ /// </li>
+ /// <li>
+ /// The format of TermData is determined by FST, typically monotonic metadata will be dense around shallow arcs,
+ /// while in deeper arcs only generic bytes and term statistics exist.
+ /// </li>
+ /// <li>
+ /// The byte Flag is used to indicate which part of metadata exists on current arc. Specially the monotonic part
+ /// is omitted when it is an array of 0s.
+ /// </li>
+ /// <li>
+ /// Since LongsSize is per-field fixed, it is only written once in field summary.
+ /// </li>
+ /// </ul>
+ ///
+ /// @lucene.experimental
+ /// </summary>
- public FSTTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) {
- final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);
+ public class FSTTermsWriter : FieldsConsumer
+ {
+ internal const string TERMS_EXTENSION = "tmp";
+ internal const string TERMS_CODEC_NAME = "FST_TERMS_DICT";
+ public const int TERMS_VERSION_START = 0;
+ public const int TERMS_VERSION_CHECKSUM = 1;
+ public const int TERMS_VERSION_CURRENT = TERMS_VERSION_CHECKSUM;
- this.postingsWriter = postingsWriter;
- this.fieldInfos = state.fieldInfos;
- this.out = state.directory.createOutput(termsFileName, state.context);
+ internal readonly PostingsWriterBase postingsWriter;
+ internal readonly FieldInfos fieldInfos;
+ internal IndexOutput @out;
+ internal readonly IList<FieldMetaData> fields = new List<FieldMetaData>();
- bool success = false;
- try {
- writeHeader(out);
- this.postingsWriter.init(out);
- success = true;
- } finally {
- if (!success) {
- IOUtils.closeWhileHandlingException(out);
- }
- }
- }
- private void writeHeader(IndexOutput out) {
- CodecUtil.writeHeader(out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT);
- }
- private void writeTrailer(IndexOutput out, long dirStart) {
- out.writeLong(dirStart);
- }
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public FSTTermsWriter(org.apache.lucene.index.SegmentWriteState state, org.apache.lucene.codecs.PostingsWriterBase postingsWriter) throws java.io.IOException
+ public FSTTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final String termsFileName = org.apache.lucene.index.IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);
+ string termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);
- @Override
- public TermsConsumer addField(FieldInfo field) {
- return new TermsWriter(field);
- }
+ this.postingsWriter = postingsWriter;
+ this.fieldInfos = state.fieldInfos;
+ this.@out = state.directory.createOutput(termsFileName, state.context);
- @Override
- public void close() {
- if (out != null) {
- IOException ioe = null;
- try {
- // write field summary
- final long dirStart = out.getFilePointer();
-
- out.writeVInt(fields.size());
- for (FieldMetaData field : fields) {
- out.writeVInt(field.fieldInfo.number);
- out.writeVLong(field.numTerms);
- if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
- out.writeVLong(field.sumTotalTermFreq);
- }
- out.writeVLong(field.sumDocFreq);
- out.writeVInt(field.docCount);
- out.writeVInt(field.longsSize);
- field.dict.save(out);
- }
- writeTrailer(out, dirStart);
- CodecUtil.writeFooter(out);
- } catch (IOException ioe2) {
- ioe = ioe2;
- } finally {
- IOUtils.closeWhileHandlingException(ioe, out, postingsWriter);
- out = null;
- }
- }
- }
+ bool success = false;
+ try
+ {
+ writeHeader(@out);
+ this.postingsWriter.init(@out);
+ success = true;
+ }
+ finally
+ {
+ if (!success)
+ {
+ IOUtils.closeWhileHandlingException(@out);
+ }
+ }
+ }
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: private void writeHeader(org.apache.lucene.store.IndexOutput out) throws java.io.IOException
+ private void writeHeader(IndexOutput @out)
+ {
+ CodecUtil.writeHeader(@out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT);
+ }
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: private void writeTrailer(org.apache.lucene.store.IndexOutput out, long dirStart) throws java.io.IOException
+ private void writeTrailer(IndexOutput @out, long dirStart)
+ {
+ @out.writeLong(dirStart);
+ }
- private static class FieldMetaData {
- public final FieldInfo fieldInfo;
- public final long numTerms;
- public final long sumTotalTermFreq;
- public final long sumDocFreq;
- public final int docCount;
- public final int longsSize;
- public final FST<FSTTermOutputs.TermData> dict;
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public org.apache.lucene.codecs.TermsConsumer addField(org.apache.lucene.index.FieldInfo field) throws java.io.IOException
+ public override TermsConsumer addField(FieldInfo field)
+ {
+ return new TermsWriter(this, field);
+ }
- public FieldMetaData(FieldInfo fieldInfo, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize, FST<FSTTermOutputs.TermData> fst) {
- this.fieldInfo = fieldInfo;
- this.numTerms = numTerms;
- this.sumTotalTermFreq = sumTotalTermFreq;
- this.sumDocFreq = sumDocFreq;
- this.docCount = docCount;
- this.longsSize = longsSize;
- this.dict = fst;
- }
- }
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void close() throws java.io.IOException
+ public override void close()
+ {
+ if (@out != null)
+ {
+ IOException ioe = null;
+ try
+ {
+ // write field summary
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final long dirStart = out.getFilePointer();
+ long dirStart = @out.FilePointer;
- final class TermsWriter extends TermsConsumer {
- private final Builder<FSTTermOutputs.TermData> builder;
- private final FSTTermOutputs outputs;
- private final FieldInfo fieldInfo;
- private final int longsSize;
- private long numTerms;
+ @out.writeVInt(fields.Count);
+ foreach (FieldMetaData field in fields)
+ {
+ @out.writeVInt(field.fieldInfo.number);
+ @out.writeVLong(field.numTerms);
+ if (field.fieldInfo.IndexOptions != IndexOptions.DOCS_ONLY)
+ {
+ @out.writeVLong(field.sumTotalTermFreq);
+ }
+ @out.writeVLong(field.sumDocFreq);
+ @out.writeVInt(field.docCount);
+ @out.writeVInt(field.longsSize);
+ field.dict.save(@out);
+ }
+ writeTrailer(@out, dirStart);
+ CodecUtil.writeFooter(@out);
+ }
+ catch (IOException ioe2)
+ {
+ ioe = ioe2;
+ }
+ finally
+ {
+ IOUtils.closeWhileHandlingException(ioe, @out, postingsWriter);
+ @out = null;
+ }
+ }
+ }
- private final IntsRef scratchTerm = new IntsRef();
- private final RAMOutputStream statsWriter = new RAMOutputStream();
- private final RAMOutputStream metaWriter = new RAMOutputStream();
+ private class FieldMetaData
+ {
+ public readonly FieldInfo fieldInfo;
+ public readonly long numTerms;
+ public readonly long sumTotalTermFreq;
+ public readonly long sumDocFreq;
+ public readonly int docCount;
+ public readonly int longsSize;
+ public readonly FST<FSTTermOutputs.TermData> dict;
- TermsWriter(FieldInfo fieldInfo) {
- this.numTerms = 0;
- this.fieldInfo = fieldInfo;
- this.longsSize = postingsWriter.setField(fieldInfo);
- this.outputs = new FSTTermOutputs(fieldInfo, longsSize);
- this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
- }
+ public FieldMetaData(FieldInfo fieldInfo, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize, FST<FSTTermOutputs.TermData> fst)
+ {
+ this.fieldInfo = fieldInfo;
+ this.numTerms = numTerms;
+ this.sumTotalTermFreq = sumTotalTermFreq;
+ this.sumDocFreq = sumDocFreq;
+ this.docCount = docCount;
+ this.longsSize = longsSize;
+ this.dict = fst;
+ }
+ }
- @Override
- public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUnicodeComparator();
- }
+ internal sealed class TermsWriter : TermsConsumer
+ {
+ private readonly FSTTermsWriter outerInstance;
- @Override
- public PostingsConsumer startTerm(BytesRef text) {
- postingsWriter.startTerm();
- return postingsWriter;
- }
+ internal readonly Builder<FSTTermOutputs.TermData> builder;
+ internal readonly FSTTermOutputs outputs;
+ internal readonly FieldInfo fieldInfo;
+ internal readonly int longsSize;
+ internal long numTerms;
- @Override
- public void finishTerm(BytesRef text, TermStats stats) {
- // write term meta data into fst
- final BlockTermState state = postingsWriter.newTermState();
- final FSTTermOutputs.TermData meta = new FSTTermOutputs.TermData();
- meta.longs = new long[longsSize];
- meta.bytes = null;
- meta.docFreq = state.docFreq = stats.docFreq;
- meta.totalTermFreq = state.totalTermFreq = stats.totalTermFreq;
- postingsWriter.finishTerm(state);
- postingsWriter.encodeTerm(meta.longs, metaWriter, fieldInfo, state, true);
- final int bytesSize = (int)metaWriter.getFilePointer();
- if (bytesSize > 0) {
- meta.bytes = new byte[bytesSize];
- metaWriter.writeTo(meta.bytes, 0);
- metaWriter.reset();
- }
- builder.add(Util.toIntsRef(text, scratchTerm), meta);
- numTerms++;
- }
+ internal readonly IntsRef scratchTerm = new IntsRef();
+ internal readonly RAMOutputStream statsWriter = new RAMOutputStream();
+ internal readonly RAMOutputStream metaWriter = new RAMOutputStream();
- @Override
- public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) {
- // save FST dict
- if (numTerms > 0) {
- final FST<FSTTermOutputs.TermData> fst = builder.finish();
- fields.add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, fst));
- }
- }
- }
-}
+ internal TermsWriter(FSTTermsWriter outerInstance, FieldInfo fieldInfo)
+ {
+ this.outerInstance = outerInstance;
+ this.numTerms = 0;
+ this.fieldInfo = fieldInfo;
+ this.longsSize = outerInstance.postingsWriter.setField(fieldInfo);
+ this.outputs = new FSTTermOutputs(fieldInfo, longsSize);
+ this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+ }
+
+ public override IComparer<BytesRef> Comparator
+ {
+ get
+ {
+ return BytesRef.UTF8SortedAsUnicodeComparator;
+ }
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public org.apache.lucene.codecs.PostingsConsumer startTerm(org.apache.lucene.util.BytesRef text) throws java.io.IOException
+ public override PostingsConsumer startTerm(BytesRef text)
+ {
+ outerInstance.postingsWriter.startTerm();
+ return outerInstance.postingsWriter;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void finishTerm(org.apache.lucene.util.BytesRef text, org.apache.lucene.codecs.TermStats stats) throws java.io.IOException
+ public override void finishTerm(BytesRef text, TermStats stats)
+ {
+ // write term meta data into fst
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.codecs.BlockTermState state = postingsWriter.newTermState();
+ BlockTermState state = outerInstance.postingsWriter.newTermState();
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final FSTTermOutputs.TermData meta = new FSTTermOutputs.TermData();
+ FSTTermOutputs.TermData meta = new FSTTermOutputs.TermData();
+ meta.longs = new long[longsSize];
+ meta.bytes = null;
+ meta.docFreq = state.docFreq = stats.docFreq;
+ meta.totalTermFreq = state.totalTermFreq = stats.totalTermFreq;
+ outerInstance.postingsWriter.finishTerm(state);
+ outerInstance.postingsWriter.encodeTerm(meta.longs, metaWriter, fieldInfo, state, true);
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int bytesSize = (int)metaWriter.getFilePointer();
+ int bytesSize = (int)metaWriter.FilePointer;
+ if (bytesSize > 0)
+ {
+ meta.bytes = new sbyte[bytesSize];
+ metaWriter.writeTo(meta.bytes, 0);
+ metaWriter.reset();
+ }
+ builder.add(Util.toIntsRef(text, scratchTerm), meta);
+ numTerms++;
+ }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws java.io.IOException
+ public override void finish(long sumTotalTermFreq, long sumDocFreq, int docCount)
+ {
+ // save FST dict
+ if (numTerms > 0)
+ {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.fst.FST<FSTTermOutputs.TermData> fst = builder.finish();
+ FST<FSTTermOutputs.TermData> fst = builder.finish();
+ outerInstance.fields.Add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, fst));
+ }
+ }
+ }
+ }
+
+}
\ No newline at end of file