You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2008/10/25 12:40:01 UTC
svn commit: r707836 - /lucene/java/trunk/src/java/org/apache/lucene/index/
Author: mikemccand
Date: Sat Oct 25 03:40:00 2008
New Revision: 707836
URL: http://svn.apache.org/viewvc?rev=707836&view=rev
Log:
LUCENE-1426: next steps towards flexible indexing: use the same API when writing a new segment
Added:
lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsDocsConsumer.java (with props)
lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsDocsWriter.java (with props)
lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java (with props)
lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java (with props)
lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java (with props)
lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsPositionsWriter.java (with props)
lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java (with props)
lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java (with props)
lucene/java/trunk/src/java/org/apache/lucene/index/SegmentWriteState.java (with props)
Modified:
lucene/java/trunk/src/java/org/apache/lucene/index/DefaultSkipListWriter.java
lucene/java/trunk/src/java/org/apache/lucene/index/DocConsumer.java
lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldConsumer.java
lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldConsumers.java
lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldProcessor.java
lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java
lucene/java/trunk/src/java/org/apache/lucene/index/DocInverter.java
lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
lucene/java/trunk/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileNames.java
lucene/java/trunk/src/java/org/apache/lucene/index/InvertedDocConsumer.java
lucene/java/trunk/src/java/org/apache/lucene/index/InvertedDocEndConsumer.java
lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriter.java
lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java
lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriter.java
lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java
lucene/java/trunk/src/java/org/apache/lucene/index/TermsHash.java
lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumer.java
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DefaultSkipListWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DefaultSkipListWriter.java?rev=707836&r1=707835&r2=707836&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DefaultSkipListWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DefaultSkipListWriter.java Sat Oct 25 03:40:00 2008
@@ -54,6 +54,14 @@
lastSkipProxPointer = new long[numberOfSkipLevels];
}
+ void setFreqOutput(IndexOutput freqOutput) {
+ this.freqOutput = freqOutput;
+ }
+
+ void setProxOutput(IndexOutput proxOutput) {
+ this.proxOutput = proxOutput;
+ }
+
/**
* Sets the values for the current skip data.
*/
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocConsumer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocConsumer.java?rev=707836&r1=707835&r2=707836&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocConsumer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocConsumer.java Sat Oct 25 03:40:00 2008
@@ -22,8 +22,8 @@
abstract class DocConsumer {
abstract DocConsumerPerThread addThread(DocumentsWriterThreadState perThread) throws IOException;
- abstract void flush(final Collection threads, final DocumentsWriter.FlushState state) throws IOException;
- abstract void closeDocStore(final DocumentsWriter.FlushState state) throws IOException;
+ abstract void flush(final Collection threads, final SegmentWriteState state) throws IOException;
+ abstract void closeDocStore(final SegmentWriteState state) throws IOException;
abstract void abort();
abstract boolean freeRAM();
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldConsumer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldConsumer.java?rev=707836&r1=707835&r2=707836&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldConsumer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldConsumer.java Sat Oct 25 03:40:00 2008
@@ -26,11 +26,11 @@
/** Called when DocumentsWriter decides to create a new
* segment */
- abstract void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException;
+ abstract void flush(Map threadsAndFields, SegmentWriteState state) throws IOException;
/** Called when DocumentsWriter decides to close the doc
* stores */
- abstract void closeDocStore(DocumentsWriter.FlushState state) throws IOException;
+ abstract void closeDocStore(SegmentWriteState state) throws IOException;
/** Called when an aborting exception is hit */
abstract void abort();
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldConsumers.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldConsumers.java?rev=707836&r1=707835&r2=707836&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldConsumers.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldConsumers.java Sat Oct 25 03:40:00 2008
@@ -44,7 +44,7 @@
two.setFieldInfos(fieldInfos);
}
- public void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException {
+ public void flush(Map threadsAndFields, SegmentWriteState state) throws IOException {
Map oneThreadsAndFields = new HashMap();
Map twoThreadsAndFields = new HashMap();
@@ -76,7 +76,7 @@
two.flush(twoThreadsAndFields, state);
}
- public void closeDocStore(DocumentsWriter.FlushState state) throws IOException {
+ public void closeDocStore(SegmentWriteState state) throws IOException {
try {
one.closeDocStore(state);
} finally {
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldProcessor.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldProcessor.java?rev=707836&r1=707835&r2=707836&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldProcessor.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldProcessor.java Sat Oct 25 03:40:00 2008
@@ -43,11 +43,11 @@
consumer.setFieldInfos(fieldInfos);
}
- public void closeDocStore(DocumentsWriter.FlushState state) throws IOException {
+ public void closeDocStore(SegmentWriteState state) throws IOException {
consumer.closeDocStore(state);
}
- public void flush(Collection threads, DocumentsWriter.FlushState state) throws IOException {
+ public void flush(Collection threads, SegmentWriteState state) throws IOException {
Map childThreadsAndFields = new HashMap();
Iterator it = threads.iterator();
@@ -63,7 +63,9 @@
// consumer can alter the FieldInfo* if necessary. EG,
// FreqProxTermsWriter does this with
// FieldInfo.storePayload.
- fieldInfos.write(state.directory, state.segmentName + ".fnm");
+ final String fileName = state.segmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION);
+ fieldInfos.write(state.directory, fileName);
+ state.flushedFiles.add(fileName);
}
public void abort() {
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java?rev=707836&r1=707835&r2=707836&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocFieldProcessorPerThread.java Sat Oct 25 03:40:00 2008
@@ -87,7 +87,7 @@
/** If there are fields we've seen but did not see again
* in the last run, then free them up. */
- void trimFields(DocumentsWriter.FlushState state) {
+ void trimFields(SegmentWriteState state) {
for(int i=0;i<fieldHash.length;i++) {
DocFieldProcessorPerField perField = fieldHash[i];
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocInverter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocInverter.java?rev=707836&r1=707835&r2=707836&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocInverter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocInverter.java Sat Oct 25 03:40:00 2008
@@ -44,7 +44,7 @@
endConsumer.setFieldInfos(fieldInfos);
}
- void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException {
+ void flush(Map threadsAndFields, SegmentWriteState state) throws IOException {
Map childThreadsAndFields = new HashMap();
Map endChildThreadsAndFields = new HashMap();
@@ -75,7 +75,7 @@
endConsumer.flush(endChildThreadsAndFields, state);
}
- public void closeDocStore(DocumentsWriter.FlushState state) throws IOException {
+ public void closeDocStore(SegmentWriteState state) throws IOException {
consumer.closeDocStore(state);
endConsumer.closeDocStore(state);
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=707836&r1=707835&r2=707836&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/DocumentsWriter.java Sat Oct 25 03:40:00 2008
@@ -156,20 +156,6 @@
}
}
- static class FlushState {
- DocumentsWriter docWriter;
- Directory directory;
- String segmentName;
- String docStoreSegmentName;
- int numDocsInRAM;
- int numDocsInStore;
- Collection flushedFiles;
-
- public String segmentFileName(String ext) {
- return segmentName + "." + ext;
- }
- }
-
/** Consumer returns this on each doc. This holds any
* state that must be flushed synchronized "in docID
* order". We gather these and flush them in order. */
@@ -402,7 +388,7 @@
private Collection abortedFiles; // List of files that were written before last abort()
- private FlushState flushState;
+ private SegmentWriteState flushState;
Collection abortedFiles() {
return abortedFiles;
@@ -545,18 +531,7 @@
synchronized private void initFlushState(boolean onlyDocStore) {
initSegmentName(onlyDocStore);
-
- if (flushState == null) {
- flushState = new FlushState();
- flushState.directory = directory;
- flushState.docWriter = this;
- }
-
- flushState.docStoreSegmentName = docStoreSegment;
- flushState.segmentName = segment;
- flushState.numDocsInRAM = numDocsInRAM;
- flushState.numDocsInStore = numDocsInStore;
- flushState.flushedFiles = new HashSet();
+ flushState = new SegmentWriteState(this, directory, segment, docStoreSegment, numDocsInRAM, numDocsInStore, writer.getTermIndexInterval());
}
/** Flush all pending docs to a new segment */
@@ -602,7 +577,7 @@
message(message);
}
- flushedDocCount += flushState.numDocsInRAM;
+ flushedDocCount += flushState.numDocs;
doAfterFlush();
@@ -616,7 +591,7 @@
assert waitQueue.waitingBytes == 0;
- return flushState.numDocsInRAM;
+ return flushState.numDocs;
}
/** Build compound file for the segment we just flushed */
Added: lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsDocsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsDocsConsumer.java?rev=707836&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsDocsConsumer.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsDocsConsumer.java Sat Oct 25 03:40:00 2008
@@ -0,0 +1,34 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+/**
+ * NOTE: this API is experimental and will likely change
+ */
+
+abstract class FormatPostingsDocsConsumer {
+
+ /** Adds a new doc in this term. If this returns null
+ * then we just skip consuming positions/payloads. */
+ abstract FormatPostingsPositionsConsumer addDoc(int docID, int termDocFreq) throws IOException;
+
+ /** Called when we are done adding docs to this term */
+ abstract void finish() throws IOException;
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsDocsConsumer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsDocsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsDocsWriter.java?rev=707836&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsDocsWriter.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsDocsWriter.java Sat Oct 25 03:40:00 2008
@@ -0,0 +1,127 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Consumes doc & freq, writing them using the current
+ * index file format */
+
+import java.io.IOException;
+
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.store.IndexOutput;
+
+final class FormatPostingsDocsWriter extends FormatPostingsDocsConsumer {
+
+ final IndexOutput out;
+ final FormatPostingsTermsWriter parent;
+ final FormatPostingsPositionsWriter posWriter;
+ final DefaultSkipListWriter skipListWriter;
+ final int skipInterval;
+ final int totalNumDocs;
+
+ boolean omitTF;
+ boolean storePayloads;
+ long freqStart;
+ FieldInfo fieldInfo;
+
+ FormatPostingsDocsWriter(SegmentWriteState state, FormatPostingsTermsWriter parent) throws IOException {
+ super();
+ this.parent = parent;
+ final String fileName = IndexFileNames.segmentFileName(parent.parent.segment, IndexFileNames.FREQ_EXTENSION);
+ state.flushedFiles.add(fileName);
+ out = parent.parent.dir.createOutput(fileName);
+ totalNumDocs = parent.parent.totalNumDocs;
+
+ // TODO: abstraction violation
+ skipInterval = parent.parent.termsOut.skipInterval;
+ skipListWriter = parent.parent.skipListWriter;
+ skipListWriter.setFreqOutput(out);
+
+ posWriter = new FormatPostingsPositionsWriter(state, this);
+ }
+
+ void setField(FieldInfo fieldInfo) {
+ this.fieldInfo = fieldInfo;
+ omitTF = fieldInfo.omitTf;
+ storePayloads = fieldInfo.storePayloads;
+ posWriter.setField(fieldInfo);
+ }
+
+ int lastDocID;
+ int df;
+
+ /** Adds a new doc in this term. If this returns null
+ * then we just skip consuming positions/payloads. */
+ FormatPostingsPositionsConsumer addDoc(int docID, int termDocFreq) throws IOException {
+
+ final int delta = docID - lastDocID;
+
+ if (docID < 0 || (df > 0 && delta <= 0))
+ throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )");
+
+ if ((++df % skipInterval) == 0) {
+ // TODO: abstraction violation
+ skipListWriter.setSkipData(lastDocID, storePayloads, posWriter.lastPayloadLength);
+ skipListWriter.bufferSkip(df);
+ }
+
+ assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs;
+
+ lastDocID = docID;
+ if (omitTF)
+ out.writeVInt(delta);
+ else if (1 == termDocFreq)
+ out.writeVInt((delta<<1) | 1);
+ else {
+ out.writeVInt(delta<<1);
+ out.writeVInt(termDocFreq);
+ }
+
+ return posWriter;
+ }
+
+ private final TermInfo termInfo = new TermInfo(); // minimize consing
+ final UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result();
+
+ /** Called when we are done adding docs to this term */
+ void finish() throws IOException {
+ long skipPointer = skipListWriter.writeSkip(out);
+
+ // TODO: this is abstraction violation -- we should not
+ // peek up into parents terms encoding format
+ termInfo.set(df, parent.freqStart, parent.proxStart, (int) (skipPointer - parent.freqStart));
+
+ // TODO: we could do this incrementally
+ UnicodeUtil.UTF16toUTF8(parent.currentTerm, parent.currentTermStart, utf8);
+
+ if (df > 0) {
+ parent.termsOut.add(fieldInfo.number,
+ utf8.result,
+ utf8.length,
+ termInfo);
+ }
+
+ lastDocID = 0;
+ df = 0;
+ }
+
+ void close() throws IOException {
+ out.close();
+ posWriter.close();
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsDocsWriter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java?rev=707836&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java Sat Oct 25 03:40:00 2008
@@ -0,0 +1,36 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+/** Abstract API that consumes terms, doc, freq, prox and
+ * payloads postings. Concrete implementations of this
+ * actually do "something" with the postings (write it into
+ * the index in a specific format).
+ *
+ * NOTE: this API is experimental and will likely change
+ */
+abstract class FormatPostingsFieldsConsumer {
+
+ /** Add a new field */
+ abstract FormatPostingsTermsConsumer addField(FieldInfo field) throws IOException;
+
+ /** Called when we are done adding everything. */
+ abstract void finish() throws IOException;
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java?rev=707836&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java Sat Oct 25 03:40:00 2008
@@ -0,0 +1,73 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.store.Directory;
+
+final class FormatPostingsFieldsWriter extends FormatPostingsFieldsConsumer {
+
+ final Directory dir;
+ final String segment;
+ final TermInfosWriter termsOut;
+ final FieldInfos fieldInfos;
+ final FormatPostingsTermsWriter termsWriter;
+ final DefaultSkipListWriter skipListWriter;
+ final int totalNumDocs;
+
+ public FormatPostingsFieldsWriter(SegmentWriteState state, FieldInfos fieldInfos) throws IOException {
+ super();
+
+ dir = state.directory;
+ segment = state.segmentName;
+ totalNumDocs = state.numDocs;
+ this.fieldInfos = fieldInfos;
+ termsOut = new TermInfosWriter(dir,
+ segment,
+ fieldInfos,
+ state.termIndexInterval);
+
+ // TODO: this is a nasty abstraction violation (that we
+ // peek down to find freqOut/proxOut) -- we need a
+ // better abstraction here whereby these child consumers
+ // can provide skip data or not
+ skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval,
+ termsOut.maxSkipLevels,
+ totalNumDocs,
+ null,
+ null);
+
+ state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_EXTENSION));
+ state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION));
+
+ termsWriter = new FormatPostingsTermsWriter(state, this);
+ }
+
+ /** Add a new field */
+ FormatPostingsTermsConsumer addField(FieldInfo field) {
+ termsWriter.setField(field);
+ return termsWriter;
+ }
+
+ /** Called when we are done adding everything. */
+ void finish() throws IOException {
+ termsOut.close();
+ termsWriter.close();
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java?rev=707836&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java Sat Oct 25 03:40:00 2008
@@ -0,0 +1,32 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.store.IndexInput;
+
+abstract class FormatPostingsPositionsConsumer {
+
+ /** Add a new position & payload. If payloadLength > 0
+ * you must read those bytes from the IndexInput. */
+ abstract void addPosition(int position, byte[] payload, int payloadOffset, int payloadLength) throws IOException;
+
+ /** Called when we are done adding positions & payloads */
+ abstract void finish() throws IOException;
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsPositionsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsPositionsWriter.java?rev=707836&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsPositionsWriter.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsPositionsWriter.java Sat Oct 25 03:40:00 2008
@@ -0,0 +1,87 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.IndexInput;
+
+import java.io.IOException;
+
+final class FormatPostingsPositionsWriter extends FormatPostingsPositionsConsumer {
+
+ final FormatPostingsDocsWriter parent;
+ final IndexOutput out;
+
+ boolean omitTF;
+ boolean storePayloads;
+ int lastPayloadLength = -1;
+
+ FormatPostingsPositionsWriter(SegmentWriteState state, FormatPostingsDocsWriter parent) throws IOException {
+ this.parent = parent;
+ omitTF = parent.omitTF;
+ if (parent.parent.parent.fieldInfos.hasProx()) {
+ // At least one field does not omit TF, so create the
+ // prox file
+ final String fileName = IndexFileNames.segmentFileName(parent.parent.parent.segment, IndexFileNames.PROX_EXTENSION);
+ state.flushedFiles.add(fileName);
+ out = parent.parent.parent.dir.createOutput(fileName);
+ parent.skipListWriter.setProxOutput(out);
+ } else
+ // Every field omits TF so we will write no prox file
+ out = null;
+ }
+
+ int lastPosition;
+
+ /** Add a new position & payload */
+ void addPosition(int position, byte[] payload, int payloadOffset, int payloadLength) throws IOException {
+ assert !omitTF: "omitTF is true";
+ assert out != null;
+
+ final int delta = position - lastPosition;
+ lastPosition = position;
+
+ if (storePayloads) {
+ if (payloadLength != lastPayloadLength) {
+ lastPayloadLength = payloadLength;
+ out.writeVInt((delta<<1)|1);
+ out.writeVInt(payloadLength);
+ } else
+ out.writeVInt(delta << 1);
+ if (payloadLength > 0)
+ out.writeBytes(payload, payloadLength);
+ } else
+ out.writeVInt(delta);
+ }
+
+ void setField(FieldInfo fieldInfo) {
+ omitTF = fieldInfo.omitTf;
+ storePayloads = omitTF ? false : fieldInfo.storePayloads;
+ }
+
+ /** Called when we are done adding positions & payloads */
+ void finish() {
+ lastPosition = 0;
+ lastPayloadLength = -1;
+ }
+
+ void close() throws IOException {
+ if (out != null)
+ out.close();
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsPositionsWriter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java?rev=707836&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java Sat Oct 25 03:40:00 2008
@@ -0,0 +1,46 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.util.ArrayUtil;
+
+/**
+ * NOTE: this API is experimental and will likely change
+ */
+
+abstract class FormatPostingsTermsConsumer {
+
+ /** Adds a new term in this field; term ends with U+FFFF
+ * char */
+ abstract FormatPostingsDocsConsumer addTerm(char[] text, int start) throws IOException;
+
+ char[] termBuffer;
+ FormatPostingsDocsConsumer addTerm(String text) throws IOException {
+ final int len = text.length();
+ if (termBuffer == null || termBuffer.length < 1+len)
+ termBuffer = new char[ArrayUtil.getNextSize(1+len)];
+ text.getChars(0, len, termBuffer, 0);
+ termBuffer[len] = 0xffff;
+ return addTerm(termBuffer, 0);
+ }
+
+ /** Called when we are done adding terms to this field */
+ abstract void finish() throws IOException;
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsTermsConsumer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java?rev=707836&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java Sat Oct 25 03:40:00 2008
@@ -0,0 +1,71 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+final class FormatPostingsTermsWriter extends FormatPostingsTermsConsumer {
+
+ final FormatPostingsFieldsWriter parent;
+ final FormatPostingsDocsWriter docsWriter;
+ final TermInfosWriter termsOut;
+ FieldInfo fieldInfo;
+
+ FormatPostingsTermsWriter(SegmentWriteState state, FormatPostingsFieldsWriter parent) throws IOException {
+ super();
+ this.parent = parent;
+ termsOut = parent.termsOut;
+ docsWriter = new FormatPostingsDocsWriter(state, this);
+ }
+
+ void setField(FieldInfo fieldInfo) {
+ this.fieldInfo = fieldInfo;
+ docsWriter.setField(fieldInfo);
+ }
+
+ char[] currentTerm;
+ int currentTermStart;
+
+ long freqStart;
+ long proxStart;
+
+ /** Adds a new term in this field */
+ FormatPostingsDocsConsumer addTerm(char[] text, int start) {
+ currentTerm = text;
+ currentTermStart = start;
+
+ // TODO: this is abstraction violation -- ideally this
+ // terms writer is not so "invasive", looking for file
+ // pointers in its child consumers.
+ freqStart = docsWriter.out.getFilePointer();
+ if (docsWriter.posWriter.out != null)
+ proxStart = docsWriter.posWriter.out.getFilePointer();
+
+ parent.skipListWriter.resetSkip();
+
+ return docsWriter;
+ }
+
+ /** Called when we are done adding terms to this field */
+ void finish() {
+ }
+
+ void close() throws IOException {
+ docsWriter.close();
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FreqProxTermsWriter.java?rev=707836&r1=707835&r2=707836&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FreqProxTermsWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FreqProxTermsWriter.java Sat Oct 25 03:40:00 2008
@@ -57,7 +57,7 @@
}
}
- void closeDocStore(DocumentsWriter.FlushState state) {}
+ void closeDocStore(SegmentWriteState state) {}
void abort() {}
@@ -66,7 +66,7 @@
// under the same FieldInfo together, up into TermsHash*.
// Other writers would presumably share alot of this...
- public void flush(Map threadsAndFields, final DocumentsWriter.FlushState state) throws IOException {
+ public void flush(Map threadsAndFields, final SegmentWriteState state) throws IOException {
// Gather all FieldData's that have postings, across all
// ThreadStates
@@ -92,22 +92,19 @@
Collections.sort(allFields);
final int numAllFields = allFields.size();
- final TermInfosWriter termsOut = new TermInfosWriter(state.directory,
- state.segmentName,
- fieldInfos,
- state.docWriter.writer.getTermIndexInterval());
-
- final IndexOutput freqOut = state.directory.createOutput(state.segmentFileName(IndexFileNames.FREQ_EXTENSION));
- final IndexOutput proxOut;
-
- if (fieldInfos.hasProx())
- proxOut = state.directory.createOutput(state.segmentFileName(IndexFileNames.PROX_EXTENSION));
- else
- proxOut = null;
-
- final DefaultSkipListWriter skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval,
- termsOut.maxSkipLevels,
- state.numDocsInRAM, freqOut, proxOut);
+ // TODO: allow Lucene user to customize this consumer:
+ final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos);
+ /*
+ Current writer chain:
+ FormatPostingsFieldsConsumer
+ -> IMPL: FormatPostingsFieldsWriter
+ -> FormatPostingsTermsConsumer
+ -> IMPL: FormatPostingsTermsWriter
+ -> FormatPostingsDocConsumer
+ -> IMPL: FormatPostingsDocWriter
+ -> FormatPostingsPositionsConsumer
+ -> IMPL: FormatPostingsPositionsWriter
+ */
int start = 0;
while(start < numAllFields) {
@@ -129,7 +126,7 @@
// If this field has postings then add them to the
// segment
- appendPostings(state, fields, termsOut, freqOut, proxOut, skipListWriter);
+ appendPostings(fields, consumer);
for(int i=0;i<fields.length;i++) {
TermsHashPerField perField = fields[i].termsHashPerField;
@@ -149,51 +146,18 @@
perThread.termsHashPerThread.reset(true);
}
- freqOut.close();
- if (proxOut != null) {
- state.flushedFiles.add(state.segmentFileName(IndexFileNames.PROX_EXTENSION));
- proxOut.close();
- }
- termsOut.close();
-
- // Record all files we have flushed
- state.flushedFiles.add(state.segmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION));
- state.flushedFiles.add(state.segmentFileName(IndexFileNames.FREQ_EXTENSION));
- state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_EXTENSION));
- state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION));
+ consumer.finish();
}
- final byte[] copyByteBuffer = new byte[4096];
-
- /** Copy numBytes from srcIn to destIn */
- void copyBytes(IndexInput srcIn, IndexOutput destIn, long numBytes) throws IOException {
- // TODO: we could do this more efficiently (save a copy)
- // because it's always from a ByteSliceReader ->
- // IndexOutput
- while(numBytes > 0) {
- final int chunk;
- if (numBytes > 4096)
- chunk = 4096;
- else
- chunk = (int) numBytes;
- srcIn.readBytes(copyByteBuffer, 0, chunk);
- destIn.writeBytes(copyByteBuffer, 0, chunk);
- numBytes -= chunk;
- }
- }
+ private byte[] payloadBuffer;
/* Walk through all unique text tokens (Posting
* instances) found in this field and serialize them
* into a single RAM segment. */
- void appendPostings(final DocumentsWriter.FlushState flushState,
- FreqProxTermsWriterPerField[] fields,
- TermInfosWriter termsOut,
- IndexOutput freqOut,
- IndexOutput proxOut,
- DefaultSkipListWriter skipListWriter)
+ void appendPostings(FreqProxTermsWriterPerField[] fields,
+ FormatPostingsFieldsConsumer consumer)
throws CorruptIndexException, IOException {
- final int fieldNumber = fields[0].fieldInfo.number;
int numFields = fields.length;
final FreqProxFieldMergeState[] mergeStates = new FreqProxFieldMergeState[numFields];
@@ -208,15 +172,12 @@
assert result;
}
- final int skipInterval = termsOut.skipInterval;
- final boolean currentFieldOmitTf = fields[0].fieldInfo.omitTf;
+ final FormatPostingsTermsConsumer termsConsumer = consumer.addField(fields[0].fieldInfo);
- // If current field omits tf then it cannot store
- // payloads. We silently drop the payloads in this case:
- final boolean currentFieldStorePayloads = currentFieldOmitTf ? false : fields[0].fieldInfo.storePayloads;
-
FreqProxFieldMergeState[] termStates = new FreqProxFieldMergeState[numFields];
+ final boolean currentFieldOmitTf = fields[0].fieldInfo.omitTf;
+
while(numFields > 0) {
// Get the next term to merge
@@ -235,43 +196,21 @@
termStates[numToMerge++] = mergeStates[i];
}
- int df = 0;
- int lastPayloadLength = -1;
-
- int lastDoc = 0;
-
- final char[] text = termStates[0].text;
- final int start = termStates[0].textOffset;
-
- final long freqPointer = freqOut.getFilePointer();
- final long proxPointer;
- if (proxOut != null)
- proxPointer = proxOut.getFilePointer();
- else
- proxPointer = 0;
-
- skipListWriter.resetSkip();
+ final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(termStates[0].text, termStates[0].textOffset);
// Now termStates has numToMerge FieldMergeStates
// which all share the same term. Now we must
// interleave the docID streams.
while(numToMerge > 0) {
- if ((++df % skipInterval) == 0) {
- skipListWriter.setSkipData(lastDoc, currentFieldStorePayloads, lastPayloadLength);
- skipListWriter.bufferSkip(df);
- }
-
FreqProxFieldMergeState minState = termStates[0];
for(int i=1;i<numToMerge;i++)
if (termStates[i].docID < minState.docID)
minState = termStates[i];
- final int doc = minState.docID;
final int termDocFreq = minState.termFreq;
- assert doc < flushState.numDocsInRAM;
- assert doc > lastDoc || df == 1;
+ final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(minState.docID, termDocFreq);
final ByteSliceReader prox = minState.prox;
@@ -279,47 +218,32 @@
// changing the format to match Lucene's segment
// format.
if (!currentFieldOmitTf) {
- // omitTf == false so we do write positions & payload
- assert proxOut != null;
+ // omitTf == false so we do write positions &
+ // payload
+ int position = 0;
for(int j=0;j<termDocFreq;j++) {
final int code = prox.readVInt();
- if (currentFieldStorePayloads) {
- final int payloadLength;
- if ((code & 1) != 0) {
- // This position has a payload
- payloadLength = prox.readVInt();
- } else
- payloadLength = 0;
- if (payloadLength != lastPayloadLength) {
- proxOut.writeVInt(code|1);
- proxOut.writeVInt(payloadLength);
- lastPayloadLength = payloadLength;
- } else
- proxOut.writeVInt(code & (~1));
- if (payloadLength > 0)
- copyBytes(prox, proxOut, payloadLength);
- } else {
- assert 0 == (code & 1);
- proxOut.writeVInt(code>>1);
- }
+ position += code >> 1;
+
+ final int payloadLength;
+ if ((code & 1) != 0) {
+ // This position has a payload
+ payloadLength = prox.readVInt();
+
+ if (payloadBuffer == null || payloadBuffer.length < payloadLength)
+ payloadBuffer = new byte[payloadLength];
+
+ prox.readBytes(payloadBuffer, 0, payloadLength);
+
+ } else
+ payloadLength = 0;
+
+ posConsumer.addPosition(position, payloadBuffer, 0, payloadLength);
} //End for
-
- final int newDocCode = (doc-lastDoc)<<1;
- if (1 == termDocFreq) {
- freqOut.writeVInt(newDocCode|1);
- } else {
- freqOut.writeVInt(newDocCode);
- freqOut.writeVInt(termDocFreq);
- }
- } else {
- // omitTf==true: we store only the docs, without
- // term freq, positions, payloads
- freqOut.writeVInt(doc-lastDoc);
+ posConsumer.finish();
}
- lastDoc = doc;
-
if (!minState.nextDoc()) {
// Remove from termStates
@@ -345,26 +269,10 @@
}
}
- assert df > 0;
-
- // Done merging this term
-
- long skipPointer = skipListWriter.writeSkip(freqOut);
-
- // Write term
- termInfo.set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer));
-
- // TODO: we could do this incrementally
- UnicodeUtil.UTF16toUTF8(text, start, termsUTF8);
-
- // TODO: we could save O(n) re-scan of the term by
- // computing the shared prefix with the last term
- // while during the UTF8 encoding
- termsOut.add(fieldNumber,
- termsUTF8.result,
- termsUTF8.length,
- termInfo);
+ docConsumer.finish();
}
+
+ termsConsumer.finish();
}
private final TermInfo termInfo = new TermInfo(); // minimize consing
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileNames.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileNames.java?rev=707836&r1=707835&r2=707836&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileNames.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileNames.java Sat Oct 25 03:40:00 2008
@@ -195,4 +195,8 @@
return true;
return false;
}
+
+ static String segmentFileName(String segmentName, String ext) {
+ return segmentName + "." + ext;
+ }
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/InvertedDocConsumer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/InvertedDocConsumer.java?rev=707836&r1=707835&r2=707836&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/InvertedDocConsumer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/InvertedDocConsumer.java Sat Oct 25 03:40:00 2008
@@ -29,10 +29,10 @@
abstract void abort();
/** Flush a new segment */
- abstract void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException;
+ abstract void flush(Map threadsAndFields, SegmentWriteState state) throws IOException;
/** Close doc stores */
- abstract void closeDocStore(DocumentsWriter.FlushState state) throws IOException;
+ abstract void closeDocStore(SegmentWriteState state) throws IOException;
/** Attempt to free RAM, returning true if any RAM was
* freed */
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/InvertedDocEndConsumer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/InvertedDocEndConsumer.java?rev=707836&r1=707835&r2=707836&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/InvertedDocEndConsumer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/InvertedDocEndConsumer.java Sat Oct 25 03:40:00 2008
@@ -22,8 +22,8 @@
abstract class InvertedDocEndConsumer {
abstract InvertedDocEndConsumerPerThread addThread(DocInverterPerThread docInverterPerThread);
- abstract void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException;
- abstract void closeDocStore(DocumentsWriter.FlushState state) throws IOException;
+ abstract void flush(Map threadsAndFields, SegmentWriteState state) throws IOException;
+ abstract void closeDocStore(SegmentWriteState state) throws IOException;
abstract void abort();
abstract void setFieldInfos(FieldInfos fieldInfos);
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriter.java?rev=707836&r1=707835&r2=707836&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriter.java Sat Oct 25 03:40:00 2008
@@ -54,7 +54,7 @@
/** Produce _X.nrm if any document had a field with norms
* not disabled */
- public void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException {
+ public void flush(Map threadsAndFields, SegmentWriteState state) throws IOException {
final Map byField = new HashMap();
@@ -133,7 +133,7 @@
}
}
- assert minDocID < state.numDocsInRAM;
+ assert minDocID < state.numDocs;
// Fill hole
for(;upto<minDocID;upto++)
@@ -154,16 +154,16 @@
}
// Fill final hole with defaultNorm
- for(;upto<state.numDocsInRAM;upto++)
+ for(;upto<state.numDocs;upto++)
normsOut.writeByte(defaultNorm);
} else if (fieldInfo.isIndexed && !fieldInfo.omitNorms) {
normCount++;
// Fill entire field with default norm:
- for(;upto<state.numDocsInRAM;upto++)
+ for(;upto<state.numDocs;upto++)
normsOut.writeByte(defaultNorm);
}
- assert 4+normCount*state.numDocsInRAM == normsOut.getFilePointer() : ".nrm file size mismatch: expected=" + (4+normCount*state.numDocsInRAM) + " actual=" + normsOut.getFilePointer();
+ assert 4+normCount*state.numDocs == normsOut.getFilePointer() : ".nrm file size mismatch: expected=" + (4+normCount*state.numDocs) + " actual=" + normsOut.getFilePointer();
}
} finally {
@@ -171,5 +171,5 @@
}
}
- void closeDocStore(DocumentsWriter.FlushState state) {}
+ void closeDocStore(SegmentWriteState state) {}
}
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java?rev=707836&r1=707835&r2=707836&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java Sat Oct 25 03:40:00 2008
@@ -21,6 +21,7 @@
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
+import java.util.HashSet;
import java.util.List;
import org.apache.lucene.document.Document;
@@ -476,38 +477,28 @@
throw new RuntimeException("mergeVectors produced an invalid result: mergedDocs is " + mergedDocs + " but tvx size is " + tvxSize + "; now aborting this merge to prevent index corruption");
}
- private IndexOutput freqOutput = null;
- private IndexOutput proxOutput = null;
- private TermInfosWriter termInfosWriter = null;
- private int skipInterval;
- private int maxSkipLevels;
private SegmentMergeQueue queue = null;
- private DefaultSkipListWriter skipListWriter = null;
private final void mergeTerms() throws CorruptIndexException, IOException {
+
+ SegmentWriteState state = new SegmentWriteState(null, directory, segment, null, mergedDocs, 0, termIndexInterval);
+
+ final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos);
+
try {
- freqOutput = directory.createOutput(segment + ".frq");
- if (hasProx())
- proxOutput = directory.createOutput(segment + ".prx");
- termInfosWriter =
- new TermInfosWriter(directory, segment, fieldInfos,
- termIndexInterval);
- skipInterval = termInfosWriter.skipInterval;
- maxSkipLevels = termInfosWriter.maxSkipLevels;
- skipListWriter = new DefaultSkipListWriter(skipInterval, maxSkipLevels, mergedDocs, freqOutput, proxOutput);
queue = new SegmentMergeQueue(readers.size());
- mergeTermInfos();
+ mergeTermInfos(consumer);
} finally {
- if (freqOutput != null) freqOutput.close();
- if (proxOutput != null) proxOutput.close();
- if (termInfosWriter != null) termInfosWriter.close();
+ consumer.finish();
if (queue != null) queue.close();
}
}
- private final void mergeTermInfos() throws CorruptIndexException, IOException {
+ boolean omitTF;
+
+ private final void mergeTermInfos(final FormatPostingsFieldsConsumer consumer) throws CorruptIndexException, IOException {
int base = 0;
final int readerCount = readers.size();
for (int i = 0; i < readerCount; i++) {
@@ -533,6 +524,9 @@
SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()];
+ String currentField = null;
+ FormatPostingsTermsConsumer termsConsumer = null;
+
while (queue.size() > 0) {
int matchSize = 0; // pop matching terms
match[matchSize++] = (SegmentMergeInfo) queue.pop();
@@ -544,7 +538,16 @@
top = (SegmentMergeInfo) queue.top();
}
- final int df = mergeTermInfo(match, matchSize); // add new TermInfo
+ if (currentField != term.field) {
+ currentField = term.field;
+ if (termsConsumer != null)
+ termsConsumer.finish();
+ final FieldInfo fieldInfo = fieldInfos.fieldInfo(currentField);
+ termsConsumer = consumer.addField(fieldInfo);
+ omitTF = fieldInfo.omitTf;
+ }
+
+ int df = appendPostings(termsConsumer, match, matchSize); // add new TermInfo
if (checkAbort != null)
checkAbort.work(df/3.0);
@@ -559,44 +562,6 @@
}
}
- private final TermInfo termInfo = new TermInfo(); // minimize consing
-
- /** Merge one term found in one or more segments. The array <code>smis</code>
- * contains segments that are positioned at the same term. <code>N</code>
- * is the number of cells in the array actually occupied.
- *
- * @param smis array of segments
- * @param n number of cells in the array actually occupied
- * @throws CorruptIndexException if the index is corrupt
- * @throws IOException if there is a low-level IO error
- */
- private final int mergeTermInfo(SegmentMergeInfo[] smis, int n)
- throws CorruptIndexException, IOException {
- final long freqPointer = freqOutput.getFilePointer();
- final long proxPointer;
- if (proxOutput != null)
- proxPointer = proxOutput.getFilePointer();
- else
- proxPointer = 0;
-
- int df;
- if (fieldInfos.fieldInfo(smis[0].term.field).omitTf) { // append posting data
- df = appendPostingsNoTf(smis, n);
- } else{
- df = appendPostings(smis, n);
- }
-
- long skipPointer = skipListWriter.writeSkip(freqOutput);
-
- if (df > 0) {
- // add an entry to the dictionary with pointers to prox and freq files
- termInfo.set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer));
- termInfosWriter.add(smis[0].term, termInfo);
- }
-
- return df;
- }
-
private byte[] payloadBuffer;
private int[][] docMaps;
int[][] getDocMaps() {
@@ -617,13 +582,11 @@
* @throws CorruptIndexException if the index is corrupt
* @throws IOException if there is a low-level IO error
*/
- private final int appendPostings(SegmentMergeInfo[] smis, int n)
- throws CorruptIndexException, IOException {
- int lastDoc = 0;
- int df = 0; // number of docs w/ term
- skipListWriter.resetSkip();
- boolean storePayloads = fieldInfos.fieldInfo(smis[0].term.field).storePayloads;
- int lastPayloadLength = -1; // ensures that we write the first length
+ private final int appendPostings(final FormatPostingsTermsConsumer termsConsumer, SegmentMergeInfo[] smis, int n)
+ throws CorruptIndexException, IOException {
+
+ final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(smis[0].term.text);
+ int df = 0;
for (int i = 0; i < n; i++) {
SegmentMergeInfo smi = smis[i];
TermPositions postings = smi.getPositions();
@@ -631,114 +594,37 @@
int base = smi.base;
int[] docMap = smi.getDocMap();
postings.seek(smi.termEnum);
+
while (postings.next()) {
+ df++;
int doc = postings.doc();
if (docMap != null)
doc = docMap[doc]; // map around deletions
doc += base; // convert to merged space
- if (doc < 0 || (df > 0 && doc <= lastDoc))
- throw new CorruptIndexException("docs out of order (" + doc +
- " <= " + lastDoc + " )");
+ final int freq = postings.freq();
+ final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(doc, freq);
- df++;
-
- if ((df % skipInterval) == 0) {
- skipListWriter.setSkipData(lastDoc, storePayloads, lastPayloadLength);
- skipListWriter.bufferSkip(df);
- }
-
- int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
- lastDoc = doc;
-
- int freq = postings.freq();
- if (freq == 1) {
- freqOutput.writeVInt(docCode | 1); // write doc & freq=1
- } else {
- freqOutput.writeVInt(docCode); // write doc
- freqOutput.writeVInt(freq); // write frequency in doc
- }
-
- /** See {@link DocumentWriter#writePostings(Posting[], String)} for
- * documentation about the encoding of positions and payloads
- */
- int lastPosition = 0; // write position deltas
- for (int j = 0; j < freq; j++) {
- int position = postings.nextPosition();
- int delta = position - lastPosition;
- if (storePayloads) {
- int payloadLength = postings.getPayloadLength();
- if (payloadLength == lastPayloadLength) {
- proxOutput.writeVInt(delta * 2);
- } else {
- proxOutput.writeVInt(delta * 2 + 1);
- proxOutput.writeVInt(payloadLength);
- lastPayloadLength = payloadLength;
- }
+ if (!omitTF) {
+ for (int j = 0; j < freq; j++) {
+ final int position = postings.nextPosition();
+ final int payloadLength = postings.getPayloadLength();
if (payloadLength > 0) {
- if (payloadBuffer == null || payloadBuffer.length < payloadLength) {
+ if (payloadBuffer == null || payloadBuffer.length < payloadLength)
payloadBuffer = new byte[payloadLength];
- }
postings.getPayload(payloadBuffer, 0);
- proxOutput.writeBytes(payloadBuffer, 0, payloadLength);
}
- } else {
- proxOutput.writeVInt(delta);
+ posConsumer.addPosition(position, payloadBuffer, 0, payloadLength);
}
- lastPosition = position;
+ posConsumer.finish();
}
}
}
- return df;
- }
+ docConsumer.finish();
- /** Process postings from multiple segments without tf, all positioned on the
- * same term. Writes out merged entries only into freqOutput, proxOut is not written.
- *
- * @param smis array of segments
- * @param n number of cells in the array actually occupied
- * @return number of documents across all segments where this term was found
- * @throws CorruptIndexException if the index is corrupt
- * @throws IOException if there is a low-level IO error
- */
- private final int appendPostingsNoTf(SegmentMergeInfo[] smis, int n)
- throws CorruptIndexException, IOException {
- int lastDoc = 0;
- int df = 0; // number of docs w/ term
- skipListWriter.resetSkip();
- int lastPayloadLength = -1; // ensures that we write the first length
- for (int i = 0; i < n; i++) {
- SegmentMergeInfo smi = smis[i];
- TermPositions postings = smi.getPositions();
- assert postings != null;
- int base = smi.base;
- int[] docMap = smi.getDocMap();
- postings.seek(smi.termEnum);
- while (postings.next()) {
- int doc = postings.doc();
- if (docMap != null)
- doc = docMap[doc]; // map around deletions
- doc += base; // convert to merged space
-
- if (doc < 0 || (df > 0 && doc <= lastDoc))
- throw new CorruptIndexException("docs out of order (" + doc +
- " <= " + lastDoc + " )");
-
- df++;
-
- if ((df % skipInterval) == 0) {
- skipListWriter.setSkipData(lastDoc, false, lastPayloadLength);
- skipListWriter.bufferSkip(df);
- }
-
- int docCode = (doc - lastDoc);
- lastDoc = doc;
- freqOutput.writeVInt(docCode); // write doc & freq=1
- }
- }
return df;
}
-
+
private void mergeNorms() throws IOException {
byte[] normBuffer = null;
IndexOutput output = null;
Added: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentWriteState.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentWriteState.java?rev=707836&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentWriteState.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentWriteState.java Sat Oct 25 03:40:00 2008
@@ -0,0 +1,50 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashSet;
+import java.util.Collection;
+
+import org.apache.lucene.store.Directory;
+
+class SegmentWriteState {
+ DocumentsWriter docWriter;
+ Directory directory;
+ String segmentName;
+ String docStoreSegmentName;
+ int numDocs;
+ int termIndexInterval;
+ int numDocsInStore;
+ Collection flushedFiles;
+
+ public SegmentWriteState(DocumentsWriter docWriter, Directory directory, String segmentName, String docStoreSegmentName, int numDocs,
+ int numDocsInStore, int termIndexInterval) {
+ this.docWriter = docWriter;
+ this.directory = directory;
+ this.segmentName = segmentName;
+ this.docStoreSegmentName = docStoreSegmentName;
+ this.numDocs = numDocs;
+ this.numDocsInStore = numDocsInStore;
+ this.termIndexInterval = termIndexInterval;
+ flushedFiles = new HashSet();
+ }
+
+ public String segmentFileName(String ext) {
+ return segmentName + "." + ext;
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentWriteState.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriter.java?rev=707836&r1=707835&r2=707836&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriter.java Sat Oct 25 03:40:00 2008
@@ -40,7 +40,7 @@
return new StoredFieldsWriterPerThread(docFieldProcessorPerThread, this);
}
- synchronized public void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException {
+ synchronized public void flush(Map threadsAndFields, SegmentWriteState state) throws IOException {
if (state.numDocsInStore > 0) {
// It's possible that all documents seen in this segment
@@ -72,7 +72,7 @@
}
}
- synchronized public void closeDocStore(DocumentsWriter.FlushState state) throws IOException {
+ synchronized public void closeDocStore(SegmentWriteState state) throws IOException {
final int inc = state.numDocsInStore - lastDocID;
if (inc > 0) {
initFieldsWriter();
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java?rev=707836&r1=707835&r2=707836&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java Sat Oct 25 03:40:00 2008
@@ -51,7 +51,7 @@
postings[i] = new PostingList();
}
- synchronized void flush(Map threadsAndFields, final DocumentsWriter.FlushState state) throws IOException {
+ synchronized void flush(Map threadsAndFields, final SegmentWriteState state) throws IOException {
if (tvx != null) {
@@ -80,7 +80,7 @@
}
}
- synchronized void closeDocStore(final DocumentsWriter.FlushState state) throws IOException {
+ synchronized void closeDocStore(final SegmentWriteState state) throws IOException {
if (tvx != null) {
// At least one doc in this run had term vectors
// enabled
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/TermsHash.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermsHash.java?rev=707836&r1=707835&r2=707836&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermsHash.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermsHash.java Sat Oct 25 03:40:00 2008
@@ -85,7 +85,7 @@
nextTermsHash.abort();
}
- void shrinkFreePostings(Map threadsAndFields, DocumentsWriter.FlushState state) {
+ void shrinkFreePostings(Map threadsAndFields, SegmentWriteState state) {
assert postingsFreeCount == postingsAllocCount: Thread.currentThread().getName() + ": postingsFreeCount=" + postingsFreeCount + " postingsAllocCount=" + postingsAllocCount + " consumer=" + consumer;
@@ -97,13 +97,13 @@
}
}
- synchronized void closeDocStore(DocumentsWriter.FlushState state) throws IOException {
+ synchronized void closeDocStore(SegmentWriteState state) throws IOException {
consumer.closeDocStore(state);
if (nextTermsHash != null)
nextTermsHash.closeDocStore(state);
}
- synchronized void flush(Map threadsAndFields, final DocumentsWriter.FlushState state) throws IOException {
+ synchronized void flush(Map threadsAndFields, final SegmentWriteState state) throws IOException {
Map childThreadsAndFields = new HashMap();
Map nextThreadsAndFields;
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumer.java?rev=707836&r1=707835&r2=707836&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumer.java Sat Oct 25 03:40:00 2008
@@ -24,9 +24,9 @@
abstract int bytesPerPosting();
abstract void createPostings(RawPostingList[] postings, int start, int count);
abstract TermsHashConsumerPerThread addThread(TermsHashPerThread perThread);
- abstract void flush(Map threadsAndFields, final DocumentsWriter.FlushState state) throws IOException;
+ abstract void flush(Map threadsAndFields, final SegmentWriteState state) throws IOException;
abstract void abort();
- abstract void closeDocStore(DocumentsWriter.FlushState state) throws IOException;
+ abstract void closeDocStore(SegmentWriteState state) throws IOException;
FieldInfos fieldInfos;