You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2008/07/18 11:20:14 UTC
svn commit: r677865 [4/5] - in /lucene/java/trunk: ./
src/java/org/apache/lucene/index/ src/java/org/apache/lucene/store/
src/java/org/apache/lucene/util/ src/test/org/apache/lucene/
src/test/org/apache/lucene/index/ src/test/org/apache/lucene/search/
Added: lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriter.java?rev=677865&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriter.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriter.java Fri Jul 18 02:20:12 2008
@@ -0,0 +1,175 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.List;
+import java.util.ArrayList;
+
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.search.Similarity;
+
+// TODO FI: norms could actually be stored as doc store
+
+/** Writes norms. Each thread X field accumulates the norms
+ * for the doc/fields it saw, then the flush method below
+ * merges all of these together into a single _X.nrm file.
+ */
+
+final class NormsWriter extends InvertedDocEndConsumer {
+
+ private static final byte defaultNorm = Similarity.encodeNorm(1.0f);
+ private FieldInfos fieldInfos;
+ public InvertedDocEndConsumerPerThread addThread(DocInverterPerThread docInverterPerThread) {
+ return new NormsWriterPerThread(docInverterPerThread, this);
+ }
+
+ public void abort() {}
+
+ // We only write the _X.nrm file at flush
+ void files(Collection files) {}
+
+ void setFieldInfos(FieldInfos fieldInfos) {
+ this.fieldInfos = fieldInfos;
+ }
+
+ /** Produce _X.nrm if any document had a field with norms
+ * not disabled */
+ public void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException {
+
+ final Map byField = new HashMap();
+
+ // Typically, each thread will have encountered the same
+ // field. So first we collate by field, ie, all
+ // per-thread field instances that correspond to the
+ // same FieldInfo
+ final Iterator it = threadsAndFields.entrySet().iterator();
+ while(it.hasNext()) {
+ Map.Entry entry = (Map.Entry) it.next();
+
+ Collection fields = (Collection) entry.getValue();
+ Iterator fieldsIt = fields.iterator();
+
+ while(fieldsIt.hasNext()) {
+ NormsWriterPerField perField = (NormsWriterPerField) fieldsIt.next();
+
+ if (perField.upto > 0) {
+ // It has some norms
+ List l = (List) byField.get(perField.fieldInfo);
+ if (l == null) {
+ l = new ArrayList();
+ byField.put(perField.fieldInfo, l);
+ }
+ l.add(perField);
+ } else
+ // Remove this field since we haven't seen it
+ // since the previous flush
+ fieldsIt.remove();
+ }
+ }
+
+ final String normsFileName = state.segmentName + "." + IndexFileNames.NORMS_EXTENSION;
+ state.flushedFiles.add(normsFileName);
+ IndexOutput normsOut = state.directory.createOutput(normsFileName);
+
+ try {
+ normsOut.writeBytes(SegmentMerger.NORMS_HEADER, 0, SegmentMerger.NORMS_HEADER.length);
+
+ final int numField = fieldInfos.size();
+
+ int normCount = 0;
+
+ for(int fieldNumber=0;fieldNumber<numField;fieldNumber++) {
+
+ final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);
+
+ List toMerge = (List) byField.get(fieldInfo);
+ int upto = 0;
+ if (toMerge != null) {
+
+ final int numFields = toMerge.size();
+
+ normCount++;
+
+ final NormsWriterPerField[] fields = new NormsWriterPerField[numFields];
+ int[] uptos = new int[numFields];
+
+ for(int j=0;j<numFields;j++)
+ fields[j] = (NormsWriterPerField) toMerge.get(j);
+
+ int numLeft = numFields;
+
+ while(numLeft > 0) {
+
+ assert uptos[0] < fields[0].docIDs.length : " uptos[0]=" + uptos[0] + " len=" + (fields[0].docIDs.length);
+
+ int minLoc = 0;
+ int minDocID = fields[0].docIDs[uptos[0]];
+
+ for(int j=1;j<numLeft;j++) {
+ final int docID = fields[j].docIDs[uptos[j]];
+ if (docID < minDocID) {
+ minDocID = docID;
+ minLoc = j;
+ }
+ }
+
+ assert minDocID < state.numDocsInRAM;
+
+ // Fill hole
+ for(;upto<minDocID;upto++)
+ normsOut.writeByte(defaultNorm);
+
+ normsOut.writeByte(fields[minLoc].norms[uptos[minLoc]]);
+ (uptos[minLoc])++;
+ upto++;
+
+ if (uptos[minLoc] == fields[minLoc].upto) {
+ fields[minLoc].reset();
+ if (minLoc != numLeft-1) {
+ fields[minLoc] = fields[numLeft-1];
+ uptos[minLoc] = uptos[numLeft-1];
+ }
+ numLeft--;
+ }
+ }
+
+ // Fill final hole with defaultNorm
+ for(;upto<state.numDocsInRAM;upto++)
+ normsOut.writeByte(defaultNorm);
+ } else if (fieldInfo.isIndexed && !fieldInfo.omitNorms) {
+ normCount++;
+ // Fill entire field with default norm:
+ for(;upto<state.numDocsInRAM;upto++)
+ normsOut.writeByte(defaultNorm);
+ }
+
+ assert 4+normCount*state.numDocsInRAM == normsOut.getFilePointer() : ".nrm file size mismatch: expected=" + (4+normCount*state.numDocsInRAM) + " actual=" + normsOut.getFilePointer();
+ }
+
+ } finally {
+ normsOut.close();
+ }
+ }
+
+ void closeDocStore(DocumentsWriter.FlushState state) {}
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriterPerField.java?rev=677865&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriterPerField.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriterPerField.java Fri Jul 18 02:20:12 2008
@@ -0,0 +1,77 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.search.Similarity;
+
+/** Taps into DocInverter, as an InvertedDocEndConsumer,
+ * which is called at the end of inverting each field. We
+ * just look at the length for the field (docState.length)
+ * and record the norm. */
+
+final class NormsWriterPerField extends InvertedDocEndConsumerPerField implements Comparable {
+
+ final NormsWriterPerThread perThread;
+ final FieldInfo fieldInfo;
+ final DocumentsWriter.DocState docState;
+
+ // Holds all docID/norm pairs we've seen
+ int[] docIDs = new int[1];
+ byte[] norms = new byte[1];
+ int upto;
+
+ final DocInverter.FieldInvertState fieldState;
+
+ public void reset() {
+ // Shrink back if we are overallocated now:
+ docIDs = ArrayUtil.shrink(docIDs, upto);
+ norms = ArrayUtil.shrink(norms, upto);
+ upto = 0;
+ }
+
+ public NormsWriterPerField(final DocInverterPerField docInverterPerField, final NormsWriterPerThread perThread, final FieldInfo fieldInfo) {
+ this.perThread = perThread;
+ this.fieldInfo = fieldInfo;
+ docState = perThread.docState;
+ fieldState = docInverterPerField.fieldState;
+ }
+
+ void abort() {
+ upto = 0;
+ }
+
+ public int compareTo(Object other) {
+ return fieldInfo.name.compareTo(((NormsWriterPerField) other).fieldInfo.name);
+ }
+
+ void finish() {
+ assert docIDs.length == norms.length;
+ if (fieldInfo.isIndexed && !fieldInfo.omitNorms) {
+ if (docIDs.length <= upto) {
+ assert docIDs.length == upto;
+ docIDs = ArrayUtil.grow(docIDs, 1+upto);
+ norms = ArrayUtil.grow(norms, 1+upto);
+ }
+ final float norm = fieldState.boost * docState.similarity.lengthNorm(fieldInfo.name, fieldState.length);
+ norms[upto] = Similarity.encodeNorm(norm);
+ docIDs[upto] = docState.docID;
+ upto++;
+ }
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriterPerField.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriterPerThread.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriterPerThread.java?rev=677865&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriterPerThread.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriterPerThread.java Fri Jul 18 02:20:12 2008
@@ -0,0 +1,41 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+final class NormsWriterPerThread extends InvertedDocEndConsumerPerThread {
+ final NormsWriter normsWriter;
+ final DocumentsWriter.DocState docState;
+
+ public NormsWriterPerThread(DocInverterPerThread docInverterPerThread, NormsWriter normsWriter) {
+ this.normsWriter = normsWriter;
+ docState = docInverterPerThread.docState;
+ }
+
+ InvertedDocEndConsumerPerField addField(DocInverterPerField docInverterPerField, final FieldInfo fieldInfo) {
+ return new NormsWriterPerField(docInverterPerField, this, fieldInfo);
+ }
+
+ void abort() {}
+
+ void startDocument() {}
+ void finishDocument() {}
+
+ boolean freeRAM() {
+ return false;
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/NormsWriterPerThread.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/RawPostingList.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/RawPostingList.java?rev=677865&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/RawPostingList.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/RawPostingList.java Fri Jul 18 02:20:12 2008
@@ -0,0 +1,36 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/** This is the base class for an in-memory posting list,
+ * keyed by a Token. {@link TermsHash} maintains a hash
+ * table holding one instance of this per unique Token.
+ * Consumers of TermsHash (@link TermsHashConsumer} must
+ * subclass this class with its own concrete class.
+ * {@link FreqProxTermsWriter.RawPostingList} is the
+ * subclass used for the freq/prox postings, and {@link
+ * TermVectorsTermsWriter.PostingList} is the subclass
+ * used to hold TermVectors postings. */
+
+abstract class RawPostingList {
+ final static int BYTES_SIZE = DocumentsWriter.OBJECT_HEADER_BYTES + 3*DocumentsWriter.INT_NUM_BYTE;
+ int textStart;
+ int intStart;
+ int byteStart;
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/RawPostingList.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriter.java?rev=677865&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriter.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriter.java Fri Jul 18 02:20:12 2008
@@ -0,0 +1,190 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+import java.io.IOException;
+import org.apache.lucene.store.RAMOutputStream;
+import org.apache.lucene.util.ArrayUtil;
+
+/** This is a DocFieldConsumer that writes stored fields. */
+final class StoredFieldsWriter extends DocFieldConsumer {
+
+ FieldsWriter fieldsWriter;
+ final DocumentsWriter docWriter;
+ int lastDocID;
+
+ PerDoc[] docFreeList = new PerDoc[1];
+ int freeCount;
+
+ public StoredFieldsWriter(DocumentsWriter docWriter) {
+ this.docWriter = docWriter;
+ }
+
+ public DocFieldConsumerPerThread addThread(DocFieldProcessorPerThread docFieldProcessorPerThread) throws IOException {
+ return new StoredFieldsWriterPerThread(docFieldProcessorPerThread, this);
+ }
+
+ synchronized public void flush(Map threadsAndFields, DocumentsWriter.FlushState state) throws IOException {
+
+ if (state.numDocsInStore > 0) {
+ // It's possible that all documents seen in this segment
+ // hit non-aborting exceptions, in which case we will
+ // not have yet init'd the FieldsWriter:
+ initFieldsWriter();
+
+ // Fill fdx file to include any final docs that we
+ // skipped because they hit non-aborting exceptions
+ fill(state.numDocsInStore - docWriter.getDocStoreOffset());
+ }
+
+ if (fieldsWriter != null)
+ fieldsWriter.flush();
+ }
+
+ private void initFieldsWriter() throws IOException {
+ if (fieldsWriter == null) {
+ final String docStoreSegment = docWriter.getDocStoreSegment();
+ if (docStoreSegment != null) {
+ assert docStoreSegment != null;
+ fieldsWriter = new FieldsWriter(docWriter.directory,
+ docStoreSegment,
+ fieldInfos);
+ docWriter.addOpenFile(docStoreSegment + "." + IndexFileNames.FIELDS_EXTENSION);
+ docWriter.addOpenFile(docStoreSegment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION);
+ lastDocID = 0;
+ }
+ }
+ }
+
+ synchronized public void closeDocStore(DocumentsWriter.FlushState state) throws IOException {
+ final int inc = state.numDocsInStore - lastDocID;
+ if (inc > 0) {
+ initFieldsWriter();
+ fill(state.numDocsInStore - docWriter.getDocStoreOffset());
+ }
+
+ if (fieldsWriter != null) {
+ fieldsWriter.close();
+ fieldsWriter = null;
+ lastDocID = 0;
+ assert state.docStoreSegmentName != null;
+ state.flushedFiles.add(state.docStoreSegmentName + "." + IndexFileNames.FIELDS_EXTENSION);
+ state.flushedFiles.add(state.docStoreSegmentName + "." + IndexFileNames.FIELDS_INDEX_EXTENSION);
+
+ state.docWriter.removeOpenFile(state.docStoreSegmentName + "." + IndexFileNames.FIELDS_EXTENSION);
+ state.docWriter.removeOpenFile(state.docStoreSegmentName + "." + IndexFileNames.FIELDS_INDEX_EXTENSION);
+
+ if (4+state.numDocsInStore*8 != state.directory.fileLength(state.docStoreSegmentName + "." + IndexFileNames.FIELDS_INDEX_EXTENSION))
+ throw new RuntimeException("after flush: fdx size mismatch: " + state.numDocsInStore + " docs vs " + state.directory.fileLength(state.docStoreSegmentName + "." + IndexFileNames.FIELDS_INDEX_EXTENSION) + " length in bytes of " + state.docStoreSegmentName + "." + IndexFileNames.FIELDS_INDEX_EXTENSION);
+ }
+ }
+
+ int allocCount;
+
+ synchronized PerDoc getPerDoc() {
+ if (freeCount == 0) {
+ allocCount++;
+ if (allocCount > docFreeList.length) {
+ // Grow our free list up front to make sure we have
+ // enough space to recycle all outstanding PerDoc
+ // instances
+ assert allocCount == 1+docFreeList.length;
+ docFreeList = new PerDoc[ArrayUtil.getNextSize(allocCount)];
+ }
+ return new PerDoc();
+ } else
+ return docFreeList[--freeCount];
+ }
+
+ synchronized void abort() {
+ if (fieldsWriter != null) {
+ try {
+ fieldsWriter.close();
+ } catch (Throwable t) {
+ }
+ fieldsWriter = null;
+ lastDocID = 0;
+ }
+ }
+
+ /** Fills in any hole in the docIDs */
+ void fill(int docID) throws IOException {
+ final int docStoreOffset = docWriter.getDocStoreOffset();
+
+ // We must "catch up" for all docs before us
+ // that had no stored fields:
+ final int end = docID+docStoreOffset;
+ while(lastDocID < end) {
+ fieldsWriter.skipDocument();
+ lastDocID++;
+ }
+ }
+
+ synchronized void finishDocument(PerDoc perDoc) throws IOException {
+ assert docWriter.writer.testPoint("StoredFieldsWriter.finishDocument start");
+ initFieldsWriter();
+
+ fill(perDoc.docID);
+
+ // Append stored fields to the real FieldsWriter:
+ fieldsWriter.flushDocument(perDoc.numStoredFields, perDoc.fdt);
+ lastDocID++;
+ perDoc.reset();
+ free(perDoc);
+ assert docWriter.writer.testPoint("StoredFieldsWriter.finishDocument end");
+ }
+
+ public boolean freeRAM() {
+ return false;
+ }
+
+ synchronized void free(PerDoc perDoc) {
+ assert freeCount < docFreeList.length;
+ assert 0 == perDoc.numStoredFields;
+ assert 0 == perDoc.fdt.length();
+ assert 0 == perDoc.fdt.getFilePointer();
+ docFreeList[freeCount++] = perDoc;
+ }
+
+ class PerDoc extends DocumentsWriter.DocWriter {
+
+ // TODO: use something more memory efficient; for small
+ // docs the 1024 buffer size of RAMOutputStream wastes alot
+ RAMOutputStream fdt = new RAMOutputStream();
+ int numStoredFields;
+
+ void reset() {
+ fdt.reset();
+ numStoredFields = 0;
+ }
+
+ void abort() {
+ reset();
+ free(this);
+ }
+
+ public long sizeInBytes() {
+ return fdt.sizeInBytes();
+ }
+
+ public void finish() throws IOException {
+ finishDocument(this);
+ }
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriterPerField.java?rev=677865&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriterPerField.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriterPerField.java Fri Jul 18 02:20:12 2008
@@ -0,0 +1,66 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import org.apache.lucene.document.Fieldable;
+
+final class StoredFieldsWriterPerField extends DocFieldConsumerPerField {
+
+ final StoredFieldsWriterPerThread perThread;
+ final FieldInfo fieldInfo;
+ final DocumentsWriter.DocState docState;
+
+ public StoredFieldsWriterPerField(StoredFieldsWriterPerThread perThread, FieldInfo fieldInfo) {
+ this.perThread = perThread;
+ this.fieldInfo = fieldInfo;
+ docState = perThread.docState;
+ }
+
+ // Process all occurrences of a single field in one doc;
+ // count is 1 if a given field occurs only once in the
+ // Document, which is the "typical" case
+ public void processFields(Fieldable[] fields, int count) throws IOException {
+
+ final StoredFieldsWriter.PerDoc doc;
+ if (perThread.doc == null) {
+ doc = perThread.doc = perThread.storedFieldsWriter.getPerDoc();
+ doc.docID = docState.docID;
+ perThread.localFieldsWriter.setFieldsStream(doc.fdt);
+ assert doc.numStoredFields == 0: "doc.numStoredFields=" + doc.numStoredFields;
+ assert 0 == doc.fdt.length();
+ assert 0 == doc.fdt.getFilePointer();
+ } else {
+ doc = perThread.doc;
+ assert doc.docID == docState.docID: "doc.docID=" + doc.docID + " docState.docID=" + docState.docID;
+ }
+
+ for(int i=0;i<count;i++) {
+ final Fieldable field = fields[i];
+ if (field.isStored()) {
+ perThread.localFieldsWriter.writeField(fieldInfo, field);
+ assert docState.testPoint("StoredFieldsWriterPerField.processFields.writeField");
+ doc.numStoredFields++;
+ }
+ }
+ }
+
+ void abort() {
+ }
+}
+
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriterPerField.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriterPerThread.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriterPerThread.java?rev=677865&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriterPerThread.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriterPerThread.java Fri Jul 18 02:20:12 2008
@@ -0,0 +1,67 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import org.apache.lucene.store.IndexOutput;
+
+final class StoredFieldsWriterPerThread extends DocFieldConsumerPerThread {
+
+ final FieldsWriter localFieldsWriter;
+ final StoredFieldsWriter storedFieldsWriter;
+ final DocumentsWriter.DocState docState;
+
+ StoredFieldsWriter.PerDoc doc;
+
+ public StoredFieldsWriterPerThread(DocFieldProcessorPerThread docFieldProcessorPerThread, StoredFieldsWriter storedFieldsWriter) throws IOException {
+ this.storedFieldsWriter = storedFieldsWriter;
+ this.docState = docFieldProcessorPerThread.docState;
+ localFieldsWriter = new FieldsWriter((IndexOutput) null, (IndexOutput) null, storedFieldsWriter.fieldInfos);
+ }
+
+ public void startDocument() {
+ if (doc != null) {
+ // Only happens if previous document hit non-aborting
+ // exception while writing stored fields into
+ // localFieldsWriter:
+ doc.reset();
+ doc.docID = docState.docID;
+ }
+ }
+
+ public DocumentsWriter.DocWriter finishDocument() {
+ // If there were any stored fields in this doc, doc will
+ // be non-null; else it's null.
+ try {
+ return doc;
+ } finally {
+ doc = null;
+ }
+ }
+
+ public void abort() {
+ if (doc != null) {
+ doc.abort();
+ doc = null;
+ }
+ }
+
+ public DocFieldConsumerPerField addField(FieldInfo fieldInfo) {
+ return new StoredFieldsWriterPerField(this, fieldInfo);
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/StoredFieldsWriterPerThread.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java?rev=677865&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java Fri Jul 18 02:20:12 2008
@@ -0,0 +1,291 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
+import org.apache.lucene.util.ArrayUtil;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Map;
+
+final class TermVectorsTermsWriter extends TermsHashConsumer {
+
+ final DocumentsWriter docWriter;
+ TermVectorsWriter termVectorsWriter;
+ PerDoc[] docFreeList = new PerDoc[1];
+ int freeCount;
+ IndexOutput tvx;
+ IndexOutput tvd;
+ IndexOutput tvf;
+ int lastDocID;
+
+ public TermVectorsTermsWriter(DocumentsWriter docWriter) {
+ this.docWriter = docWriter;
+ streamCount = 2;
+ }
+
+ public TermsHashConsumerPerThread addThread(TermsHashPerThread termsHashPerThread) {
+ return new TermVectorsTermsWriterPerThread(termsHashPerThread, this);
+ }
+
+ void createPostings(RawPostingList[] postings, int start, int count) {
+ final int end = start + count;
+ for(int i=start;i<end;i++)
+ postings[i] = new PostingList();
+ }
+
+ synchronized void flush(Map threadsAndFields, final DocumentsWriter.FlushState state) throws IOException {
+
+ if (tvx != null) {
+
+ if (state.numDocsInStore > 0)
+ // In case there are some final documents that we
+ // didn't see (because they hit a non-aborting exception):
+ fill(state.numDocsInStore - docWriter.getDocStoreOffset());
+
+ tvx.flush();
+ tvd.flush();
+ tvf.flush();
+ }
+
+ Iterator it = threadsAndFields.entrySet().iterator();
+ while(it.hasNext()) {
+ Map.Entry entry = (Map.Entry) it.next();
+ Iterator it2 = ((Collection) entry.getValue()).iterator();
+ while(it2.hasNext()) {
+ TermVectorsTermsWriterPerField perField = (TermVectorsTermsWriterPerField) it2.next();
+ perField.termsHashPerField.reset();
+ perField.shrinkHash();
+ }
+
+ TermVectorsTermsWriterPerThread perThread = (TermVectorsTermsWriterPerThread) entry.getKey();
+ perThread.termsHashPerThread.reset(true);
+ }
+ }
+
+ synchronized void closeDocStore(final DocumentsWriter.FlushState state) throws IOException {
+ if (tvx != null) {
+ // At least one doc in this run had term vectors
+ // enabled
+ fill(state.numDocsInStore - docWriter.getDocStoreOffset());
+ tvx.close();
+ tvf.close();
+ tvd.close();
+ tvx = null;
+ assert state.docStoreSegmentName != null;
+ if (4+state.numDocsInStore*16 != state.directory.fileLength(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_INDEX_EXTENSION))
+ throw new RuntimeException("after flush: tvx size mismatch: " + state.numDocsInStore + " docs vs " + state.directory.fileLength(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_INDEX_EXTENSION) + " length in bytes of " + state.docStoreSegmentName + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
+
+ state.flushedFiles.add(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
+ state.flushedFiles.add(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
+ state.flushedFiles.add(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
+
+ docWriter.removeOpenFile(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
+ docWriter.removeOpenFile(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
+ docWriter.removeOpenFile(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
+
+ lastDocID = 0;
+ }
+ }
+
+ int allocCount;
+
+ synchronized PerDoc getPerDoc() {
+ if (freeCount == 0) {
+ allocCount++;
+ if (allocCount > docFreeList.length) {
+ // Grow our free list up front to make sure we have
+ // enough space to recycle all outstanding PerDoc
+ // instances
+ assert allocCount == 1+docFreeList.length;
+ docFreeList = new PerDoc[ArrayUtil.getNextSize(allocCount)];
+ }
+ return new PerDoc();
+ } else
+ return docFreeList[--freeCount];
+ }
+
+ /** Fills in no-term-vectors for all docs we haven't seen
+ * since the last doc that had term vectors. */
+ void fill(int docID) throws IOException {
+ final int docStoreOffset = docWriter.getDocStoreOffset();
+ final int end = docID+docStoreOffset;
+ if (lastDocID < end) {
+ final long tvfPosition = tvf.getFilePointer();
+ while(lastDocID < end) {
+ tvx.writeLong(tvd.getFilePointer());
+ tvd.writeVInt(0);
+ tvx.writeLong(tvfPosition);
+ lastDocID++;
+ }
+ }
+ }
+
+ synchronized void initTermVectorsWriter() throws IOException {
+ if (tvx == null) {
+
+ final String docStoreSegment = docWriter.getDocStoreSegment();
+
+ if (docStoreSegment == null)
+ return;
+
+ assert docStoreSegment != null;
+
+ // If we hit an exception while init'ing the term
+ // vector output files, we must abort this segment
+ // because those files will be in an unknown
+ // state:
+ tvx = docWriter.directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
+ tvd = docWriter.directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
+ tvf = docWriter.directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
+
+ tvx.writeInt(TermVectorsReader.FORMAT_CURRENT);
+ tvd.writeInt(TermVectorsReader.FORMAT_CURRENT);
+ tvf.writeInt(TermVectorsReader.FORMAT_CURRENT);
+
+ docWriter.addOpenFile(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
+ docWriter.addOpenFile(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
+ docWriter.addOpenFile(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
+
+ lastDocID = 0;
+ }
+ }
+
+ synchronized void finishDocument(PerDoc perDoc) throws IOException {
+
+ assert docWriter.writer.testPoint("TermVectorsTermsWriter.finishDocument start");
+
+ initTermVectorsWriter();
+
+ fill(perDoc.docID);
+
+ // Append term vectors to the real outputs:
+ tvx.writeLong(tvd.getFilePointer());
+ tvx.writeLong(tvf.getFilePointer());
+ tvd.writeVInt(perDoc.numVectorFields);
+ if (perDoc.numVectorFields > 0) {
+ for(int i=0;i<perDoc.numVectorFields;i++)
+ tvd.writeVInt(perDoc.fieldNumbers[i]);
+ assert 0 == perDoc.fieldPointers[0];
+ long lastPos = perDoc.fieldPointers[0];
+ for(int i=1;i<perDoc.numVectorFields;i++) {
+ long pos = perDoc.fieldPointers[i];
+ tvd.writeVLong(pos-lastPos);
+ lastPos = pos;
+ }
+ perDoc.tvf.writeTo(tvf);
+ perDoc.tvf.reset();
+ perDoc.numVectorFields = 0;
+ }
+
+ assert lastDocID == perDoc.docID + docWriter.getDocStoreOffset();
+
+ lastDocID++;
+
+ free(perDoc);
+ assert docWriter.writer.testPoint("TermVectorsTermsWriter.finishDocument end");
+ }
+
+ public boolean freeRAM() {
+ // We don't hold any state beyond one doc, so we don't
+ // free persistent RAM here
+ return false;
+ }
+
+ public void abort() {
+ if (tvx != null) {
+ try {
+ tvx.close();
+ } catch (Throwable t) {
+ }
+ tvx = null;
+ }
+ if (tvd != null) {
+ try {
+ tvd.close();
+ } catch (Throwable t) {
+ }
+ tvd = null;
+ }
+ if (tvf != null) {
+ try {
+ tvf.close();
+ } catch (Throwable t) {
+ }
+ tvf = null;
+ }
+ lastDocID = 0;
+ }
+
+ synchronized void free(PerDoc doc) {
+ assert freeCount < docFreeList.length;
+ docFreeList[freeCount++] = doc;
+ }
+
+ class PerDoc extends DocumentsWriter.DocWriter {
+
+ // TODO: use something more memory efficient; for small
+ // docs the 1024 buffer size of RAMOutputStream wastes alot
+ RAMOutputStream tvf = new RAMOutputStream();
+ int numVectorFields;
+
+ int[] fieldNumbers = new int[1];
+ long[] fieldPointers = new long[1];
+
+ void reset() {
+ tvf.reset();
+ numVectorFields = 0;
+ }
+
+ void abort() {
+ reset();
+ free(this);
+ }
+
+ void addField(final int fieldNumber) {
+ if (numVectorFields == fieldNumbers.length) {
+ fieldNumbers = ArrayUtil.grow(fieldNumbers);
+ fieldPointers = ArrayUtil.grow(fieldPointers);
+ }
+ fieldNumbers[numVectorFields] = fieldNumber;
+ fieldPointers[numVectorFields] = tvf.getFilePointer();
+ numVectorFields++;
+ }
+
+ public long sizeInBytes() {
+ return tvf.sizeInBytes();
+ }
+
+ public void finish() throws IOException {
+ finishDocument(this);
+ }
+ }
+
+ static final class PostingList extends RawPostingList {
+ int freq; // How many times this term occurred in the current doc
+ int lastOffset; // Last offset we saw
+ int lastPosition; // Last position where this term occurred
+ }
+
+ int bytesPerPosting() {
+ return RawPostingList.BYTES_SIZE + 3 * DocumentsWriter.INT_NUM_BYTE;
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java?rev=677865&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java Fri Jul 18 02:20:12 2008
@@ -0,0 +1,235 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.document.Fieldable;
+import org.apache.lucene.store.IndexOutput;
+
+final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
+
+ final TermVectorsTermsWriterPerThread perThread;
+ final TermsHashPerField termsHashPerField;
+ final TermVectorsTermsWriter termsWriter;
+ final FieldInfo fieldInfo;
+ final DocumentsWriter.DocState docState;
+ final DocInverter.FieldInvertState fieldState;
+
+ boolean doVectors;
+ boolean doVectorPositions;
+ boolean doVectorOffsets;
+
+ int maxNumPostings;
+
+ public TermVectorsTermsWriterPerField(TermsHashPerField termsHashPerField, TermVectorsTermsWriterPerThread perThread, FieldInfo fieldInfo) {
+ this.termsHashPerField = termsHashPerField;
+ this.perThread = perThread;
+ this.termsWriter = perThread.termsWriter;
+ this.fieldInfo = fieldInfo;
+ docState = termsHashPerField.docState;
+ fieldState = termsHashPerField.fieldState;
+ }
+
+ boolean start(Fieldable[] fields, int count) {
+ doVectors = false;
+ doVectorPositions = false;
+ doVectorOffsets = false;
+
+ for(int i=0;i<count;i++) {
+ Fieldable field = fields[i];
+ if (field.isIndexed() && field.isTermVectorStored()) {
+ doVectors = true;
+ doVectorPositions |= field.isStorePositionWithTermVector();
+ doVectorOffsets |= field.isStoreOffsetWithTermVector();
+ }
+ }
+
+ if (doVectors) {
+ if (perThread.doc == null) {
+ perThread.doc = termsWriter.getPerDoc();
+ perThread.doc.docID = docState.docID;
+ assert perThread.doc.numVectorFields == 0;
+ assert 0 == perThread.doc.tvf.length();
+ assert 0 == perThread.doc.tvf.getFilePointer();
+ } else {
+ assert perThread.doc.docID == docState.docID;
+
+ if (termsHashPerField.numPostings != 0)
+ // Only necessary if previous doc hit a
+ // non-aborting exception while writing vectors in
+ // this field:
+ termsHashPerField.reset();
+ }
+ }
+
+ // TODO: only if needed for performance
+ //perThread.postingsCount = 0;
+
+ return doVectors;
+ }
+
+ public void abort() {}
+
+ /** Called once per field per document if term vectors
+ * are enabled, to write the vectors to
+ * RAMOutputStream, which is then quickly flushed to
+ * * the real term vectors files in the Directory. */
+ void finish() throws IOException {
+
+ assert docState.testPoint("TermVectorsTermsWriterPerField.finish start");
+
+ final int numPostings = termsHashPerField.numPostings;
+
+ assert numPostings >= 0;
+
+ if (!doVectors || numPostings == 0)
+ return;
+
+ if (numPostings > maxNumPostings)
+ maxNumPostings = numPostings;
+
+ final IndexOutput tvf = perThread.doc.tvf;
+
+ // This is called once, after inverting all occurences
+ // of a given field in the doc. At this point we flush
+ // our hash into the DocWriter.
+
+ assert fieldInfo.storeTermVector;
+ assert perThread.vectorFieldsInOrder(fieldInfo);
+
+ perThread.doc.addField(termsHashPerField.fieldInfo.number);
+
+ final RawPostingList[] postings = termsHashPerField.sortPostings();
+
+ tvf.writeVInt(numPostings);
+ byte bits = 0x0;
+ if (doVectorPositions)
+ bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
+ if (doVectorOffsets)
+ bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
+ tvf.writeByte(bits);
+
+ int encoderUpto = 0;
+ int lastTermBytesCount = 0;
+
+ final ByteSliceReader reader = perThread.vectorSliceReader;
+ final char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers;
+ for(int j=0;j<numPostings;j++) {
+ final TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList) postings[j];
+ final int freq = posting.freq;
+
+ final char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+ final int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+
+ // We swap between two encoders to save copying
+ // last Term's byte array
+ final UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto];
+
+ // TODO: we could do this incrementally
+ UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result);
+ final int termBytesCount = utf8Result.length;
+
+ // TODO: UTF16toUTF8 could tell us this prefix
+ // Compute common prefix between last term and
+ // this term
+ int prefix = 0;
+ if (j > 0) {
+ final byte[] lastTermBytes = perThread.utf8Results[1-encoderUpto].result;
+ final byte[] termBytes = perThread.utf8Results[encoderUpto].result;
+ while(prefix < lastTermBytesCount && prefix < termBytesCount) {
+ if (lastTermBytes[prefix] != termBytes[prefix])
+ break;
+ prefix++;
+ }
+ }
+ encoderUpto = 1-encoderUpto;
+ lastTermBytesCount = termBytesCount;
+
+ final int suffix = termBytesCount - prefix;
+ tvf.writeVInt(prefix);
+ tvf.writeVInt(suffix);
+ tvf.writeBytes(utf8Result.result, prefix, suffix);
+ tvf.writeVInt(freq);
+
+ if (doVectorPositions) {
+ termsHashPerField.initReader(reader, posting, 0);
+ reader.writeTo(tvf);
+ }
+
+ if (doVectorOffsets) {
+ termsHashPerField.initReader(reader, posting, 1);
+ reader.writeTo(tvf);
+ }
+ }
+
+ termsHashPerField.reset();
+ perThread.termsHashPerThread.reset(false);
+ }
+
+ void shrinkHash() {
+ termsHashPerField.shrinkHash(maxNumPostings);
+ maxNumPostings = 0;
+ }
+
+ void newTerm(Token t, RawPostingList p0) {
+
+ assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start");
+
+ TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0;
+
+ p.freq = 1;
+
+ if (doVectorOffsets) {
+ final int startOffset = fieldState.offset + t.startOffset();
+ final int endOffset = fieldState.offset + t.endOffset();
+ termsHashPerField.writeVInt(1, startOffset);
+ termsHashPerField.writeVInt(1, endOffset - startOffset);
+ p.lastOffset = endOffset;
+ }
+
+ if (doVectorPositions) {
+ termsHashPerField.writeVInt(0, fieldState.position);
+ p.lastPosition = fieldState.position;
+ }
+ }
+
+ void addTerm(Token t, RawPostingList p0) {
+
+ assert docState.testPoint("TermVectorsTermsWriterPerField.addTerm start");
+
+ TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0;
+ p.freq++;
+
+ if (doVectorOffsets) {
+ final int startOffset = fieldState.offset + t.startOffset();
+ final int endOffset = fieldState.offset + t.endOffset();
+ termsHashPerField.writeVInt(1, startOffset - p.lastOffset);
+ termsHashPerField.writeVInt(1, endOffset - startOffset);
+ p.lastOffset = endOffset;
+ }
+
+ if (doVectorPositions) {
+ termsHashPerField.writeVInt(0, fieldState.position - p.lastPosition);
+ p.lastPosition = fieldState.position;
+ }
+ }
+
+ void skippingLongTerm(Token t) {}
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java?rev=677865&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java Fri Jul 18 02:20:12 2008
@@ -0,0 +1,87 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.UnicodeUtil;
+
+final class TermVectorsTermsWriterPerThread extends TermsHashConsumerPerThread {
+
+ final TermVectorsTermsWriter termsWriter;
+ final TermsHashPerThread termsHashPerThread;
+ final DocumentsWriter.DocState docState;
+
+ TermVectorsTermsWriter.PerDoc doc;
+
+ public TermVectorsTermsWriterPerThread(TermsHashPerThread termsHashPerThread, TermVectorsTermsWriter termsWriter) {
+ this.termsWriter = termsWriter;
+ this.termsHashPerThread = termsHashPerThread;
+ docState = termsHashPerThread.docState;
+ }
+
+ // Used by perField when serializing the term vectors
+ final ByteSliceReader vectorSliceReader = new ByteSliceReader();
+
+ final UnicodeUtil.UTF8Result utf8Results[] = {new UnicodeUtil.UTF8Result(),
+ new UnicodeUtil.UTF8Result()};
+
+ public void startDocument() {
+ assert clearLastVectorFieldName();
+ if (doc != null) {
+ doc.reset();
+ doc.docID = docState.docID;
+ }
+ }
+
+ public DocumentsWriter.DocWriter finishDocument() {
+ try {
+ return doc;
+ } finally {
+ doc = null;
+ }
+ }
+
+ public TermsHashConsumerPerField addField(TermsHashPerField termsHashPerField, FieldInfo fieldInfo) {
+ return new TermVectorsTermsWriterPerField(termsHashPerField, this, fieldInfo);
+ }
+
+ public void abort() {
+ if (doc != null) {
+ doc.abort();
+ doc = null;
+ }
+ }
+
+ // Called only by assert
+ final boolean clearLastVectorFieldName() {
+ lastVectorFieldName = null;
+ return true;
+ }
+
+ // Called only by assert
+ String lastVectorFieldName;
+ final boolean vectorFieldsInOrder(FieldInfo fi) {
+ try {
+ if (lastVectorFieldName != null)
+ return lastVectorFieldName.compareTo(fi.name) < 0;
+ else
+ return true;
+ } finally {
+ lastVectorFieldName = fi.name;
+ }
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerThread.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/TermsHash.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermsHash.java?rev=677865&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermsHash.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermsHash.java Fri Jul 18 02:20:12 2008
@@ -0,0 +1,244 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Collection;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.HashSet;
+import java.util.Arrays;
+import java.io.IOException;
+
+import org.apache.lucene.util.ArrayUtil;
+
+/** This class implements {@link InvertedDocConsumer}, which
+ * is passed each token produced by the analyzer on each
+ * field. It stores these tokens in a hash table, and
+ * allocates separate byte streams per token. Consumers of
+ * this class, eg {@link FreqProxTermsWriter} and {@link
+ * TermVectorsTermsWriter}, write their own byte streams
+ * under each term.
+ */
+
+final class TermsHash extends InvertedDocConsumer {
+
+ final TermsHashConsumer consumer;
+ final TermsHash nextTermsHash;
+ final int bytesPerPosting;
+ final int postingsFreeChunk;
+ final int streamCount;
+ final DocumentsWriter docWriter;
+
+ TermsHash primaryTermsHash;
+
+ RawPostingList[] postingsFreeList = new RawPostingList[1];
+ int postingsFreeCount;
+ int postingsAllocCount;
+ boolean trackAllocations;
+
+ public TermsHash(final DocumentsWriter docWriter, boolean trackAllocations, final TermsHashConsumer consumer, final TermsHash nextTermsHash) {
+ this.docWriter = docWriter;
+ this.consumer = consumer;
+ this.streamCount = consumer.streamCount;
+ this.nextTermsHash = nextTermsHash;
+ this.trackAllocations = trackAllocations;
+
+ // Why + 4*POINTER_NUM_BYTE below?
+ // +1: Posting is referenced by postingsFreeList array
+ // +3: Posting is referenced by hash, which
+ // targets 25-50% fill factor; approximate this
+ // as 3X # pointers
+ bytesPerPosting = consumer.bytesPerPosting() + 4*DocumentsWriter.POINTER_NUM_BYTE;
+ postingsFreeChunk = (int) (DocumentsWriter.BYTE_BLOCK_SIZE / bytesPerPosting);
+ }
+
+ InvertedDocConsumerPerThread addThread(DocInverterPerThread docInverterPerThread) {
+ return new TermsHashPerThread(docInverterPerThread, this, nextTermsHash, null);
+ }
+
+ TermsHashPerThread addThread(DocInverterPerThread docInverterPerThread, TermsHashPerThread primaryPerThread) {
+ return new TermsHashPerThread(docInverterPerThread, this, nextTermsHash, primaryPerThread);
+ }
+
+ void setFieldInfos(FieldInfos fieldInfos) {
+ this.fieldInfos = fieldInfos;
+ consumer.setFieldInfos(fieldInfos);
+ }
+
+ synchronized public void abort() {
+ consumer.abort();
+ if (nextTermsHash != null)
+ nextTermsHash.abort();
+ }
+
+ void shrinkFreePostings(Map threadsAndFields, DocumentsWriter.FlushState state) {
+
+ assert postingsFreeCount == postingsAllocCount: Thread.currentThread().getName() + ": postingsFreeCount=" + postingsFreeCount + " postingsAllocCount=" + postingsAllocCount + " consumer=" + consumer;
+
+ final int newSize = ArrayUtil.getShrinkSize(postingsFreeList.length, postingsAllocCount);
+ if (newSize != postingsFreeList.length) {
+ RawPostingList[] newArray = new RawPostingList[newSize];
+ System.arraycopy(postingsFreeList, 0, newArray, 0, postingsFreeCount);
+ postingsFreeList = newArray;
+ }
+ }
+
+ synchronized void closeDocStore(DocumentsWriter.FlushState state) throws IOException {
+ consumer.closeDocStore(state);
+ if (nextTermsHash != null)
+ nextTermsHash.closeDocStore(state);
+ }
+
+ synchronized void flush(Map threadsAndFields, final DocumentsWriter.FlushState state) throws IOException {
+ Map childThreadsAndFields = new HashMap();
+ Map nextThreadsAndFields;
+
+ if (nextTermsHash != null)
+ nextThreadsAndFields = new HashMap();
+ else
+ nextThreadsAndFields = null;
+
+ Iterator it = threadsAndFields.entrySet().iterator();
+ while(it.hasNext()) {
+
+ Map.Entry entry = (Map.Entry) it.next();
+
+ TermsHashPerThread perThread = (TermsHashPerThread) entry.getKey();
+
+ Collection fields = (Collection) entry.getValue();
+
+ Iterator fieldsIt = fields.iterator();
+ Collection childFields = new HashSet();
+ Collection nextChildFields;
+
+ if (nextTermsHash != null)
+ nextChildFields = new HashSet();
+ else
+ nextChildFields = null;
+
+ while(fieldsIt.hasNext()) {
+ TermsHashPerField perField = (TermsHashPerField) fieldsIt.next();
+ childFields.add(perField.consumer);
+ if (nextTermsHash != null)
+ nextChildFields.add(perField.nextPerField);
+ }
+
+ childThreadsAndFields.put(perThread.consumer, childFields);
+ if (nextTermsHash != null)
+ nextThreadsAndFields.put(perThread.nextPerThread, nextChildFields);
+ }
+
+ consumer.flush(childThreadsAndFields, state);
+
+ shrinkFreePostings(threadsAndFields, state);
+
+ if (nextTermsHash != null)
+ nextTermsHash.flush(nextThreadsAndFields, state);
+ }
+
+ synchronized public boolean freeRAM() {
+
+ if (!trackAllocations)
+ return false;
+
+ boolean any;
+ final int numToFree;
+ if (postingsFreeCount >= postingsFreeChunk)
+ numToFree = postingsFreeChunk;
+ else
+ numToFree = postingsFreeCount;
+ any = numToFree > 0;
+ if (any) {
+ Arrays.fill(postingsFreeList, postingsFreeCount-numToFree, postingsFreeCount, null);
+ postingsFreeCount -= numToFree;
+ postingsAllocCount -= numToFree;
+ docWriter.bytesAllocated(-numToFree * bytesPerPosting);
+ any = true;
+ }
+
+ if (nextTermsHash != null)
+ any |= nextTermsHash.freeRAM();
+
+ return any;
+ }
+
+ // USE ONLY FOR DEBUGGING!
+ /*
+ public String getPostingText() {
+ char[] text = charPool.buffers[p.textStart >> CHAR_BLOCK_SHIFT];
+ int upto = p.textStart & CHAR_BLOCK_MASK;
+ while(text[upto] != 0xffff)
+ upto++;
+ return new String(text, p.textStart, upto-(p.textStart & BYTE_BLOCK_MASK));
+ }
+ */
+
+ synchronized public void recyclePostings(final RawPostingList[] postings, final int numPostings) {
+
+ assert postings.length >= numPostings;
+
+ // Move all Postings from this ThreadState back to our
+ // free list. We pre-allocated this array while we were
+ // creating Postings to make sure it's large enough
+ assert postingsFreeCount + numPostings <= postingsFreeList.length;
+ System.arraycopy(postings, 0, postingsFreeList, postingsFreeCount, numPostings);
+ postingsFreeCount += numPostings;
+ }
+
+ synchronized public void getPostings(final RawPostingList[] postings) {
+
+ assert docWriter.writer.testPoint("TermsHash.getPostings start");
+
+ assert postingsFreeCount <= postingsFreeList.length;
+ assert postingsFreeCount <= postingsAllocCount: "postingsFreeCount=" + postingsFreeCount + " postingsAllocCount=" + postingsAllocCount;
+
+ final int numToCopy;
+ if (postingsFreeCount < postings.length)
+ numToCopy = postingsFreeCount;
+ else
+ numToCopy = postings.length;
+ final int start = postingsFreeCount-numToCopy;
+ assert start >= 0;
+ assert start + numToCopy <= postingsFreeList.length;
+ assert numToCopy <= postings.length;
+ System.arraycopy(postingsFreeList, start,
+ postings, 0, numToCopy);
+
+ // Directly allocate the remainder if any
+ if (numToCopy < postings.length) {
+ final int extra = postings.length - numToCopy;
+ final int newPostingsAllocCount = postingsAllocCount + extra;
+
+ if (newPostingsAllocCount > postingsFreeList.length)
+ postingsFreeList = new RawPostingList[ArrayUtil.getNextSize(newPostingsAllocCount)];
+
+ consumer.createPostings(postings, numToCopy, extra);
+ assert docWriter.writer.testPoint("TermsHash.getPostings after create");
+ postingsAllocCount += extra;
+
+ if (trackAllocations)
+ docWriter.bytesAllocated(extra * bytesPerPosting);
+ }
+
+ postingsFreeCount -= numToCopy;
+
+ if (trackAllocations)
+ docWriter.bytesUsed(postings.length * bytesPerPosting);
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/TermsHash.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumer.java?rev=677865&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumer.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumer.java Fri Jul 18 02:20:12 2008
@@ -0,0 +1,38 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Map;
+
+abstract class TermsHashConsumer {
+ abstract int bytesPerPosting();
+ abstract void createPostings(RawPostingList[] postings, int start, int count);
+ abstract TermsHashConsumerPerThread addThread(TermsHashPerThread perThread);
+ abstract void flush(Map threadsAndFields, final DocumentsWriter.FlushState state) throws IOException;
+ abstract void abort();
+ abstract void closeDocStore(DocumentsWriter.FlushState state) throws IOException;
+
+ int streamCount;
+
+ FieldInfos fieldInfos;
+
+ void setFieldInfos(FieldInfos fieldInfos) {
+ this.fieldInfos = fieldInfos;
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java?rev=677865&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java Fri Jul 18 02:20:12 2008
@@ -0,0 +1,35 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Implement this class to plug into the TermsHash
+ * processor, which inverts & stores Tokens into a hash
+ * table and provides an API for writing bytes into
+ * multiple streams for each unique Token. */
+
+import java.io.IOException;
+import org.apache.lucene.document.Fieldable;
+import org.apache.lucene.analysis.Token;
+
+abstract class TermsHashConsumerPerField {
+ abstract boolean start(Fieldable[] fields, int count) throws IOException;
+ abstract void finish() throws IOException;
+ abstract void skippingLongTerm(Token t) throws IOException;
+ abstract void newTerm(Token t, RawPostingList p) throws IOException;
+ abstract void addTerm(Token t, RawPostingList p) throws IOException;
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumerPerThread.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumerPerThread.java?rev=677865&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumerPerThread.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumerPerThread.java Fri Jul 18 02:20:12 2008
@@ -0,0 +1,27 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+abstract class TermsHashConsumerPerThread {
+ abstract void startDocument() throws IOException;
+ abstract DocumentsWriter.DocWriter finishDocument() throws IOException;
+ abstract public TermsHashConsumerPerField addField(TermsHashPerField termsHashPerField, FieldInfo fieldInfo);
+ abstract public void abort();
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashConsumerPerThread.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashPerField.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashPerField.java?rev=677865&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashPerField.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashPerField.java Fri Jul 18 02:20:12 2008
@@ -0,0 +1,545 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.document.Fieldable;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.util.UnicodeUtil;
+
+final class TermsHashPerField extends InvertedDocConsumerPerField {
+
+ final TermsHashConsumerPerField consumer;
+ final TermsHashPerField nextPerField;
+ final TermsHashPerThread perThread;
+ final DocumentsWriter.DocState docState;
+ final DocInverter.FieldInvertState fieldState;
+
+ // Copied from our perThread
+ final CharBlockPool charPool;
+ final IntBlockPool intPool;
+ final ByteBlockPool bytePool;
+
+ final int streamCount;
+ final int numPostingInt;
+
+ final FieldInfo fieldInfo;
+
+ boolean postingsCompacted;
+ int numPostings;
+ private int postingsHashSize = 4;
+ private int postingsHashHalfSize = postingsHashSize/2;
+ private int postingsHashMask = postingsHashSize-1;
+ private RawPostingList[] postingsHash = new RawPostingList[postingsHashSize];
+ private RawPostingList p;
+
+ public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
+ this.perThread = perThread;
+ intPool = perThread.intPool;
+ charPool = perThread.charPool;
+ bytePool = perThread.bytePool;
+ docState = perThread.docState;
+ fieldState = docInverterPerField.fieldState;
+ streamCount = perThread.termsHash.streamCount;
+ numPostingInt = 2*streamCount;
+ this.consumer = perThread.consumer.addField(this, fieldInfo);
+ this.fieldInfo = fieldInfo;
+ if (nextPerThread != null)
+ nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo);
+ else
+ nextPerField = null;
+ }
+
+ void shrinkHash(int targetSize) {
+ assert postingsCompacted || numPostings == 0;
+
+ // Cannot use ArrayUtil.shrink because we require power
+ // of 2:
+ int newSize = postingsHash.length;
+ while(newSize >= 8 && newSize/4 > targetSize) {
+ newSize /= 2;
+ }
+
+ if (newSize != postingsHash.length) {
+ postingsHash = new RawPostingList[newSize];
+ postingsHashSize = newSize;
+ postingsHashHalfSize = newSize/2;
+ postingsHashMask = newSize-1;
+ }
+ }
+
+ public void reset() {
+ if (!postingsCompacted)
+ compactPostings();
+ assert numPostings <= postingsHash.length;
+ if (numPostings > 0) {
+ perThread.termsHash.recyclePostings(postingsHash, numPostings);
+ Arrays.fill(postingsHash, 0, numPostings, null);
+ numPostings = 0;
+ }
+ postingsCompacted = false;
+ if (nextPerField != null)
+ nextPerField.reset();
+ }
+
+ synchronized public void abort() {
+ reset();
+ if (nextPerField != null)
+ nextPerField.abort();
+ }
+
+ public void initReader(ByteSliceReader reader, RawPostingList p, int stream) {
+ assert stream < streamCount;
+ final int[] ints = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
+ final int upto = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
+ reader.init(bytePool,
+ p.byteStart+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
+ ints[upto+stream]);
+ }
+
+ private synchronized void compactPostings() {
+ int upto = 0;
+ for(int i=0;i<postingsHashSize;i++) {
+ if (postingsHash[i] != null) {
+ if (upto < i) {
+ postingsHash[upto] = postingsHash[i];
+ postingsHash[i] = null;
+ }
+ upto++;
+ }
+ }
+
+ assert upto == numPostings;
+ postingsCompacted = true;
+ }
+
+ /** Collapse the hash table & sort in-place. */
+ public RawPostingList[] sortPostings() {
+ compactPostings();
+ quickSort(postingsHash, 0, numPostings-1);
+ return postingsHash;
+ }
+
+ void quickSort(RawPostingList[] postings, int lo, int hi) {
+ if (lo >= hi)
+ return;
+ else if (hi == 1+lo) {
+ if (comparePostings(postings[lo], postings[hi]) > 0) {
+ final RawPostingList tmp = postings[lo];
+ postings[lo] = postings[hi];
+ postings[hi] = tmp;
+ }
+ return;
+ }
+
+ int mid = (lo + hi) >>> 1;
+
+ if (comparePostings(postings[lo], postings[mid]) > 0) {
+ RawPostingList tmp = postings[lo];
+ postings[lo] = postings[mid];
+ postings[mid] = tmp;
+ }
+
+ if (comparePostings(postings[mid], postings[hi]) > 0) {
+ RawPostingList tmp = postings[mid];
+ postings[mid] = postings[hi];
+ postings[hi] = tmp;
+
+ if (comparePostings(postings[lo], postings[mid]) > 0) {
+ RawPostingList tmp2 = postings[lo];
+ postings[lo] = postings[mid];
+ postings[mid] = tmp2;
+ }
+ }
+
+ int left = lo + 1;
+ int right = hi - 1;
+
+ if (left >= right)
+ return;
+
+ RawPostingList partition = postings[mid];
+
+ for (; ;) {
+ while (comparePostings(postings[right], partition) > 0)
+ --right;
+
+ while (left < right && comparePostings(postings[left], partition) <= 0)
+ ++left;
+
+ if (left < right) {
+ RawPostingList tmp = postings[left];
+ postings[left] = postings[right];
+ postings[right] = tmp;
+ --right;
+ } else {
+ break;
+ }
+ }
+
+ quickSort(postings, lo, left);
+ quickSort(postings, left + 1, hi);
+ }
+
+ /** Compares term text for two Posting instance and
+ * returns -1 if p1 < p2; 1 if p1 > p2; else 0. */
+ int comparePostings(RawPostingList p1, RawPostingList p2) {
+
+ if (p1 == p2)
+ return 0;
+
+ final char[] text1 = charPool.buffers[p1.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+ int pos1 = p1.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+ final char[] text2 = charPool.buffers[p2.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+ int pos2 = p2.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+
+ assert text1 != text2 || pos1 != pos2;
+
+ while(true) {
+ final char c1 = text1[pos1++];
+ final char c2 = text2[pos2++];
+ if (c1 != c2) {
+ if (0xffff == c2)
+ return 1;
+ else if (0xffff == c1)
+ return -1;
+ else
+ return c1-c2;
+ } else
+ // This method should never compare equal postings
+ // unless p1==p2
+ assert c1 != 0xffff;
+ }
+ }
+
+ /** Test whether the text for current RawPostingList p equals
+ * current tokenText. */
+ private boolean postingEquals(final char[] tokenText, final int tokenTextLen) {
+
+ final char[] text = perThread.charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+ assert text != null;
+ int pos = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+
+ int tokenPos = 0;
+ for(;tokenPos<tokenTextLen;pos++,tokenPos++)
+ if (tokenText[tokenPos] != text[pos])
+ return false;
+ return 0xffff == text[pos];
+ }
+
+ private boolean doCall;
+ private boolean doNextCall;
+
+ boolean start(Fieldable[] fields, int count) throws IOException {
+ doCall = consumer.start(fields, count);
+ if (nextPerField != null)
+ doNextCall = nextPerField.start(fields, count);
+ return doCall || doNextCall;
+ }
+
+ // Secondary entry point (for 2nd & subsequent TermsHash),
+ // because token text has already been "interned" into
+ // textStart, so we hash by textStart
+ public void add(Token token, int textStart) throws IOException {
+
+ int code = textStart;
+
+ int hashPos = code & postingsHashMask;
+
+ assert !postingsCompacted;
+
+ // Locate RawPostingList in hash
+ p = postingsHash[hashPos];
+
+ if (p != null && p.textStart != textStart) {
+ // Conflict: keep searching different locations in
+ // the hash table.
+ final int inc = ((code>>8)+code)|1;
+ do {
+ code += inc;
+ hashPos = code & postingsHashMask;
+ p = postingsHash[hashPos];
+ } while (p != null && p.textStart != textStart);
+ }
+
+ if (p == null) {
+
+ // First time we are seeing this token since we last
+ // flushed the hash.
+
+ // Refill?
+ if (0 == perThread.freePostingsCount)
+ perThread.morePostings();
+
+ // Pull next free RawPostingList from free list
+ p = perThread.freePostings[--perThread.freePostingsCount];
+ assert p != null;
+
+ p.textStart = textStart;
+
+ assert postingsHash[hashPos] == null;
+ postingsHash[hashPos] = p;
+ numPostings++;
+
+ if (numPostings == postingsHashHalfSize)
+ rehashPostings(2*postingsHashSize);
+
+ // Init stream slices
+ if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
+ intPool.nextBuffer();
+
+ if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
+ bytePool.nextBuffer();
+
+ intUptos = intPool.buffer;
+ intUptoStart = intPool.intUpto;
+ intPool.intUpto += streamCount;
+
+ p.intStart = intUptoStart + intPool.intOffset;
+
+ for(int i=0;i<streamCount;i++) {
+ final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
+ intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
+ }
+ p.byteStart = intUptos[intUptoStart];
+
+ consumer.newTerm(token, p);
+
+ } else {
+ intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
+ intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
+ consumer.addTerm(token, p);
+ }
+ }
+
+ // Primary entry point (for first TermsHash)
+ void add(Token token) throws IOException {
+
+ assert !postingsCompacted;
+
+ // We are first in the chain so we must "intern" the
+ // term text into textStart address
+
+ // Get the text of this term.
+ final char[] tokenText = token.termBuffer();
+ final int tokenTextLen = token.termLength();
+
+ // Compute hashcode & replace any invalid UTF16 sequences
+ int downto = tokenTextLen;
+ int code = 0;
+ while (downto > 0) {
+ char ch = tokenText[--downto];
+
+ if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) {
+ if (0 == downto) {
+ // Unpaired
+ ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
+ } else {
+ final char ch2 = tokenText[downto-1];
+ if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) {
+ // OK: high followed by low. This is a valid
+ // surrogate pair.
+ code = ((code*31) + ch)*31+ch2;
+ downto--;
+ continue;
+ } else {
+ // Unpaired
+ ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
+ }
+ }
+ } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END)
+ // Unpaired
+ ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
+
+ code = (code*31) + ch;
+ }
+
+ int hashPos = code & postingsHashMask;
+
+ // Locate RawPostingList in hash
+ p = postingsHash[hashPos];
+
+ if (p != null && !postingEquals(tokenText, tokenTextLen)) {
+ // Conflict: keep searching different locations in
+ // the hash table.
+ final int inc = ((code>>8)+code)|1;
+ do {
+ code += inc;
+ hashPos = code & postingsHashMask;
+ p = postingsHash[hashPos];
+ } while (p != null && !postingEquals(tokenText, tokenTextLen));
+ }
+
+ if (p == null) {
+
+ // First time we are seeing this token since we last
+ // flushed the hash.
+ final int textLen1 = 1+tokenTextLen;
+ if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) {
+ if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) {
+ // Just skip this term, to remain as robust as
+ // possible during indexing. A TokenFilter
+ // can be inserted into the analyzer chain if
+ // other behavior is wanted (pruning the term
+ // to a prefix, throwing an exception, etc).
+
+ if (docState.maxTermPrefix == null)
+ docState.maxTermPrefix = new String(tokenText, 0, 30);
+
+ consumer.skippingLongTerm(token);
+ return;
+ }
+ charPool.nextBuffer();
+ }
+
+ // Refill?
+ if (0 == perThread.freePostingsCount)
+ perThread.morePostings();
+
+ // Pull next free RawPostingList from free list
+ p = perThread.freePostings[--perThread.freePostingsCount];
+ assert p != null;
+
+ final char[] text = charPool.buffer;
+ final int textUpto = charPool.charUpto;
+ p.textStart = textUpto + charPool.charOffset;
+ charPool.charUpto += textLen1;
+ System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);
+ text[textUpto+tokenTextLen] = 0xffff;
+
+ assert postingsHash[hashPos] == null;
+ postingsHash[hashPos] = p;
+ numPostings++;
+
+ if (numPostings == postingsHashHalfSize)
+ rehashPostings(2*postingsHashSize);
+
+ // Init stream slices
+ if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
+ intPool.nextBuffer();
+
+ if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
+ bytePool.nextBuffer();
+
+ intUptos = intPool.buffer;
+ intUptoStart = intPool.intUpto;
+ intPool.intUpto += streamCount;
+
+ p.intStart = intUptoStart + intPool.intOffset;
+
+ for(int i=0;i<streamCount;i++) {
+ final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
+ intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
+ }
+ p.byteStart = intUptos[intUptoStart];
+
+ consumer.newTerm(token, p);
+
+ } else {
+ intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
+ intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
+ consumer.addTerm(token, p);
+ }
+
+ if (doNextCall)
+ nextPerField.add(token, p.textStart);
+ }
+
+ int[] intUptos;
+ int intUptoStart;
+
+ void writeByte(int stream, byte b) {
+ int upto = intUptos[intUptoStart+stream];
+ byte[] bytes = bytePool.buffers[upto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
+ assert bytes != null;
+ int offset = upto & DocumentsWriter.BYTE_BLOCK_MASK;
+ if (bytes[offset] != 0) {
+ // End of slice; allocate a new one
+ offset = bytePool.allocSlice(bytes, offset);
+ bytes = bytePool.buffer;
+ intUptos[intUptoStart+stream] = offset + bytePool.byteOffset;
+ }
+ bytes[offset] = b;
+ (intUptos[intUptoStart+stream])++;
+ }
+
+ public void writeBytes(int stream, byte[] b, int offset, int len) {
+ // TODO: optimize
+ final int end = offset + len;
+ for(int i=offset;i<end;i++)
+ writeByte(stream, b[i]);
+ }
+
+ void writeVInt(int stream, int i) {
+ while ((i & ~0x7F) != 0) {
+ writeByte(stream, (byte)((i & 0x7f) | 0x80));
+ i >>>= 7;
+ }
+ writeByte(stream, (byte) i);
+ }
+
+ void finish() throws IOException {
+ consumer.finish();
+ if (nextPerField != null)
+ nextPerField.finish();
+ }
+
+ /** Called when postings hash is too small (> 50%
+ * occupied) or too large (< 20% occupied). */
+ void rehashPostings(final int newSize) {
+
+ final int newMask = newSize-1;
+
+ RawPostingList[] newHash = new RawPostingList[newSize];
+ for(int i=0;i<postingsHashSize;i++) {
+ RawPostingList p0 = postingsHash[i];
+ if (p0 != null) {
+ int code;
+ if (perThread.primary) {
+ final int start = p0.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
+ final char[] text = charPool.buffers[p0.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
+ int pos = start;
+ while(text[pos] != 0xffff)
+ pos++;
+ code = 0;
+ while (pos > start)
+ code = (code*31) + text[--pos];
+ } else
+ code = p0.textStart;
+
+ int hashPos = code & newMask;
+ assert hashPos >= 0;
+ if (newHash[hashPos] != null) {
+ final int inc = ((code>>8)+code)|1;
+ do {
+ code += inc;
+ hashPos = code & newMask;
+ } while (newHash[hashPos] != null);
+ }
+ newHash[hashPos] = p0;
+ }
+ }
+
+ postingsHashMask = newMask;
+ postingsHash = newHash;
+ postingsHashSize = newSize;
+ postingsHashHalfSize = newSize >> 1;
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashPerField.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashPerThread.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashPerThread.java?rev=677865&view=auto
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashPerThread.java (added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashPerThread.java Fri Jul 18 02:20:12 2008
@@ -0,0 +1,116 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+final class TermsHashPerThread extends InvertedDocConsumerPerThread {
+
+ final TermsHash termsHash;
+ final TermsHashConsumerPerThread consumer;
+ final TermsHashPerThread nextPerThread;
+
+ final CharBlockPool charPool;
+ final IntBlockPool intPool;
+ final ByteBlockPool bytePool;
+ final boolean primary;
+ final DocumentsWriter.DocState docState;
+
+ final RawPostingList freePostings[] = new RawPostingList[256];
+ int freePostingsCount;
+
+ public TermsHashPerThread(DocInverterPerThread docInverterPerThread, final TermsHash termsHash, final TermsHash nextTermsHash, final TermsHashPerThread primaryPerThread) {
+ docState = docInverterPerThread.docState;
+
+ this.termsHash = termsHash;
+ this.consumer = termsHash.consumer.addThread(this);
+
+ if (nextTermsHash != null) {
+ // We are primary
+ charPool = new CharBlockPool(termsHash.docWriter);
+ primary = true;
+ } else {
+ charPool = primaryPerThread.charPool;
+ primary = false;
+ }
+
+ intPool = new IntBlockPool(termsHash.docWriter, termsHash.trackAllocations);
+ bytePool = new ByteBlockPool(termsHash.docWriter.byteBlockAllocator, termsHash.trackAllocations);
+
+ if (nextTermsHash != null)
+ nextPerThread = nextTermsHash.addThread(docInverterPerThread, this);
+ else
+ nextPerThread = null;
+ }
+
+ InvertedDocConsumerPerField addField(DocInverterPerField docInverterPerField, final FieldInfo fieldInfo) {
+ return new TermsHashPerField(docInverterPerField, this, nextPerThread, fieldInfo);
+ }
+
+ synchronized public void abort() {
+ reset(true);
+ consumer.abort();
+ if (nextPerThread != null)
+ nextPerThread.abort();
+ }
+
+ // perField calls this when it needs more postings:
+ void morePostings() throws IOException {
+ assert freePostingsCount == 0;
+ termsHash.getPostings(freePostings);
+ freePostingsCount = freePostings.length;
+ for(int i=0;i<freePostingsCount;i++)
+ assert freePostings[i] != null;
+ }
+
+ public void startDocument() throws IOException {
+ consumer.startDocument();
+ if (nextPerThread != null)
+ nextPerThread.consumer.startDocument();
+ }
+
+ public DocumentsWriter.DocWriter finishDocument() throws IOException {
+ final DocumentsWriter.DocWriter doc = consumer.finishDocument();
+
+ final DocumentsWriter.DocWriter doc2;
+ if (nextPerThread != null)
+ doc2 = nextPerThread.consumer.finishDocument();
+ else
+ doc2 = null;
+ if (doc == null)
+ return doc2;
+ else {
+ doc.setNext(doc2);
+ return doc;
+ }
+ }
+
+ // Clear all state
+ void reset(boolean recyclePostings) {
+ intPool.reset();
+ bytePool.reset();
+
+ if (primary)
+ charPool.reset();
+
+ if (recyclePostings) {
+ termsHash.recyclePostings(freePostings, freePostingsCount);
+ freePostingsCount = 0;
+ }
+ }
+}
Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/TermsHashPerThread.java
------------------------------------------------------------------------------
svn:eol-style = native