You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2011/02/09 10:36:03 UTC
svn commit: r1068809 [7/36] - in /lucene/dev/branches/docvalues: ./
dev-tools/eclipse/ dev-tools/idea/.idea/ dev-tools/idea/.idea/copyright/
dev-tools/idea/lucene/ dev-tools/idea/lucene/contrib/ant/
dev-tools/idea/lucene/contrib/queryparser/ dev-tools/...
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/ParallelReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/ParallelReader.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/ParallelReader.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/ParallelReader.java Wed Feb 9 09:35:27 2011
@@ -24,12 +24,12 @@ import org.apache.lucene.document.Fielda
import org.apache.lucene.index.values.DocValues;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.Pair;
-import org.apache.lucene.search.FieldCache; // not great (circular); used only to purge FieldCache entry on close
-import org.apache.lucene.search.Similarity;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.MapBackedSet;
import java.io.IOException;
import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
/** An IndexReader which reads multiple, parallel indexes. Each index added
@@ -57,7 +57,7 @@ public class ParallelReader extends Inde
private Map<IndexReader,Collection<String>> readerToFields = new HashMap<IndexReader,Collection<String>>();
private List<IndexReader> storedFieldReaders = new ArrayList<IndexReader>();
private Map<String,byte[]> normsCache = new HashMap<String,byte[]>();
-
+ private final ReaderContext topLevelReaderContext = new AtomicReaderContext(this);
private int maxDoc;
private int numDocs;
private boolean hasDeletions;
@@ -76,6 +76,7 @@ public class ParallelReader extends Inde
public ParallelReader(boolean closeSubReaders) throws IOException {
super();
this.incRefReaders = !closeSubReaders;
+ readerFinishedListeners = new MapBackedSet<ReaderFinishedListener>(new ConcurrentHashMap<ReaderFinishedListener,Boolean>());
}
/** {@inheritDoc} */
@@ -92,7 +93,7 @@ public class ParallelReader extends Inde
buffer.append(')');
return buffer.toString();
}
-
+
/** Add an IndexReader.
* @throws IOException if there is a low-level IO error
*/
@@ -452,6 +453,8 @@ public class ParallelReader extends Inde
return bytes;
if (!hasNorms(field))
return null;
+ if (normsCache.containsKey(field)) // cached omitNorms, not missing key
+ return null;
bytes = MultiNorms.norms(reader, field);
normsCache.put(field, bytes);
@@ -459,23 +462,6 @@ public class ParallelReader extends Inde
}
@Override
- public synchronized void norms(String field, byte[] result, int offset)
- throws IOException {
- // TODO: maybe optimize
- ensureOpen();
- IndexReader reader = fieldToReader.get(field);
- if (reader==null)
- return;
-
- byte[] norms = norms(field);
- if (norms == null) {
- Arrays.fill(result, offset, result.length, Similarity.getDefault().encodeNormValue(1.0f));
- } else {
- System.arraycopy(norms, 0, result, offset, maxDoc());
- }
- }
-
- @Override
protected void doSetNorm(int n, String field, byte value)
throws CorruptIndexException, IOException {
IndexReader reader = fieldToReader.get(field);
@@ -560,8 +546,6 @@ public class ParallelReader extends Inde
readers.get(i).close();
}
}
-
- FieldCache.DEFAULT.purge(this);
}
@Override
@@ -574,6 +558,26 @@ public class ParallelReader extends Inde
}
return fieldSet;
}
+ @Override
+ public ReaderContext getTopReaderContext() {
+ return topLevelReaderContext;
+ }
+
+ @Override
+ public void addReaderFinishedListener(ReaderFinishedListener listener) {
+ super.addReaderFinishedListener(listener);
+ for (IndexReader reader : readers) {
+ reader.addReaderFinishedListener(listener);
+ }
+ }
+
+ @Override
+ public void removeReaderFinishedListener(ReaderFinishedListener listener) {
+ super.removeReaderFinishedListener(listener);
+ for (IndexReader reader : readers) {
+ reader.removeReaderFinishedListener(listener);
+ }
+ }
}
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/PayloadProcessorProvider.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/PayloadProcessorProvider.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/PayloadProcessorProvider.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/PayloadProcessorProvider.java Wed Feb 9 09:35:27 2011
@@ -24,7 +24,7 @@ import org.apache.lucene.util.BytesRef;
/**
* Provides a {@link DirPayloadProcessor} to be used for a {@link Directory}.
- * This allows using differnt {@link DirPayloadProcessor}s for different
+ * This allows using different {@link DirPayloadProcessor}s for different
* directories, for e.g. to perform different processing of payloads of
* different directories.
* <p>
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/PerFieldCodecWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/PerFieldCodecWrapper.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/PerFieldCodecWrapper.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/PerFieldCodecWrapper.java Wed Feb 9 09:35:27 2011
@@ -224,6 +224,7 @@ final class PerFieldCodecWrapper extends
}
}
+ @Override
public FieldsProducer fieldsProducer(SegmentReadState state)
throws IOException {
return new FieldsReader(state.dir, state.fieldInfos, state.segmentInfo,
@@ -233,7 +234,7 @@ final class PerFieldCodecWrapper extends
@Override
public void files(Directory dir, SegmentInfo info, String codecId, Set<String> files)
throws IOException {
- // ignore codecid sicne segmentCodec will assign it per codec
+ // ignore codecid since segmentCodec will assign it per codec
segmentCodecs.files(dir, info, files);
}
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/PersistentSnapshotDeletionPolicy.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/PersistentSnapshotDeletionPolicy.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/PersistentSnapshotDeletionPolicy.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/PersistentSnapshotDeletionPolicy.java Wed Feb 9 09:35:27 2011
@@ -103,7 +103,7 @@ public class PersistentSnapshotDeletionP
* @param mode
* specifies whether a new index should be created, deleting all
* existing snapshots information (immediately), or open an existing
- * index, initializing the class with the snapsthots information.
+ * index, initializing the class with the snapshots information.
* @param matchVersion
* specifies the {@link Version} that should be used when opening the
* IndexWriter.
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentInfo.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentInfo.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentInfo.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentInfo.java Wed Feb 9 09:35:27 2011
@@ -20,6 +20,7 @@ package org.apache.lucene.index;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.Constants;
import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.codecs.DefaultSegmentInfosWriter;
@@ -67,10 +68,11 @@ public final class SegmentInfo {
private boolean isCompoundFile;
- private List<String> files; // cached list of files that this segment uses
+ private volatile List<String> files; // cached list of files that this segment uses
// in the Directory
- long sizeInBytes = -1; // total byte size of all of our files (computed on demand)
+ private volatile long sizeInBytesNoStore = -1; // total byte size of all but the store files (computed on demand)
+ private volatile long sizeInBytesWithStore = -1; // total byte size of all of our files (computed on demand)
private int docStoreOffset; // if this segment shares stored fields & vectors, this
// offset is where in that file this segment's docs begin
@@ -88,6 +90,17 @@ public final class SegmentInfo {
private Map<String,String> diagnostics;
+ // Tracks the Lucene version this segment was created with, since 3.1. Null
+ // indicates an older than 3.0 index, and it's used to detect a too old index.
+ // The format expected is "x.y" - "2.x" for pre-3.0 indexes (or null), and
+ // specific versions afterwards ("3.0", "3.1" etc.).
+ // see Constants.LUCENE_MAIN_VERSION.
+ private String version;
+
+ // NOTE: only used in-RAM by IW to track buffered deletes;
+ // this is never written to/read from the Directory
+ private long bufferedDeletesGen;
+
public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile,
boolean hasProx, SegmentCodecs segmentCodecs, boolean hasVectors) {
this.name = name;
@@ -96,10 +109,12 @@ public final class SegmentInfo {
delGen = NO;
this.isCompoundFile = isCompoundFile;
this.docStoreOffset = -1;
+ this.docStoreSegment = name;
this.hasProx = hasProx;
this.segmentCodecs = segmentCodecs;
this.hasVectors = hasVectors;
delCount = 0;
+ version = Constants.LUCENE_MAIN_VERSION;
}
/**
@@ -107,11 +122,13 @@ public final class SegmentInfo {
*/
void reset(SegmentInfo src) {
clearFiles();
+ version = src.version;
name = src.name;
docCount = src.docCount;
dir = src.dir;
delGen = src.delGen;
docStoreOffset = src.docStoreOffset;
+ docStoreSegment = src.docStoreSegment;
docStoreIsCompoundFile = src.docStoreIsCompoundFile;
hasVectors = src.hasVectors;
hasProx = src.hasProx;
@@ -146,6 +163,9 @@ public final class SegmentInfo {
*/
public SegmentInfo(Directory dir, int format, IndexInput input, CodecProvider codecs) throws IOException {
this.dir = dir;
+ if (format <= DefaultSegmentInfosWriter.FORMAT_3_1) {
+ version = input.readString();
+ }
name = input.readString();
docCount = input.readInt();
delGen = input.readLong();
@@ -219,26 +239,41 @@ public final class SegmentInfo {
}
}
}
-
- /** Returns total size in bytes of all of files used by
- * this segment. */
+
+ /**
+ * Returns total size in bytes of all of files used by this segment (if
+ * {@code includeDocStores} is true), or the size of all files except the
+ * store files otherwise.
+ */
public long sizeInBytes(boolean includeDocStores) throws IOException {
- if (sizeInBytes == -1) {
- List<String> files = files();
- final int size = files.size();
- sizeInBytes = 0;
- for(int i=0;i<size;i++) {
- final String fileName = files.get(i);
- if (!includeDocStores && IndexFileNames.isDocStoreFile(fileName)) {
+ if (includeDocStores) {
+ if (sizeInBytesWithStore != -1) {
+ return sizeInBytesWithStore;
+ }
+ long sum = 0;
+ for (final String fileName : files()) {
+ // We don't count bytes used by a shared doc store
+ // against this segment
+ if (docStoreOffset == -1 || !IndexFileNames.isDocStoreFile(fileName)) {
+ sum += dir.fileLength(fileName);
+ }
+ }
+ sizeInBytesWithStore = sum;
+ return sizeInBytesWithStore;
+ } else {
+ if (sizeInBytesNoStore != -1) {
+ return sizeInBytesNoStore;
+ }
+ long sum = 0;
+ for (final String fileName : files()) {
+ if (IndexFileNames.isDocStoreFile(fileName)) {
continue;
}
- // We don't count bytes used by a shared doc store
- // against this segment:
- if (docStoreOffset == -1 || !IndexFileNames.isDocStoreFile(fileName))
- sizeInBytes += dir.fileLength(fileName);
+ sum += dir.fileLength(fileName);
}
+ sizeInBytesNoStore = sum;
+ return sizeInBytesNoStore;
}
- return sizeInBytes;
}
public boolean getHasVectors() throws IOException {
@@ -286,6 +321,7 @@ public final class SegmentInfo {
si.normGen = normGen.clone();
}
si.hasVectors = hasVectors;
+ si.version = version;
return si;
}
@@ -426,6 +462,8 @@ public final class SegmentInfo {
public void write(IndexOutput output)
throws IOException {
assert delCount <= docCount: "delCount=" + delCount + " docCount=" + docCount + " segment=" + name;
+ // Write the Lucene version that created this segment, since 3.1
+ output.writeString(version);
output.writeString(name);
output.writeInt(docCount);
output.writeLong(delGen);
@@ -554,7 +592,8 @@ public final class SegmentInfo {
* files this segment has. */
private void clearFiles() {
files = null;
- sizeInBytes = -1;
+ sizeInBytesNoStore = -1;
+ sizeInBytesWithStore = -1;
}
/** {@inheritDoc} */
@@ -566,8 +605,9 @@ public final class SegmentInfo {
/** Used for debugging. Format may suddenly change.
*
* <p>Current format looks like
- * <code>_a:c45/4->_1</code>, which means the segment's
- * name is <code>_a</code>; it's using compound file
+ * <code>_a(3.1):c45/4->_1</code>, which means the segment's
+ * name is <code>_a</code>; it was created with Lucene 3.1 (or
+ * '?' if it's unkown); it's using compound file
* format (would be <code>C</code> if not compound); it
* has 45 documents; it has 4 deletions (this part is
* left off when there are no deletions); it's using the
@@ -577,7 +617,7 @@ public final class SegmentInfo {
public String toString(Directory dir, int pendingDelCount) {
StringBuilder s = new StringBuilder();
- s.append(name).append(':');
+ s.append(name).append('(').append(version == null ? "?" : version).append(')').append(':');
char cfs = getUseCompoundFile() ? 'c' : 'C';
s.append(cfs);
@@ -625,4 +665,32 @@ public final class SegmentInfo {
public int hashCode() {
return dir.hashCode() + name.hashCode();
}
+
+ /**
+ * Used by DefaultSegmentInfosReader to upgrade a 3.0 segment to record its
+ * version is "3.0". This method can be removed when we're not required to
+ * support 3x indexes anymore, e.g. in 5.0.
+ * <p>
+ * <b>NOTE:</b> this method is used for internal purposes only - you should
+ * not modify the version of a SegmentInfo, or it may result in unexpected
+ * exceptions thrown when you attempt to open the index.
+ *
+ * @lucene.internal
+ */
+ public void setVersion(String version) {
+ this.version = version;
+ }
+
+ /** Returns the version of the code which wrote the segment. */
+ public String getVersion() {
+ return version;
+ }
+
+ long getBufferedDeletesGen() {
+ return bufferedDeletesGen;
+ }
+
+ void setBufferedDeletesGen(long v) {
+ bufferedDeletesGen = v;
+ }
}
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentInfos.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentInfos.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentInfos.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentInfos.java Wed Feb 9 09:35:27 2011
@@ -308,6 +308,19 @@ public final class SegmentInfos extends
}
}
+ /** Prunes any segment whose docs are all deleted. */
+ public void pruneDeletedSegments() {
+ int segIdx = 0;
+ while(segIdx < size()) {
+ final SegmentInfo info = info(segIdx);
+ if (info.getDelCount() == info.docCount) {
+ remove(segIdx);
+ } else {
+ segIdx++;
+ }
+ }
+ }
+
/**
* Returns a copy of this instance, also copying each
* SegmentInfo.
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentMerger.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentMerger.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentMerger.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentMerger.java Wed Feb 9 09:35:27 2011
@@ -19,6 +19,7 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;
@@ -59,7 +60,7 @@ final class SegmentMerger {
private int mergedDocs;
- private final CheckAbort checkAbort;
+ private final MergeState.CheckAbort checkAbort;
/** Maximum number of contiguous documents to bulk-copy
when merging stored fields */
@@ -78,9 +79,9 @@ final class SegmentMerger {
this.fieldInfos = fieldInfos;
segment = name;
if (merge != null) {
- checkAbort = new CheckAbort(merge, directory);
+ checkAbort = new MergeState.CheckAbort(merge, directory);
} else {
- checkAbort = new CheckAbort(null, null) {
+ checkAbort = new MergeState.CheckAbort(null, null) {
@Override
public void work(double units) throws MergeAbortedException {
// do nothing
@@ -266,7 +267,7 @@ final class SegmentMerger {
// details.
throw new RuntimeException("mergeFields produced an invalid result: docCount is " + docCount + " but fdx file size is " + fdxFileLength + " file=" + fileName + " file exists?=" + directory.fileExists(fileName) + "; now aborting this merge to prevent index corruption");
- segmentWriteState = new SegmentWriteState(null, directory, segment, fieldInfos, docCount, termIndexInterval, codecInfo, new AtomicLong(0));
+ segmentWriteState = new SegmentWriteState(null, directory, segment, fieldInfos, docCount, termIndexInterval, codecInfo, null, new AtomicLong(0));
return docCount;
}
@@ -508,6 +509,7 @@ final class SegmentMerger {
mergeState.hasPayloadProcessorProvider = payloadProcessorProvider != null;
mergeState.dirPayloadProcessor = new PayloadProcessorProvider.DirPayloadProcessor[mergeState.readerCount];
mergeState.currentPayloadProcessor = new PayloadProcessorProvider.PayloadProcessor[mergeState.readerCount];
+ mergeState.checkAbort = checkAbort;
docBase = 0;
int inputDocBase = 0;
@@ -571,13 +573,6 @@ final class SegmentMerger {
}
private void mergeNorms() throws IOException {
- // get needed buffer size by finding the largest segment
- int bufferSize = 0;
- for (IndexReader reader : readers) {
- bufferSize = Math.max(bufferSize, reader.maxDoc());
- }
-
- byte[] normBuffer = null;
IndexOutput output = null;
try {
for (int i = 0, numFieldInfos = fieldInfos.size(); i < numFieldInfos; i++) {
@@ -587,12 +582,15 @@ final class SegmentMerger {
output = directory.createOutput(IndexFileNames.segmentFileName(segment, "", IndexFileNames.NORMS_EXTENSION));
output.writeBytes(NORMS_HEADER,NORMS_HEADER.length);
}
- if (normBuffer == null) {
- normBuffer = new byte[bufferSize];
- }
for (IndexReader reader : readers) {
final int maxDoc = reader.maxDoc();
- reader.norms(fi.name, normBuffer, 0);
+ byte normBuffer[] = reader.norms(fi.name);
+ if (normBuffer == null) {
+ // Can be null if this segment doesn't have
+ // any docs with this field
+ normBuffer = new byte[maxDoc];
+ Arrays.fill(normBuffer, (byte)0);
+ }
if (!reader.hasDeletions()) {
//optimized case for segments without deleted docs
output.writeBytes(normBuffer, maxDoc);
@@ -616,31 +614,4 @@ final class SegmentMerger {
}
}
}
-
- static class CheckAbort {
- private double workCount;
- private MergePolicy.OneMerge merge;
- private Directory dir;
- public CheckAbort(MergePolicy.OneMerge merge, Directory dir) {
- this.merge = merge;
- this.dir = dir;
- }
-
- /**
- * Records the fact that roughly units amount of work
- * have been done since this method was last called.
- * When adding time-consuming code into SegmentMerger,
- * you should test different values for units to ensure
- * that the time in between calls to merge.checkAborted
- * is up to ~ 1 second.
- */
- public void work(double units) throws MergePolicy.MergeAbortedException {
- workCount += units;
- if (workCount >= 10000.0) {
- merge.checkAborted(dir);
- workCount = 0;
- }
- }
- }
-
}
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentReader.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentReader.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentReader.java Wed Feb 9 09:35:27 2011
@@ -19,7 +19,6 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
@@ -32,7 +31,6 @@ import java.util.concurrent.atomic.Atomi
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldSelector;
-import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
@@ -46,7 +44,6 @@ import org.apache.lucene.index.values.In
import org.apache.lucene.index.values.DocValues;
import org.apache.lucene.index.values.Floats;
import org.apache.lucene.index.values.Type;
-import org.apache.lucene.search.FieldCache; // not great (circular); used only to purge FieldCache entry on close
import org.apache.lucene.util.BytesRef;
/**
@@ -57,7 +54,7 @@ public class SegmentReader extends Index
private SegmentInfo si;
private int readBufferSize;
-
+ private final ReaderContext readerContext = new AtomicReaderContext(this);
CloseableThreadLocal<FieldsReader> fieldsReaderLocal = new FieldsReaderLocal();
CloseableThreadLocal<TermVectorsReader> termVectorsLocal = new CloseableThreadLocal<TermVectorsReader>();
@@ -190,13 +187,9 @@ public class SegmentReader extends Index
storeCFSReader.close();
}
- // Force FieldCache to evict our entries at this
- // point. If the exception occurred while
- // initializing the core readers, then
- // origInstance will be null, and we don't want
- // to call FieldCache.purge (it leads to NPE):
+ // Now, notify any ReaderFinished listeners:
if (origInstance != null) {
- FieldCache.DEFAULT.purge(origInstance);
+ origInstance.notifyReaderFinishedListeners();
}
}
}
@@ -233,13 +226,7 @@ public class SegmentReader extends Index
assert storeDir != null;
}
- final String storesSegment;
- if (si.getDocStoreOffset() != -1) {
- storesSegment = si.getDocStoreSegment();
- } else {
- storesSegment = segment;
- }
-
+ final String storesSegment = si.getDocStoreSegment();
fieldsReaderOrig = new FieldsReader(storeDir, storesSegment, fieldInfos, readBufferSize,
si.getDocStoreOffset(), si.docCount);
@@ -342,29 +329,6 @@ public class SegmentReader extends Index
}
}
- // Load bytes but do not cache them if they were not
- // already cached
- public synchronized void bytes(byte[] bytesOut, int offset, int len) throws IOException {
- assert refCount > 0 && (origNorm == null || origNorm.refCount > 0);
- if (bytes != null) {
- // Already cached -- copy from cache:
- assert len <= maxDoc();
- System.arraycopy(bytes, 0, bytesOut, offset, len);
- } else {
- // Not cached
- if (origNorm != null) {
- // Ask origNorm to load
- origNorm.bytes(bytesOut, offset, len);
- } else {
- // We are orig -- read ourselves from disk:
- synchronized(in) {
- in.seek(normSeek);
- in.readBytes(bytesOut, offset, len, false);
- }
- }
- }
- }
-
// Load & cache full bytes array. Returns bytes.
public synchronized byte[] bytes() throws IOException {
assert refCount > 0 && (origNorm == null || origNorm.refCount > 0);
@@ -669,6 +633,7 @@ public class SegmentReader extends Index
clone.si = si;
clone.readBufferSize = readBufferSize;
clone.pendingDeleteCount = pendingDeleteCount;
+ clone.readerFinishedListeners = readerFinishedListeners;
if (!openReadOnly && hasChanges) {
// My pending changes transfer to the new reader
@@ -999,22 +964,6 @@ public class SegmentReader extends Index
norm.copyOnWrite()[doc] = value; // set the value
}
- /** Read norms into a pre-allocated array. */
- @Override
- public synchronized void norms(String field, byte[] bytes, int offset)
- throws IOException {
-
- ensureOpen();
- Norm norm = norms.get(field);
- if (norm == null) {
- Arrays.fill(bytes, offset, bytes.length, Similarity.getDefault().encodeNormValue(1.0f));
- return;
- }
-
- norm.bytes(bytes, offset, maxDoc());
- }
-
-
private void openNorms(Directory cfsDir, int readBufferSize) throws IOException {
long nextNormSeek = SegmentMerger.NORMS_HEADER.length; //skip header (header unused for now)
int maxDoc = maxDoc();
@@ -1191,6 +1140,11 @@ public class SegmentReader extends Index
buffer.append(si.toString(core.dir, pendingDeleteCount));
return buffer.toString();
}
+
+ @Override
+ public ReaderContext getTopReaderContext() {
+ return readerContext;
+ }
/**
* Return the name of the segment this reader is reading.
@@ -1254,6 +1208,16 @@ public class SegmentReader extends Index
return core.termsIndexDivisor;
}
+ @Override
+ protected void readerFinished() {
+ // Do nothing here -- we have more careful control on
+ // when to notify that a SegmentReader has finished,
+ // because a given core is shared across many cloned
+ // SegmentReaders. We only notify once that core is no
+ // longer used (all SegmentReaders sharing it have been
+ // closed).
+ }
+
@Override
public DocValues docValues(String field) throws IOException {
return core.fields.docValues(field);
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java Wed Feb 9 09:35:27 2011
@@ -23,6 +23,7 @@ import java.util.HashSet;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BitVector;
/**
* @lucene.experimental
@@ -37,6 +38,16 @@ public class SegmentWriteState {
public final Collection<String> flushedFiles;
public final AtomicLong bytesUsed;
+ // Deletes to apply while we are flushing the segment. A
+ // Term is enrolled in here if it was deleted at one
+ // point, and it's mapped to the docIDUpto, meaning any
+ // docID < docIDUpto containing this term should be
+ // deleted.
+ public final BufferedDeletes segDeletes;
+
+ // Lazily created:
+ public BitVector deletedDocs;
+
final SegmentCodecs segmentCodecs;
public final String codecId;
@@ -62,8 +73,9 @@ public class SegmentWriteState {
public SegmentWriteState(PrintStream infoStream, Directory directory, String segmentName, FieldInfos fieldInfos,
- int numDocs, int termIndexInterval, SegmentCodecs segmentCodecs, AtomicLong bytesUsed) {
+ int numDocs, int termIndexInterval, SegmentCodecs segmentCodecs, BufferedDeletes segDeletes, AtomicLong bytesUsed) {
this.infoStream = infoStream;
+ this.segDeletes = segDeletes;
this.directory = directory;
this.segmentName = segmentName;
this.fieldInfos = fieldInfos;
@@ -88,6 +100,7 @@ public class SegmentWriteState {
segmentCodecs = state.segmentCodecs;
flushedFiles = state.flushedFiles;
this.codecId = codecId;
+ segDeletes = state.segDeletes;
bytesUsed = state.bytesUsed;
}
}
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SlowMultiReaderWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SlowMultiReaderWrapper.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SlowMultiReaderWrapper.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/SlowMultiReaderWrapper.java Wed Feb 9 09:35:27 2011
@@ -18,13 +18,9 @@ package org.apache.lucene.index;
*/
import java.io.IOException;
-import java.util.Arrays;
import java.util.HashMap;
-import java.util.List;
-import java.util.ArrayList;
import java.util.Map;
-import org.apache.lucene.search.Similarity;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.ReaderUtil; // javadoc
@@ -55,10 +51,12 @@ import org.apache.lucene.index.MultiRead
public final class SlowMultiReaderWrapper extends FilterIndexReader {
+ private final ReaderContext readerContext;
private final Map<String,byte[]> normsCache = new HashMap<String,byte[]>();
public SlowMultiReaderWrapper(IndexReader other) {
super(other);
+ readerContext = new AtomicReaderContext(this); // emulate atomic reader!
}
@Override
@@ -85,22 +83,17 @@ public final class SlowMultiReaderWrappe
return bytes;
if (!hasNorms(field))
return null;
-
+ if (normsCache.containsKey(field)) // cached omitNorms, not missing key
+ return null;
+
bytes = MultiNorms.norms(in, field);
normsCache.put(field, bytes);
return bytes;
}
-
+
@Override
- public synchronized void norms(String field, byte[] bytes, int offset) throws IOException {
- // TODO: maybe optimize
- ensureOpen();
- byte[] norms = norms(field);
- if (norms == null) {
- Arrays.fill(bytes, offset, bytes.length, Similarity.getDefault().encodeNormValue(1.0f));
- } else {
- System.arraycopy(norms, 0, bytes, offset, maxDoc());
- }
+ public ReaderContext getTopReaderContext() {
+ return readerContext;
}
@Override
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java Wed Feb 9 09:35:27 2011
@@ -281,6 +281,7 @@ final class TermVectorsTermsWriterPerFie
int[] lastOffsets; // Last offset we saw
int[] lastPositions; // Last position where this term occurred
+ @Override
ParallelPostingsArray newInstance(int size) {
return new TermVectorsPostingsArray(size);
}
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/Terms.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/Terms.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/Terms.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/Terms.java Wed Feb 9 09:35:27 2011
@@ -57,6 +57,18 @@ public abstract class Terms {
}
}
+ /** Returns the number of documents containing the
+ * specified term text. Returns 0 if the term does not
+ * exist. */
+ public long totalTermFreq(BytesRef text) throws IOException {
+ final TermsEnum termsEnum = getThreadTermsEnum();
+ if (termsEnum.seek(text) == TermsEnum.SeekStatus.FOUND) {
+ return termsEnum.totalTermFreq();
+ } else {
+ return 0;
+ }
+ }
+
/** Get {@link DocsEnum} for the specified term. This
* method may return null if the term does not exist. */
public DocsEnum docs(Bits skipDocs, BytesRef text, DocsEnum reuse) throws IOException {
@@ -80,11 +92,59 @@ public abstract class Terms {
}
}
+ /**
+ * Expert: Get {@link DocsEnum} for the specified {@link TermState}.
+ * This method may return <code>null</code> if the term does not exist.
+ *
+ * @see TermsEnum#termState()
+ * @see TermsEnum#seek(BytesRef, TermState) */
+ public DocsEnum docs(Bits skipDocs, BytesRef term, TermState termState, DocsEnum reuse) throws IOException {
+ final TermsEnum termsEnum = getThreadTermsEnum();
+ termsEnum.seek(term, termState);
+ return termsEnum.docs(skipDocs, reuse);
+ }
+
+ /**
+ * Get {@link DocsEnum} for the specified {@link TermState}. This
+ * method will may return <code>null</code> if the term does not exists, or positions were
+ * not indexed.
+ *
+ * @see TermsEnum#termState()
+ * @see TermsEnum#seek(BytesRef, TermState) */
+ public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, BytesRef term, TermState termState, DocsAndPositionsEnum reuse) throws IOException {
+ final TermsEnum termsEnum = getThreadTermsEnum();
+ termsEnum.seek(term, termState);
+ return termsEnum.docsAndPositions(skipDocs, reuse);
+ }
+
public long getUniqueTermCount() throws IOException {
throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()");
}
- protected TermsEnum getThreadTermsEnum() throws IOException {
+ /** Returns the sum of {@link TermsEnum#totalTermFreq} for
+ * all terms in this field, or -1 if this measure isn't
+ * stored by the codec (or if this fields omits term freq
+ * and positions). Note that, just like other term
+ * measures, this measure does not take deleted documents
+ * into account. */
+ public abstract long getSumTotalTermFreq() throws IOException;
+
+ /**
+ * Returns a thread-private {@link TermsEnum} instance. Obtaining
+ * {@link TermsEnum} from this method might be more efficient than using
+ * {@link #iterator()} directly since this method doesn't necessarily create a
+ * new {@link TermsEnum} instance.
+ * <p>
+ * NOTE: {@link TermsEnum} instances obtained from this method must not be
+ * shared across threads. The enum should only be used within a local context
+ * where other threads can't access it.
+ *
+ * @return a thread-private {@link TermsEnum} instance
+ * @throws IOException
+ * if an IOException occurs
+ * @lucene.internal
+ */
+ public TermsEnum getThreadTermsEnum() throws IOException {
TermsEnum termsEnum = threadEnums.get();
if (termsEnum == null) {
termsEnum = iterator();
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/TermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/TermsEnum.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/TermsEnum.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/TermsEnum.java Wed Feb 9 09:35:27 2011
@@ -73,7 +73,34 @@ public abstract class TermsEnum {
* may be before or after the current ord. See {@link
* #seek(BytesRef)}. */
public abstract SeekStatus seek(long ord) throws IOException;
-
+
+ /**
+ * Expert: Seeks a specific position by {@link TermState} previously obtained
+ * from {@link #termState()}. Callers should maintain the {@link TermState} to
+ * use this method. Low-level implementations may position the TermsEnum
+ * without re-seeking the term dictionary.
+ * <p>
+ * Seeking by {@link TermState} should only be used iff the enum the state was
+ * obtained from and the enum the state is used for seeking are obtained from
+ * the same {@link IndexReader}, otherwise a {@link #seek(BytesRef, TermState)} call can
+ * leave the enum in undefined state.
+ * <p>
+ * NOTE: Using this method with an incompatible {@link TermState} might leave
+ * this {@link TermsEnum} in undefined state. On a segment level
+ * {@link TermState} instances are compatible only iff the source and the
+ * target {@link TermsEnum} operate on the same field. If operating on segment
+ * level, TermState instances must not be used across segments.
+ * <p>
+ * NOTE: A seek by {@link TermState} might not restore the
+ * {@link AttributeSource}'s state. {@link AttributeSource} states must be
+ * maintained separately if this method is used.
+ * @param term the term the TermState corresponds to
+ * @param state the {@link TermState}
+ * */
+ public void seek(BytesRef term, TermState state) throws IOException {
+ seek(term);
+ }
+
/** Increments the enumeration to the next element.
* Returns the resulting term, or null if the end was
* hit. The returned BytesRef may be re-used across calls
@@ -97,7 +124,15 @@ public abstract class TermsEnum {
* term. Do not call this before calling next() for the
* first time, after next() returns null or seek returns
* {@link SeekStatus#END}.*/
- public abstract int docFreq();
+ public abstract int docFreq() throws IOException;
+
+ /** Returns the total number of occurrences of this term
+ * across all documents (the sum of the freq() for each
+ * doc that has this term). This will be -1 if the
+ * codec doesn't support this measure. Note that, like
+ * other term measures, this measure does not take
+ * deleted documents into account. */
+ public abstract long totalTermFreq() throws IOException;
/** Get {@link DocsEnum} for the current term. Do not
* call this before calling {@link #next} or {@link
@@ -116,6 +151,25 @@ public abstract class TermsEnum {
* the postings by this codec. */
public abstract DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException;
+ /**
+ * Expert: Returns the TermsEnums internal state to position the TermsEnum
+ * without re-seeking the term dictionary.
+ * <p>
+ * NOTE: A seek by {@link TermState} might not capture the
+ * {@link AttributeSource}'s state. Callers must maintain the
+ * {@link AttributeSource} states separately
+ *
+ * @see TermState
+ * @see #seek(BytesRef, TermState)
+ */
+ public TermState termState() throws IOException {
+ return new TermState() {
+ @Override
+ public void copyFrom(TermState other) {
+ }
+ };
+ }
+
/** Return the {@link BytesRef} Comparator used to sort
* terms provided by the iterator. This may return
* null if there are no terms. Callers may invoke this
@@ -123,10 +177,6 @@ public abstract class TermsEnum {
* instance & reuse it. */
public abstract Comparator<BytesRef> getComparator() throws IOException;
- /** Optional optimization hint: informs the codec that the
- * current term is likely to be re-seek'd-to soon. */
- public abstract void cacheCurrentTerm() throws IOException;
-
/** An empty TermsEnum for quickly returning an empty instance e.g.
* in {@link org.apache.lucene.search.MultiTermQuery}
* <p><em>Please note:</em> This enum should be unmodifiable,
@@ -142,9 +192,6 @@ public abstract class TermsEnum {
public SeekStatus seek(long ord) { return SeekStatus.END; }
@Override
- public void cacheCurrentTerm() {}
-
- @Override
public BytesRef term() {
throw new IllegalStateException("this method should never be called");
}
@@ -158,6 +205,11 @@ public abstract class TermsEnum {
public int docFreq() {
throw new IllegalStateException("this method should never be called");
}
+
+ @Override
+ public long totalTermFreq() {
+ throw new IllegalStateException("this method should never be called");
+ }
@Override
public long ord() {
@@ -183,5 +235,15 @@ public abstract class TermsEnum {
public synchronized AttributeSource attributes() {
return super.attributes();
}
+
+ @Override
+ public TermState termState() throws IOException {
+ throw new IllegalStateException("this method should never be called");
+ }
+
+ @Override
+ public void seek(BytesRef term, TermState state) throws IOException {
+ throw new IllegalStateException("this method should never be called");
+ }
};
}
Copied: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java (from r1068464, lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java?p2=lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java&p1=lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java&r1=1068464&r2=1068809&rev=1068809&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java Wed Feb 9 09:35:27 2011
@@ -35,6 +35,7 @@ import org.apache.lucene.index.TermState
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.codecs.standard.StandardPostingsReader; // javadocs
+import org.apache.lucene.index.values.DocValues;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
@@ -242,6 +243,11 @@ public class BlockTermsReader extends Fi
public TermsEnum terms() throws IOException {
return current.iterator();
}
+
+ @Override
+ public DocValues docValues() throws IOException {
+ return null;
+ }
}
private class FieldReader extends Terms implements Closeable {
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosReader.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosReader.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosReader.java Wed Feb 9 09:35:27 2011
@@ -19,7 +19,10 @@ package org.apache.lucene.index.codecs;
import java.io.IOException;
+import org.apache.lucene.index.CompoundFileReader;
import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.FieldsReader;
+import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexFormatTooOldException;
import org.apache.lucene.index.IndexFormatTooNewException;
import org.apache.lucene.index.SegmentInfo;
@@ -55,7 +58,41 @@ public class DefaultSegmentInfosReader e
infos.counter = input.readInt(); // read counter
for (int i = input.readInt(); i > 0; i--) { // read segmentInfos
- infos.add(new SegmentInfo(directory, format, input, codecs));
+ SegmentInfo si = new SegmentInfo(directory, format, input, codecs);
+ if (si.getVersion() == null) {
+ // Could be a 3.0 - try to open the doc stores - if it fails, it's a
+ // 2.x segment, and an IndexFormatTooOldException will be thrown,
+ // which is what we want.
+ Directory dir = directory;
+ if (si.getDocStoreOffset() != -1) {
+ if (si.getDocStoreIsCompoundFile()) {
+ dir = new CompoundFileReader(dir, IndexFileNames.segmentFileName(
+ si.getDocStoreSegment(), "",
+ IndexFileNames.COMPOUND_FILE_STORE_EXTENSION), 1024);
+ }
+ } else if (si.getUseCompoundFile()) {
+ dir = new CompoundFileReader(dir, IndexFileNames.segmentFileName(
+ si.name, "", IndexFileNames.COMPOUND_FILE_EXTENSION), 1024);
+ }
+
+ try {
+ FieldsReader.checkCodeVersion(dir, si.getDocStoreSegment());
+ } finally {
+ // If we opened the directory, close it
+ if (dir != directory) dir.close();
+ }
+
+ // Above call succeeded, so it's a 3.0 segment. Upgrade it so the next
+ // time the segment is read, its version won't be null and we won't
+ // need to open FieldsReader every time for each such segment.
+ si.setVersion("3.0");
+ } else if (si.getVersion().equals("2.x")) {
+ // If it's a 3x index touched by 3.1+ code, then segments record their
+ // version, whether they are 2.x ones or not. We detect that and throw
+ // appropriate exception.
+ throw new IndexFormatTooOldException(si.name, si.getVersion());
+ }
+ infos.add(si);
}
infos.userData = input.readStringStringMap();
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosWriter.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosWriter.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosWriter.java Wed Feb 9 09:35:27 2011
@@ -38,9 +38,12 @@ public class DefaultSegmentInfosWriter e
/** Each segment records whether it has term vectors */
public static final int FORMAT_HAS_VECTORS = -10;
+ /** Each segment records the Lucene version that created it. */
+ public static final int FORMAT_3_1 = -11;
+
/** Each segment records whether its postings are written
* in the new flex format */
- public static final int FORMAT_4_0 = -11;
+ public static final int FORMAT_4_0 = -12;
/** This must always point to the most recent file format.
* whenever you add a new format, make it 1 smaller (negative version logic)! */
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java Wed Feb 9 09:35:27 2011
@@ -44,7 +44,7 @@ public class FixedGapTermsIndexReader ex
// number of places to multiply out the actual ord, and we
// will overflow int during those multiplies. So to avoid
// having to upgrade each multiple to long in multiple
- // places (error proned), we use long here:
+ // places (error prone), we use long here:
private long totalIndexInterval;
private int indexDivisor;
@@ -94,6 +94,7 @@ public class FixedGapTermsIndexReader ex
// Read directory
final int numFields = in.readVInt();
+ //System.out.println("FGR: init seg=" + segment + " div=" + indexDivisor + " nF=" + numFields);
for(int i=0;i<numFields;i++) {
final int field = in.readVInt();
final int numIndexTerms = in.readVInt();
@@ -132,7 +133,6 @@ public class FixedGapTermsIndexReader ex
private class IndexEnum extends FieldIndexEnum {
private final FieldIndexData.CoreFieldIndex fieldIndex;
private final BytesRef term = new BytesRef();
- private final BytesRef nextTerm = new BytesRef();
private long ord;
public IndexEnum(FieldIndexData.CoreFieldIndex fieldIndex) {
@@ -192,7 +192,7 @@ public class FixedGapTermsIndexReader ex
final long offset = fieldIndex.termOffsets.get(idx);
final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
- termBytesReader.fillSlice(nextTerm, fieldIndex.termBytesStart + offset, length);
+ termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
}
@@ -242,9 +242,6 @@ public class FixedGapTermsIndexReader ex
this.packedOffsetsStart = packedOffsetsStart;
this.numIndexTerms = numIndexTerms;
- // We still create the indexReader when indexDivisor
- // is -1, so that PrefixCodedTermsReader can call
- // isIndexTerm for each field:
if (indexDivisor > 0) {
loadTermsIndex();
}
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java Wed Feb 9 09:35:27 2011
@@ -53,7 +53,6 @@ public class FixedGapTermsIndexWriter ex
private final List<SimpleFieldWriter> fields = new ArrayList<SimpleFieldWriter>();
private final FieldInfos fieldInfos; // unread
- private IndexOutput termsOut;
public FixedGapTermsIndexWriter(SegmentWriteState state) throws IOException {
final String indexFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_INDEX_EXTENSION);
@@ -71,13 +70,9 @@ public class FixedGapTermsIndexWriter ex
}
@Override
- public void setTermsOutput(IndexOutput termsOut) {
- this.termsOut = termsOut;
- }
-
- @Override
- public FieldWriter addField(FieldInfo field) {
- SimpleFieldWriter writer = new SimpleFieldWriter(field);
+ public FieldWriter addField(FieldInfo field, long termsFilePointer) {
+ //System.out.println("FGW: addFfield=" + field.name);
+ SimpleFieldWriter writer = new SimpleFieldWriter(field, termsFilePointer);
fields.add(writer);
return writer;
}
@@ -119,44 +114,19 @@ public class FixedGapTermsIndexWriter ex
private final BytesRef lastTerm = new BytesRef();
- SimpleFieldWriter(FieldInfo fieldInfo) {
+ SimpleFieldWriter(FieldInfo fieldInfo, long termsFilePointer) {
this.fieldInfo = fieldInfo;
indexStart = out.getFilePointer();
- termsStart = lastTermsPointer = termsOut.getFilePointer();
+ termsStart = lastTermsPointer = termsFilePointer;
termLengths = new short[0];
termsPointerDeltas = new int[0];
}
@Override
- public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
+ public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
// First term is first indexed term:
+ //System.out.println("FGW: checkIndexTerm text=" + text.utf8ToString());
if (0 == (numTerms++ % termIndexInterval)) {
-
- final int indexedTermLength = indexedTermPrefixLength(lastTerm, text);
-
- // write only the min prefix that shows the diff
- // against prior term
- out.writeBytes(text.bytes, text.offset, indexedTermLength);
-
- if (termLengths.length == numIndexTerms) {
- termLengths = ArrayUtil.grow(termLengths);
- }
- if (termsPointerDeltas.length == numIndexTerms) {
- termsPointerDeltas = ArrayUtil.grow(termsPointerDeltas);
- }
-
- // save delta terms pointer
- final long fp = termsOut.getFilePointer();
- termsPointerDeltas[numIndexTerms] = (int) (fp - lastTermsPointer);
- lastTermsPointer = fp;
-
- // save term length (in bytes)
- assert indexedTermLength <= Short.MAX_VALUE;
- termLengths[numIndexTerms] = (short) indexedTermLength;
- totTermLength += indexedTermLength;
-
- lastTerm.copy(text);
- numIndexTerms++;
return true;
} else {
if (0 == numTerms % termIndexInterval) {
@@ -169,13 +139,41 @@ public class FixedGapTermsIndexWriter ex
}
@Override
- public void finish() throws IOException {
+ public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException {
+ final int indexedTermLength = indexedTermPrefixLength(lastTerm, text);
+ //System.out.println("FGW: add text=" + text.utf8ToString() + " " + text + " fp=" + termsFilePointer);
+
+ // write only the min prefix that shows the diff
+ // against prior term
+ out.writeBytes(text.bytes, text.offset, indexedTermLength);
+
+ if (termLengths.length == numIndexTerms) {
+ termLengths = ArrayUtil.grow(termLengths);
+ }
+ if (termsPointerDeltas.length == numIndexTerms) {
+ termsPointerDeltas = ArrayUtil.grow(termsPointerDeltas);
+ }
+
+ // save delta terms pointer
+ termsPointerDeltas[numIndexTerms] = (int) (termsFilePointer - lastTermsPointer);
+ lastTermsPointer = termsFilePointer;
+
+ // save term length (in bytes)
+ assert indexedTermLength <= Short.MAX_VALUE;
+ termLengths[numIndexTerms] = (short) indexedTermLength;
+ totTermLength += indexedTermLength;
+
+ lastTerm.copy(text);
+ numIndexTerms++;
+ }
+
+ @Override
+ public void finish(long termsFilePointer) throws IOException {
// write primary terms dict offsets
packedIndexStart = out.getFilePointer();
- final long maxValue = termsOut.getFilePointer();
- PackedInts.Writer w = PackedInts.getWriter(out, numIndexTerms, PackedInts.bitsRequired(maxValue));
+ PackedInts.Writer w = PackedInts.getWriter(out, numIndexTerms, PackedInts.bitsRequired(termsFilePointer));
// relative to our indexStart
long upto = 0;
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/MergeState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/MergeState.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/MergeState.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/MergeState.java Wed Feb 9 09:35:27 2011
@@ -17,13 +17,16 @@ package org.apache.lucene.index.codecs;
* limitations under the License.
*/
+import java.util.List;
+
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.PayloadProcessorProvider.DirPayloadProcessor;
import org.apache.lucene.index.PayloadProcessorProvider.PayloadProcessor;
+import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
-import java.util.List;
/** Holds common state used during segment merging
*
@@ -37,6 +40,7 @@ public class MergeState {
public int[] docBase; // New docID base per reader
public int mergedDocCount; // Total # merged docs
public Bits multiDeletedDocs;
+ public CheckAbort checkAbort;
// Updated per field;
public FieldInfo fieldInfo;
@@ -45,5 +49,30 @@ public class MergeState {
public boolean hasPayloadProcessorProvider;
public DirPayloadProcessor[] dirPayloadProcessor;
public PayloadProcessor[] currentPayloadProcessor;
-
+
+ public static class CheckAbort {
+ private double workCount;
+ private MergePolicy.OneMerge merge;
+ private Directory dir;
+ public CheckAbort(MergePolicy.OneMerge merge, Directory dir) {
+ this.merge = merge;
+ this.dir = dir;
+ }
+
+ /**
+ * Records the fact that roughly units amount of work
+ * have been done since this method was last called.
+ * When adding time-consuming code into SegmentMerger,
+ * you should test different values for units to ensure
+ * that the time in between calls to merge.checkAborted
+ * is up to ~ 1 second.
+ */
+ public void work(double units) throws MergePolicy.MergeAbortedException {
+ workCount += units;
+ if (workCount >= 10000.0) {
+ merge.checkAborted(dir);
+ workCount = 0;
+ }
+ }
+ }
}
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/MultiLevelSkipListReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/MultiLevelSkipListReader.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/MultiLevelSkipListReader.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/MultiLevelSkipListReader.java Wed Feb 9 09:35:27 2011
@@ -172,6 +172,8 @@ public abstract class MultiLevelSkipList
public void init(long skipPointer, int df) {
this.skipPointer[0] = skipPointer;
this.docCount = df;
+ assert skipPointer >= 0 && skipPointer <= skipStream[0].length()
+ : "invalid skip pointer: " + skipPointer + ", length=" + skipStream[0].length();
Arrays.fill(skipDoc, 0);
Arrays.fill(numSkipped, 0);
Arrays.fill(childPointer, 0);
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java Wed Feb 9 09:35:27 2011
@@ -30,9 +30,9 @@ import org.apache.lucene.util.BytesRef;
public abstract class PostingsConsumer {
- /** Adds a new doc in this term. Return null if this
- * consumer doesn't need to see the positions for this
- * doc. */
+ /** Adds a new doc in this term. If this field omits term
+ * freqs & positions then termDocFreq should be ignored,
+ * and, finishDoc will not be called. */
public abstract void startDoc(int docID, int termDocFreq) throws IOException;
public static class PostingsMergeState {
@@ -49,14 +49,16 @@ public abstract class PostingsConsumer {
public abstract void addPosition(int position, BytesRef payload) throws IOException;
/** Called when we are done adding positions & payloads
- * for each doc */
+ * for each doc. Not called when the field omits term
+ * freq and positions. */
public abstract void finishDoc() throws IOException;
/** Default merge impl: append documents, mapping around
* deletes */
- public int merge(final MergeState mergeState, final DocsEnum postings) throws IOException {
+ public TermStats merge(final MergeState mergeState, final DocsEnum postings) throws IOException {
int df = 0;
+ long totTF = 0;
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
while(true) {
@@ -67,6 +69,7 @@ public abstract class PostingsConsumer {
this.startDoc(doc, postings.freq());
this.finishDoc();
df++;
+ totTF++;
}
} else {
final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings;
@@ -77,6 +80,7 @@ public abstract class PostingsConsumer {
}
final int freq = postingsEnum.freq();
this.startDoc(doc, freq);
+ totTF += freq;
for(int i=0;i<freq;i++) {
final int position = postingsEnum.nextPosition();
final BytesRef payload;
@@ -91,6 +95,6 @@ public abstract class PostingsConsumer {
df++;
}
}
- return df;
+ return new TermStats(df, totTF);
}
}
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java Wed Feb 9 09:35:27 2011
@@ -28,12 +28,12 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; // javadocs
-/** PrefixCodedTermsReader interacts with a single instance
- * of this to manage creation of {@link DocsEnum} and
+/** BlockTermsReader interacts with a single instance
+ * of this class to manage creation of {@link DocsEnum} and
* {@link DocsAndPositionsEnum} instances. It provides an
* IndexInput (termsIn) where this class may read any
* previously stored data that it had written in its
- * corresponding {@link StandardPostingsWriter} at indexing
+ * corresponding {@link PostingsWriterBase} at indexing
* time.
* @lucene.experimental */
@@ -42,17 +42,23 @@ public abstract class PostingsReaderBase
public abstract void init(IndexInput termsIn) throws IOException;
/** Return a newly created empty TermState */
- public abstract TermState newTermState() throws IOException;
+ public abstract BlockTermState newTermState() throws IOException;
- public abstract void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState state, boolean isIndexTerm) throws IOException;
+ /** Actually decode metadata for next term */
+ public abstract void nextTerm(FieldInfo fieldInfo, BlockTermState state) throws IOException;
/** Must fully consume state, since after this call that
* TermState may be reused. */
- public abstract DocsEnum docs(FieldInfo fieldInfo, TermState state, Bits skipDocs, DocsEnum reuse) throws IOException;
+ public abstract DocsEnum docs(FieldInfo fieldInfo, BlockTermState state, Bits skipDocs, DocsEnum reuse) throws IOException;
/** Must fully consume state, since after this call that
* TermState may be reused. */
- public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState state, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException;
+ public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState state, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException;
public abstract void close() throws IOException;
+
+ /** Reads data for all terms in the next block; this
+ * method should merely load the byte[] blob but not
+ * decode, which is done in {@link #nextTerm}. */
+ public abstract void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState termState) throws IOException;
}
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/PostingsWriterBase.java Wed Feb 9 09:35:27 2011
@@ -33,8 +33,10 @@ public abstract class PostingsWriterBase
public abstract void startTerm() throws IOException;
+ public abstract void flushTermsBlock() throws IOException;
+
/** Finishes the current term */
- public abstract void finishTerm(int numDocs, boolean isIndexTerm) throws IOException;
+ public abstract void finishTerm(TermStats stats) throws IOException;
public abstract void setField(FieldInfo fieldInfo);
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java Wed Feb 9 09:35:27 2011
@@ -38,10 +38,10 @@ public abstract class TermsConsumer {
public abstract PostingsConsumer startTerm(BytesRef text) throws IOException;
/** Finishes the current term; numDocs must be > 0. */
- public abstract void finishTerm(BytesRef text, int numDocs) throws IOException;
+ public abstract void finishTerm(BytesRef text, TermStats stats) throws IOException;
/** Called when we are done adding terms to this field */
- public abstract void finish() throws IOException;
+ public abstract void finish(long sumTotalTermFreq) throws IOException;
/** Return the BytesRef Comparator used to sort terms
* before feeding to this API. */
@@ -55,6 +55,8 @@ public abstract class TermsConsumer {
BytesRef term;
assert termsEnum != null;
+ long sumTotalTermFreq = 0;
+ long sumDF = 0;
if (mergeState.fieldInfo.omitTermFreqAndPositions) {
if (docsEnum == null) {
@@ -69,9 +71,14 @@ public abstract class TermsConsumer {
if (docsEnumIn != null) {
docsEnum.reset(docsEnumIn);
final PostingsConsumer postingsConsumer = startTerm(term);
- final int numDocs = postingsConsumer.merge(mergeState, docsEnum);
- if (numDocs > 0) {
- finishTerm(term, numDocs);
+ final TermStats stats = postingsConsumer.merge(mergeState, docsEnum);
+ if (stats.docFreq > 0) {
+ finishTerm(term, stats);
+ sumDF += stats.docFreq;
+ if (sumDF > 60000) {
+ mergeState.checkAbort.work(sumDF/5.0);
+ sumDF = 0;
+ }
}
}
}
@@ -94,14 +101,20 @@ public abstract class TermsConsumer {
}
}
final PostingsConsumer postingsConsumer = startTerm(term);
- final int numDocs = postingsConsumer.merge(mergeState, postingsEnum);
- if (numDocs > 0) {
- finishTerm(term, numDocs);
+ final TermStats stats = postingsConsumer.merge(mergeState, postingsEnum);
+ if (stats.docFreq > 0) {
+ finishTerm(term, stats);
+ sumTotalTermFreq += stats.totalTermFreq;
+ sumDF += stats.docFreq;
+ if (sumDF > 60000) {
+ mergeState.checkAbort.work(sumDF/5.0);
+ sumDF = 0;
+ }
}
}
}
}
- finish();
+ finish(sumTotalTermFreq);
}
}
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexWriterBase.java Wed Feb 9 09:35:27 2011
@@ -17,7 +17,6 @@ package org.apache.lucene.index.codecs;
* limitations under the License.
*/
-import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
@@ -25,14 +24,13 @@ import java.io.IOException;
/** @lucene.experimental */
public abstract class TermsIndexWriterBase {
- public abstract void setTermsOutput(IndexOutput out);
-
public abstract class FieldWriter {
- public abstract boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException;
- public abstract void finish() throws IOException;
+ public abstract boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException;
+ public abstract void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException;
+ public abstract void finish(long termsFilePointer) throws IOException;
}
- public abstract FieldWriter addField(FieldInfo fieldInfo) throws IOException;
+ public abstract FieldWriter addField(FieldInfo fieldInfo, long termsFilePointer) throws IOException;
public abstract void close() throws IOException;
}
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexReader.java Wed Feb 9 09:35:27 2011
@@ -164,9 +164,6 @@ public class VariableGapTermsIndexReader
this.fieldInfo = fieldInfo;
this.indexStart = indexStart;
- // We still create the indexReader when indexDivisor
- // is -1, so that PrefixCodedTermsReader can call
- // isIndexTerm for each field:
if (indexDivisor > 0) {
loadTermsIndex();
}
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/VariableGapTermsIndexWriter.java Wed Feb 9 09:35:27 2011
@@ -52,14 +52,14 @@ public class VariableGapTermsIndexWriter
private final List<FSTFieldWriter> fields = new ArrayList<FSTFieldWriter>();
private final FieldInfos fieldInfos; // unread
- private IndexOutput termsOut;
private final IndexTermSelector policy;
/** @lucene.experimental */
public static abstract class IndexTermSelector {
// Called sequentially on every term being written,
// returning true if this term should be indexed
- public abstract boolean isIndexTerm(BytesRef term, int docFreq);
+ public abstract boolean isIndexTerm(BytesRef term, TermStats stats);
+ public abstract void newField(FieldInfo fieldInfo);
}
/** Same policy as {@link FixedGapTermsIndexWriter} */
@@ -74,15 +74,20 @@ public class VariableGapTermsIndexWriter
}
@Override
- public boolean isIndexTerm(BytesRef term, int docFreq) {
+ public boolean isIndexTerm(BytesRef term, TermStats stats) {
if (count >= interval) {
- count = 0;
+ count = 1;
return true;
} else {
count++;
return false;
}
}
+
+ @Override
+ public void newField(FieldInfo fieldInfo) {
+ count = interval;
+ }
}
/** Sets an index term when docFreq >= docFreqThresh, or
@@ -96,18 +101,26 @@ public class VariableGapTermsIndexWriter
public EveryNOrDocFreqTermSelector(int docFreqThresh, int interval) {
this.interval = interval;
this.docFreqThresh = docFreqThresh;
+
+ // First term is first indexed term:
+ count = interval;
}
@Override
- public boolean isIndexTerm(BytesRef term, int docFreq) {
- if (docFreq >= docFreqThresh || count >= interval) {
- count = 0;
+ public boolean isIndexTerm(BytesRef term, TermStats stats) {
+ if (stats.docFreq >= docFreqThresh || count >= interval) {
+ count = 1;
return true;
} else {
count++;
return false;
}
}
+
+ @Override
+ public void newField(FieldInfo fieldInfo) {
+ count = interval;
+ }
}
// TODO: it'd be nice to let the FST builder prune based
@@ -158,14 +171,10 @@ public class VariableGapTermsIndexWriter
}
@Override
- public void setTermsOutput(IndexOutput termsOut) {
- this.termsOut = termsOut;
- }
-
- @Override
- public FieldWriter addField(FieldInfo field) throws IOException {
- //System.out.println("VGW: field=" + field.name);
- FSTFieldWriter writer = new FSTFieldWriter(field);
+ public FieldWriter addField(FieldInfo field, long termsFilePointer) throws IOException {
+ ////System.out.println("VGW: field=" + field.name);
+ policy.newField(field);
+ FSTFieldWriter writer = new FSTFieldWriter(field, termsFilePointer);
fields.add(writer);
return writer;
}
@@ -200,42 +209,48 @@ public class VariableGapTermsIndexWriter
private final BytesRef lastTerm = new BytesRef();
private boolean first = true;
- public FSTFieldWriter(FieldInfo fieldInfo) throws IOException {
+ public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
this.fieldInfo = fieldInfo;
fstOutputs = PositiveIntOutputs.getSingleton(true);
fstBuilder = new Builder<Long>(FST.INPUT_TYPE.BYTE1,
0, 0, true,
fstOutputs);
indexStart = out.getFilePointer();
- //System.out.println("VGW: field=" + fieldInfo.name);
+ ////System.out.println("VGW: field=" + fieldInfo.name);
// Always put empty string in
- fstBuilder.add(new BytesRef(), fstOutputs.get(termsOut.getFilePointer()));
+ fstBuilder.add(new BytesRef(), fstOutputs.get(termsFilePointer));
}
@Override
- public boolean checkIndexTerm(BytesRef text, int docFreq) throws IOException {
- if (policy.isIndexTerm(text, docFreq) || first) {
+ public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
+ //System.out.println("VGW: index term=" + text.utf8ToString());
+ // NOTE: we must force the first term per field to be
+ // indexed, in case policy doesn't:
+ if (policy.isIndexTerm(text, stats) || first) {
first = false;
- //System.out.println("VGW: index term=" + text.utf8ToString() + " fp=" + termsOut.getFilePointer());
- final int lengthSave = text.length;
- text.length = indexedTermPrefixLength(lastTerm, text);
- try {
- fstBuilder.add(text, fstOutputs.get(termsOut.getFilePointer()));
- } finally {
- text.length = lengthSave;
- }
- lastTerm.copy(text);
+ //System.out.println(" YES");
return true;
} else {
- //System.out.println("VGW: not index term=" + text.utf8ToString() + " fp=" + termsOut.getFilePointer());
lastTerm.copy(text);
return false;
}
}
@Override
- public void finish() throws IOException {
+ public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException {
+ final int lengthSave = text.length;
+ text.length = indexedTermPrefixLength(lastTerm, text);
+ try {
+ fstBuilder.add(text, fstOutputs.get(termsFilePointer));
+ } finally {
+ text.length = lengthSave;
+ }
+ lastTerm.copy(text);
+ }
+
+ @Override
+ public void finish(long termsFilePointer) throws IOException {
fst = fstBuilder.finish();
if (fst != null) {
fst.save(out);
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexInput.java Wed Feb 9 09:35:27 2011
@@ -24,6 +24,7 @@ package org.apache.lucene.index.codecs.i
import java.io.IOException;
import org.apache.lucene.index.codecs.sep.IntIndexInput;
+import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.IntsRef;
@@ -149,7 +150,7 @@ public abstract class FixedIntBlockIndex
private int upto;
@Override
- public void read(final IndexInput indexIn, final boolean absolute) throws IOException {
+ public void read(final DataInput indexIn, final boolean absolute) throws IOException {
if (absolute) {
fp = indexIn.readVLong();
upto = indexIn.readVInt();
@@ -205,5 +206,10 @@ public abstract class FixedIntBlockIndex
other.upto = upto;
return other;
}
+
+ @Override
+ public String toString() {
+ return "fp=" + fp + " upto=" + upto;
+ }
}
}
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/intblock/FixedIntBlockIndexOutput.java Wed Feb 9 09:35:27 2011
@@ -111,6 +111,11 @@ public abstract class FixedIntBlockIndex
lastUpto = upto;
lastFP = fp;
}
+
+ @Override
+ public String toString() {
+ return "fp=" + fp + " upto=" + upto;
+ }
}
@Override
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/intblock/VariableIntBlockIndexInput.java Wed Feb 9 09:35:27 2011
@@ -24,6 +24,7 @@ package org.apache.lucene.index.codecs.i
import java.io.IOException;
import org.apache.lucene.index.codecs.sep.IntIndexInput;
+import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.IntsRef;
@@ -168,7 +169,7 @@ public abstract class VariableIntBlockIn
private int upto;
@Override
- public void read(final IndexInput indexIn, final boolean absolute) throws IOException {
+ public void read(final DataInput indexIn, final boolean absolute) throws IOException {
if (absolute) {
fp = indexIn.readVLong();
upto = indexIn.readByte()&0xFF;
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java Wed Feb 9 09:35:27 2011
@@ -269,6 +269,11 @@ public class PreFlexFields extends Field
return BytesRef.getUTF8SortedAsUTF16Comparator();
}
}
+
+ @Override
+ public long getSumTotalTermFreq() {
+ return -1;
+ }
}
private class PreTermsEnum extends TermsEnum {
@@ -540,7 +545,7 @@ public class PreFlexFields extends Field
// We can easily detect S in UTF8: if a byte has
// prefix 11110 (0xf0), then that byte and the
// following 3 bytes encode a single unicode codepoint
- // in S. Similary,we can detect E: if a byte has
+ // in S. Similarly, we can detect E: if a byte has
// prefix 1110111 (0xee), then that byte and the
// following 2 bytes encode a single unicode codepoint
// in E.
@@ -749,11 +754,6 @@ public class PreFlexFields extends Field
}
@Override
- public void cacheCurrentTerm() throws IOException {
- getTermsDict().cacheCurrentTerm(termEnum);
- }
-
- @Override
public SeekStatus seek(long ord) throws IOException {
throw new UnsupportedOperationException();
}
@@ -950,6 +950,11 @@ public class PreFlexFields extends Field
}
@Override
+ public long totalTermFreq() {
+ return -1;
+ }
+
+ @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
PreDocsEnum docsEnum;
if (reuse == null || !(reuse instanceof PreDocsEnum)) {
@@ -982,7 +987,7 @@ public class PreFlexFields extends Field
private final class PreDocsEnum extends DocsEnum {
final private SegmentTermDocs docs;
-
+ private int docID = -1;
PreDocsEnum() throws IOException {
docs = new SegmentTermDocs(freqStream, getTermsDict(), fieldInfos);
}
@@ -1000,18 +1005,18 @@ public class PreFlexFields extends Field
@Override
public int nextDoc() throws IOException {
if (docs.next()) {
- return docs.doc();
+ return docID = docs.doc();
} else {
- return NO_MORE_DOCS;
+ return docID = NO_MORE_DOCS;
}
}
@Override
public int advance(int target) throws IOException {
if (docs.skipTo(target)) {
- return docs.doc();
+ return docID = docs.doc();
} else {
- return NO_MORE_DOCS;
+ return docID = NO_MORE_DOCS;
}
}
@@ -1022,7 +1027,7 @@ public class PreFlexFields extends Field
@Override
public int docID() {
- return docs.doc();
+ return docID;
}
@Override
@@ -1038,7 +1043,7 @@ public class PreFlexFields extends Field
private final class PreDocsAndPositionsEnum extends DocsAndPositionsEnum {
final private SegmentTermPositions pos;
-
+ private int docID = -1;
PreDocsAndPositionsEnum() throws IOException {
pos = new SegmentTermPositions(freqStream, proxStream, getTermsDict(), fieldInfos);
}
@@ -1056,18 +1061,18 @@ public class PreFlexFields extends Field
@Override
public int nextDoc() throws IOException {
if (pos.next()) {
- return pos.doc();
+ return docID = pos.doc();
} else {
- return NO_MORE_DOCS;
+ return docID = NO_MORE_DOCS;
}
}
@Override
public int advance(int target) throws IOException {
if (pos.skipTo(target)) {
- return pos.doc();
+ return docID = pos.doc();
} else {
- return NO_MORE_DOCS;
+ return docID = NO_MORE_DOCS;
}
}
@@ -1078,16 +1083,18 @@ public class PreFlexFields extends Field
@Override
public int docID() {
- return pos.doc();
+ return docID;
}
@Override
public int nextPosition() throws IOException {
+ assert docID != NO_MORE_DOCS;
return pos.nextPosition();
}
@Override
public boolean hasPayload() {
+ assert docID != NO_MORE_DOCS;
return pos.isPayloadAvailable();
}
Modified: lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java?rev=1068809&r1=1068808&r2=1068809&view=diff
==============================================================================
--- lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java (original)
+++ lucene/dev/branches/docvalues/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java Wed Feb 9 09:35:27 2011
@@ -45,7 +45,7 @@ public final class SegmentTermEnum imple
// whenever you add a new format, make it 1 smaller (negative version logic)!
public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
- // when removing support for old versions, levae the last supported version here
+ // when removing support for old versions, leave the last supported version here
public static final int FORMAT_MINIMUM = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
private TermBuffer termBuffer = new TermBuffer();