You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by bu...@apache.org on 2011/01/13 20:53:39 UTC
svn commit: r1058718 [4/18] - in /lucene/dev/branches/realtime_search: ./
lucene/ lucene/contrib/ lucene/contrib/ant/src/java/org/apache/lucene/ant/
lucene/contrib/ant/src/test/org/apache/lucene/ant/
lucene/contrib/benchmark/ lucene/contrib/demo/src/ja...
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/MultiReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/MultiReader.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/MultiReader.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/MultiReader.java Thu Jan 13 19:53:21 2011
@@ -18,14 +18,11 @@ package org.apache.lucene.index;
*/
import java.io.IOException;
-import java.util.Arrays;
import java.util.Collection;
-import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
-import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.FieldCache; // not great (circular); used only to purge FieldCache entry on close
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
@@ -35,10 +32,9 @@ import org.apache.lucene.util.ReaderUtil
* their content. */
public class MultiReader extends IndexReader implements Cloneable {
protected IndexReader[] subReaders;
+ private final ReaderContext topLevelContext;
private int[] starts; // 1st docno for each segment
- private final Map<IndexReader,ReaderUtil.Slice> subReaderToSlice = new HashMap<IndexReader,ReaderUtil.Slice>();
private boolean[] decrefOnClose; // remember which subreaders to decRef on close
- private Map<String,byte[]> normsCache = new HashMap<String,byte[]>();
private int maxDoc = 0;
private int numDocs = -1;
private boolean hasDeletions = false;
@@ -51,7 +47,7 @@ public class MultiReader extends IndexRe
* @param subReaders set of (sub)readers
*/
public MultiReader(IndexReader... subReaders) throws IOException {
- initialize(subReaders, true);
+ topLevelContext = initialize(subReaders, true);
}
/**
@@ -63,14 +59,13 @@ public class MultiReader extends IndexRe
* @param subReaders set of (sub)readers
*/
public MultiReader(IndexReader[] subReaders, boolean closeSubReaders) throws IOException {
- initialize(subReaders, closeSubReaders);
+ topLevelContext = initialize(subReaders, closeSubReaders);
}
- private void initialize(IndexReader[] subReaders, boolean closeSubReaders) throws IOException {
+ private ReaderContext initialize(IndexReader[] subReaders, boolean closeSubReaders) throws IOException {
this.subReaders = subReaders.clone();
starts = new int[subReaders.length + 1]; // build starts array
decrefOnClose = new boolean[subReaders.length];
-
for (int i = 0; i < subReaders.length; i++) {
starts[i] = maxDoc;
maxDoc += subReaders[i].maxDoc(); // compute maxDocs
@@ -85,14 +80,9 @@ public class MultiReader extends IndexRe
if (subReaders[i].hasDeletions()) {
hasDeletions = true;
}
-
- final ReaderUtil.Slice slice = new ReaderUtil.Slice(starts[i],
- subReaders[i].maxDoc(),
- i);
- subReaderToSlice.put(subReaders[i], slice);
}
-
starts[subReaders.length] = maxDoc;
+ return ReaderUtil.buildReaderContext(this);
}
@Override
@@ -101,11 +91,6 @@ public class MultiReader extends IndexRe
}
@Override
- public int getSubReaderDocBase(IndexReader subReader) {
- return subReaderToSlice.get(subReader).start;
- }
-
- @Override
public Fields fields() throws IOException {
throw new UnsupportedOperationException("please use MultiFields.getFields, or wrap your IndexReader with SlowMultiReaderWrapper, if you really need a top level Fields");
}
@@ -316,45 +301,12 @@ public class MultiReader extends IndexRe
@Override
public synchronized byte[] norms(String field) throws IOException {
- ensureOpen();
- byte[] bytes = normsCache.get(field);
- if (bytes != null)
- return bytes; // cache hit
- if (!hasNorms(field))
- return null;
-
- bytes = new byte[maxDoc()];
- for (int i = 0; i < subReaders.length; i++)
- subReaders[i].norms(field, bytes, starts[i]);
- normsCache.put(field, bytes); // update cache
- return bytes;
- }
-
- @Override
- public synchronized void norms(String field, byte[] result, int offset)
- throws IOException {
- ensureOpen();
- byte[] bytes = normsCache.get(field);
- for (int i = 0; i < subReaders.length; i++) // read from segments
- subReaders[i].norms(field, result, offset + starts[i]);
-
- if (bytes==null && !hasNorms(field)) {
- Arrays.fill(result, offset, result.length, Similarity.getDefault().encodeNormValue(1.0f));
- } else if (bytes != null) { // cache hit
- System.arraycopy(bytes, 0, result, offset, maxDoc());
- } else {
- for (int i = 0; i < subReaders.length; i++) { // read from segments
- subReaders[i].norms(field, result, offset + starts[i]);
- }
- }
+ throw new UnsupportedOperationException("please use MultiNorms.norms, or wrap your IndexReader with SlowMultiReaderWrapper, if you really need a top level norms");
}
@Override
protected void doSetNorm(int n, String field, byte value)
throws CorruptIndexException, IOException {
- synchronized (normsCache) {
- normsCache.remove(field); // clear cache
- }
int i = readerIndex(n); // find segment num
subReaders[i].setNorm(n-starts[i], field, value); // dispatch
}
@@ -433,4 +385,8 @@ public class MultiReader extends IndexRe
public IndexReader[] getSequentialSubReaders() {
return subReaders;
}
+
+ public ReaderContext getTopReaderContext() {
+ return topLevelContext;
+ }
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java Thu Jan 13 19:53:21 2011
@@ -91,13 +91,6 @@ public final class MultiTermsEnum extend
}
@Override
- public void cacheCurrentTerm() throws IOException {
- for(int i=0;i<numTop;i++) {
- top[i].terms.cacheCurrentTerm();
- }
- }
-
- @Override
public Comparator<BytesRef> getComparator() {
return termComp;
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/NormsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/NormsWriter.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/NormsWriter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/NormsWriter.java Thu Jan 13 19:53:21 2011
@@ -21,7 +21,6 @@ import java.io.IOException;
import java.util.Collection;
import java.util.Map;
-import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.IndexOutput;
// TODO FI: norms could actually be stored as doc store
@@ -33,7 +32,6 @@ import org.apache.lucene.store.IndexOutp
final class NormsWriter extends InvertedDocEndConsumer {
- private static final byte defaultNorm = Similarity.getDefault().encodeNormValue(1.0f);
private FieldInfos fieldInfos;
@Override
@@ -51,9 +49,11 @@ final class NormsWriter extends Inverted
* not disabled */
@Override
public void flush(Map<FieldInfo,InvertedDocEndConsumerPerField> fieldsToFlush, SegmentWriteState state) throws IOException {
+ if (!fieldInfos.hasNorms()) {
+ return;
+ }
final String normsFileName = IndexFileNames.segmentFileName(state.segmentName, "", IndexFileNames.NORMS_EXTENSION);
- state.flushedFiles.add(normsFileName);
IndexOutput normsOut = state.directory.createOutput(normsFileName);
try {
@@ -79,7 +79,7 @@ final class NormsWriter extends Inverted
normsOut.writeByte(toWrite.norms[upto]);
upto++;
} else {
- normsOut.writeByte(defaultNorm);
+ normsOut.writeByte((byte) 0);
}
}
@@ -91,7 +91,7 @@ final class NormsWriter extends Inverted
normCount++;
// Fill entire field with default norm:
for(;upto<state.numDocs;upto++)
- normsOut.writeByte(defaultNorm);
+ normsOut.writeByte((byte) 0);
}
assert 4+normCount*state.numDocs == normsOut.getFilePointer() : ".nrm file size mismatch: expected=" + (4+normCount*state.numDocs) + " actual=" + normsOut.getFilePointer();
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/NormsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/NormsWriterPerField.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/NormsWriterPerField.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/NormsWriterPerField.java Thu Jan 13 19:53:21 2011
@@ -18,7 +18,6 @@ package org.apache.lucene.index;
*/
import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.search.Similarity;
/** Taps into DocInverter, as an InvertedDocEndConsumer,
* which is called at the end of inverting each field. We
@@ -71,7 +70,7 @@ final class NormsWriterPerField extends
norms = ArrayUtil.grow(norms, 1+upto);
}
final float norm = docState.similarity.computeNorm(fieldInfo.name, fieldState);
- norms[upto] = Similarity.getDefault().encodeNormValue(norm);
+ norms[upto] = docState.similarity.encodeNormValue(norm);
docIDs[upto] = docState.docID;
upto++;
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/ParallelReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/ParallelReader.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/ParallelReader.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/ParallelReader.java Thu Jan 13 19:53:21 2011
@@ -53,7 +53,8 @@ public class ParallelReader extends Inde
private SortedMap<String,IndexReader> fieldToReader = new TreeMap<String,IndexReader>();
private Map<IndexReader,Collection<String>> readerToFields = new HashMap<IndexReader,Collection<String>>();
private List<IndexReader> storedFieldReaders = new ArrayList<IndexReader>();
-
+ private Map<String,byte[]> normsCache = new HashMap<String,byte[]>();
+ private final ReaderContext topLevelReaderContext = new AtomicReaderContext(this);
private int maxDoc;
private int numDocs;
private boolean hasDeletions;
@@ -88,7 +89,7 @@ public class ParallelReader extends Inde
buffer.append(')');
return buffer.toString();
}
-
+
/** Add an IndexReader.
* @throws IOException if there is a low-level IO error
*/
@@ -141,6 +142,9 @@ public class ParallelReader extends Inde
reader.incRef();
}
decrefOnClose.add(Boolean.valueOf(incRefReaders));
+ synchronized(normsCache) {
+ normsCache.clear(); // TODO: don't need to clear this for all fields really?
+ }
}
private class ParallelFieldsEnum extends FieldsEnum {
@@ -278,6 +282,7 @@ public class ParallelReader extends Inde
if (reopened) {
List<Boolean> newDecrefOnClose = new ArrayList<Boolean>();
+ // TODO: maybe add a special reopen-ctor for norm-copying?
ParallelReader pr = new ParallelReader();
for (int i = 0; i < readers.size(); i++) {
IndexReader oldReader = readers.get(i);
@@ -419,27 +424,36 @@ public class ParallelReader extends Inde
}
@Override
- public byte[] norms(String field) throws IOException {
+ public synchronized byte[] norms(String field) throws IOException {
ensureOpen();
IndexReader reader = fieldToReader.get(field);
- return reader==null ? null : reader.norms(field);
- }
- @Override
- public void norms(String field, byte[] result, int offset)
- throws IOException {
- ensureOpen();
- IndexReader reader = fieldToReader.get(field);
- if (reader!=null)
- reader.norms(field, result, offset);
+ if (reader==null)
+ return null;
+
+ byte[] bytes = normsCache.get(field);
+ if (bytes != null)
+ return bytes;
+ if (!hasNorms(field))
+ return null;
+ if (normsCache.containsKey(field)) // cached omitNorms, not missing key
+ return null;
+
+ bytes = MultiNorms.norms(reader, field);
+ normsCache.put(field, bytes);
+ return bytes;
}
@Override
protected void doSetNorm(int n, String field, byte value)
throws CorruptIndexException, IOException {
IndexReader reader = fieldToReader.get(field);
- if (reader!=null)
+ if (reader!=null) {
+ synchronized(normsCache) {
+ normsCache.remove(field);
+ }
reader.doSetNorm(n, field, value);
+ }
}
@Override
@@ -529,6 +543,11 @@ public class ParallelReader extends Inde
}
return fieldSet;
}
+ @Override
+ public ReaderContext getTopReaderContext() {
+ return topLevelReaderContext;
+ }
+
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentInfo.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentInfo.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentInfo.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentInfo.java Thu Jan 13 19:53:21 2011
@@ -68,7 +68,8 @@ public final class SegmentInfo {
private List<String> files; // cached list of files that this segment uses
// in the Directory
- long sizeInBytes = -1; // total byte size of all of our files (computed on demand)
+ private long sizeInBytesNoStore = -1; // total byte size of all but the store files (computed on demand)
+ private long sizeInBytesWithStore = -1; // total byte size of all of our files (computed on demand)
@Deprecated private int docStoreOffset; // if this segment shares stored fields & vectors, this
// offset is where in that file this segment's docs begin
@@ -220,25 +221,33 @@ public final class SegmentInfo {
}
}
- /** Returns total size in bytes of all of files used by
- * this segment. */
+ /**
+ * Returns total size in bytes of all of files used by this segment (if
+ * {@code includeDocStores} is true), or the size of all files except the
+ * store files otherwise.
+ */
public long sizeInBytes(boolean includeDocStores) throws IOException {
- if (sizeInBytes == -1) {
- List<String> files = files();
- final int size = files.size();
- sizeInBytes = 0;
- for(int i=0;i<size;i++) {
- final String fileName = files.get(i);
- if (!includeDocStores && IndexFileNames.isDocStoreFile(fileName)) {
+ if (includeDocStores) {
+ if (sizeInBytesWithStore != -1) return sizeInBytesWithStore;
+ sizeInBytesWithStore = 0;
+ for (final String fileName : files()) {
+ // We don't count bytes used by a shared doc store against this segment
+ if (docStoreOffset == -1 || !IndexFileNames.isDocStoreFile(fileName)) {
+ sizeInBytesWithStore += dir.fileLength(fileName);
+ }
+ }
+ return sizeInBytesWithStore;
+ } else {
+ if (sizeInBytesNoStore != -1) return sizeInBytesNoStore;
+ sizeInBytesNoStore = 0;
+ for (final String fileName : files()) {
+ if (IndexFileNames.isDocStoreFile(fileName)) {
continue;
}
- // We don't count bytes used by a shared doc store
- // against this segment:
- if (docStoreOffset == -1 || !IndexFileNames.isDocStoreFile(fileName))
- sizeInBytes += dir.fileLength(fileName);
+ sizeInBytesNoStore += dir.fileLength(fileName);
}
+ return sizeInBytesNoStore;
}
- return sizeInBytes;
}
public boolean getHasVectors() throws IOException {
@@ -549,7 +558,8 @@ public final class SegmentInfo {
* files this segment has. */
private void clearFiles() {
files = null;
- sizeInBytes = -1;
+ sizeInBytesNoStore = -1;
+ sizeInBytesWithStore = -1;
}
/** {@inheritDoc} */
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentMerger.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentMerger.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentMerger.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentMerger.java Thu Jan 13 19:53:21 2011
@@ -19,9 +19,8 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collection;
-import java.util.Set;
-import java.util.HashSet;
import java.util.List;
import org.apache.lucene.document.Document;
@@ -127,41 +126,11 @@ final class SegmentMerger {
return mergedDocs;
}
- final Collection<String> getMergedFiles(final SegmentInfo info) throws IOException {
- Set<String> fileSet = new HashSet<String>();
-
- // Basic files
- for (String ext : IndexFileNames.COMPOUND_EXTENSIONS_NOT_CODEC) {
- fileSet.add(IndexFileNames.segmentFileName(segment, "", ext));
- }
-
- segmentWriteState.segmentCodecs.files(directory, info, fileSet);
-
- // Fieldable norm files
- int numFIs = fieldInfos.size();
- for (int i = 0; i < numFIs; i++) {
- FieldInfo fi = fieldInfos.fieldInfo(i);
- if (fi.isIndexed && !fi.omitNorms) {
- fileSet.add(IndexFileNames.segmentFileName(segment, "", IndexFileNames.NORMS_EXTENSION));
- break;
- }
- }
-
- // Vector files
- if (fieldInfos.hasVectors()) {
- for (String ext : IndexFileNames.VECTOR_EXTENSIONS) {
- fileSet.add(IndexFileNames.segmentFileName(segment, "", ext));
- }
- }
-
- return fileSet;
- }
-
final Collection<String> createCompoundFile(String fileName, final SegmentInfo info)
throws IOException {
// Now merge all added files
- Collection<String> files = getMergedFiles(info);
+ Collection<String> files = info.files();
CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, fileName, checkAbort);
for (String file : files) {
cfsWriter.addFile(file);
@@ -602,13 +571,6 @@ final class SegmentMerger {
}
private void mergeNorms() throws IOException {
- // get needed buffer size by finding the largest segment
- int bufferSize = 0;
- for (IndexReader reader : readers) {
- bufferSize = Math.max(bufferSize, reader.maxDoc());
- }
-
- byte[] normBuffer = null;
IndexOutput output = null;
try {
for (int i = 0, numFieldInfos = fieldInfos.size(); i < numFieldInfos; i++) {
@@ -618,12 +580,15 @@ final class SegmentMerger {
output = directory.createOutput(IndexFileNames.segmentFileName(segment, "", IndexFileNames.NORMS_EXTENSION));
output.writeBytes(NORMS_HEADER,NORMS_HEADER.length);
}
- if (normBuffer == null) {
- normBuffer = new byte[bufferSize];
- }
for (IndexReader reader : readers) {
final int maxDoc = reader.maxDoc();
- reader.norms(fi.name, normBuffer, 0);
+ byte normBuffer[] = reader.norms(fi.name);
+ if (normBuffer == null) {
+ // Can be null if this segment doesn't have
+ // any docs with this field
+ normBuffer = new byte[maxDoc];
+ Arrays.fill(normBuffer, (byte)0);
+ }
if (!reader.hasDeletions()) {
//optimized case for segments without deleted docs
output.writeBytes(normBuffer, maxDoc);
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentReadState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentReadState.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentReadState.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentReadState.java Thu Jan 13 19:53:21 2011
@@ -33,7 +33,7 @@ public class SegmentReadState {
// terms index on init (preflex is the only once currently
// that must do so), then it should negate this value to
// get the app's terms divisor:
- public final int termsIndexDivisor;
+ public int termsIndexDivisor;
public final String codecId;
public SegmentReadState(Directory dir, SegmentInfo info,
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentReader.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentReader.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentReader.java Thu Jan 13 19:53:21 2011
@@ -19,7 +19,6 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
@@ -31,7 +30,6 @@ import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
-import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
@@ -51,7 +49,7 @@ public class SegmentReader extends Index
private SegmentInfo si;
private int readBufferSize;
-
+ private final ReaderContext readerContext = new AtomicReaderContext(this);
CloseableThreadLocal<FieldsReader> fieldsReaderLocal = new FieldsReaderLocal();
CloseableThreadLocal<TermVectorsReader> termVectorsLocal = new CloseableThreadLocal<TermVectorsReader>();
@@ -991,22 +989,6 @@ public class SegmentReader extends Index
norm.copyOnWrite()[doc] = value; // set the value
}
- /** Read norms into a pre-allocated array. */
- @Override
- public synchronized void norms(String field, byte[] bytes, int offset)
- throws IOException {
-
- ensureOpen();
- Norm norm = norms.get(field);
- if (norm == null) {
- Arrays.fill(bytes, offset, bytes.length, Similarity.getDefault().encodeNormValue(1.0f));
- return;
- }
-
- norm.bytes(bytes, offset, maxDoc());
- }
-
-
private void openNorms(Directory cfsDir, int readBufferSize) throws IOException {
long nextNormSeek = SegmentMerger.NORMS_HEADER.length; //skip header (header unused for now)
int maxDoc = maxDoc();
@@ -1183,6 +1165,11 @@ public class SegmentReader extends Index
buffer.append(si.toString(core.dir, pendingDeleteCount));
return buffer.toString();
}
+
+ @Override
+ public ReaderContext getTopReaderContext() {
+ return readerContext;
+ }
/**
* Return the name of the segment this reader is reading.
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SegmentWriteState.java Thu Jan 13 19:53:21 2011
@@ -18,8 +18,6 @@ package org.apache.lucene.index;
*/
import java.io.PrintStream;
-import java.util.Collection;
-import java.util.HashSet;
import org.apache.lucene.store.Directory;
@@ -33,7 +31,6 @@ public class SegmentWriteState {
public final FieldInfos fieldInfos;
public final int numDocs;
public boolean hasVectors;
- public final Collection<String> flushedFiles;
final SegmentCodecs segmentCodecs;
public final String codecId;
@@ -43,7 +40,7 @@ public class SegmentWriteState {
* faster, while larger values use less memory and make searching slightly
* slower. Searching is typically not dominated by dictionary lookup, so
* tweaking this is rarely useful.*/
- public final int termIndexInterval;
+ public int termIndexInterval; // TODO: this should be private to the codec, not settable here or in IWC
/** Expert: The fraction of TermDocs entries stored in skip tables,
* used to accelerate {@link DocsEnum#advance(int)}. Larger values result in
@@ -68,7 +65,6 @@ public class SegmentWriteState {
this.numDocs = numDocs;
this.termIndexInterval = termIndexInterval;
this.segmentCodecs = segmentCodecs;
- flushedFiles = new HashSet<String>();
codecId = "";
}
@@ -83,7 +79,6 @@ public class SegmentWriteState {
numDocs = state.numDocs;
termIndexInterval = state.termIndexInterval;
segmentCodecs = state.segmentCodecs;
- flushedFiles = state.flushedFiles;
this.codecId = codecId;
}
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SlowMultiReaderWrapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SlowMultiReaderWrapper.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SlowMultiReaderWrapper.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/SlowMultiReaderWrapper.java Thu Jan 13 19:53:21 2011
@@ -18,6 +18,9 @@ package org.apache.lucene.index;
*/
import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.ReaderUtil; // javadoc
@@ -48,8 +51,12 @@ import org.apache.lucene.index.MultiRead
public final class SlowMultiReaderWrapper extends FilterIndexReader {
+ private final ReaderContext readerContext;
+ private final Map<String,byte[]> normsCache = new HashMap<String,byte[]>();
+
public SlowMultiReaderWrapper(IndexReader other) {
super(other);
+ readerContext = new AtomicReaderContext(this); // emulate atomic reader!
}
@Override
@@ -62,9 +69,39 @@ public final class SlowMultiReaderWrappe
return MultiFields.getDeletedDocs(in);
}
+
@Override
public IndexReader[] getSequentialSubReaders() {
return null;
}
+
+ @Override
+ public synchronized byte[] norms(String field) throws IOException {
+ ensureOpen();
+ byte[] bytes = normsCache.get(field);
+ if (bytes != null)
+ return bytes;
+ if (!hasNorms(field))
+ return null;
+ if (normsCache.containsKey(field)) // cached omitNorms, not missing key
+ return null;
+
+ bytes = MultiNorms.norms(in, field);
+ normsCache.put(field, bytes);
+ return bytes;
+ }
+ @Override
+ public ReaderContext getTopReaderContext() {
+ return readerContext;
+ }
+
+ @Override
+ protected void doSetNorm(int n, String field, byte value)
+ throws CorruptIndexException, IOException {
+ synchronized(normsCache) {
+ normsCache.remove(field);
+ }
+ in.doSetNorm(n, field, value);
+ }
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/StoredFieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/StoredFieldsWriter.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/StoredFieldsWriter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/StoredFieldsWriter.java Thu Jan 13 19:53:21 2011
@@ -70,11 +70,7 @@ final class StoredFieldsWriter {
fieldsWriter = null;
lastDocID = 0;
- String fieldsName = IndexFileNames.segmentFileName(state.segmentName, "", IndexFileNames.FIELDS_EXTENSION);
String fieldsIdxName = IndexFileNames.segmentFileName(state.segmentName, "", IndexFileNames.FIELDS_INDEX_EXTENSION);
- state.flushedFiles.add(fieldsName);
- state.flushedFiles.add(fieldsIdxName);
-
if (4 + ((long) state.numDocs) * 8 != state.directory.fileLength(fieldsIdxName)) {
throw new RuntimeException("after flush: fdx size mismatch: " + state.numDocs + " docs vs " + state.directory.fileLength(fieldsIdxName) + " length in bytes of " + fieldsIdxName + " file exists?=" + state.directory.fileExists(fieldsIdxName));
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java Thu Jan 13 19:53:21 2011
@@ -54,9 +54,6 @@ final class TermVectorsTermsWriter exten
fill(state.numDocs);
assert state.segmentName != null;
String idxName = IndexFileNames.segmentFileName(state.segmentName, "", IndexFileNames.VECTORS_INDEX_EXTENSION);
- String fldName = IndexFileNames.segmentFileName(state.segmentName, "", IndexFileNames.VECTORS_FIELDS_EXTENSION);
- String docName = IndexFileNames.segmentFileName(state.segmentName, "", IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
-
tvx.close();
tvf.close();
tvd.close();
@@ -64,10 +61,6 @@ final class TermVectorsTermsWriter exten
if (4+((long) state.numDocs)*16 != state.directory.fileLength(idxName))
throw new RuntimeException("after flush: tvx size mismatch: " + state.numDocs + " docs vs " + state.directory.fileLength(idxName) + " length in bytes of " + idxName + " file exists?=" + state.directory.fileExists(idxName));
- state.flushedFiles.add(idxName);
- state.flushedFiles.add(fldName);
- state.flushedFiles.add(docName);
-
lastDocID = 0;
state.hasVectors = hasVectors;
hasVectors = false;
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/Terms.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/Terms.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/Terms.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/Terms.java Thu Jan 13 19:53:21 2011
@@ -80,11 +80,57 @@ public abstract class Terms {
}
}
+ /**
+ * Expert: Get {@link DocsEnum} for the specified {@link TermState}.
+ * This method may return <code>null</code> if the term does not exist.
+ *
+ * @see TermsEnum#termState()
+ * @see TermsEnum#seek(BytesRef, TermState) */
+ public DocsEnum docs(Bits skipDocs, BytesRef term, TermState termState, DocsEnum reuse) throws IOException {
+ final TermsEnum termsEnum = getThreadTermsEnum();
+ if (termsEnum.seek(term, termState) == TermsEnum.SeekStatus.FOUND) {
+ return termsEnum.docs(skipDocs, reuse);
+ } else {
+ return null;
+ }
+ }
+
+ /**
+ * Get {@link DocsEnum} for the specified {@link TermState}. This
+ * method will may return <code>null</code> if the term does not exists, or positions were
+ * not indexed.
+ *
+ * @see TermsEnum#termState()
+ * @see TermsEnum#seek(BytesRef, TermState) */
+ public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, BytesRef term, TermState termState, DocsAndPositionsEnum reuse) throws IOException {
+ final TermsEnum termsEnum = getThreadTermsEnum();
+ if (termsEnum.seek(term, termState) == TermsEnum.SeekStatus.FOUND) {
+ return termsEnum.docsAndPositions(skipDocs, reuse);
+ } else {
+ return null;
+ }
+ }
+
public long getUniqueTermCount() throws IOException {
throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()");
}
- protected TermsEnum getThreadTermsEnum() throws IOException {
+ /**
+ * Returns a thread-private {@link TermsEnum} instance. Obtaining
+ * {@link TermsEnum} from this method might be more efficient than using
+ * {@link #iterator()} directly since this method doesn't necessarily create a
+ * new {@link TermsEnum} instance.
+ * <p>
+ * NOTE: {@link TermsEnum} instances obtained from this method must not be
+ * shared across threads. The enum should only be used within a local context
+ * where other threads can't access it.
+ *
+ * @return a thread-private {@link TermsEnum} instance
+ * @throws IOException
+ * if an IOException occurs
+ * @lucene.internal
+ */
+ public TermsEnum getThreadTermsEnum() throws IOException {
TermsEnum termsEnum = threadEnums.get();
if (termsEnum == null) {
termsEnum = iterator();
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermsEnum.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermsEnum.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/TermsEnum.java Thu Jan 13 19:53:21 2011
@@ -73,7 +73,34 @@ public abstract class TermsEnum {
* may be before or after the current ord. See {@link
* #seek(BytesRef)}. */
public abstract SeekStatus seek(long ord) throws IOException;
-
+
+ /**
+ * Expert: Seeks a specific position by {@link TermState} previously obtained
+ * from {@link #termState()}. Callers should maintain the {@link TermState} to
+ * use this method. Low-level implementations may position the TermsEnum
+ * without re-seeking the term dictionary.
+ * <p>
+ * Seeking by {@link TermState} should only be used iff the enu the state was
+ * obtained from and the enum the state is used for seeking are obtained from
+ * the same {@link IndexReader}, otherwise a {@link #seek(BytesRef, TermState)} call can
+ * leave the enum in undefined state.
+ * <p>
+ * NOTE: Using this method with an incompatible {@link TermState} might leave
+ * this {@link TermsEnum} in undefined state. On a segment level
+ * {@link TermState} instances are compatible only iff the source and the
+ * target {@link TermsEnum} operate on the same field. If operating on segment
+ * level, TermState instances must not be used across segments.
+ * <p>
+ * NOTE: A seek by {@link TermState} might not restore the
+ * {@link AttributeSource}'s state. {@link AttributeSource} states must be
+ * maintained separately if this method is used.
+ * @param term the term the TermState corresponds to
+ * @param state the {@link TermState}
+ * */
+ public SeekStatus seek(BytesRef term, TermState state) throws IOException {
+ return seek(term);
+ }
+
/** Increments the enumeration to the next element.
* Returns the resulting term, or null if the end was
* hit. The returned BytesRef may be re-used across calls
@@ -98,7 +125,7 @@ public abstract class TermsEnum {
* first time, after next() returns null or seek returns
* {@link SeekStatus#END}.*/
public abstract int docFreq();
-
+
/** Get {@link DocsEnum} for the current term. Do not
* call this before calling {@link #next} or {@link
* #seek} for the first time. This method will not
@@ -116,6 +143,25 @@ public abstract class TermsEnum {
* the postings by this codec. */
public abstract DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException;
+ /**
+ * Expert: Returns the TermsEnums internal state to position the TermsEnum
+ * without re-seeking the term dictionary.
+ * <p>
+ * NOTE: A seek by {@link TermState} might not capture the
+ * {@link AttributeSource}'s state. Callers must maintain the
+ * {@link AttributeSource} states separately
+ *
+ * @see TermState
+ * @see #seek(BytesRef, TermState)
+ */
+ public TermState termState() throws IOException {
+ return new TermState() {
+ @Override
+ public void copyFrom(TermState other) {
+ }
+ };
+ }
+
/** Return the {@link BytesRef} Comparator used to sort
* terms provided by the iterator. This may return
* null if there are no terms. Callers may invoke this
@@ -123,10 +169,6 @@ public abstract class TermsEnum {
* instance & reuse it. */
public abstract Comparator<BytesRef> getComparator() throws IOException;
- /** Optional optimization hint: informs the codec that the
- * current term is likely to be re-seek'd-to soon. */
- public abstract void cacheCurrentTerm() throws IOException;
-
/** An empty TermsEnum for quickly returning an empty instance e.g.
* in {@link org.apache.lucene.search.MultiTermQuery}
* <p><em>Please note:</em> This enum should be unmodifiable,
@@ -142,9 +184,6 @@ public abstract class TermsEnum {
public SeekStatus seek(long ord) { return SeekStatus.END; }
@Override
- public void cacheCurrentTerm() {}
-
- @Override
public BytesRef term() {
throw new IllegalStateException("this method should never be called");
}
@@ -183,5 +222,15 @@ public abstract class TermsEnum {
public synchronized AttributeSource attributes() {
return super.attributes();
}
+
+ @Override
+ public TermState termState() throws IOException {
+ throw new IllegalStateException("this method should never be called");
+ }
+
+ @Override
+ public SeekStatus seek(BytesRef term, TermState state) throws IOException {
+ throw new IllegalStateException("this method should never be called");
+ }
};
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesReader.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesReader.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesReader.java Thu Jan 13 19:53:21 2011
@@ -36,13 +36,17 @@ final class DeltaBytesReader {
term.copy(text);
}
- void read() throws IOException {
+ boolean read() throws IOException {
final int start = in.readVInt();
+ if (start == DeltaBytesWriter.TERM_EOF) {
+ return false;
+ }
final int suffix = in.readVInt();
assert start <= term.length: "start=" + start + " length=" + term.length;
final int newLength = start+suffix;
term.grow(newLength);
in.readBytes(term.bytes, start, suffix);
term.length = newLength;
+ return true;
}
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesWriter.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesWriter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/DeltaBytesWriter.java Thu Jan 13 19:53:21 2011
@@ -20,11 +20,18 @@ package org.apache.lucene.index.codecs;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
import java.io.IOException;
final class DeltaBytesWriter {
+ // Must be bigger than
+ // DocumentsWriter.MAX_TERM_LENGTH_UTF8. If you change
+ // this it's an index format change, so that change must be
+ // versioned:
+ final static int TERM_EOF = BYTE_BLOCK_SIZE;
+
private byte[] lastBytes = new byte[10];
private int lastLength;
final IndexOutput out;
@@ -45,8 +52,9 @@ final class DeltaBytesWriter {
final int limit = length < lastLength ? length : lastLength;
while(start < limit) {
- if (bytes[upto] != lastBytes[start])
+ if (bytes[upto] != lastBytes[start]) {
break;
+ }
start++;
upto++;
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexReader.java Thu Jan 13 19:53:21 2011
@@ -33,29 +33,6 @@ import java.util.Collection;
import java.util.Comparator;
import java.io.IOException;
-/**
- * Uses a simplistic format to record terms dict index
- * information. Limititations:
- *
- * - Index for all fields is loaded entirely into RAM up
- * front
- * - Index is stored in RAM using shared byte[] that
- * wastefully expand every term. Using FST to share
- * common prefix & suffix would save RAM.
- * - Index is taken at regular numTerms (every 128 by
- * default); might be better to do it by "net docFreqs"
- * encountered, so that for spans of low-freq terms we
- * take index less often.
- *
- * A better approach might be something similar to how
- * postings are encoded, w/ multi-level skips. Ie, load all
- * terms index data into memory, as a single large compactly
- * encoded stream (eg delta bytes + delta offset). Index
- * that w/ multi-level skipper. Then to look up a term is
- * the equivalent binary search, using the skipper instead,
- * while data remains compressed in memory.
- */
-
import org.apache.lucene.index.IndexFileNames;
/** @lucene.experimental */
@@ -74,7 +51,7 @@ public class FixedGapTermsIndexReader ex
final private int indexInterval;
// Closed if indexLoaded is true:
- final private IndexInput in;
+ private IndexInput in;
private volatile boolean indexLoaded;
private final Comparator<BytesRef> termComp;
@@ -85,7 +62,7 @@ public class FixedGapTermsIndexReader ex
private final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS);
private PagedBytes.Reader termBytesReader;
- final HashMap<FieldInfo,FieldIndexReader> fields = new HashMap<FieldInfo,FieldIndexReader>();
+ final HashMap<FieldInfo,FieldIndexData> fields = new HashMap<FieldInfo,FieldIndexData>();
// start of the field info data
protected long dirOffset;
@@ -95,7 +72,7 @@ public class FixedGapTermsIndexReader ex
this.termComp = termComp;
- IndexInput in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION));
+ in = dir.openInput(IndexFileNames.segmentFileName(segment, codecId, FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION));
boolean success = false;
@@ -116,49 +93,137 @@ public class FixedGapTermsIndexReader ex
seekDir(in, dirOffset);
// Read directory
- final int numFields = in.readInt();
-
+ final int numFields = in.readVInt();
for(int i=0;i<numFields;i++) {
- final int field = in.readInt();
- final int numIndexTerms = in.readInt();
- final long termsStart = in.readLong();
- final long indexStart = in.readLong();
- final long packedIndexStart = in.readLong();
- final long packedOffsetsStart = in.readLong();
+ final int field = in.readVInt();
+ final int numIndexTerms = in.readVInt();
+ final long termsStart = in.readVLong();
+ final long indexStart = in.readVLong();
+ final long packedIndexStart = in.readVLong();
+ final long packedOffsetsStart = in.readVLong();
assert packedIndexStart >= indexStart: "packedStart=" + packedIndexStart + " indexStart=" + indexStart + " numIndexTerms=" + numIndexTerms + " seg=" + segment;
- if (numIndexTerms > 0) {
- final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
- fields.put(fieldInfo, new FieldIndexReader(in, fieldInfo, numIndexTerms, indexStart, termsStart, packedIndexStart, packedOffsetsStart));
- }
+ final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+ fields.put(fieldInfo, new FieldIndexData(fieldInfo, numIndexTerms, indexStart, termsStart, packedIndexStart, packedOffsetsStart));
}
success = true;
} finally {
if (indexDivisor > 0) {
in.close();
- this.in = null;
+ in = null;
if (success) {
indexLoaded = true;
}
termBytesReader = termBytes.freeze(true);
- } else {
- this.in = in;
}
}
}
+ @Override
+ public int getDivisor() {
+ return indexDivisor;
+ }
+
protected void readHeader(IndexInput input) throws IOException {
CodecUtil.checkHeader(input, FixedGapTermsIndexWriter.CODEC_NAME,
FixedGapTermsIndexWriter.VERSION_START, FixedGapTermsIndexWriter.VERSION_START);
dirOffset = input.readLong();
}
- private final class FieldIndexReader extends FieldReader {
+ private class IndexEnum extends FieldIndexEnum {
+ private final FieldIndexData.CoreFieldIndex fieldIndex;
+ private final BytesRef term = new BytesRef();
+ private final BytesRef nextTerm = new BytesRef();
+ private long ord;
- final private FieldInfo fieldInfo;
+ public IndexEnum(FieldIndexData.CoreFieldIndex fieldIndex) {
+ this.fieldIndex = fieldIndex;
+ }
+
+ @Override
+ public BytesRef term() {
+ return term;
+ }
+
+ @Override
+ public long seek(BytesRef target) {
+ int lo = 0; // binary search
+ int hi = fieldIndex.numIndexTerms - 1;
+ assert totalIndexInterval > 0 : "totalIndexInterval=" + totalIndexInterval;
+
+ while (hi >= lo) {
+ int mid = (lo + hi) >>> 1;
+
+ final long offset = fieldIndex.termOffsets.get(mid);
+ final int length = (int) (fieldIndex.termOffsets.get(1+mid) - offset);
+ termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
+
+ int delta = termComp.compare(target, term);
+ if (delta < 0) {
+ hi = mid - 1;
+ } else if (delta > 0) {
+ lo = mid + 1;
+ } else {
+ assert mid >= 0;
+ ord = mid*totalIndexInterval;
+ return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(mid);
+ }
+ }
- private volatile CoreFieldIndex coreIndex;
+ if (hi < 0) {
+ assert hi == -1;
+ hi = 0;
+ }
- private final IndexInput in;
+ final long offset = fieldIndex.termOffsets.get(hi);
+ final int length = (int) (fieldIndex.termOffsets.get(1+hi) - offset);
+ termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
+
+ ord = hi*totalIndexInterval;
+ return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(hi);
+ }
+
+ @Override
+ public long next() {
+ final int idx = 1 + (int) (ord / totalIndexInterval);
+ if (idx >= fieldIndex.numIndexTerms) {
+ return -1;
+ }
+ ord += totalIndexInterval;
+
+ final long offset = fieldIndex.termOffsets.get(idx);
+ final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
+ termBytesReader.fillSlice(nextTerm, fieldIndex.termBytesStart + offset, length);
+ return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
+ }
+
+ @Override
+ public long ord() {
+ return ord;
+ }
+
+ @Override
+ public long seek(long ord) {
+ int idx = (int) (ord / totalIndexInterval);
+ // caller must ensure ord is in bounds
+ assert idx < fieldIndex.numIndexTerms;
+ final long offset = fieldIndex.termOffsets.get(idx);
+ final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
+ termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
+ this.ord = idx * totalIndexInterval;
+ return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
+ }
+ }
+
+ @Override
+ public boolean supportsOrd() {
+ return true;
+ }
+
+ private final class FieldIndexData {
+
+ final private FieldInfo fieldInfo;
+
+ volatile CoreFieldIndex coreIndex;
private final long indexStart;
private final long termsStart;
@@ -167,11 +232,10 @@ public class FixedGapTermsIndexReader ex
private final int numIndexTerms;
- public FieldIndexReader(IndexInput in, FieldInfo fieldInfo, int numIndexTerms, long indexStart, long termsStart, long packedIndexStart,
- long packedOffsetsStart) throws IOException {
+ public FieldIndexData(FieldInfo fieldInfo, int numIndexTerms, long indexStart, long termsStart, long packedIndexStart,
+ long packedOffsetsStart) throws IOException {
this.fieldInfo = fieldInfo;
- this.in = in;
this.termsStart = termsStart;
this.indexStart = indexStart;
this.packedIndexStart = packedIndexStart;
@@ -182,12 +246,7 @@ public class FixedGapTermsIndexReader ex
// is -1, so that PrefixCodedTermsReader can call
// isIndexTerm for each field:
if (indexDivisor > 0) {
- coreIndex = new CoreFieldIndex(indexStart,
- termsStart,
- packedIndexStart,
- packedOffsetsStart,
- numIndexTerms);
-
+ loadTermsIndex();
}
}
@@ -197,46 +256,11 @@ public class FixedGapTermsIndexReader ex
}
}
- @Override
- public boolean isIndexTerm(long ord, int docFreq, boolean onlyLoaded) {
- if (onlyLoaded) {
- return ord % totalIndexInterval == 0;
- } else {
- return ord % indexInterval == 0;
- }
- }
-
- @Override
- public boolean nextIndexTerm(long ord, TermsIndexResult result) throws IOException {
- if (coreIndex == null) {
- throw new IllegalStateException("terms index was not loaded");
- } else {
- return coreIndex.nextIndexTerm(ord, result);
- }
- }
-
- @Override
- public void getIndexOffset(BytesRef term, TermsIndexResult result) throws IOException {
- // You must call loadTermsIndex if you had specified -1 for indexDivisor
- if (coreIndex == null) {
- throw new IllegalStateException("terms index was not loaded");
- }
- coreIndex.getIndexOffset(term, result);
- }
-
- @Override
- public void getIndexOffset(long ord, TermsIndexResult result) throws IOException {
- // You must call loadTermsIndex if you had specified
- // indexDivisor < 0 to ctor
- if (coreIndex == null) {
- throw new IllegalStateException("terms index was not loaded");
- }
- coreIndex.getIndexOffset(ord, result);
- }
-
private final class CoreFieldIndex {
- final private long termBytesStart;
+ // where this field's terms begin in the packed byte[]
+ // data
+ final long termBytesStart;
// offset into index termBytes
final PackedInts.Reader termOffsets;
@@ -245,7 +269,6 @@ public class FixedGapTermsIndexReader ex
final PackedInts.Reader termsDictOffsets;
final int numIndexTerms;
-
final long termsStart;
public CoreFieldIndex(long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, int numIndexTerms) throws IOException {
@@ -315,7 +338,6 @@ public class FixedGapTermsIndexReader ex
termsDictOffsetsM.set(upto, termsDictOffsetsIter.next());
termOffsetsM.set(upto, termOffsetUpto);
- upto++;
long termOffset = termOffsetsIter.next();
long nextTermOffset = termOffsetsIter.next();
@@ -328,6 +350,11 @@ public class FixedGapTermsIndexReader ex
termBytes.copy(clone, numTermBytes);
termOffsetUpto += numTermBytes;
+ upto++;
+ if (upto == this.numIndexTerms) {
+ break;
+ }
+
// skip terms:
termsDictOffsetsIter.next();
for(int i=0;i<indexDivisor-2;i++) {
@@ -344,71 +371,10 @@ public class FixedGapTermsIndexReader ex
}
}
}
-
- public boolean nextIndexTerm(long ord, TermsIndexResult result) throws IOException {
- int idx = 1 + (int) (ord / totalIndexInterval);
- if (idx < numIndexTerms) {
- fillResult(idx, result);
- return true;
- } else {
- return false;
- }
- }
-
- private void fillResult(int idx, TermsIndexResult result) {
- final long offset = termOffsets.get(idx);
- final int length = (int) (termOffsets.get(1+idx) - offset);
- termBytesReader.fillSlice(result.term, termBytesStart + offset, length);
- result.position = idx * totalIndexInterval;
- result.offset = termsStart + termsDictOffsets.get(idx);
- }
-
- public void getIndexOffset(BytesRef term, TermsIndexResult result) throws IOException {
- int lo = 0; // binary search
- int hi = numIndexTerms - 1;
- assert totalIndexInterval > 0 : "totalIndexInterval=" + totalIndexInterval;
-
- while (hi >= lo) {
- int mid = (lo + hi) >>> 1;
-
- final long offset = termOffsets.get(mid);
- final int length = (int) (termOffsets.get(1+mid) - offset);
- termBytesReader.fillSlice(result.term, termBytesStart + offset, length);
-
- int delta = termComp.compare(term, result.term);
- if (delta < 0) {
- hi = mid - 1;
- } else if (delta > 0) {
- lo = mid + 1;
- } else {
- assert mid >= 0;
- result.position = mid*totalIndexInterval;
- result.offset = termsStart + termsDictOffsets.get(mid);
- return;
- }
- }
- if (hi < 0) {
- assert hi == -1;
- hi = 0;
- }
-
- final long offset = termOffsets.get(hi);
- final int length = (int) (termOffsets.get(1+hi) - offset);
- termBytesReader.fillSlice(result.term, termBytesStart + offset, length);
-
- result.position = hi*totalIndexInterval;
- result.offset = termsStart + termsDictOffsets.get(hi);
- }
-
- public void getIndexOffset(long ord, TermsIndexResult result) throws IOException {
- int idx = (int) (ord / totalIndexInterval);
- // caller must ensure ord is in bounds
- assert idx < numIndexTerms;
- fillResult(idx, result);
- }
}
}
+ // Externally synced in IndexWriter
@Override
public void loadTermsIndex(int indexDivisor) throws IOException {
if (!indexLoaded) {
@@ -420,7 +386,7 @@ public class FixedGapTermsIndexReader ex
}
this.totalIndexInterval = indexInterval * this.indexDivisor;
- Iterator<FieldIndexReader> it = fields.values().iterator();
+ Iterator<FieldIndexData> it = fields.values().iterator();
while(it.hasNext()) {
it.next().loadTermsIndex();
}
@@ -432,8 +398,13 @@ public class FixedGapTermsIndexReader ex
}
@Override
- public FieldReader getField(FieldInfo fieldInfo) {
- return fields.get(fieldInfo);
+ public FieldIndexEnum getFieldEnum(FieldInfo fieldInfo) {
+ final FieldIndexData fieldData = fields.get(fieldInfo);
+ if (fieldData.coreIndex == null) {
+ return null;
+ } else {
+ return new IndexEnum(fieldData.coreIndex);
+ }
}
public static void files(Directory dir, SegmentInfo info, String id, Collection<String> files) {
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/FixedGapTermsIndexWriter.java Thu Jan 13 19:53:21 2011
@@ -31,7 +31,14 @@ import java.util.List;
import java.util.ArrayList;
import java.io.IOException;
-/** @lucene.experimental */
+/**
+ * Selects every Nth term as and index term, and hold term
+ * bytes fully expanded in memory. This terms index
+ * supports seeking by ord. See {@link
+ * VariableGapTermsIndexWriter} for a more memory efficient
+ * terms index that does not support seeking by ord.
+ *
+ * @lucene.experimental */
public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
protected final IndexOutput out;
@@ -50,7 +57,6 @@ public class FixedGapTermsIndexWriter ex
public FixedGapTermsIndexWriter(SegmentWriteState state) throws IOException {
final String indexFileName = IndexFileNames.segmentFileName(state.segmentName, state.codecId, TERMS_INDEX_EXTENSION);
- state.flushedFiles.add(indexFileName);
termIndexInterval = state.termIndexInterval;
out = state.directory.createOutput(indexFileName);
fieldInfos = state.fieldInfos;
@@ -203,15 +209,25 @@ public class FixedGapTermsIndexWriter ex
final long dirStart = out.getFilePointer();
final int fieldCount = fields.size();
- out.writeInt(fieldCount);
+ int nonNullFieldCount = 0;
for(int i=0;i<fieldCount;i++) {
SimpleFieldWriter field = fields.get(i);
- out.writeInt(field.fieldInfo.number);
- out.writeInt(field.numIndexTerms);
- out.writeLong(field.termsStart);
- out.writeLong(field.indexStart);
- out.writeLong(field.packedIndexStart);
- out.writeLong(field.packedOffsetsStart);
+ if (field.numIndexTerms > 0) {
+ nonNullFieldCount++;
+ }
+ }
+
+ out.writeVInt(nonNullFieldCount);
+ for(int i=0;i<fieldCount;i++) {
+ SimpleFieldWriter field = fields.get(i);
+ if (field.numIndexTerms > 0) {
+ out.writeVInt(field.fieldInfo.number);
+ out.writeVInt(field.numIndexTerms);
+ out.writeVLong(field.termsStart);
+ out.writeVLong(field.indexStart);
+ out.writeVLong(field.packedIndexStart);
+ out.writeVLong(field.packedOffsetsStart);
+ }
}
writeTrailer(dirStart);
out.close();
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java Thu Jan 13 19:53:21 2011
@@ -30,9 +30,9 @@ import org.apache.lucene.util.BytesRef;
public abstract class PostingsConsumer {
- /** Adds a new doc in this term. Return null if this
- * consumer doesn't need to see the positions for this
- * doc. */
+ /** Adds a new doc in this term. If this field omits term
+ * freqs & positions then termDocFreq should be ignored,
+ * and, finishDoc will not be called. */
public abstract void startDoc(int docID, int termDocFreq) throws IOException;
public static class PostingsMergeState {
@@ -49,7 +49,8 @@ public abstract class PostingsConsumer {
public abstract void addPosition(int position, BytesRef payload) throws IOException;
/** Called when we are done adding positions & payloads
- * for each doc */
+ * for each doc. Not called when the field omits term
+ * freq and positions. */
public abstract void finishDoc() throws IOException;
/** Default merge impl: append documents, mapping around
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PostingsReaderBase.java Thu Jan 13 19:53:21 2011
@@ -42,17 +42,17 @@ public abstract class PostingsReaderBase
public abstract void init(IndexInput termsIn) throws IOException;
/** Return a newly created empty TermState */
- public abstract TermState newTermState() throws IOException;
+ public abstract PrefixCodedTermState newTermState() throws IOException;
- public abstract void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState state, boolean isIndexTerm) throws IOException;
+ public abstract void readTerm(IndexInput termsIn, FieldInfo fieldInfo, PrefixCodedTermState state, boolean isIndexTerm) throws IOException;
/** Must fully consume state, since after this call that
* TermState may be reused. */
- public abstract DocsEnum docs(FieldInfo fieldInfo, TermState state, Bits skipDocs, DocsEnum reuse) throws IOException;
+ public abstract DocsEnum docs(FieldInfo fieldInfo, PrefixCodedTermState state, Bits skipDocs, DocsEnum reuse) throws IOException;
/** Must fully consume state, since after this call that
* TermState may be reused. */
- public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, TermState state, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException;
+ public abstract DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, PrefixCodedTermState state, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException;
public abstract void close() throws IOException;
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsReader.java Thu Jan 13 19:53:21 2011
@@ -31,6 +31,7 @@ import org.apache.lucene.index.FieldInfo
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
@@ -68,7 +69,7 @@ public class PrefixCodedTermsReader exte
private final Comparator<BytesRef> termComp;
// Caches the most recently looked-up field + terms:
- private final DoubleBarrelLRUCache<FieldAndTerm,TermState> termsCache;
+ private final DoubleBarrelLRUCache<FieldAndTerm,PrefixCodedTermState> termsCache;
// Reads the terms index
private TermsIndexReaderBase indexReader;
@@ -84,11 +85,6 @@ public class PrefixCodedTermsReader exte
public FieldAndTerm() {
}
- public FieldAndTerm(String field, BytesRef term) {
- this.field = field;
- this.term = new BytesRef(term);
- }
-
public FieldAndTerm(FieldAndTerm other) {
field = other.field;
term = new BytesRef(other.term);
@@ -116,7 +112,7 @@ public class PrefixCodedTermsReader exte
throws IOException {
this.postingsReader = postingsReader;
- termsCache = new DoubleBarrelLRUCache<FieldAndTerm,TermState>(termsCacheSize);
+ termsCache = new DoubleBarrelLRUCache<FieldAndTerm,PrefixCodedTermState>(termsCacheSize);
this.termComp = termComp;
@@ -140,12 +136,10 @@ public class PrefixCodedTermsReader exte
final long numTerms = in.readLong();
assert numTerms >= 0;
final long termsStartPointer = in.readLong();
- final TermsIndexReaderBase.FieldReader fieldIndexReader;
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
- fieldIndexReader = indexReader.getField(fieldInfo);
if (numTerms > 0) {
assert !fields.containsKey(fieldInfo.name);
- fields.put(fieldInfo.name, new FieldReader(fieldIndexReader, fieldInfo, numTerms, termsStartPointer));
+ fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer));
}
}
success = true;
@@ -251,14 +245,12 @@ public class PrefixCodedTermsReader exte
final long numTerms;
final FieldInfo fieldInfo;
final long termsStartPointer;
- final TermsIndexReaderBase.FieldReader fieldIndexReader;
- FieldReader(TermsIndexReaderBase.FieldReader fieldIndexReader, FieldInfo fieldInfo, long numTerms, long termsStartPointer) {
+ FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer) {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
this.numTerms = numTerms;
this.termsStartPointer = termsStartPointer;
- this.fieldIndexReader = fieldIndexReader;
}
@Override
@@ -281,18 +273,25 @@ public class PrefixCodedTermsReader exte
return numTerms;
}
- // Iterates through terms in this field
- private class SegmentTermsEnum extends TermsEnum {
+ // Iterates through terms in this field, not supporting ord()
+ private final class SegmentTermsEnum extends TermsEnum {
private final IndexInput in;
private final DeltaBytesReader bytesReader;
- private final TermState state;
+ private final PrefixCodedTermState state;
private boolean seekPending;
- private final TermsIndexReaderBase.TermsIndexResult indexResult = new TermsIndexReaderBase.TermsIndexResult();
private final FieldAndTerm fieldTerm = new FieldAndTerm();
+ private final TermsIndexReaderBase.FieldIndexEnum indexEnum;
+ private boolean positioned;
+ private boolean didIndexNext;
+ private BytesRef nextIndexTerm;
+ private boolean isIndexTerm;
+ private final boolean doOrd;
SegmentTermsEnum() throws IOException {
in = (IndexInput) PrefixCodedTermsReader.this.in.clone();
in.seek(termsStartPointer);
+ indexEnum = indexReader.getFieldEnum(fieldInfo);
+ doOrd = indexReader.supportsOrd();
bytesReader = new DeltaBytesReader(in);
fieldTerm.field = fieldInfo.name;
state = postingsReader.newTermState();
@@ -304,12 +303,26 @@ public class PrefixCodedTermsReader exte
return termComp;
}
- @Override
- public void cacheCurrentTerm() {
- TermState stateCopy = (TermState) state.clone();
- stateCopy.filePointer = in.getFilePointer();
- termsCache.put(new FieldAndTerm(fieldInfo.name, bytesReader.term),
- stateCopy);
+ // called only from assert
+ private boolean first;
+ private int indexTermCount;
+
+ private boolean startSeek() {
+ first = true;
+ indexTermCount = 0;
+ return true;
+ }
+
+ private boolean checkSeekScan() {
+ if (!first && isIndexTerm) {
+ indexTermCount++;
+ if (indexTermCount >= indexReader.getDivisor()) {
+ //System.out.println("now fail count=" + indexTermCount);
+ return false;
+ }
+ }
+ first = false;
+ return true;
}
/** Seeks until the first term that's >= the provided
@@ -317,16 +330,24 @@ public class PrefixCodedTermsReader exte
* is found, SeekStatus.NOT_FOUND if a different term
* was found, SeekStatus.END if we hit EOF */
@Override
- public SeekStatus seek(BytesRef term, boolean useCache) throws IOException {
+ public SeekStatus seek(final BytesRef term, final boolean useCache) throws IOException {
+
+ if (indexEnum == null) {
+ throw new IllegalStateException("terms index was not loaded");
+ }
+
+ //System.out.println("te.seek term=" + fieldInfo.name + ":" + term.utf8ToString() + " current=" + term().utf8ToString() + " useCache=" + useCache + " this=" + this);
+
// Check cache
fieldTerm.term = term;
TermState cachedState;
if (useCache) {
cachedState = termsCache.get(fieldTerm);
if (cachedState != null) {
- state.copy(cachedState);
- seekPending = true;
- bytesReader.term.copy(term);
+ state.copyFrom(cachedState);
+ setTermState(term, state);
+ positioned = false;
+ //System.out.println(" cached!");
return SeekStatus.FOUND;
}
} else {
@@ -335,36 +356,54 @@ public class PrefixCodedTermsReader exte
boolean doSeek = true;
- if (state.ord != -1) {
- // we are positioned
+ if (positioned) {
final int cmp = termComp.compare(bytesReader.term, term);
if (cmp == 0) {
// already at the requested term
return SeekStatus.FOUND;
- }
+ } else if (cmp < 0) {
+
+ if (seekPending) {
+ seekPending = false;
+ in.seek(state.filePointer);
+ indexEnum.seek(bytesReader.term);
+ didIndexNext = false;
+ }
+
+ // Target term is after current term
+ if (!didIndexNext) {
+ if (indexEnum.next() == -1) {
+ nextIndexTerm = null;
+ } else {
+ nextIndexTerm = indexEnum.term();
+ }
+ //System.out.println(" now do index next() nextIndexTerm=" + (nextIndexTerm == null ? "null" : nextIndexTerm.utf8ToString()));
+ didIndexNext = true;
+ }
- if (cmp < 0 &&
- fieldIndexReader.nextIndexTerm(state.ord, indexResult) &&
- termComp.compare(indexResult.term, term) > 0) {
- // Optimization: requested term is within the
- // same index block we are now in; skip seeking
- // (but do scanning):
- doSeek = false;
+ if (nextIndexTerm == null || termComp.compare(term, nextIndexTerm) < 0) {
+ // Optimization: requested term is within the
+ // same index block we are now in; skip seeking
+ // (but do scanning):
+ doSeek = false;
+ //System.out.println(" skip seek: nextIndexTerm=" + nextIndexTerm);
+ }
}
}
- // Used only for assert:
- final long startOrd;
-
if (doSeek) {
- // As index to find biggest index term that's <=
- // our text:
- fieldIndexReader.getIndexOffset(term, indexResult);
+ positioned = true;
- in.seek(indexResult.offset);
+ // Ask terms index to find biggest index term that's <=
+ // our text:
+ in.seek(indexEnum.seek(term));
+ didIndexNext = false;
+ if (doOrd) {
+ state.ord = indexEnum.ord()-1;
+ }
seekPending = false;
// NOTE: the first next() after an index seek is
@@ -373,78 +412,56 @@ public class PrefixCodedTermsReader exte
// those bytes in the primary file, but then when
// scanning over an index term we'd have to
// special case it:
- bytesReader.reset(indexResult.term);
-
- state.ord = indexResult.position-1;
- assert state.ord >= -1: "ord=" + state.ord + " pos=" + indexResult.position;
-
- startOrd = indexResult.position;
+ bytesReader.reset(indexEnum.term());
+ //System.out.println(" doSeek term=" + indexEnum.term().utf8ToString() + " vs target=" + term.utf8ToString());
} else {
- startOrd = -1;
+ //System.out.println(" skip seek");
}
+ assert startSeek();
+
// Now scan:
- while(next() != null) {
+ while (next() != null) {
final int cmp = termComp.compare(bytesReader.term, term);
if (cmp == 0) {
-
- if (doSeek && useCache) {
- // Store in cache
- FieldAndTerm entryKey = new FieldAndTerm(fieldTerm);
- cachedState = (TermState) state.clone();
- // this is fp after current term
- cachedState.filePointer = in.getFilePointer();
- termsCache.put(entryKey, cachedState);
+ // Done!
+ if (useCache) {
+ cacheTerm(fieldTerm);
}
-
+
return SeekStatus.FOUND;
} else if (cmp > 0) {
return SeekStatus.NOT_FOUND;
}
+
// The purpose of the terms dict index is to seek
// the enum to the closest index term before the
// term we are looking for. So, we should never
// cross another index term (besides the first
// one) while we are scanning:
- assert state.ord == startOrd || !fieldIndexReader.isIndexTerm(state.ord, state.docFreq, true): "state.ord=" + state.ord + " startOrd=" + startOrd + " ir.isIndexTerm=" + fieldIndexReader.isIndexTerm(state.ord, state.docFreq, true) + " state.docFreq=" + state.docFreq;
+ assert checkSeekScan();
}
+ positioned = false;
return SeekStatus.END;
}
- @Override
- public SeekStatus seek(long ord) throws IOException {
-
- // TODO: should we cache term lookup by ord as well...?
-
- if (ord >= numTerms) {
- state.ord = numTerms-1;
- return SeekStatus.END;
- }
-
- fieldIndexReader.getIndexOffset(ord, indexResult);
- in.seek(indexResult.offset);
- seekPending = false;
-
- // NOTE: the first next() after an index seek is
- // wasteful, since it redundantly reads the same
- // bytes into the buffer
- bytesReader.reset(indexResult.term);
-
- state.ord = indexResult.position-1;
- assert state.ord >= -1: "ord=" + state.ord;
-
- // Now, scan:
- int left = (int) (ord - state.ord);
- while(left > 0) {
- final BytesRef term = next();
- assert term != null;
- left--;
- }
-
- // always found
- return SeekStatus.FOUND;
+ private final void setTermState(BytesRef term, final TermState termState) {
+ assert termState != null && termState instanceof PrefixCodedTermState;
+ state.copyFrom(termState);
+ seekPending = true;
+ bytesReader.term.copy(term);
+ }
+
+ private final void cacheTerm(FieldAndTerm other) {
+ // Store in cache
+ final FieldAndTerm entryKey = new FieldAndTerm(other);
+ final PrefixCodedTermState cachedState = (PrefixCodedTermState) state.clone();
+ // this is fp after current term
+ cachedState.filePointer = in.getFilePointer();
+ termsCache.put(entryKey, cachedState);
}
+
@Override
public BytesRef term() {
@@ -452,38 +469,40 @@ public class PrefixCodedTermsReader exte
}
@Override
- public long ord() {
- return state.ord;
- }
-
- @Override
public BytesRef next() throws IOException {
if (seekPending) {
seekPending = false;
in.seek(state.filePointer);
+ indexEnum.seek(bytesReader.term);
+ didIndexNext = false;
}
- if (state.ord >= numTerms-1) {
+ if (!bytesReader.read()) {
+ //System.out.println("te.next end!");
+ positioned = false;
return null;
}
- bytesReader.read();
- state.docFreq = in.readVInt();
+ final byte b = in.readByte();
+ isIndexTerm = (b & 0x80) != 0;
+
+ if ((b & 0x40) == 0) {
+ // Fast case -- docFreq fits in 6 bits
+ state.docFreq = b & 0x3F;
+ } else {
+ state.docFreq = (in.readVInt() << 6) | (b & 0x3F);
+ }
- // TODO: would be cleaner, but space-wasting, to
- // simply record a bit into each index entry as to
- // whether it's an index entry or not, rather than
- // re-compute that information... or, possibly store
- // a "how many terms until next index entry" in each
- // index entry, but that'd require some tricky
- // lookahead work when writing the index
postingsReader.readTerm(in,
fieldInfo, state,
- fieldIndexReader.isIndexTerm(1+state.ord, state.docFreq, false));
-
- state.ord++;
+ isIndexTerm);
+ if (doOrd) {
+ state.ord++;
+ }
+ positioned = true;
+ //System.out.println("te.next term=" + bytesReader.term.utf8ToString());
return bytesReader.term;
}
@@ -494,7 +513,7 @@ public class PrefixCodedTermsReader exte
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
- DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse);
+ final DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse);
assert docsEnum != null;
return docsEnum;
}
@@ -507,6 +526,66 @@ public class PrefixCodedTermsReader exte
return postingsReader.docsAndPositions(fieldInfo, state, skipDocs, reuse);
}
}
+
+ @Override
+ public SeekStatus seek(BytesRef term, TermState otherState) throws IOException {
+ assert otherState != null && otherState instanceof PrefixCodedTermState;
+ assert otherState.getClass() == this.state.getClass() : "Illegal TermState type " + otherState.getClass();
+ assert ((PrefixCodedTermState)otherState).ord < numTerms;
+ setTermState(term, otherState);
+ positioned = false;
+ return SeekStatus.FOUND;
+ }
+
+ @Override
+ public TermState termState() throws IOException {
+ final PrefixCodedTermState newTermState = (PrefixCodedTermState) state.clone();
+ newTermState.filePointer = in.getFilePointer();
+ return newTermState;
+ }
+
+ @Override
+ public SeekStatus seek(long ord) throws IOException {
+
+ if (indexEnum == null) {
+ throw new IllegalStateException("terms index was not loaded");
+ }
+
+ if (ord >= numTerms) {
+ state.ord = numTerms-1;
+ return SeekStatus.END;
+ }
+
+ in.seek(indexEnum.seek(ord));
+ seekPending = false;
+ positioned = true;
+
+ // NOTE: the first next() after an index seek is
+ // wasteful, since it redundantly reads the same
+ // bytes into the buffer
+ bytesReader.reset(indexEnum.term());
+
+ state.ord = indexEnum.ord()-1;
+ assert state.ord >= -1: "ord=" + state.ord;
+
+ // Now, scan:
+ int left = (int) (ord - state.ord);
+ while(left > 0) {
+ final BytesRef term = next();
+ assert term != null;
+ left--;
+ }
+
+ // always found
+ return SeekStatus.FOUND;
+ }
+
+ public long ord() {
+ if (!doOrd) {
+ throw new UnsupportedOperationException();
+ }
+ return state.ord;
+ }
}
}
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/PrefixCodedTermsWriter.java Thu Jan 13 19:53:21 2011
@@ -74,7 +74,6 @@ public class PrefixCodedTermsWriter exte
this.termComp = termComp;
out = state.directory.createOutput(termsFileName);
termsIndexWriter.setTermsOutput(out);
- state.flushedFiles.add(termsFileName);
fieldInfos = state.fieldInfos;
writeHeader(out);
@@ -93,7 +92,7 @@ public class PrefixCodedTermsWriter exte
}
@Override
- public TermsConsumer addField(FieldInfo field) {
+ public TermsConsumer addField(FieldInfo field) throws IOException {
assert currentField == null || currentField.name.compareTo(field.name) < 0;
currentField = field;
TermsIndexWriterBase.FieldWriter fieldIndexWriter = termsIndexWriter.addField(field);
@@ -173,12 +172,25 @@ public class PrefixCodedTermsWriter exte
public void finishTerm(BytesRef text, int numDocs) throws IOException {
assert numDocs > 0;
+ //System.out.println("finishTerm term=" + fieldInfo.name + ":" + text.utf8ToString() + " fp=" + out.getFilePointer());
final boolean isIndexTerm = fieldIndexWriter.checkIndexTerm(text, numDocs);
termWriter.write(text);
- out.writeVInt(numDocs);
+ final int highBit = isIndexTerm ? 0x80 : 0;
+ //System.out.println(" isIndex=" + isIndexTerm);
+ // This is a vInt, except, we steal top bit to record
+ // whether this was an indexed term:
+ if ((numDocs & ~0x3F) == 0) {
+ // Fast case -- docFreq fits in 6 bits
+ out.writeByte((byte) (highBit | numDocs));
+ } else {
+ // Write bottom 6 bits of docFreq, then write the
+ // remainder as vInt:
+ out.writeByte((byte) (highBit | 0x40 | (numDocs & 0x3F)));
+ out.writeVInt(numDocs >>> 6);
+ }
postingsWriter.finishTerm(numDocs, isIndexTerm);
numTerms++;
}
@@ -186,6 +198,8 @@ public class PrefixCodedTermsWriter exte
// Finishes all terms in this field
@Override
public void finish() throws IOException {
+ // EOF marker:
+ out.writeVInt(DeltaBytesWriter.TERM_EOF);
fieldIndexWriter.finish();
}
}
Modified: lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexReaderBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexReaderBase.java?rev=1058718&r1=1058717&r2=1058718&view=diff
==============================================================================
--- lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexReaderBase.java (original)
+++ lucene/dev/branches/realtime_search/lucene/src/java/org/apache/lucene/index/codecs/TermsIndexReaderBase.java Thu Jan 13 19:53:21 2011
@@ -21,6 +21,7 @@ import org.apache.lucene.index.FieldInfo
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
+import java.io.Closeable;
import java.util.Collection;
@@ -35,42 +36,40 @@ import java.util.Collection;
* indexed terms (many pairs of CharSequence text + long
* fileOffset), and then this reader must be able to
* retrieve the nearest index term to a provided term
- * text.
+ * text.
* @lucene.experimental */
-public abstract class TermsIndexReaderBase {
+public abstract class TermsIndexReaderBase implements Closeable {
- static class TermsIndexResult {
- long position;
- final BytesRef term = new BytesRef();
- long offset;
- };
-
- public abstract class FieldReader {
- /** Returns position of "largest" index term that's <=
- * text. Returned TermsIndexResult may be reused
- * across calls. This resets internal state, and
- * expects that you'll then scan the file and
- * sequentially call isIndexTerm for each term
- * encountered. */
- public abstract void getIndexOffset(BytesRef term, TermsIndexResult result) throws IOException;
-
- public abstract void getIndexOffset(long ord, TermsIndexResult result) throws IOException;
-
- /** Call this sequentially for each term encoutered,
- * after calling {@link #getIndexOffset}. */
- public abstract boolean isIndexTerm(long ord, int docFreq, boolean onlyLoaded) throws IOException;
-
- /** Finds the next index term, after the specified
- * ord. Returns true if one exists. */
- public abstract boolean nextIndexTerm(long ord, TermsIndexResult result) throws IOException;
- }
-
- public abstract FieldReader getField(FieldInfo fieldInfo);
+ public abstract FieldIndexEnum getFieldEnum(FieldInfo fieldInfo);
public abstract void loadTermsIndex(int indexDivisor) throws IOException;
public abstract void close() throws IOException;
public abstract void getExtensions(Collection<String> extensions);
-}
\ No newline at end of file
+
+ public abstract boolean supportsOrd();
+
+ public abstract int getDivisor();
+
+ // Similar to TermsEnum, except, the only "metadata" it
+ // reports for a given indexed term is the long fileOffset
+ // into the main terms dict (_X.tis) file:
+ public static abstract class FieldIndexEnum {
+
+ /** Seeks to "largest" indexed term that's <=
+ * term; retruns file pointer index (into the main
+ * terms index file) for that term */
+ public abstract long seek(BytesRef term) throws IOException;
+
+ /** Returns -1 at end */
+ public abstract long next() throws IOException;
+
+ public abstract BytesRef term();
+
+ // Only impl'd if supportsOrd() returns true!
+ public abstract long seek(long ord) throws IOException;
+ public abstract long ord();
+ }
+}