You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2007/11/15 22:14:28 UTC
svn commit: r595458 - in /lucene/java/trunk: ./
src/java/org/apache/lucene/index/ src/test/org/apache/lucene/index/
Author: mikemccand
Date: Thu Nov 15 13:14:27 2007
New Revision: 595458
URL: http://svn.apache.org/viewvc?rev=595458&view=rev
Log:
LUCENE-1052: add set/getTermInfosIndexDivisor to IndexReader to reduce memory usage of the TermInfos index
Modified:
lucene/java/trunk/CHANGES.txt
lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java
lucene/java/trunk/src/java/org/apache/lucene/index/MultiSegmentReader.java
lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java
lucene/java/trunk/src/java/org/apache/lucene/index/TermInfosReader.java
lucene/java/trunk/src/test/org/apache/lucene/index/TestSegmentReader.java
lucene/java/trunk/src/test/org/apache/lucene/index/TestSegmentTermDocs.java
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=595458&r1=595457&r2=595458&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Thu Nov 15 13:14:27 2007
@@ -55,6 +55,12 @@
scheduler is now ConcurrentMergeScheduler (see
LUCENE-870). (Steven Parkes via Mike McCandless)
+ 6. LUCENE-1052: Add IndexReader.setTermInfosIndexDivisor(int) method
+ that allows you to reduce memory usage of the termInfos by further
+ sub-sampling (over the termIndexInterval that was used during
+ indexing) which terms are loaded into memory. (Chuck Williams,
+ Doug Cutting via Mike McCandless)
+
Bug fixes
1. LUCENE-933: QueryParser fixed to not produce empty sub
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java?rev=595458&r1=595457&r2=595458&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexReader.java Thu Nov 15 13:14:27 2007
@@ -298,6 +298,35 @@
throw new UnsupportedOperationException("This reader does not support this method.");
}
+ /**<p>For IndexReader implementations that use
+ * TermInfosReader to read terms, this sets the
+ * indexDivisor to subsample the number of indexed terms
+ * loaded into memory. This has the same effect as {@link
+ * IndexWriter#setTermIndexInterval} except that setting
+ * must be done at indexing time while this setting can be
+ * set per reader. When set to N, then one in every
+ * N*termIndexInterval terms in the index is loaded into
+ * memory. By setting this to a value > 1 you can reduce
+ * memory usage, at the expense of higher latency when
+ * loading a TermInfo. The default value is 1.</p>
+ *
+ * <b>NOTE:</b> you must call this before the term
+ * index is loaded. If the index is already loaded,
+ * an IllegalStateException is thrown.
+ * @throws IllegalStateException if the term index has already been loaded into memory
+ */
+ public void setTermInfosIndexDivisor(int indexDivisor) throws IllegalStateException {
+ throw new UnsupportedOperationException("This reader does not support this method.");
+ }
+
+ /** <p>For IndexReader implementations that use
+ * TermInfosReader to read terms, this returns the
+ * current indexDivisor.
+ * @see #setTermInfosIndexDivisor */
+ public int getTermInfosIndexDivisor() {
+ throw new UnsupportedOperationException("This reader does not support this method.");
+ }
+
/**
* Check whether this IndexReader is still using the
* current (i.e., most recently committed) version of the
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/MultiSegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/MultiSegmentReader.java?rev=595458&r1=595457&r2=595458&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/MultiSegmentReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/MultiSegmentReader.java Thu Nov 15 13:14:27 2007
@@ -299,6 +299,17 @@
return fieldSet;
}
+ public void setTermInfosIndexDivisor(int indexDivisor) throws IllegalStateException {
+ for (int i = 0; i < subReaders.length; i++)
+ subReaders[i].setTermInfosIndexDivisor(indexDivisor);
+ }
+
+ public int getTermInfosIndexDivisor() throws IllegalStateException {
+ if (subReaders.length > 0)
+ return subReaders[0].getTermInfosIndexDivisor();
+ else
+ throw new IllegalStateException("no readers");
+ }
static class MultiTermEnum extends TermEnum {
private SegmentMergeQueue queue;
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java?rev=595458&r1=595457&r2=595458&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java Thu Nov 15 13:14:27 2007
@@ -444,6 +444,14 @@
return si.docCount;
}
+ public void setTermInfosIndexDivisor(int indexDivisor) throws IllegalStateException {
+ tis.setIndexDivisor(indexDivisor);
+ }
+
+ public int getTermInfosIndexDivisor() {
+ return tis.getIndexDivisor();
+ }
+
/**
* @see IndexReader#getFieldNames(IndexReader.FieldOption fldOption)
*/
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/TermInfosReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/TermInfosReader.java?rev=595458&r1=595457&r2=595458&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/TermInfosReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/TermInfosReader.java Thu Nov 15 13:14:27 2007
@@ -40,6 +40,9 @@
private long[] indexPointers;
private SegmentTermEnum indexEnum;
+
+ private int indexDivisor = 1;
+ private int totalIndexInterval;
TermInfosReader(Directory dir, String seg, FieldInfos fis)
throws CorruptIndexException, IOException {
@@ -58,6 +61,7 @@
origEnum = new SegmentTermEnum(directory.openInput(segment + ".tis",
readBufferSize), fieldInfos, false);
size = origEnum.size;
+ totalIndexInterval = origEnum.indexInterval;
indexEnum = new SegmentTermEnum(directory.openInput(segment + ".tii",
readBufferSize), fieldInfos, true);
@@ -82,6 +86,43 @@
public int getMaxSkipLevels() {
return origEnum.maxSkipLevels;
}
+
+ /**
+ * <p>Sets the indexDivisor, which subsamples the number
+ * of indexed terms loaded into memory. This has a
+ * similar effect as {@link
+ * IndexWriter#setTermIndexInterval} except that setting
+ * must be done at indexing time while this setting can be
+ * set per reader. When set to N, then one in every
+ * N*termIndexInterval terms in the index is loaded into
+ * memory. By setting this to a value > 1 you can reduce
+ * memory usage, at the expense of higher latency when
+ * loading a TermInfo. The default value is 1.</p>
+ *
+ * <b>NOTE:</b> you must call this before the term
+ * index is loaded. If the index is already loaded,
+ * an IllegalStateException is thrown.
+ *
+ + @throws IllegalStateException if the term index has
+ * already been loaded into memory.
+ */
+ public void setIndexDivisor(int indexDivisor) throws IllegalStateException {
+ if (indexDivisor < 1)
+ throw new IllegalArgumentException("indexDivisor must be > 0: got " + indexDivisor);
+
+ if (indexTerms != null)
+ throw new IllegalStateException("index terms are already loaded");
+
+ this.indexDivisor = indexDivisor;
+ totalIndexInterval = origEnum.indexInterval * indexDivisor;
+ }
+
+ /** Returns the indexDivisor.
+ * @see #setIndexDivisor
+ */
+ public int getIndexDivisor() {
+ return indexDivisor;
+ }
final void close() throws IOException {
if (origEnum != null)
@@ -106,10 +147,10 @@
}
private synchronized void ensureIndexIsRead() throws IOException {
- if (indexTerms != null) // index already read
- return; // do nothing
+ if (indexTerms != null) // index already read
+ return; // do nothing
try {
- int indexSize = (int)indexEnum.size; // otherwise read index
+ int indexSize = 1+((int)indexEnum.size-1)/indexDivisor; // otherwise read index
indexTerms = new Term[indexSize];
indexInfos = new TermInfo[indexSize];
@@ -119,6 +160,10 @@
indexTerms[i] = indexEnum.term();
indexInfos[i] = indexEnum.termInfo();
indexPointers[i] = indexEnum.indexPointer;
+
+ for (int j = 1; j < indexDivisor; j++)
+ if (!indexEnum.next())
+ break;
}
} finally {
indexEnum.close();
@@ -146,8 +191,8 @@
private final void seekEnum(int indexOffset) throws IOException {
getEnum().seek(indexPointers[indexOffset],
- (indexOffset * getEnum().indexInterval) - 1,
- indexTerms[indexOffset], indexInfos[indexOffset]);
+ (indexOffset * totalIndexInterval) - 1,
+ indexTerms[indexOffset], indexInfos[indexOffset]);
}
/** Returns the TermInfo for a Term in the set, or null. */
@@ -161,7 +206,7 @@
if (enumerator.term() != null // term is at or past current
&& ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0)
|| term.compareTo(enumerator.term()) >= 0)) {
- int enumOffset = (int)(enumerator.position/enumerator.indexInterval)+1;
+ int enumOffset = (int)(enumerator.position/totalIndexInterval)+1;
if (indexTerms.length == enumOffset // but before end of block
|| term.compareTo(indexTerms[enumOffset]) < 0)
return scanEnum(term); // no need to seek
@@ -189,10 +234,10 @@
SegmentTermEnum enumerator = getEnum();
if (enumerator != null && enumerator.term() != null &&
position >= enumerator.position &&
- position < (enumerator.position + enumerator.indexInterval))
+ position < (enumerator.position + totalIndexInterval))
return scanEnum(position); // can avoid seek
- seekEnum(position / enumerator.indexInterval); // must seek
+ seekEnum(position/totalIndexInterval); // must seek
return scanEnum(position);
}
Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestSegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestSegmentReader.java?rev=595458&r1=595457&r2=595458&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestSegmentReader.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestSegmentReader.java Thu Nov 15 13:14:27 2007
@@ -28,6 +28,9 @@
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.store.MockRAMDirectory;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.search.Similarity;
public class TestSegmentReader extends LuceneTestCase {
private RAMDirectory dir = new RAMDirectory();
@@ -204,4 +207,19 @@
assertTrue("We do not have 4 term freq vectors, we have: " + results.length, results.length == 4);
}
+ public void testIndexDivisor() throws IOException {
+ dir = new MockRAMDirectory();
+ testDoc = new Document();
+ DocHelper.setupDoc(testDoc);
+ SegmentInfo si = DocHelper.writeDoc(dir, testDoc);
+
+ reader = SegmentReader.get(si);
+ reader.setTermInfosIndexDivisor(3);
+ testDocument();
+ testDelete();
+ testGetFieldNameVariations();
+ testNorms();
+ testTerms();
+ testTermVectors();
+ }
}
Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestSegmentTermDocs.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestSegmentTermDocs.java?rev=595458&r1=595457&r2=595458&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestSegmentTermDocs.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestSegmentTermDocs.java Thu Nov 15 13:14:27 2007
@@ -19,12 +19,14 @@
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.store.MockRAMDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.io.IOException;
+import org.apache.lucene.search.Similarity;
public class TestSegmentTermDocs extends LuceneTestCase {
private Document testDoc = new Document();
@@ -46,8 +48,13 @@
}
public void testTermDocs() throws IOException {
+ testTermDocs(1);
+ }
+
+ public void testTermDocs(int indexDivisor) throws IOException {
//After adding the document, we should be able to read it back in
SegmentReader reader = SegmentReader.get(info);
+ reader.setTermInfosIndexDivisor(indexDivisor);
assertTrue(reader != null);
SegmentTermDocs segTermDocs = new SegmentTermDocs(reader);
assertTrue(segTermDocs != null);
@@ -63,9 +70,14 @@
}
public void testBadSeek() throws IOException {
+ testBadSeek(1);
+ }
+
+ public void testBadSeek(int indexDivisor) throws IOException {
{
//After adding the document, we should be able to read it back in
SegmentReader reader = SegmentReader.get(info);
+ reader.setTermInfosIndexDivisor(indexDivisor);
assertTrue(reader != null);
SegmentTermDocs segTermDocs = new SegmentTermDocs(reader);
assertTrue(segTermDocs != null);
@@ -76,6 +88,7 @@
{
//After adding the document, we should be able to read it back in
SegmentReader reader = SegmentReader.get(info);
+ reader.setTermInfosIndexDivisor(indexDivisor);
assertTrue(reader != null);
SegmentTermDocs segTermDocs = new SegmentTermDocs(reader);
assertTrue(segTermDocs != null);
@@ -86,6 +99,10 @@
}
public void testSkipTo() throws IOException {
+ testSkipTo(1);
+ }
+
+ public void testSkipTo(int indexDivisor) throws IOException {
Directory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
@@ -106,6 +123,9 @@
writer.close();
IndexReader reader = IndexReader.open(dir);
+ reader.setTermInfosIndexDivisor(indexDivisor);
+ assertEquals(indexDivisor, reader.getTermInfosIndexDivisor());
+
TermDocs tdocs = reader.termDocs();
// without optimization (assumption skipInterval == 16)
@@ -209,6 +229,31 @@
dir.close();
}
+ public void testIndexDivisor() throws IOException {
+ dir = new MockRAMDirectory();
+ testDoc = new Document();
+ DocHelper.setupDoc(testDoc);
+ DocHelper.writeDoc(dir, testDoc);
+ testTermDocs(2);
+ testBadSeek(2);
+ testSkipTo(2);
+ }
+
+ public void testIndexDivisorAfterLoad() throws IOException {
+ dir = new MockRAMDirectory();
+ testDoc = new Document();
+ DocHelper.setupDoc(testDoc);
+ SegmentInfo si = DocHelper.writeDoc(dir, testDoc);
+ SegmentReader reader = SegmentReader.get(si);
+ assertEquals(1, reader.docFreq(new Term("keyField", "Keyword")));
+ try {
+ reader.setTermInfosIndexDivisor(2);
+ fail("did not hit IllegalStateException exception");
+ } catch (IllegalStateException ise) {
+ // expected
+ }
+ }
+
private void addDoc(IndexWriter writer, String value) throws IOException
{
Document doc = new Document();