You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2011/04/12 21:35:55 UTC
svn commit: r1091557 - in /lucene/dev/trunk: ./ lucene/ lucene/contrib/lucli/
lucene/src/java/org/apache/lucene/index/codecs/preflex/
lucene/src/java/org/apache/lucene/store/
lucene/src/test/org/apache/lucene/index/ solr/ solr/client/ solr/contrib/
sol...
Author: mikemccand
Date: Tue Apr 12 19:35:54 2011
New Revision: 1091557
URL: http://svn.apache.org/viewvc?rev=1091557&view=rev
Log:
LUCENE-3024: handle > 2.1B terms (again)
Modified:
lucene/dev/trunk/ (props changed)
lucene/dev/trunk/lucene/ (props changed)
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/contrib/lucli/build.xml (props changed)
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/store/DataInput.java (props changed)
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/Test2BTerms.java
lucene/dev/trunk/solr/ (props changed)
lucene/dev/trunk/solr/CHANGES.txt (props changed)
lucene/dev/trunk/solr/LICENSE.txt (props changed)
lucene/dev/trunk/solr/NOTICE.txt (props changed)
lucene/dev/trunk/solr/README.txt (props changed)
lucene/dev/trunk/solr/build.xml (props changed)
lucene/dev/trunk/solr/client/ (props changed)
lucene/dev/trunk/solr/common-build.xml (props changed)
lucene/dev/trunk/solr/contrib/ (props changed)
lucene/dev/trunk/solr/example/ (props changed)
lucene/dev/trunk/solr/lib/ (props changed)
lucene/dev/trunk/solr/site/ (props changed)
lucene/dev/trunk/solr/src/ (props changed)
lucene/dev/trunk/solr/testlogging.properties (props changed)
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1091557&r1=1091556&r2=1091557&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Tue Apr 12 19:35:54 2011
@@ -404,6 +404,12 @@ Optimizations
* LUCENE-2990: ArrayUtil/CollectionUtil.*Sort() methods now exit early
on empty or one-element lists/arrays. (Uwe Schindler)
+Bug fixes
+
+* LUCENE-3024: Index with more than 2.1B terms was hitting AIOOBE when
+ seeking TermEnum (eg used by Solr's faceting) (Tom Burton-West, Mike
+ McCandless)
+
======================= Lucene 3.1.0 =======================
Changes in backwards compatibility policy
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java?rev=1091557&r1=1091556&r2=1091557&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java Tue Apr 12 19:35:54 2011
@@ -310,7 +310,7 @@ public final class TermInfosReader {
}
} else {
assert sameTermInfo(ti, tiOrd, enumerator);
- assert (int) enumerator.position == tiOrd.termOrd;
+ assert enumerator.position == tiOrd.termOrd;
}
} else {
ti = null;
Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/Test2BTerms.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/Test2BTerms.java?rev=1091557&r1=1091556&r2=1091557&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/Test2BTerms.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/Test2BTerms.java Tue Apr 12 19:35:54 2011
@@ -19,11 +19,17 @@ package org.apache.lucene.index;
import org.apache.lucene.util.*;
import org.apache.lucene.store.*;
+import org.apache.lucene.search.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.document.*;
import org.apache.lucene.index.codecs.CodecProvider;
+import java.io.File;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Random;
import org.junit.Ignore;
// NOTE: this test will fail w/ PreFlexRW codec! (Because
@@ -36,7 +42,7 @@ import org.junit.Ignore;
//
// ant compile-test
//
-// java -server -Xmx2g -Xms2g -d64 -cp .:lib/junit-4.7.jar:./build/classes/test:./build/classes/java -Dlucene.version=4.0-dev -Dtests.directory=SimpleFSDirectory -Dtests.codec=Standard -DtempDir=build -ea org.junit.runner.JUnitCore org.apache.lucene.index.Test2BTerms
+// java -server -Xmx8g -d64 -cp .:lib/junit-4.7.jar:./build/classes/test:./build/classes/test-framework:./build/classes/java -Dlucene.version=4.0-dev -Dtests.directory=MMapDirectory -DtempDir=build -ea org.junit.runner.JUnitCore org.apache.lucene.index.Test2BTerms
//
public class Test2BTerms extends LuceneTestCase {
@@ -45,17 +51,21 @@ public class Test2BTerms extends LuceneT
private final static BytesRef bytes = new BytesRef(TOKEN_LEN);
- private static final class MyTokenStream extends TokenStream {
+ private final static class MyTokenStream extends TokenStream {
private final int tokensPerDoc;
private int tokenCount;
- private int byteUpto;
+ public final List<BytesRef> savedTerms = new ArrayList<BytesRef>();
+ private int nextSave;
+ private final Random random;
- public MyTokenStream(int tokensPerDoc) {
+ public MyTokenStream(Random random, int tokensPerDoc) {
super(new MyAttributeFactory(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY));
this.tokensPerDoc = tokensPerDoc;
addAttribute(TermToBytesRefAttribute.class);
bytes.length = TOKEN_LEN;
+ this.random = random;
+ nextSave = _TestUtil.nextInt(random, 500000, 1000000);
}
@Override
@@ -65,6 +75,10 @@ public class Test2BTerms extends LuceneT
}
random.nextBytes(bytes.bytes);
tokenCount++;
+ if (--nextSave == 0) {
+ savedTerms.add(new BytesRef(bytes));
+ nextSave = _TestUtil.nextInt(random, 500000, 1000000);
+ }
return true;
}
@@ -131,47 +145,104 @@ public class Test2BTerms extends LuceneT
throw new RuntimeException("thist test cannot run with PreFlex codec");
}
- long TERM_COUNT = ((long) Integer.MAX_VALUE) + 100000000;
+ final long TERM_COUNT = ((long) Integer.MAX_VALUE) + 100000000;
+
+ final int TERMS_PER_DOC = _TestUtil.nextInt(random, 100000, 1000000);
- int TERMS_PER_DOC = 1000000;
+ List<BytesRef> savedTerms = null;
Directory dir = newFSDirectory(_TestUtil.getTempDir("2BTerms"));
- IndexWriter w = new IndexWriter(
- dir,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).
- setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH).
- setRAMBufferSizeMB(256.0).
- setMergeScheduler(new ConcurrentMergeScheduler()).
- setMergePolicy(newLogMergePolicy(false, 10))
- );
-
- MergePolicy mp = w.getConfig().getMergePolicy();
- if (mp instanceof LogByteSizeMergePolicy) {
- // 1 petabyte:
- ((LogByteSizeMergePolicy) mp).setMaxMergeMB(1024*1024*1024);
- }
-
- Document doc = new Document();
- Field field = new Field("field", new MyTokenStream(TERMS_PER_DOC));
- field.setOmitTermFreqAndPositions(true);
- field.setOmitNorms(true);
- doc.add(field);
- //w.setInfoStream(System.out);
- final int numDocs = (int) (TERM_COUNT/TERMS_PER_DOC);
- for(int i=0;i<numDocs;i++) {
- final long t0 = System.currentTimeMillis();
- w.addDocument(doc);
- System.out.println(i + " of " + numDocs + " " + (System.currentTimeMillis()-t0) + " msec");
- }
- System.out.println("now optimize...");
- w.optimize();
- w.close();
+ //Directory dir = newFSDirectory(new File("/p/lucene/indices/2bindex"));
+ if (true) {
+ IndexWriter w = new IndexWriter(dir,
+ new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
+ .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
+ .setRAMBufferSizeMB(256.0)
+ .setMergeScheduler(new ConcurrentMergeScheduler())
+ .setMergePolicy(newLogMergePolicy(false, 10)));
+
+ MergePolicy mp = w.getConfig().getMergePolicy();
+ if (mp instanceof LogByteSizeMergePolicy) {
+ // 1 petabyte:
+ ((LogByteSizeMergePolicy) mp).setMaxMergeMB(1024*1024*1024);
+ }
+
+ Document doc = new Document();
+ final MyTokenStream ts = new MyTokenStream(random, TERMS_PER_DOC);
+ Field field = new Field("field", ts);
+ field.setOmitTermFreqAndPositions(true);
+ field.setOmitNorms(true);
+ doc.add(field);
+ //w.setInfoStream(System.out);
+ final int numDocs = (int) (TERM_COUNT/TERMS_PER_DOC);
+
+ System.out.println("TERMS_PER_DOC=" + TERMS_PER_DOC);
+ System.out.println("numDocs=" + numDocs);
+
+ for(int i=0;i<numDocs;i++) {
+ final long t0 = System.currentTimeMillis();
+ w.addDocument(doc);
+ System.out.println(i + " of " + numDocs + " " + (System.currentTimeMillis()-t0) + " msec");
+ }
+ savedTerms = ts.savedTerms;
+
+ System.out.println("TEST: optimize");
+ w.optimize();
+ System.out.println("TEST: close writer");
+ w.close();
+ }
+
+ System.out.println("TEST: open reader");
+ final IndexReader r = IndexReader.open(dir);
+ if (savedTerms == null) {
+ savedTerms = findTerms(r);
+ }
+ final int numSavedTerms = savedTerms.size();
+ final List<BytesRef> bigOrdTerms = new ArrayList<BytesRef>(savedTerms.subList(numSavedTerms-10, numSavedTerms));
+ System.out.println("TEST: test big ord terms...");
+ testSavedTerms(r, bigOrdTerms);
+ System.out.println("TEST: test all saved terms...");
+ testSavedTerms(r, savedTerms);
+ r.close();
- System.out.println("now CheckIndex...");
+ System.out.println("TEST: now CheckIndex...");
CheckIndex.Status status = _TestUtil.checkIndex(dir);
final long tc = status.segmentInfos.get(0).termIndexStatus.termCount;
assertTrue("count " + tc + " is not > " + Integer.MAX_VALUE, tc > Integer.MAX_VALUE);
dir.close();
}
+
+ private List<BytesRef> findTerms(IndexReader r) throws IOException {
+ System.out.println("TEST: findTerms");
+ final TermsEnum termsEnum = MultiFields.getTerms(r, "field").iterator();
+ final List<BytesRef> savedTerms = new ArrayList<BytesRef>();
+ int nextSave = _TestUtil.nextInt(random, 500000, 1000000);
+ BytesRef term;
+ while((term = termsEnum.next()) != null) {
+ if (--nextSave == 0) {
+ savedTerms.add(new BytesRef(term));
+ System.out.println("TEST: add " + term);
+ nextSave = _TestUtil.nextInt(random, 500000, 1000000);
+ }
+ }
+ return savedTerms;
+ }
+
+ private void testSavedTerms(IndexReader r, List<BytesRef> terms) throws IOException {
+ System.out.println("TEST: run " + terms.size() + " terms on reader=" + r);
+ IndexSearcher s = new IndexSearcher(r);
+ Collections.shuffle(terms);
+ TermsEnum termsEnum = MultiFields.getTerms(r, "field").iterator();
+ for(int iter=0;iter<10*terms.size();iter++) {
+ final BytesRef term = terms.get(random.nextInt(terms.size()));
+ System.out.println("TEST: search " + term);
+ final long t0 = System.currentTimeMillis();
+ assertTrue(s.search(new TermQuery(new Term("field", term)), 1).totalHits > 0);
+ final long t1 = System.currentTimeMillis();
+ System.out.println(" took " + (t1-t0) + " millis");
+
+ assertEquals(TermsEnum.SeekStatus.FOUND, termsEnum.seek(term));
+ }
+ }
}