You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2010/10/04 12:02:50 UTC
svn commit: r1004179 - in /lucene/dev/trunk/lucene/src:
java/org/apache/lucene/index/CheckIndex.java
test/org/apache/lucene/index/Test2BTerms.java
Author: mikemccand
Date: Mon Oct 4 10:02:50 2010
New Revision: 1004179
URL: http://svn.apache.org/viewvc?rev=1004179&view=rev
Log:
LUCENE-2682: add test that confirm we can index more than 2B terms in one segment, @Ignored since it takes > 4 hrs to run on a fast machine
Added:
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/Test2BTerms.java (with props)
Modified:
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java?rev=1004179&r1=1004178&r2=1004179&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java Mon Oct 4 10:02:50 2010
@@ -18,6 +18,8 @@ package org.apache.lucene.index;
*/
import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
@@ -573,6 +575,8 @@ public class CheckIndex {
final int maxDoc = reader.maxDoc();
final Bits delDocs = reader.getDeletedDocs();
+ final IndexSearcher is = new IndexSearcher(reader);
+
try {
if (infoStream != null) {
@@ -584,7 +588,10 @@ public class CheckIndex {
msg("OK [no fields/terms]");
return status;
}
-
+
+ DocsEnum docs = null;
+ DocsAndPositionsEnum postings = null;
+
final FieldsEnum fieldsEnum = fields.iterator();
while(true) {
final String field = fieldsEnum.next();
@@ -594,9 +601,6 @@ public class CheckIndex {
final TermsEnum terms = fieldsEnum.terms();
- DocsEnum docs = null;
- DocsAndPositionsEnum postings = null;
-
boolean hasOrd = true;
final long termCountStart = status.termCount;
@@ -706,6 +710,70 @@ public class CheckIndex {
}
}
}
+
+ // Test seek to last term:
+ if (lastTerm != null) {
+ if (terms.seek(lastTerm) != TermsEnum.SeekStatus.FOUND) {
+ throw new RuntimeException("seek to last term " + lastTerm + " failed");
+ }
+
+ is.search(new TermQuery(new Term(field, lastTerm)), 1);
+ }
+
+ // Test seeking by ord
+ if (hasOrd && status.termCount-termCountStart > 0) {
+ long termCount;
+ try {
+ termCount = fields.terms(field).getUniqueTermCount();
+ } catch (UnsupportedOperationException uoe) {
+ termCount = -1;
+ }
+
+ if (termCount != -1 && termCount != status.termCount - termCountStart) {
+ throw new RuntimeException("termCount mismatch " + termCount + " vs " + (status.termCount - termCountStart));
+ }
+
+ termCount = status.termCount;
+
+ int seekCount = (int) Math.min(10000L, termCount);
+ if (seekCount > 0) {
+ BytesRef[] seekTerms = new BytesRef[seekCount];
+
+ // Seek by ord
+ for(int i=seekCount-1;i>=0;i--) {
+ long ord = i*(termCount/seekCount);
+ terms.seek(ord);
+ seekTerms[i] = new BytesRef(terms.term());
+ }
+
+ // Seek by term
+ long totDocCount = 0;
+ for(int i=seekCount-1;i>=0;i--) {
+ if (terms.seek(seekTerms[i]) != TermsEnum.SeekStatus.FOUND) {
+ throw new RuntimeException("seek to existing term " + seekTerms[i] + " failed");
+ }
+
+ docs = terms.docs(delDocs, docs);
+ if (docs == null) {
+ throw new RuntimeException("null DocsEnum from to existing term " + seekTerms[i]);
+ }
+
+ while(docs.nextDoc() != DocsEnum.NO_MORE_DOCS) {
+ totDocCount++;
+ }
+ }
+
+ // TermQuery
+ long totDocCount2 = 0;
+ for(int i=0;i<seekCount;i++) {
+ totDocCount2 += is.search(new TermQuery(new Term(field, seekTerms[i])), 1).totalHits;
+ }
+
+ if (totDocCount != totDocCount2) {
+ throw new RuntimeException("search to seek terms produced wrong number of hits: " + totDocCount + " vs " + totDocCount2);
+ }
+ }
+ }
}
msg("OK [" + status.termCount + " terms; " + status.totFreq + " terms/docs pairs; " + status.totPos + " tokens]");
Added: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/Test2BTerms.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/Test2BTerms.java?rev=1004179&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/Test2BTerms.java (added)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/Test2BTerms.java Mon Oct 4 10:02:50 2010
@@ -0,0 +1,169 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.*;
+import org.apache.lucene.store.*;
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.lucene.document.*;
+import org.apache.lucene.index.codecs.CodecProvider;
+import java.io.IOException;
+import org.junit.Ignore;
+
+// NOTE: this test will fail w/ PreFlexRW codec! (Because
+// this test uses full binary term space, but PreFlex cannot
+// handle this since it requires the terms are UTF8 bytes).
+//
+// Also, SimpleText codec will consume very large amounts of
+// disk (but, should run successfully). Best to run w/
+// -Dtests.codec=Standard, and w/ plenty of RAM, eg:
+//
+// ant compile-core compile-test
+//
+// java -server -Xmx2g -Xms2g -d64 -cp .:lib/junit-4.7.jar:./build/classes/test:./build/classes/java:./build/classes/demo -Dlucene.version=4.0-dev -Dtests.codec=Standard -DtempDir=build -ea org.junit.runner.JUnitCore org.apache.lucene.index.Test2BTerms
+//
+
+public class Test2BTerms extends LuceneTestCase {
+
+ private final static BytesRef bytes = new BytesRef(20);
+
+ private static final class MyTokenStream extends TokenStream {
+
+ private final int tokensPerDoc;
+ private int tokenCount;
+
+ public MyTokenStream(int tokensPerDoc) {
+ super(new MyAttributeFactory(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY));
+ this.tokensPerDoc = tokensPerDoc;
+ addAttribute(TermToBytesRefAttribute.class);
+ }
+
+ public boolean incrementToken() {
+ if (tokenCount >= tokensPerDoc) {
+ return false;
+ }
+ final byte[] bs = bytes.bytes;
+ for(int i=bytes.length-1;i>=0;i--) {
+ int b = bs[i]&0xff;
+ if (b == 0xff) {
+ bs[i] = 0;
+ } else {
+ bs[i] = (byte) (++b);
+ tokenCount++;
+ return true;
+ }
+ }
+ bytes.length++;
+ bs[0] = 1;
+ tokenCount++;
+ return true;
+ }
+
+ public void reset() {
+ tokenCount = 0;
+ }
+
+ private final static class MyTermAttributeImpl extends AttributeImpl implements TermToBytesRefAttribute {
+ public int toBytesRef(BytesRef bs) {
+ bs.bytes = bytes.bytes;
+ bs.offset = bytes.offset;
+ bs.length = bytes.length;
+ return bytes.hashCode();
+ }
+ @Override
+ public void clear() {
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ return other == this;
+ }
+
+ @Override
+ public int hashCode() {
+ return System.identityHashCode(this);
+ }
+
+ @Override
+ public void copyTo(AttributeImpl target) {
+ }
+
+ @Override
+ public Object clone() {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+ private static final class MyAttributeFactory extends AttributeFactory {
+ private final AttributeFactory delegate;
+
+ public MyAttributeFactory(AttributeFactory delegate) {
+ this.delegate = delegate;
+ }
+
+ @Override
+ public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
+ if (attClass == TermToBytesRefAttribute.class)
+ return new MyTermAttributeImpl();
+ if (CharTermAttribute.class.isAssignableFrom(attClass))
+ throw new IllegalArgumentException("no");
+ return delegate.createAttributeInstance(attClass);
+ }
+ }
+ }
+
+ @Ignore("Takes ~4 hours to run on a fast machine!! And requires that you don't use PreFlex codec.")
+ public void test2BTerms() throws IOException {
+
+ if ("PreFlex".equals(CodecProvider.getDefaultCodec())) {
+ throw new RuntimeException("thist test cannot run with PreFlex codec");
+ }
+
+ long TERM_COUNT = ((long) Integer.MAX_VALUE) + 100000000;
+
+ int TERMS_PER_DOC = 1000000;
+
+ Directory dir = FSDirectory.open(_TestUtil.getTempDir("2BTerms"));
+ IndexWriter w = new IndexWriter(dir,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())
+ .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
+ .setRAMBufferSizeMB(256.0).setMergeScheduler(new ConcurrentMergeScheduler()));
+ ((LogMergePolicy) w.getConfig().getMergePolicy()).setUseCompoundFile(false);
+ ((LogMergePolicy) w.getConfig().getMergePolicy()).setUseCompoundDocStore(false);
+ ((LogMergePolicy) w.getConfig().getMergePolicy()).setMergeFactor(10);
+
+ Document doc = new Document();
+ Field field = new Field("field", new MyTokenStream(TERMS_PER_DOC));
+ field.setOmitTermFreqAndPositions(true);
+ field.setOmitNorms(true);
+ doc.add(field);
+ //w.setInfoStream(System.out);
+ final int numDocs = (int) (TERM_COUNT/TERMS_PER_DOC);
+ for(int i=0;i<numDocs;i++) {
+ w.addDocument(doc);
+ System.out.println(i + " of " + numDocs);
+ }
+ System.out.println("now optimize...");
+ w.optimize();
+ w.close();
+
+ _TestUtil.checkIndex(dir);
+ dir.close();
+ }
+}
Propchange: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/Test2BTerms.java
------------------------------------------------------------------------------
svn:eol-style = native