You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2010/10/04 12:02:50 UTC
svn commit: r1004179 - in /lucene/dev/trunk/lucene/src: java/org/apache/lucene/index/CheckIndex.java test/org/apache/lucene/index/Test2BTerms.java

Author: mikemccand
Date: Mon Oct  4 10:02:50 2010
New Revision: 1004179

URL: http://svn.apache.org/viewvc?rev=1004179&view=rev
Log:
LUCENE-2682: add test that confirm we can index more than 2B terms in one segment, @Ignored since it takes > 4 hrs to run on a fast machine

Added:
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/Test2BTerms.java   (with props)
Modified:
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java?rev=1004179&r1=1004178&r2=1004179&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/CheckIndex.java Mon Oct  4 10:02:50 2010
@@ -18,6 +18,8 @@ package org.apache.lucene.index;
  */
 
 import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexInput;
@@ -573,6 +575,8 @@ public class CheckIndex {
     final int maxDoc = reader.maxDoc();
     final Bits delDocs = reader.getDeletedDocs();
 
+    final IndexSearcher is = new IndexSearcher(reader);
+
     try {
 
       if (infoStream != null) {
@@ -584,7 +588,10 @@ public class CheckIndex {
         msg("OK [no fields/terms]");
         return status;
       }
-      
+     
+      DocsEnum docs = null;
+      DocsAndPositionsEnum postings = null;
+
       final FieldsEnum fieldsEnum = fields.iterator();
       while(true) {
         final String field = fieldsEnum.next();
@@ -594,9 +601,6 @@ public class CheckIndex {
         
         final TermsEnum terms = fieldsEnum.terms();
 
-        DocsEnum docs = null;
-        DocsAndPositionsEnum postings = null;
-
         boolean hasOrd = true;
         final long termCountStart = status.termCount;
 
@@ -706,6 +710,70 @@ public class CheckIndex {
             }
           }
         }
+
+        // Test seek to last term:
+        if (lastTerm != null) {
+          if (terms.seek(lastTerm) != TermsEnum.SeekStatus.FOUND) {
+            throw new RuntimeException("seek to last term " + lastTerm + " failed");
+          }
+
+          is.search(new TermQuery(new Term(field, lastTerm)), 1);
+        }
+
+        // Test seeking by ord
+        if (hasOrd && status.termCount-termCountStart > 0) {
+          long termCount;
+          try {
+            termCount = fields.terms(field).getUniqueTermCount();
+          } catch (UnsupportedOperationException uoe) {
+            termCount = -1;
+          }
+
+          if (termCount != -1 && termCount != status.termCount - termCountStart) {
+            throw new RuntimeException("termCount mismatch " + termCount + " vs " + (status.termCount - termCountStart));
+          }
+
+          termCount = status.termCount;
+
+          int seekCount = (int) Math.min(10000L, termCount);
+          if (seekCount > 0) {
+            BytesRef[] seekTerms = new BytesRef[seekCount];
+            
+            // Seek by ord
+            for(int i=seekCount-1;i>=0;i--) {
+              long ord = i*(termCount/seekCount);
+              terms.seek(ord);
+              seekTerms[i] = new BytesRef(terms.term());
+            }
+
+            // Seek by term
+            long totDocCount = 0;
+            for(int i=seekCount-1;i>=0;i--) {
+              if (terms.seek(seekTerms[i]) != TermsEnum.SeekStatus.FOUND) {
+                throw new RuntimeException("seek to existing term " + seekTerms[i] + " failed");
+              }
+              
+              docs = terms.docs(delDocs, docs);
+              if (docs == null) {
+                throw new RuntimeException("null DocsEnum from to existing term " + seekTerms[i]);
+              }
+
+              while(docs.nextDoc() != DocsEnum.NO_MORE_DOCS) {
+                totDocCount++;
+              }
+            }
+
+            // TermQuery
+            long totDocCount2 = 0;
+            for(int i=0;i<seekCount;i++) {
+              totDocCount2 += is.search(new TermQuery(new Term(field, seekTerms[i])), 1).totalHits;
+            }
+
+            if (totDocCount != totDocCount2) {
+              throw new RuntimeException("search to seek terms produced wrong number of hits: " + totDocCount + " vs " + totDocCount2);
+            }
+          }
+        }
       }
 
       msg("OK [" + status.termCount + " terms; " + status.totFreq + " terms/docs pairs; " + status.totPos + " tokens]");

Added: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/Test2BTerms.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/Test2BTerms.java?rev=1004179&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/Test2BTerms.java (added)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/Test2BTerms.java Mon Oct  4 10:02:50 2010
@@ -0,0 +1,169 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.*;
+import org.apache.lucene.store.*;
+import org.apache.lucene.analysis.*;
+import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.lucene.document.*;
+import org.apache.lucene.index.codecs.CodecProvider;
+import java.io.IOException;
+import org.junit.Ignore;
+
+// NOTE: this test will fail w/ PreFlexRW codec!  (Because
+// this test uses full binary term space, but PreFlex cannot
+// handle this since it requires the terms are UTF8 bytes).
+//
+// Also, SimpleText codec will consume very large amounts of
+// disk (but, should run successfully).  Best to run w/
+// -Dtests.codec=Standard, and w/ plenty of RAM, eg:
+//
+//   ant compile-core compile-test
+//
+//  java -server -Xmx2g -Xms2g -d64 -cp .:lib/junit-4.7.jar:./build/classes/test:./build/classes/java:./build/classes/demo -Dlucene.version=4.0-dev -Dtests.codec=Standard -DtempDir=build -ea org.junit.runner.JUnitCore org.apache.lucene.index.Test2BTerms
+//
+
+public class Test2BTerms extends LuceneTestCase {
+
+  private final static BytesRef bytes = new BytesRef(20);
+
+  private static final class MyTokenStream extends TokenStream {
+
+    private final int tokensPerDoc;
+    private int tokenCount;
+
+    public MyTokenStream(int tokensPerDoc) {
+      super(new MyAttributeFactory(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY));
+      this.tokensPerDoc = tokensPerDoc;
+      addAttribute(TermToBytesRefAttribute.class);
+    }
+    
+    public boolean incrementToken() {
+      if (tokenCount >= tokensPerDoc) {
+        return false;
+      }
+      final byte[] bs = bytes.bytes;
+      for(int i=bytes.length-1;i>=0;i--) {
+        int b = bs[i]&0xff;
+        if (b == 0xff) {
+          bs[i] = 0;
+        } else {
+          bs[i] = (byte) (++b);
+          tokenCount++;
+          return true;
+        }
+      }
+      bytes.length++;
+      bs[0] = 1;
+      tokenCount++;
+      return true;
+    }
+
+    public void reset() {
+      tokenCount = 0;
+    }
+
+    private final static class MyTermAttributeImpl extends AttributeImpl implements TermToBytesRefAttribute {
+      public int toBytesRef(BytesRef bs) {
+        bs.bytes = bytes.bytes;
+        bs.offset = bytes.offset;
+        bs.length = bytes.length;
+        return bytes.hashCode();
+      }
+      @Override
+      public void clear() {
+      }
+
+      @Override
+      public boolean equals(Object other) {
+        return other == this;
+      }
+
+      @Override
+      public int hashCode() {
+        return System.identityHashCode(this);
+      }
+    
+      @Override
+      public void copyTo(AttributeImpl target) {
+      }
+    
+      @Override
+      public Object clone() {
+        throw new UnsupportedOperationException();
+      }
+    }
+
+    private static final class MyAttributeFactory extends AttributeFactory {
+      private final AttributeFactory delegate;
+
+      public MyAttributeFactory(AttributeFactory delegate) {
+        this.delegate = delegate;
+      }
+  
+      @Override
+      public AttributeImpl createAttributeInstance(Class<? extends Attribute> attClass) {
+        if (attClass == TermToBytesRefAttribute.class)
+          return new MyTermAttributeImpl();
+        if (CharTermAttribute.class.isAssignableFrom(attClass))
+          throw new IllegalArgumentException("no");
+        return delegate.createAttributeInstance(attClass);
+      }
+    }
+  }
+
+  @Ignore("Takes ~4 hours to run on a fast machine!!  And requires that you don't use PreFlex codec.")
+  public void test2BTerms() throws IOException {
+
+    if ("PreFlex".equals(CodecProvider.getDefaultCodec())) {
+      throw new RuntimeException("thist test cannot run with PreFlex codec");
+    }
+
+    long TERM_COUNT = ((long) Integer.MAX_VALUE) + 100000000;
+
+    int TERMS_PER_DOC = 1000000;
+
+    Directory dir = FSDirectory.open(_TestUtil.getTempDir("2BTerms"));
+    IndexWriter w = new IndexWriter(dir,
+                                    newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())
+                                                  .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
+                                                .setRAMBufferSizeMB(256.0).setMergeScheduler(new ConcurrentMergeScheduler()));
+    ((LogMergePolicy) w.getConfig().getMergePolicy()).setUseCompoundFile(false);
+    ((LogMergePolicy) w.getConfig().getMergePolicy()).setUseCompoundDocStore(false);
+    ((LogMergePolicy) w.getConfig().getMergePolicy()).setMergeFactor(10);
+
+    Document doc = new Document();
+    Field field = new Field("field", new MyTokenStream(TERMS_PER_DOC));
+    field.setOmitTermFreqAndPositions(true);
+    field.setOmitNorms(true);
+    doc.add(field);
+    //w.setInfoStream(System.out);
+    final int numDocs = (int) (TERM_COUNT/TERMS_PER_DOC);
+    for(int i=0;i<numDocs;i++) {
+      w.addDocument(doc);
+      System.out.println(i + " of " + numDocs);
+    }
+    System.out.println("now optimize...");
+    w.optimize();
+    w.close();
+
+    _TestUtil.checkIndex(dir);
+    dir.close();
+  }
+}

Propchange: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/Test2BTerms.java
------------------------------------------------------------------------------
    svn:eol-style = native