You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by GitBox <gi...@apache.org> on 2019/02/19 18:19:56 UTC

[GitHub] msokolov commented on a change in pull request #579: PET: prorated early termination

msokolov commented on a change in pull request #579: PET: prorated early termination
URL: https://github.com/apache/lucene-solr/pull/579#discussion_r258167454
 
 

 ##########
 File path: lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollectorEarlyTermination.java
 ##########
 @@ -234,13 +242,189 @@ public void testCanEarlyTerminateOnPrefix() {
         new Sort(new SortField("c", SortField.Type.LONG), new SortField("b", SortField.Type.STRING))));
   }
 
+  private Document numberedDocument(int num, int iterm) {
+    final Document doc = new Document();
+    doc.add(new NumericDocValuesField("ndv1", num));
+    doc.add(new StringField("s", terms.get(iterm % terms.size()), Store.YES));
+    return doc;
+  }
+
+  private void createRandomTerms() {
+    final int numTerms = TestUtil.nextInt(random(), 1, numDocs / 5);
+    Set<String> randomTerms = new HashSet<>();
+    while (randomTerms.size() < numTerms) {
+      randomTerms.add(TestUtil.randomSimpleString(random()));
+    }
+    terms = new ArrayList<>(randomTerms);
+  }
+
+  private void createUniformIndexX() throws IOException {
+    dir = newDirectory();
+    numDocs = atLeast(150);
+    int numSegs = atLeast(5);
+    // Create segments of random pre-determined sizes so we can distribute the documents uniformly
+    // among them
+    int docsRemaining = numDocs;
+    List<Integer> segmentSizes = new ArrayList<>();
+    for (int i = 0; i < numSegs - 1; i++) {
+      int size = random().nextInt(docsRemaining - numSegs + i);
+      segmentSizes.add(size);
+      docsRemaining -= size;
+    }
+    segmentSizes.add(docsRemaining);
+    List<List<Document>> segDocs = new ArrayList<>();
+    for (int i = 0; i < numSegs; i++) {
+      segDocs.add(new ArrayList<>());
+    }
+    createRandomTerms();
+    List<List<Document>> segDocsToFill = new ArrayList<>(segDocs);
+    for (int seg = 0, i = 0, j = 0; i < numDocs; ++i) {
+      // Create documents with the sort key and terms uniformly distributed among segments
+      seg %= segDocsToFill.size();
+      if (seg == 0) {
+        // this causes equal numbers of docs with "score" j to be added to each segment that has at least j documents
+        // TODO: sometimes do not increment j (so we get more random setup), but we must increment it when complete a segment
+        ++j;
+      }
+      List<Document> docs = segDocsToFill.get(seg);
+      docs.add(numberedDocument(j, j));
+      if (docs.size() == segmentSizes.get(seg)) {
+        segmentSizes.remove(seg);
+        segDocsToFill.remove(seg);
+      } else {
+        ++seg;
+      }
+    }
+    final long seed = random().nextLong();
+    final IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(new Random(seed)));
+    // one segment per commit so we can control the segment sizes
+    iwc.setMergePolicy(NoMergePolicy.INSTANCE);
+    iwc.setIndexSort(sort);
+    iw = new RandomIndexWriter(new Random(seed), dir, iwc);
+    for (int seg = 0; seg < segDocs.size(); seg++) {
+      for (Document doc : segDocs.get(seg)) {
+        iw.addDocument(doc);
+      }
+      iw.commit();
+    }
+    reader = iw.getReader();
+  }
+
+  private void createUniformIndex() throws IOException {
+    dir = newDirectory();
+    numDocs = atLeast(150);
+    int numSegs = atLeast(5);
+    // Create segments of random pre-determined sizes so we can distribute the documents uniformly
+    // among them
+    int docsRemaining = numDocs;
+    List<Integer> segmentSizes = new ArrayList<>();
+    for (int i = 0; i < numSegs - 1; i++) {
+      int size = random().nextInt(docsRemaining - numSegs + i);
+      segmentSizes.add(size);
+      docsRemaining -= size;
+    }
+    segmentSizes.add(docsRemaining);
+    createRandomTerms();
+    final long seed = random().nextLong();
+    final IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(new Random(seed)));
+    // one segment per commit so we can control the segment sizes
+    iwc.setMergePolicy(NoMergePolicy.INSTANCE);
+    iwc.setMaxBufferedDocs(numDocs);
+    iwc.setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH);
+    iwc.setIndexSort(sort);
+    writer = new IndexWriter(dir, iwc);
+    for (int seg = 0; seg < numSegs; seg++) {
+      int size = segmentSizes.get(seg);
+      double step = numDocs / (double) size;
+      for (int i = 0; i < size; i++) {
+        int num = (int) Math.round(i * step);
+        Document doc = numberedDocument(num, num);
+        writer.addDocument(doc);
+      }
+      writer.commit();
+    }
+    reader = DirectoryReader.open(writer);
+  }  
+
+  private void createSkewedIndex() throws IOException {
+    dir = newDirectory();
+    numDocs = atLeast(150);
+    createRandomTerms();
+    final long seed = random().nextLong();
+    final IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(new Random(seed)));
+    // one segment per commit so we can control the segment sizes
+    iwc.setMergePolicy(NoMergePolicy.INSTANCE);
+    iwc.setIndexSort(sort);
+    writer = new IndexWriter(dir, iwc);
+    for (int i = 0; i < numDocs; ++i) {
+      // insert the documents in order, so successive segments have increasingly larger documents
+      writer.addDocument(numberedDocument(i, i));
+      if (random().nextInt(numDocs / 10) == 0) {
+        // Make about 10 random-sized segments
+        writer.commit();
+      }
+    }
+    reader = DirectoryReader.open(writer);
+  }
+
+  public void testProratedEarlyTermination() throws IOException {
 
 Review comment:
   hmm, explicitly .. no. It's the default now, so tested implicitly in many cases, but an explicit test would be good to demonstrate all still works OK even when index distribution is skewed: I'll add that.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: dev-help@lucene.apache.org