You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@lucene.apache.org by sh...@apache.org on 2016/09/12 12:02:43 UTC

[1/2] lucene-solr:branch_6_2: LUCENE-7417: Highlighter WSTE didn't handle single-term MultiPhraseQuery. Also updated to Java 5 for-each in this method.

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_6_2 8027eb980 -> c7b3e9ae3


LUCENE-7417: Highlighter WSTE didn't handle single-term MultiPhraseQuery.
Also updated to Java 5 for-each in this method.

(cherry picked from commit 3966f99)

(cherry picked from commit 514bb1b)


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/cddeb9dc
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/cddeb9dc
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/cddeb9dc

Branch: refs/heads/branch_6_2
Commit: cddeb9dc3c8322b4149b910f509a93be37f5c17b
Parents: 8027eb9
Author: David Smiley <ds...@apache.org>
Authored: Fri Sep 9 19:36:39 2016 +0530
Committer: Shalin Shekhar Mangar <sh...@apache.org>
Committed: Mon Sep 12 17:31:19 2016 +0530

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |  6 ++++
 .../highlight/WeightedSpanTermExtractor.java    | 26 +++++++++--------
 .../search/highlight/HighlighterTest.java       | 30 +++++++++++++-------
 3 files changed, 39 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cddeb9dc/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index fdc6d03..e9ae13d 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -10,6 +10,12 @@ API Changes
 * LUCENE-7436: MinHashFilter's constructor, and some of its default
   settings, should be public.  (Doug Turnbull via Mike McCandless)
 
+Bug Fixes
+
+* LUCENE-7417: The standard Highlighter could throw an IllegalArgumentException when
+  trying to highlight a query containing a degenerate case of a MultiPhraseQuery with one
+  term.  (Thomas Kappler via David Smiley)
+
 ======================= Lucene 6.2.0 =======================
 
 API Changes

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cddeb9dc/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
index 7507bdd..644dad3 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
@@ -118,8 +118,7 @@ public class WeightedSpanTermExtractor {
       Term[] phraseQueryTerms = phraseQuery.getTerms();
       if (phraseQueryTerms.length == 1) {
         extractWeightedSpanTerms(terms, new SpanTermQuery(phraseQueryTerms[0]), boost);
-      }
-      else {
+      } else {
         SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.length];
         for (int i = 0; i < phraseQueryTerms.length; i++) {
           clauses[i] = new SpanTermQuery(phraseQueryTerms[i]);
@@ -153,8 +152,8 @@ public class WeightedSpanTermExtractor {
       // this query is TermContext sensitive.
       extractWeightedTerms(terms, query, boost);
     } else if (query instanceof DisjunctionMaxQuery) {
-      for (Iterator<Query> iterator = ((DisjunctionMaxQuery) query).iterator(); iterator.hasNext();) {
-        extract(iterator.next(), boost, terms);
+      for (Query clause : ((DisjunctionMaxQuery) query)) {
+        extract(clause, boost, terms);
       }
     } else if (query instanceof ToParentBlockJoinQuery) {
       extract(((ToParentBlockJoinQuery) query).getChildQuery(), boost, terms);
@@ -184,16 +183,15 @@ public class WeightedSpanTermExtractor {
             disjuncts = (disjunctLists[positions[i]] = new ArrayList<>(termArray.length));
             ++distinctPositions;
           }
-          for (int j = 0; j < termArray.length; ++j) {
-            disjuncts.add(new SpanTermQuery(termArray[j]));
+          for (Term aTermArray : termArray) {
+            disjuncts.add(new SpanTermQuery(aTermArray));
           }
         }
 
         int positionGaps = 0;
         int position = 0;
         final SpanQuery[] clauses = new SpanQuery[distinctPositions];
-        for (int i = 0; i < disjunctLists.length; ++i) {
-          List<SpanQuery> disjuncts = disjunctLists[i];
+        for (List<SpanQuery> disjuncts : disjunctLists) {
           if (disjuncts != null) {
             clauses[position++] = new SpanOrQuery(disjuncts
                 .toArray(new SpanQuery[disjuncts.size()]));
@@ -202,11 +200,15 @@ public class WeightedSpanTermExtractor {
           }
         }
 
-        final int slop = mpq.getSlop();
-        final boolean inorder = (slop == 0);
+        if (clauses.length == 1) {
+          extractWeightedSpanTerms(terms, clauses[0], boost);
+        } else {
+          final int slop = mpq.getSlop();
+          final boolean inorder = (slop == 0);
 
-        SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder);
-        extractWeightedSpanTerms(terms, sp, boost);
+          SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder);
+          extractWeightedSpanTerms(terms, sp, boost);
+        }
       }
     } else if (query instanceof MatchAllDocsQuery) {
       //nothing

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/cddeb9dc/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
index cf727d7..fc402ba 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
@@ -94,7 +94,6 @@ import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.automaton.Automata;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.apache.lucene.util.automaton.RegExp;
-import org.junit.Test;
 import org.w3c.dom.Element;
 import org.w3c.dom.NodeList;
 
@@ -1580,30 +1579,39 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
     helper.start();
   }
 
-  @Test
   public void testHighlighterWithPhraseQuery() throws IOException, InvalidTokenOffsetsException {
+    final String fieldName = "substring";
+
+    final PhraseQuery query = new PhraseQuery(fieldName, new BytesRef[] { new BytesRef("uchu") });
+
+    assertHighlighting(query, new SimpleHTMLFormatter("<b>", "</b>"), "Buchung", "B<b>uchu</b>ng", fieldName);
+  }
+
+  public void testHighlighterWithMultiPhraseQuery() throws IOException, InvalidTokenOffsetsException {
+    final String fieldName = "substring";
+
+    final MultiPhraseQuery mpq = new MultiPhraseQuery.Builder()
+        .add(new Term(fieldName, "uchu")).build();
 
+    assertHighlighting(mpq, new SimpleHTMLFormatter("<b>", "</b>"), "Buchung", "B<b>uchu</b>ng", fieldName);
+  }
+
+  private void assertHighlighting(Query query, Formatter formatter, String text, String expected, String fieldName)
+      throws IOException, InvalidTokenOffsetsException {
     final Analyzer analyzer = new Analyzer() {
       @Override
       protected TokenStreamComponents createComponents(String fieldName) {
         return new TokenStreamComponents(new NGramTokenizer(4, 4));
       }
     };
-    final String fieldName = "substring";
-
-    final List<BytesRef> list = new ArrayList<>();
-    list.add(new BytesRef("uchu"));
-    final PhraseQuery query = new PhraseQuery(fieldName, list.toArray(new BytesRef[list.size()]));
 
     final QueryScorer fragmentScorer = new QueryScorer(query, fieldName);
-    final SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<b>", "</b>");
 
     final Highlighter highlighter = new Highlighter(formatter, fragmentScorer);
     highlighter.setTextFragmenter(new SimpleFragmenter(100));
-    final String fragment = highlighter.getBestFragment(analyzer, fieldName, "Buchung");
-
-    assertEquals("B<b>uchu</b>ng",fragment);
+    final String fragment = highlighter.getBestFragment(analyzer, fieldName, text);
 
+    assertEquals(expected, fragment);
   }
 
   public void testUnRewrittenQuery() throws Exception {

[2/2] lucene-solr:branch_6_2: LUCENE-7440: fix MultiLevelSkipListReader overflow

Posted by sh...@apache.org.

LUCENE-7440: fix MultiLevelSkipListReader overflow

(cherry picked from commit cf72eeb)


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/c7b3e9ae
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/c7b3e9ae
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/c7b3e9ae

Branch: refs/heads/branch_6_2
Commit: c7b3e9ae3695a13dacb81312db0d470ada273808
Parents: cddeb9d
Author: yonik <yo...@apache.org>
Authored: Sun Sep 11 01:28:24 2016 +0530
Committer: Shalin Shekhar Mangar <sh...@apache.org>
Committed: Mon Sep 12 17:32:22 2016 +0530

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   4 +
 .../lucene/codecs/MultiLevelSkipListReader.java |   9 +-
 .../org/apache/lucene/index/Test2BDocs.java     | 135 +++++++++++++++++++
 3 files changed, 145 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c7b3e9ae/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index e9ae13d..f62e470 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -16,6 +16,10 @@ Bug Fixes
   trying to highlight a query containing a degenerate case of a MultiPhraseQuery with one
   term.  (Thomas Kappler via David Smiley)
 
+* LUCENE-7440: Document id skipping (PostingsEnum.advance) could throw an
+  ArrayIndexOutOfBoundsException exception on large index segments (>1.8B docs)
+  with large skips. (yonik)
+
 ======================= Lucene 6.2.0 =======================
 
 API Changes

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c7b3e9ae/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListReader.java b/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListReader.java
index 72ffe9f..c937886 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/MultiLevelSkipListReader.java
@@ -63,7 +63,9 @@ public abstract class MultiLevelSkipListReader implements Closeable {
   /**  skipInterval of each level. */
   private int skipInterval[];
 
-  /** Number of docs skipped per level. */
+  /** Number of docs skipped per level.
+   * It's possible for some values to overflow a signed int, but this has been accounted for.
+   */
   private int[] numSkipped;
 
   /** Doc id of current skip entry per level. */
@@ -150,8 +152,9 @@ public abstract class MultiLevelSkipListReader implements Closeable {
     setLastSkipData(level);
       
     numSkipped[level] += skipInterval[level];
-      
-    if (numSkipped[level] > docCount) {
+
+    // numSkipped may overflow a signed int, so compare as unsigned.
+    if (Integer.compareUnsigned(numSkipped[level], docCount) > 0) {
       // this skip list is exhausted
       skipDoc[level] = Integer.MAX_VALUE;
       if (numberOfSkipLevels > level) numberOfSkipLevels = level; 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/c7b3e9ae/lucene/core/src/test/org/apache/lucene/index/Test2BDocs.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/index/Test2BDocs.java b/lucene/core/src/test/org/apache/lucene/index/Test2BDocs.java
new file mode 100644
index 0000000..4fab45a
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/index/Test2BDocs.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.index;
+
+
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.store.BaseDirectoryWrapper;
+import org.apache.lucene.store.MockDirectoryWrapper;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.LuceneTestCase.Monster;
+import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
+import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.TimeUnits;
+
+@SuppressCodecs({"SimpleText", "Memory", "Direct"})
+@TimeoutSuite(millis = 80 * TimeUnits.HOUR) // effectively no limit
+@Monster("Takes ~30min")
+@SuppressSysoutChecks(bugUrl = "Stuff gets printed")
+public class Test2BDocs extends LuceneTestCase {
+  
+  // indexes Integer.MAX_VALUE docs with indexed field(s)
+  public void test2BDocs() throws Exception {
+    BaseDirectoryWrapper dir = newFSDirectory(createTempDir("2BDocs"));
+    if (dir instanceof MockDirectoryWrapper) {
+      ((MockDirectoryWrapper)dir).setThrottling(MockDirectoryWrapper.Throttling.NEVER);
+    }
+    
+    IndexWriter w = new IndexWriter(dir,
+        new IndexWriterConfig(new MockAnalyzer(random()))
+        .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
+        .setRAMBufferSizeMB(256.0)
+        .setMergeScheduler(new ConcurrentMergeScheduler())
+        .setMergePolicy(newLogMergePolicy(false, 10))
+        .setOpenMode(IndexWriterConfig.OpenMode.CREATE)
+        .setCodec(TestUtil.getDefaultCodec()));
+
+    Document doc = new Document();
+    Field field = new Field("f1", "a", StringField.TYPE_NOT_STORED);
+    doc.add(field);
+    
+    for (int i = 0; i < IndexWriter.MAX_DOCS; i++) {
+      w.addDocument(doc);
+      if (i % (10*1000*1000) == 0) {
+        System.out.println("indexed: " + i);
+        System.out.flush();
+      }
+    }
+    
+    w.forceMerge(1);
+    w.close();
+    
+    System.out.println("verifying...");
+    System.out.flush();
+    
+    DirectoryReader r = DirectoryReader.open(dir);
+
+    BytesRef term = new BytesRef(1);
+    term.bytes[0] = (byte)'a';
+    term.length = 1;
+
+    long skips = 0;
+
+    Random rnd = random();
+
+    long start = System.nanoTime();
+
+    for (LeafReaderContext context : r.leaves()) {
+      LeafReader reader = context.reader();
+      int lim = context.reader().maxDoc();
+
+      Terms terms = reader.fields().terms("f1");
+      for (int i=0; i<10000; i++) {
+        TermsEnum te = terms.iterator();
+        assertTrue( te.seekExact(term) );
+        PostingsEnum docs = te.postings(null);
+
+        // skip randomly through the term
+        for (int target = -1;;)
+        {
+          int maxSkipSize = lim - target + 1;
+          // do a smaller skip half of the time
+          if (rnd.nextBoolean()) {
+            maxSkipSize = Math.min(256, maxSkipSize);
+          }
+          int newTarget = target + rnd.nextInt(maxSkipSize) + 1;
+          if (newTarget >= lim) {
+            if (target+1 >= lim) break; // we already skipped to end, so break.
+            newTarget = lim-1;  // skip to end
+          }
+          target = newTarget;
+
+          int res = docs.advance(target);
+          if (res == PostingsEnum.NO_MORE_DOCS) break;
+
+          assertTrue( res >= target );
+
+          skips++;
+          target = res;
+        }
+      }
+    }
+    
+    r.close();
+    dir.close();
+
+    long end = System.nanoTime();
+
+    System.out.println("Skip count=" + skips + " seconds=" + TimeUnit.NANOSECONDS.toSeconds(end-start));
+    assert skips > 0;
+  }
+  
+}