You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2020/10/02 02:34:08 UTC

[lucene-solr] branch branch_8x updated: LUCENE-9458: WDGF should tie-break by endOffset (#1740)

This is an automated email from the ASF dual-hosted git repository.

dsmiley pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new 587f730  LUCENE-9458: WDGF should tie-break by endOffset (#1740)
587f730 is described below

commit 587f7302b9de62f840d85ca325a660a9fbe1e3b0
Author: David Smiley <ds...@apache.org>
AuthorDate: Thu Oct 1 22:27:45 2020 -0400

    LUCENE-9458: WDGF should tie-break by endOffset (#1740)
    
    Can happen with catenateAll and not generating word xor number part when the input ends with the non-generated sub-token.
    Fuzzing revealed that only start & end offsets are needed to order sub-tokens.
    
    (cherry picked from commit 0303063e12049d9b470757c5ef2fb9afa6c5cd18)
---
 lucene/CHANGES.txt                                 |  3 ++
 .../miscellaneous/WordDelimiterGraphFilter.java    | 24 ++++------
 .../miscellaneous/WordDelimiterIterator.java       | 14 +++++-
 .../TestWordDelimiterGraphFilter.java              | 56 ++++++++++++++++++++--
 4 files changed, 75 insertions(+), 22 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index f55d61a..36cf2d7 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -73,6 +73,9 @@ Improvements
 * LUCENE-9539: Use more compact datastructures to represent sorted doc-values in memory when 
   sorting a segment before flush and in SortingCodecReader. (Simon Willnauer)
 
+* LUCENE-9458: WordDelimiterGraphFilter should order tokens at the same position by endOffset to
+  emit longer tokens first.  The same graph is produced. (David Smiley)
+
 Optimizations
 ---------------------
 
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
index 9d03c7e..70e1e1e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
@@ -447,26 +447,18 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
   private class PositionSorter extends InPlaceMergeSorter {
     @Override
     protected int compare(int i, int j) {
-      // smaller start position
-      int iPosStart = bufferedParts[4*i];
-      int jPosStart = bufferedParts[4*j];
-      int cmp = Integer.compare(iPosStart, jPosStart);
-      if (cmp != 0) {
-        return cmp;
-      }
-
-      // longest pos length:
-      int iPosEnd = bufferedParts[4*i+1];
-      int jPosEnd = bufferedParts[4*j+1];
-      cmp = Integer.compare(jPosEnd, iPosEnd);
+      // smaller start offset
+      int iOff = bufferedParts[4 * i + 2];
+      int jOff = bufferedParts[4 * j + 2];
+      int cmp = Integer.compare(iOff, jOff);
       if (cmp != 0) {
         return cmp;
       }
 
-      // smaller start offset
-      int iOff = bufferedParts[4*i + 2];
-      int jOff = bufferedParts[4*j + 2];
-      return Integer.compare(iOff, jOff);
+      // longer end offset
+      iOff = bufferedParts[4 * i + 3];
+      jOff = bufferedParts[4 * j + 3];
+      return Integer.compare(jOff, iOff);
     }
 
     @Override
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java
index f3541ac..b7435d6 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java
@@ -16,6 +16,8 @@
  */
 package org.apache.lucene.analysis.miscellaneous;
 
+import java.util.Locale;
+
 /**
  * A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterGraphFilter rules.
  * @lucene.internal
@@ -113,7 +115,17 @@ public final class WordDelimiterIterator {
     this.splitOnNumerics = splitOnNumerics;
     this.stemEnglishPossessive = stemEnglishPossessive;
   }
-  
+
+  @Override
+  public String toString() {
+    if (end == DONE) {
+      return "DONE";
+    }
+    return new String(text, current, end - current)
+        + " [" + current + "-" + end + "]"
+        + " type=" + String.format(Locale.ROOT, "%#02x", type());
+  }
+
   /**
    * Advance to the next subword in the string.
    *
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
index 648c366..5f26a16 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
@@ -401,6 +401,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
   public void testCatenateAllEmittedBeforeParts() throws Exception {
     // no number parts
     final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | CATENATE_ALL;
+    final boolean useCharFilter = true;
+    final boolean graphOffsetsAreCorrect = false; // note: could solve via always incrementing wordPos on first word ('8')
 
     //not using getAnalyzer because we want adjustInternalOffsets=true
     Analyzer a = new Analyzer() {
@@ -414,17 +416,61 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
     // input starts with a number, but we don't generate numbers.
     //   Nonetheless preserve-original and concatenate-all show up first.
     assertTokenStreamContents(a.tokenStream("dummy", "8-other"),
-        new String[] { "8-other", "8other", "other" }, new int[]{0, 0, 2}, new int[]{7, 7, 7});
-
-    boolean useCharFilter = true;
-    boolean graphOffsetsAreCorrect = false; // note: could solve via always incrementing wordPos on first word ('8')
+        new String[] { "8-other", "8other", "other" }, new int[]{0, 0, 2}, new int[]{7, 7, 7}, new int[]{1, 0, 0});
     checkAnalysisConsistency(random(), a, useCharFilter, "8-other", graphOffsetsAreCorrect);
-
     verify("8-other", flags); // uses getAnalyzer which uses adjustInternalOffsets=false which works
 
+    // input ends with a number, but we don't generate numbers
+    assertTokenStreamContents(a.tokenStream("dummy", "other-9"),
+        new String[] { "other-9", "other9", "other" }, new int[]{0, 0, 0}, new int[]{7, 7, 5}, new int[]{1, 0, 0});
+    checkAnalysisConsistency(random(), a, useCharFilter, "other-9", graphOffsetsAreCorrect);
+    verify("9-other", flags); // uses getAnalyzer which uses adjustInternalOffsets=false which works
+
     a.close();
   }
 
+  /*
+  static char[] fuzzDict = {'-', 'H', 'w', '4'};
+  public void testFuzz() throws IOException {
+    //System.out.println(getGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | CATENATE_WORDS), "H-H")); // orig:[H H, HH H] orig; fixed posInc:"[HH H H]"
+    //System.out.println(getGraphStrings(getAnalyzer(CATENATE_WORDS | CATENATE_ALL), "H-4")); // fixPos:[H H4] final:"[H4 H]"
+
+    StringBuilder input = new StringBuilder("000000"); // fill with arbitrary chars; not too long or too short
+
+    for (int flags = 0; flags < IGNORE_KEYWORDS; flags++) { // all interesting bit flags precede IGNORE_KEYWORDS
+      System.out.println("Flags: " + flags + " " + WordDelimiterGraphFilter.flagsToString(flags));
+      final Analyzer analyzer = getAnalyzer(flags);
+      fuzzLoop(input, 0, analyzer);
+    }
+  }
+
+  public void fuzzLoop(StringBuilder input, int inputPrefixLenFuzzed, Analyzer analyzer) throws IOException {
+    if (inputPrefixLenFuzzed < input.length()) {
+      for (char c : fuzzDict) {
+        input.setCharAt(inputPrefixLenFuzzed, c);
+        fuzzLoop(input, inputPrefixLenFuzzed + 1, analyzer); // recursive
+      }
+      return;
+    }
+
+    fuzzDoCheck(input.toString(), analyzer);
+  }
+
+  private void fuzzDoCheck(String input, Analyzer analyzer) throws IOException {
+    try (TokenStream ts1 = analyzer.tokenStream("fieldName", input)) {
+      ts1.reset();
+      while (ts1.incrementToken()) { // modified WDF sorter compare() contains assertion check
+        //do-nothing
+      }
+      ts1.end();
+    } catch (AssertionError e) {
+      System.out.println("failed input: " + input);
+      throw e;
+    }
+  }
+*/
+
+
   /** concat numbers + words + all */
   public void testLotsOfConcatenating() throws Exception {
     final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;