You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2020/10/02 02:34:08 UTC
[lucene-solr] branch branch_8x updated: LUCENE-9458: WDGF should
tie-break by endOffset (#1740)
This is an automated email from the ASF dual-hosted git repository.
dsmiley pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/branch_8x by this push:
new 587f730 LUCENE-9458: WDGF should tie-break by endOffset (#1740)
587f730 is described below
commit 587f7302b9de62f840d85ca325a660a9fbe1e3b0
Author: David Smiley <ds...@apache.org>
AuthorDate: Thu Oct 1 22:27:45 2020 -0400
LUCENE-9458: WDGF should tie-break by endOffset (#1740)
Can happen with catenateAll and not generating word xor number part when the input ends with the non-generated sub-token.
Fuzzing revealed that only start & end offsets are needed to order sub-tokens.
(cherry picked from commit 0303063e12049d9b470757c5ef2fb9afa6c5cd18)
---
lucene/CHANGES.txt | 3 ++
.../miscellaneous/WordDelimiterGraphFilter.java | 24 ++++------
.../miscellaneous/WordDelimiterIterator.java | 14 +++++-
.../TestWordDelimiterGraphFilter.java | 56 ++++++++++++++++++++--
4 files changed, 75 insertions(+), 22 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index f55d61a..36cf2d7 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -73,6 +73,9 @@ Improvements
* LUCENE-9539: Use more compact datastructures to represent sorted doc-values in memory when
sorting a segment before flush and in SortingCodecReader. (Simon Willnauer)
+* LUCENE-9458: WordDelimiterGraphFilter should order tokens at the same position by endOffset to
+ emit longer tokens first. The same graph is produced. (David Smiley)
+
Optimizations
---------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
index 9d03c7e..70e1e1e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
@@ -447,26 +447,18 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
private class PositionSorter extends InPlaceMergeSorter {
@Override
protected int compare(int i, int j) {
- // smaller start position
- int iPosStart = bufferedParts[4*i];
- int jPosStart = bufferedParts[4*j];
- int cmp = Integer.compare(iPosStart, jPosStart);
- if (cmp != 0) {
- return cmp;
- }
-
- // longest pos length:
- int iPosEnd = bufferedParts[4*i+1];
- int jPosEnd = bufferedParts[4*j+1];
- cmp = Integer.compare(jPosEnd, iPosEnd);
+ // smaller start offset
+ int iOff = bufferedParts[4 * i + 2];
+ int jOff = bufferedParts[4 * j + 2];
+ int cmp = Integer.compare(iOff, jOff);
if (cmp != 0) {
return cmp;
}
- // smaller start offset
- int iOff = bufferedParts[4*i + 2];
- int jOff = bufferedParts[4*j + 2];
- return Integer.compare(iOff, jOff);
+ // longer end offset
+ iOff = bufferedParts[4 * i + 3];
+ jOff = bufferedParts[4 * j + 3];
+ return Integer.compare(jOff, iOff);
}
@Override
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java
index f3541ac..b7435d6 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java
@@ -16,6 +16,8 @@
*/
package org.apache.lucene.analysis.miscellaneous;
+import java.util.Locale;
+
/**
* A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterGraphFilter rules.
* @lucene.internal
@@ -113,7 +115,17 @@ public final class WordDelimiterIterator {
this.splitOnNumerics = splitOnNumerics;
this.stemEnglishPossessive = stemEnglishPossessive;
}
-
+
+ @Override
+ public String toString() {
+ if (end == DONE) {
+ return "DONE";
+ }
+ return new String(text, current, end - current)
+ + " [" + current + "-" + end + "]"
+ + " type=" + String.format(Locale.ROOT, "%#02x", type());
+ }
+
/**
* Advance to the next subword in the string.
*
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
index 648c366..5f26a16 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
@@ -401,6 +401,8 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
public void testCatenateAllEmittedBeforeParts() throws Exception {
// no number parts
final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | CATENATE_ALL;
+ final boolean useCharFilter = true;
+ final boolean graphOffsetsAreCorrect = false; // note: could solve via always incrementing wordPos on first word ('8')
//not using getAnalyzer because we want adjustInternalOffsets=true
Analyzer a = new Analyzer() {
@@ -414,17 +416,61 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
// input starts with a number, but we don't generate numbers.
// Nonetheless preserve-original and concatenate-all show up first.
assertTokenStreamContents(a.tokenStream("dummy", "8-other"),
- new String[] { "8-other", "8other", "other" }, new int[]{0, 0, 2}, new int[]{7, 7, 7});
-
- boolean useCharFilter = true;
- boolean graphOffsetsAreCorrect = false; // note: could solve via always incrementing wordPos on first word ('8')
+ new String[] { "8-other", "8other", "other" }, new int[]{0, 0, 2}, new int[]{7, 7, 7}, new int[]{1, 0, 0});
checkAnalysisConsistency(random(), a, useCharFilter, "8-other", graphOffsetsAreCorrect);
-
verify("8-other", flags); // uses getAnalyzer which uses adjustInternalOffsets=false which works
+ // input ends with a number, but we don't generate numbers
+ assertTokenStreamContents(a.tokenStream("dummy", "other-9"),
+ new String[] { "other-9", "other9", "other" }, new int[]{0, 0, 0}, new int[]{7, 7, 5}, new int[]{1, 0, 0});
+ checkAnalysisConsistency(random(), a, useCharFilter, "other-9", graphOffsetsAreCorrect);
+ verify("9-other", flags); // uses getAnalyzer which uses adjustInternalOffsets=false which works
+
a.close();
}
+ /*
+ static char[] fuzzDict = {'-', 'H', 'w', '4'};
+ public void testFuzz() throws IOException {
+ //System.out.println(getGraphStrings(getAnalyzer(GENERATE_WORD_PARTS | CATENATE_WORDS), "H-H")); // orig:[H H, HH H] orig; fixed posInc:"[HH H H]"
+ //System.out.println(getGraphStrings(getAnalyzer(CATENATE_WORDS | CATENATE_ALL), "H-4")); // fixPos:[H H4] final:"[H4 H]"
+
+ StringBuilder input = new StringBuilder("000000"); // fill with arbitrary chars; not too long or too short
+
+ for (int flags = 0; flags < IGNORE_KEYWORDS; flags++) { // all interesting bit flags precede IGNORE_KEYWORDS
+ System.out.println("Flags: " + flags + " " + WordDelimiterGraphFilter.flagsToString(flags));
+ final Analyzer analyzer = getAnalyzer(flags);
+ fuzzLoop(input, 0, analyzer);
+ }
+ }
+
+ public void fuzzLoop(StringBuilder input, int inputPrefixLenFuzzed, Analyzer analyzer) throws IOException {
+ if (inputPrefixLenFuzzed < input.length()) {
+ for (char c : fuzzDict) {
+ input.setCharAt(inputPrefixLenFuzzed, c);
+ fuzzLoop(input, inputPrefixLenFuzzed + 1, analyzer); // recursive
+ }
+ return;
+ }
+
+ fuzzDoCheck(input.toString(), analyzer);
+ }
+
+ private void fuzzDoCheck(String input, Analyzer analyzer) throws IOException {
+ try (TokenStream ts1 = analyzer.tokenStream("fieldName", input)) {
+ ts1.reset();
+ while (ts1.incrementToken()) { // modified WDF sorter compare() contains assertion check
+ //do-nothing
+ }
+ ts1.end();
+ } catch (AssertionError e) {
+ System.out.println("failed input: " + input);
+ throw e;
+ }
+ }
+*/
+
+
/** concat numbers + words + all */
public void testLotsOfConcatenating() throws Exception {
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;