You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ro...@apache.org on 2019/04/02 08:09:46 UTC

[lucene-solr] branch branch_8x updated: LUCENE-8730: WordDelimiterGraphFilter always emits its original token first

This is an automated email from the ASF dual-hosted git repository.

romseygeek pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new 9591052  LUCENE-8730: WordDelimiterGraphFilter always emits its original token first
9591052 is described below

commit 9591052fede6dda95fc26113bb22ab79b5405a75
Author: Alan Woodward <ro...@apache.org>
AuthorDate: Mon Apr 1 18:21:06 2019 +0100

    LUCENE-8730: WordDelimiterGraphFilter always emits its original token first
---
 lucene/CHANGES.txt                                    |  5 +++++
 .../miscellaneous/WordDelimiterGraphFilter.java       | 13 ++++++++++---
 .../miscellaneous/TestWordDelimiterGraphFilter.java   | 19 ++++++++++++++++++-
 3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 5ba9c4f..0a99deb 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -75,6 +75,11 @@ Changes in Runtime Behavior
 * LUCENE-8671: Load FST off-heap also for ID-like fields if reader is not opened
   from an IndexWriter. (Simon Willnauer)
 
+* LUCENE-8730: WordDelimiterGraphFilter always emits its original token first.  This
+  brings its behaviour into line with the deprecated WordDelimiterFilter, so that
+  the only difference in output between the two is in the position length
+  attribute.  (Alan Woodward, Jim Ferenczi)
+
 Other
 
 * LUCENE-8680: Refactor EdgeTree#relateTriangle method. (Ignacio Vera)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
index 00ace5b..a04eaff 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
@@ -268,6 +268,12 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
     lastConcatCount = 0;
     wordPos = 0;
 
+    if (has(PRESERVE_ORIGINAL)) {
+      // add the original token now so that it is always emitted first
+      // we will edit the term length after all other parts have been buffered
+      buffer(0, 1, 0, savedTermLength);
+    }
+
     if (iterator.isSingleWord()) {
       buffer(wordPos, wordPos+1, iterator.current, iterator.end);
       wordPos++;
@@ -320,15 +326,16 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
     }
 
     if (has(PRESERVE_ORIGINAL)) {
+      // we now know how many tokens need to be injected, so we can set the original
+      // token's position length
       if (wordPos == 0) {
         // can happen w/ strange flag combos and inputs :)
         wordPos++;
       }
-      // add the original token now so that we can set the correct end position
-      buffer(0, wordPos, 0, savedTermLength);
+      bufferedParts[1] = wordPos;
     }
             
-    sorter.sort(0, bufferedLen);
+    sorter.sort(has(PRESERVE_ORIGINAL) ? 1 : 0, bufferedLen);
     wordPos = 0;
 
     // set back to 0 for iterating from the buffer
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
index e3f3f65..41109b8 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
@@ -380,6 +380,23 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
     };
   }
 
+  public void testOriginalTokenEmittedFirst() throws Exception {
+    final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+
+    /* analyzer that uses whitespace + wdf */
+    Analyzer a = new Analyzer() {
+      @Override
+      public TokenStreamComponents createComponents(String field) {
+        Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, true, DEFAULT_WORD_DELIM_TABLE, flags, null));
+      }
+    };
+
+    assertAnalyzesTo(a, "abc-def abcDEF abc123",
+        new String[] { "abc-def", "abcdef", "abc", "def", "abcDEF", "abcDEF", "abc", "DEF", "abc123", "abc123", "abc", "123" });
+    a.close();
+  }
+
   /** concat numbers + words + all */
   public void testLotsOfConcatenating() throws Exception {
     final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;    
@@ -418,7 +435,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
     };
     
     assertAnalyzesTo(a, "abc-def-123-456", 
-                     new String[] { "abcdef123456", "abc-def-123-456", "abcdef", "abc", "def", "123456", "123", "456" }, 
+                     new String[] { "abc-def-123-456", "abcdef123456", "abcdef", "abc", "def", "123456", "123", "456" },
                      new int[] { 0, 0, 0, 0, 0, 0, 0, 0 },
                      new int[] { 15, 15, 15, 15, 15, 15, 15, 15 },
                      null,