You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ro...@apache.org on 2019/04/02 08:09:47 UTC
[lucene-solr] branch master updated: LUCENE-8730:
WordDelimiterGraphFilter always emits its original token first
This is an automated email from the ASF dual-hosted git repository.
romseygeek pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/master by this push:
new 3de0b36 LUCENE-8730: WordDelimiterGraphFilter always emits its original token first
3de0b36 is described below
commit 3de0b3671998cc9bc723d10f1b31ce48cbd4fa64
Author: Alan Woodward <ro...@apache.org>
AuthorDate: Mon Apr 1 18:21:06 2019 +0100
LUCENE-8730: WordDelimiterGraphFilter always emits its original token first
---
lucene/CHANGES.txt | 5 +++++
.../miscellaneous/WordDelimiterGraphFilter.java | 13 ++++++++++---
.../miscellaneous/TestWordDelimiterGraphFilter.java | 19 ++++++++++++++++++-
3 files changed, 33 insertions(+), 4 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index b54fa3f..c9be635 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -90,6 +90,11 @@ Changes in Runtime Behavior
* LUCENE-8671: Load FST off-heap also for ID-like fields if reader is not opened
from an IndexWriter. (Simon Willnauer)
+* LUCENE-8730: WordDelimiterGraphFilter always emits its original token first. This
+ brings its behaviour into line with the deprecated WordDelimiterFilter, so that
+ the only difference in output between the two is in the position length
+ attribute. (Alan Woodward, Jim Ferenczi)
+
Other
* LUCENE-8680: Refactor EdgeTree#relateTriangle method. (Ignacio Vera)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
index 00ace5b..a04eaff 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
@@ -268,6 +268,12 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
lastConcatCount = 0;
wordPos = 0;
+ if (has(PRESERVE_ORIGINAL)) {
+ // add the original token now so that it is always emitted first
+ // we will edit the term length after all other parts have been buffered
+ buffer(0, 1, 0, savedTermLength);
+ }
+
if (iterator.isSingleWord()) {
buffer(wordPos, wordPos+1, iterator.current, iterator.end);
wordPos++;
@@ -320,15 +326,16 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
}
if (has(PRESERVE_ORIGINAL)) {
+ // we now know how many tokens need to be injected, so we can set the original
+ // token's position length
if (wordPos == 0) {
// can happen w/ strange flag combos and inputs :)
wordPos++;
}
- // add the original token now so that we can set the correct end position
- buffer(0, wordPos, 0, savedTermLength);
+ bufferedParts[1] = wordPos;
}
- sorter.sort(0, bufferedLen);
+ sorter.sort(has(PRESERVE_ORIGINAL) ? 1 : 0, bufferedLen);
wordPos = 0;
// set back to 0 for iterating from the buffer
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
index e3f3f65..41109b8 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
@@ -380,6 +380,23 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
};
}
+ public void testOriginalTokenEmittedFirst() throws Exception {
+ final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+
+ /* analyzer that uses whitespace + wdf */
+ Analyzer a = new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String field) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, true, DEFAULT_WORD_DELIM_TABLE, flags, null));
+ }
+ };
+
+ assertAnalyzesTo(a, "abc-def abcDEF abc123",
+ new String[] { "abc-def", "abcdef", "abc", "def", "abcDEF", "abcDEF", "abc", "DEF", "abc123", "abc123", "abc", "123" });
+ a.close();
+ }
+
/** concat numbers + words + all */
public void testLotsOfConcatenating() throws Exception {
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
@@ -418,7 +435,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
};
assertAnalyzesTo(a, "abc-def-123-456",
- new String[] { "abcdef123456", "abc-def-123-456", "abcdef", "abc", "def", "123456", "123", "456" },
+ new String[] { "abc-def-123-456", "abcdef123456", "abcdef", "abc", "def", "123456", "123", "456" },
new int[] { 0, 0, 0, 0, 0, 0, 0, 0 },
new int[] { 15, 15, 15, 15, 15, 15, 15, 15 },
null,