You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2018/04/28 13:58:07 UTC
[1/2] lucene-solr:master: LUCENE-8265: WordDelimiter*Filter ignores
keywords
Repository: lucene-solr
Updated Branches:
refs/heads/master 4fba55c86 -> 70abbe743
LUCENE-8265: WordDelimiter*Filter ignores keywords
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/fc0878cc
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/fc0878cc
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/fc0878cc
Branch: refs/heads/master
Commit: fc0878cc2f97fdaa5206796ca5e0efa4988e7609
Parents: 4fba55c
Author: Michael Sokolov <so...@amazon.com>
Authored: Sun Apr 22 20:41:08 2018 +0000
Committer: Mike McCandless <mi...@apache.org>
Committed: Sat Apr 28 09:47:06 2018 -0400
----------------------------------------------------------------------
.../miscellaneous/WordDelimiterFilter.java | 13 +++++-
.../miscellaneous/WordDelimiterGraphFilter.java | 18 ++++++--
.../miscellaneous/TestWordDelimiterFilter.java | 43 +++++++++++++++-----
.../TestWordDelimiterGraphFilter.java | 32 +++++++++++++++
4 files changed, 90 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fc0878cc/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
index 313386b..16edb3d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java
@@ -25,6 +25,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
@@ -164,7 +165,12 @@ public final class WordDelimiterFilter extends TokenFilter {
* "O'Neil's" => "O", "Neil"
*/
public static final int STEM_ENGLISH_POSSESSIVE = 256;
-
+
+ /**
+ * Suppresses processing terms with {@link KeywordAttribute#isKeyword()}=true.
+ */
+ public static final int IGNORE_KEYWORDS = 512;
+
/**
* If not null is the set of tokens to protect from being delimited
*
@@ -174,6 +180,7 @@ public final class WordDelimiterFilter extends TokenFilter {
private final int flags;
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+ private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);;
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
@@ -243,7 +250,9 @@ public final class WordDelimiterFilter extends TokenFilter {
if (!input.incrementToken()) {
return false;
}
-
+ if (has(IGNORE_KEYWORDS) && keywordAttribute.isKeyword()) {
+ return true;
+ }
int termLength = termAttribute.length();
char[] termBuffer = termAttribute.buffer();
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fc0878cc/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
index 7949fa2..7d021c5 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
@@ -24,6 +24,7 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
@@ -39,7 +40,7 @@ import org.apache.lucene.util.RamUsageEstimator;
* work correctly when this filter is used in the search-time analyzer. Unlike
* the deprecated {@link WordDelimiterFilter}, this token filter produces a
* correct token graph as output. However, it cannot consume an input token
- * graph correctly.
+ * graph correctly. Processing is suppressed by {@link KeywordAttribute#isKeyword()}=true.
*
* <p>
* Words are split into subwords with the following rules:
@@ -156,7 +157,12 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
* "O'Neil's" => "O", "Neil"
*/
public static final int STEM_ENGLISH_POSSESSIVE = 256;
-
+
+ /**
+ * Suppresses processing terms with {@link KeywordAttribute#isKeyword()}=true.
+ */
+ public static final int IGNORE_KEYWORDS = 512;
+
/**
* If not null is the set of tokens to protect from being delimited
*
@@ -174,6 +180,7 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
private char[][] bufferedTermParts = new char[4][];
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+ private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class);;
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);
@@ -225,7 +232,8 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
PRESERVE_ORIGINAL |
SPLIT_ON_CASE_CHANGE |
SPLIT_ON_NUMERICS |
- STEM_ENGLISH_POSSESSIVE)) != 0) {
+ STEM_ENGLISH_POSSESSIVE |
+ IGNORE_KEYWORDS)) != 0) {
throw new IllegalArgumentException("flags contains unrecognized flag: " + configurationFlags);
}
this.flags = configurationFlags;
@@ -335,7 +343,9 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
if (input.incrementToken() == false) {
return false;
}
-
+ if (has(IGNORE_KEYWORDS) && keywordAttribute.isKeyword()) {
+ return true;
+ }
int termLength = termAttribute.length();
char[] termBuffer = termAttribute.buffer();
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fc0878cc/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
index 2804bfd..f945cd6 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java
@@ -27,7 +27,6 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.IOUtils;
-import org.junit.Test;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE;
@@ -57,7 +56,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
}
***/
- @Test
public void testOffsets() throws IOException {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
// test that subwords and catenated subwords have
@@ -77,7 +75,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
new int[] { 6, 6, 6 });
}
- @Test
public void testOffsetChange() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("übelkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
@@ -88,7 +85,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
new int[] { 15 });
}
- @Test
public void testOffsetChange2() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(übelkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
@@ -99,7 +95,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
new int[] { 17 });
}
- @Test
public void testOffsetChange3() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(übelkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
@@ -110,7 +105,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
new int[] { 16 });
}
- @Test
public void testOffsetChange4() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
@@ -129,7 +123,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
assertTokenStreamContents(wdf, output);
}
- @Test
public void testSplits() throws Exception {
doSplit("basic-split","basic","split");
doSplit("camelCase","camel","Case");
@@ -175,7 +168,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
/*
* Test option that allows disabling the special "'s" stemming, instead treating the single quote like other delimiters.
*/
- @Test
public void testPossessives() throws Exception {
doSplitPossessive(1, "ra's", "ra");
doSplitPossessive(0, "ra's", "ra", "s");
@@ -204,7 +196,6 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
}
}
- @Test
public void testPositionIncrements() throws Exception {
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
@@ -323,6 +314,38 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
IOUtils.close(a, a2, a3);
}
+ public void testKeywordFilter() throws Exception {
+ assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS),
+ "abc-def klm-nop kpop",
+ new String[] {"abc", "def", "klm", "nop", "kpop"});
+ assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS | IGNORE_KEYWORDS),
+ "abc-def klm-nop kpop",
+ new String[] {"abc", "def", "klm-nop", "kpop"},
+ new int[]{0, 4, 8, 16},
+ new int[]{3, 7, 15, 20},
+ null,
+ new int[]{1, 1, 1, 1},
+ null,
+ false);
+ }
+
+ private Analyzer keywordTestAnalyzer(int flags) throws Exception {
+ return new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String field) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ KeywordMarkerFilter kFilter = new KeywordMarkerFilter(tokenizer) {
+ private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
+ @Override public boolean isKeyword() {
+ // Marks terms starting with the letter 'k' as keywords
+ return term.toString().charAt(0) == 'k';
+ }
+ };
+ return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(kFilter, flags, null));
+ }
+ };
+ }
+
/** concat numbers + words + all */
public void testLotsOfConcatenating() throws Exception {
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
@@ -346,7 +369,7 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
false);
a.close();
}
-
+
/** concat numbers + words + all + preserve original */
public void testLotsOfConcatenating2() throws Exception {
final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/fc0878cc/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
index 7516a23..61ae6c0 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
@@ -309,6 +309,38 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
IOUtils.close(a, a2, a3);
}
+ public void testKeywordFilter() throws Exception {
+ assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS),
+ "abc-def klm-nop kpop",
+ new String[] {"abc", "def", "klm", "nop", "kpop"});
+ assertAnalyzesTo(keywordTestAnalyzer(GENERATE_WORD_PARTS | IGNORE_KEYWORDS),
+ "abc-def klm-nop kpop",
+ new String[] {"abc", "def", "klm-nop", "kpop"},
+ new int[]{0, 4, 8, 16},
+ new int[]{3, 7, 15, 20},
+ null,
+ new int[]{1, 1, 1, 1},
+ null,
+ false);
+ }
+
+ private Analyzer keywordTestAnalyzer(int flags) throws Exception {
+ return new Analyzer() {
+ @Override
+ public TokenStreamComponents createComponents(String field) {
+ Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+ KeywordMarkerFilter kFilter = new KeywordMarkerFilter(tokenizer) {
+ private final CharTermAttribute term = addAttribute(CharTermAttribute.class);
+ @Override public boolean isKeyword() {
+ // Marks terms starting with the letter 'k' as keywords
+ return term.toString().charAt(0) == 'k';
+ }
+ };
+ return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(kFilter, flags, null));
+ }
+ };
+ }
+
/** concat numbers + words + all */
public void testLotsOfConcatenating() throws Exception {
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
[2/2] lucene-solr:master: LUCENE_8265: add CHANGES entry
Posted by mi...@apache.org.
LUCENE_8265: add CHANGES entry
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/70abbe74
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/70abbe74
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/70abbe74
Branch: refs/heads/master
Commit: 70abbe7433fc205e4abd05ebfc0fcf9399bf0f46
Parents: fc0878c
Author: Mike McCandless <mi...@apache.org>
Authored: Sat Apr 28 09:57:58 2018 -0400
Committer: Mike McCandless <mi...@apache.org>
Committed: Sat Apr 28 09:57:58 2018 -0400
----------------------------------------------------------------------
lucene/CHANGES.txt | 3 +++
1 file changed, 3 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/70abbe74/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 4d902fe..a5d0816 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -150,6 +150,9 @@ New Features
but to handle Korean using mecab-ko-dic and morphological analysis.
(Robert Muir, Jim Ferenczi)
+* LUCENE-8265: WordDelimter/GraphFilter now have an option to skip tokens
+ marked with KeywordAttribute (Mike Sokolov via Mike McCandless)
+
Bug Fixes
* LUCENE-8266: Detect bogus tiles when creating a standard polygon and