You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2017/01/31 16:56:21 UTC
lucene-solr:master: LUCENE-7668: add new test case; remove dead code;
improve CannedTokenStream to copy all Token attributes
Repository: lucene-solr
Updated Branches:
refs/heads/master a43ef8f48 -> 72eaeab71
LUCENE-7668: add new test case; remove dead code; improve CannedTokenStream to copy all Token attributes
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/72eaeab7
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/72eaeab7
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/72eaeab7
Branch: refs/heads/master
Commit: 72eaeab7151d421a28ecec1634b8c48599e524f5
Parents: a43ef8f
Author: Mike McCandless <mi...@apache.org>
Authored: Tue Jan 31 11:56:07 2017 -0500
Committer: Mike McCandless <mi...@apache.org>
Committed: Tue Jan 31 11:56:07 2017 -0500
----------------------------------------------------------------------
.../miscellaneous/WordDelimiterGraphFilter.java | 2 --
.../TestWordDelimiterGraphFilter.java | 15 ++++++++++-
.../lucene/analysis/CannedTokenStream.java | 28 ++++++--------------
3 files changed, 22 insertions(+), 23 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/72eaeab7/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
index fe8ed72..a6ade19 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
@@ -27,7 +27,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeSource;
@@ -173,7 +172,6 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);
- private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
// used for iterating word delimiter breaks
private final WordDelimiterIterator iterator;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/72eaeab7/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
index 2daf886..f4e8b79 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
@@ -155,6 +155,19 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
doSplitPossessive(0, "ra's", "ra", "s");
}
+ public void testTokenType() throws Exception {
+ int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+ // test that subwords and catenated subwords have
+ // the correct offsets.
+ Token token = new Token("foo-bar", 5, 12);
+ token.setType("mytype");
+ WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(token), DEFAULT_WORD_DELIM_TABLE, flags, null);
+
+ assertTokenStreamContents(wdf,
+ new String[] {"foobar", "foo", "bar"},
+ new String[] {"mytype", "mytype", "mytype"});
+ }
+
/*
* Set a large position increment gap of 10 if the token is "largegap" or "/"
*/
@@ -177,7 +190,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
}
}
}
-
+
public void testPositionIncrements() throws Exception {
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/72eaeab7/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java
index 8250799..8323882 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java
@@ -18,11 +18,9 @@ package org.apache.lucene.analysis;
import java.io.IOException;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
/**
* TokenStream from a canned list of Tokens.
@@ -30,23 +28,19 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
public final class CannedTokenStream extends TokenStream {
private final Token[] tokens;
private int upto = 0;
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final int finalOffset;
private final int finalPosInc;
public CannedTokenStream(Token... tokens) {
- this.tokens = tokens;
- finalOffset = 0;
- finalPosInc = 0;
+ this(0, 0, tokens);
}
/** If you want trailing holes, pass a non-zero
* finalPosInc. */
public CannedTokenStream(int finalPosInc, int finalOffset, Token... tokens) {
+ super(Token.TOKEN_ATTRIBUTE_FACTORY);
this.tokens = tokens;
this.finalOffset = finalOffset;
this.finalPosInc = finalPosInc;
@@ -62,16 +56,10 @@ public final class CannedTokenStream extends TokenStream {
@Override
public boolean incrementToken() {
if (upto < tokens.length) {
- final Token token = tokens[upto++];
- // TODO: can we just capture/restoreState so
- // we get all attrs...?
- clearAttributes();
- termAtt.setEmpty();
- termAtt.append(token.toString());
- posIncrAtt.setPositionIncrement(token.getPositionIncrement());
- posLengthAtt.setPositionLength(token.getPositionLength());
- offsetAtt.setOffset(token.startOffset(), token.endOffset());
- payloadAtt.setPayload(token.getPayload());
+ clearAttributes();
+ // NOTE: this looks weird, casting offsetAtt to Token, but because we are using the Token class's AttributeFactory, all attributes are
+ // in fact backed by the Token class, so we just copy the current token into our Token:
+ tokens[upto++].copyTo((Token) offsetAtt);
return true;
} else {
return false;