You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by dw...@apache.org on 2013/07/03 12:31:50 UTC
svn commit: r1499313 - in /lucene/dev/branches/branch_4x: ./ lucene/
lucene/analysis/
lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/
lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/
Author: dweiss
Date: Wed Jul 3 10:31:50 2013
New Revision: 1499313
URL: http://svn.apache.org/r1499313
Log:
LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords.
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt
lucene/dev/branches/branch_4x/lucene/analysis/ (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
lucene/dev/branches/branch_4x/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1499313&r1=1499312&r2=1499313&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Wed Jul 3 10:31:50 2013
@@ -7,6 +7,9 @@ http://s.apache.org/luceneversions
Changes in backwards compatibility policy
+* LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords
+ (Dawid Weiss, Grzegorz Sobczyk)
+
* LUCENE-4955: NGramTokenFilter now emits all n-grams for the same token at the
same position and preserves the position length and the offsets of the
original token. (Simon Willnauer, Adrien Grand)
@@ -179,6 +182,9 @@ Optimizations
New Features
+* LUCENE-5085: MorfologikFilter will no longer stem words marked as keywords
+ (Dawid Weiss, Grzegorz Sobczyk)
+
* LUCENE-5064: Added PagedMutable (internal), a paged extension of
PackedInts.Mutable which allows for storing more than 2B values. (Adrien Grand)
Modified: lucene/dev/branches/branch_4x/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java?rev=1499313&r1=1499312&r2=1499313&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java Wed Jul 3 10:31:50 2013
@@ -27,6 +27,7 @@ import morfologik.stemming.PolishStemmer
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.*;
@@ -44,6 +45,7 @@ public class MorfologikFilter extends To
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final MorphosyntacticTagsAttribute tagsAtt = addAttribute(MorphosyntacticTagsAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
private final CharsRef scratch = new CharsRef(0);
private final CharacterUtils charUtils;
@@ -140,7 +142,8 @@ public class MorfologikFilter extends To
popNextLemma();
return true;
} else if (this.input.incrementToken()) {
- if (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt))) {
+ if (!keywordAttr.isKeyword() &&
+ (lookupSurfaceForm(termAtt) || lookupSurfaceForm(toLowercase(termAtt)))) {
current = captureState();
popNextLemma();
} else {
Modified: lucene/dev/branches/branch_4x/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java?rev=1499313&r1=1499312&r2=1499313&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java (original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java Wed Jul 3 10:31:50 2013
@@ -18,11 +18,22 @@ package org.apache.lucene.analysis.morfo
*/
import java.io.IOException;
+import java.io.Reader;
import java.io.StringReader;
import java.util.TreeSet;
-import org.apache.lucene.analysis.*;
+import morfologik.stemming.PolishStemmer.DICTIONARY;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.Version;
/**
* TODO: The tests below rely on the order of returned lemmas, which is probably not good.
@@ -144,6 +155,35 @@ public class TestMorfologikAnalyzer exte
ts.close();
}
+ /** */
+ public final void testKeywordAttrTokens() throws IOException {
+ final Version version = TEST_VERSION_CURRENT;
+ final DICTIONARY dictionary = DICTIONARY.COMBINED;
+
+ Analyzer a = new MorfologikAnalyzer(version, dictionary) {
+ @Override
+ protected TokenStreamComponents createComponents(String field, Reader reader) {
+ final CharArraySet keywords = new CharArraySet(version, 1, false);
+ keywords.add("liÅcie");
+
+ final Tokenizer src = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
+ TokenStream result = new StandardFilter(TEST_VERSION_CURRENT, src);
+ result = new SetKeywordMarkerFilter(result, keywords);
+ result = new MorfologikFilter(result, dictionary, TEST_VERSION_CURRENT);
+
+ return new TokenStreamComponents(src, result);
+ }
+ };
+
+ assertAnalyzesToReuse(
+ a,
+ "liÅcie danych",
+ new String[] { "liÅcie", "dany", "dana", "dane", "daÄ" },
+ new int[] { 0, 7, 7, 7, 7 },
+ new int[] { 6, 13, 13, 13, 13 },
+ new int[] { 1, 1, 0, 0, 0 });
+ }
+
/** blast some random strings through the analyzer */
public void testRandom() throws Exception {
checkRandomData(random(), getTestAnalyzer(), 1000 * RANDOM_MULTIPLIER);