You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cassandra.apache.org by xe...@apache.org on 2016/06/26 08:52:26 UTC
cassandra git commit: Correct english word stemming test and add a
test for french
Repository: cassandra
Updated Branches:
refs/heads/trunk f1cabcade -> eb82861c8
Correct english word stemming test and add a test for french
patch by doanduyhai; reviewed by xedin for CASSANDRA-12078
Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo
Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/eb82861c
Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/eb82861c
Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/eb82861c
Branch: refs/heads/trunk
Commit: eb82861c8d4c497d64b5e61a1606bdd270e8e109
Parents: f1cabca
Author: Pavel Yaskevich <xe...@apache.org>
Authored: Sun Jun 26 01:48:23 2016 -0700
Committer: Pavel Yaskevich <xe...@apache.org>
Committed: Sun Jun 26 01:50:32 2016 -0700
----------------------------------------------------------------------
.../sasi/analyzer/filter/StemmingFilters.java | 2 +-
.../french_skip_stop_words_before_stemming.txt | 1 +
.../sasi/analyzer/StandardAnalyzerTest.java | 33 +++++++++++++++++++-
3 files changed, 34 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/cassandra/blob/eb82861c/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java
----------------------------------------------------------------------
diff --git a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java b/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java
index 9e098d1..cb840a8 100644
--- a/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java
+++ b/src/java/org/apache/cassandra/index/sasi/analyzer/filter/StemmingFilters.java
@@ -37,7 +37,7 @@ public class StemmingFilters
public String process(String input) throws Exception
{
- if (stemmer == null)
+ if (input == null || stemmer == null)
return input;
stemmer.setCurrent(input);
return (stemmer.stem()) ? stemmer.getCurrent() : input;
http://git-wip-us.apache.org/repos/asf/cassandra/blob/eb82861c/test/resources/tokenization/french_skip_stop_words_before_stemming.txt
----------------------------------------------------------------------
diff --git a/test/resources/tokenization/french_skip_stop_words_before_stemming.txt b/test/resources/tokenization/french_skip_stop_words_before_stemming.txt
new file mode 100644
index 0000000..59a1c23
--- /dev/null
+++ b/test/resources/tokenization/french_skip_stop_words_before_stemming.txt
@@ -0,0 +1 @@
+"La danse sous la pluie" est une chanson connue
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/cassandra/blob/eb82861c/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java
----------------------------------------------------------------------
diff --git a/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java b/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java
index e307512..7a88a3d 100644
--- a/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java
+++ b/test/unit/org/apache/cassandra/index/sasi/analyzer/StandardAnalyzerTest.java
@@ -25,6 +25,8 @@ import java.util.Locale;
import org.junit.Test;
+import org.apache.cassandra.serializers.UTF8Serializer;
+
import static org.junit.Assert.assertEquals;
public class StandardAnalyzerTest
@@ -151,7 +153,36 @@ public class StandardAnalyzerTest
while (tokenizer.hasNext())
tokens.add(tokenizer.next());
- assertEquals(40249, tokens.size());
+ assertEquals(37739, tokens.size());
+ }
+
+ @Test
+ public void testSkipStopWordBeforeStemmingFrench() throws Exception
+ {
+ InputStream is = StandardAnalyzerTest.class.getClassLoader()
+ .getResourceAsStream("tokenization/french_skip_stop_words_before_stemming.txt");
+
+ StandardTokenizerOptions options = new StandardTokenizerOptions.OptionsBuilder().stemTerms(true)
+ .ignoreStopTerms(true).useLocale(Locale.FRENCH)
+ .alwaysLowerCaseTerms(true).build();
+ StandardAnalyzer tokenizer = new StandardAnalyzer();
+ tokenizer.init(options);
+
+ List<ByteBuffer> tokens = new ArrayList<>();
+ List<String> words = new ArrayList<>();
+ tokenizer.reset(is);
+ while (tokenizer.hasNext())
+ {
+ final ByteBuffer nextToken = tokenizer.next();
+ tokens.add(nextToken);
+ words.add(UTF8Serializer.instance.deserialize(nextToken.duplicate()));
+ }
+
+ assertEquals(4, tokens.size());
+ assertEquals("dans", words.get(0));
+ assertEquals("plui", words.get(1));
+ assertEquals("chanson", words.get(2));
+ assertEquals("connu", words.get(3));
}
@Test