You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2022/12/19 08:34:03 UTC

[incubator-nlpcraft] 02/03: WIP.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 4b1d67b7910107371da42bb2b8d641dcced52ab8
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Mon Dec 19 11:19:10 2022 +0400

    WIP.
---
 .../main/resources/stopwords/first_words.txt.gz    | Bin 4024880 -> 0 bytes
 .../src/main/resources/stopwords/noun_words.txt.gz | Bin 862 -> 0 bytes
 .../nlp/enrichers/NCStopWordsEnricherSpec.scala    |  23 ++++++++++++++-------
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/nlpcraft/src/main/resources/stopwords/first_words.txt.gz b/nlpcraft/src/main/resources/stopwords/first_words.txt.gz
deleted file mode 100644
index e92748b4..00000000
Binary files a/nlpcraft/src/main/resources/stopwords/first_words.txt.gz and /dev/null differ
diff --git a/nlpcraft/src/main/resources/stopwords/noun_words.txt.gz b/nlpcraft/src/main/resources/stopwords/noun_words.txt.gz
deleted file mode 100644
index bfeb6fac..00000000
Binary files a/nlpcraft/src/main/resources/stopwords/noun_words.txt.gz and /dev/null differ
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala
index 142c16b4..b81ee116 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala
@@ -56,43 +56,50 @@ class NCStopWordsEnricherSpec extends AnyFunSuite:
             false
         )
         test(
-            new NCEnStopWordsTokenEnricher(Set("test"), Set("the")),
+            new NCEnStopWordsTokenEnricher(addSet = Set("test"), exclSet = Set("the")),
             "the test",
             false,
             true
         )
         // The synonym is defined as lemma => all kind of input words should be found.
         test(
-            new NCEnStopWordsTokenEnricher(Set("woman")),
+            new NCEnStopWordsTokenEnricher(addSet = Set("woman")),
             "woman women",
             true,
             true
         )
         // The synonym is defined in some form => only in the same form input words should be found.
         test(
-            new NCEnStopWordsTokenEnricher(Set("women")),
+            new NCEnStopWordsTokenEnricher(addSet = Set("women")),
             "woman women",
             false,
             true
         )
         // The synonym is defined in some form, but stemmer is very rough =>  all kind of input words should be found.
         test(
-            new NCEnStopWordsTokenEnricher(addStopsSet = Set("women"), stemmer = _.take(3)),
+            new NCEnStopWordsTokenEnricher(addSet = Set("women"), stemmer = _.take(3)),
             "woman women",
             true,
             true
         )
         // The synonym is defined as lemma => all kind of input words should be found, but excluded set is defined.
         test(
-            new NCEnStopWordsTokenEnricher(Set("woman"), Set("women")),
+            new NCEnStopWordsTokenEnricher(addSet = Set("woman"), exclSet = Set("women")),
             "woman women",
             true,
             false
         )
-        // Very rough stemmer defined.
+        // Very rough stemmers defined.
         test(
-            new NCEnStopWordsTokenEnricher(addStopsSet = Set("women"), stemmer = _.head.toString),
-            "weather windows",
+            new NCEnStopWordsTokenEnricher(addSet = Set("women"), stemmer = _.head.toString),
+            "weather windows noun",
+            true,
+            true,
+            false
+        )
+        test(
+            new NCEnStopWordsTokenEnricher(stemmer = _ => ""),
+            "weather noun",
             true,
             true
         )