You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2022/12/19 08:34:03 UTC
[incubator-nlpcraft] 02/03: WIP.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 4b1d67b7910107371da42bb2b8d641dcced52ab8
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Mon Dec 19 11:19:10 2022 +0400
WIP.
---
.../main/resources/stopwords/first_words.txt.gz | Bin 4024880 -> 0 bytes
.../src/main/resources/stopwords/noun_words.txt.gz | Bin 862 -> 0 bytes
.../nlp/enrichers/NCStopWordsEnricherSpec.scala | 23 ++++++++++++++-------
3 files changed, 15 insertions(+), 8 deletions(-)
diff --git a/nlpcraft/src/main/resources/stopwords/first_words.txt.gz b/nlpcraft/src/main/resources/stopwords/first_words.txt.gz
deleted file mode 100644
index e92748b4..00000000
Binary files a/nlpcraft/src/main/resources/stopwords/first_words.txt.gz and /dev/null differ
diff --git a/nlpcraft/src/main/resources/stopwords/noun_words.txt.gz b/nlpcraft/src/main/resources/stopwords/noun_words.txt.gz
deleted file mode 100644
index bfeb6fac..00000000
Binary files a/nlpcraft/src/main/resources/stopwords/noun_words.txt.gz and /dev/null differ
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala
index 142c16b4..b81ee116 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala
@@ -56,43 +56,50 @@ class NCStopWordsEnricherSpec extends AnyFunSuite:
false
)
test(
- new NCEnStopWordsTokenEnricher(Set("test"), Set("the")),
+ new NCEnStopWordsTokenEnricher(addSet = Set("test"), exclSet = Set("the")),
"the test",
false,
true
)
// The synonym is defined as lemma => all kind of input words should be found.
test(
- new NCEnStopWordsTokenEnricher(Set("woman")),
+ new NCEnStopWordsTokenEnricher(addSet = Set("woman")),
"woman women",
true,
true
)
// The synonym is defined in some form => only in the same form input words should be found.
test(
- new NCEnStopWordsTokenEnricher(Set("women")),
+ new NCEnStopWordsTokenEnricher(addSet = Set("women")),
"woman women",
false,
true
)
// The synonym is defined in some form, but stemmer is very rough => all kind of input words should be found.
test(
- new NCEnStopWordsTokenEnricher(addStopsSet = Set("women"), stemmer = _.take(3)),
+ new NCEnStopWordsTokenEnricher(addSet = Set("women"), stemmer = _.take(3)),
"woman women",
true,
true
)
// The synonym is defined as lemma => all kind of input words should be found, but excluded set is defined.
test(
- new NCEnStopWordsTokenEnricher(Set("woman"), Set("women")),
+ new NCEnStopWordsTokenEnricher(addSet = Set("woman"), exclSet = Set("women")),
"woman women",
true,
false
)
- // Very rough stemmer defined.
+ // Very rough stemmers defined.
test(
- new NCEnStopWordsTokenEnricher(addStopsSet = Set("women"), stemmer = _.head.toString),
- "weather windows",
+ new NCEnStopWordsTokenEnricher(addSet = Set("women"), stemmer = _.head.toString),
+ "weather windows noun",
+ true,
+ true,
+ false
+ )
+ test(
+ new NCEnStopWordsTokenEnricher(stemmer = _ => ""),
+ "weather noun",
true,
true
)