You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2022/12/09 08:12:46 UTC
[incubator-nlpcraft] 01/01: WIP.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 6d63a451ed2628aa2f3f5718606ed814ac6a52a6
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Fri Dec 9 12:12:49 2022 +0400
WIP.
---
.../nlpcraft/examples/time/CalculatorModel.scala | 4 +-
.../entity/parser/NCFrSemanticEntityParser.scala | 3 +-
.../entity/parser/NCRuSemanticEntityParser.scala | 3 +-
.../components/PizzeriaModelPipeline.scala | 5 +-
.../org/apache/nlpcraft/NCPipelineBuilder.scala | 54 ++++++++++++++--------
.../NCStemmer.scala} | 11 ++---
...nricher.scala => NCBracketsTokenEnricher.scala} | 4 +-
...icher.scala => NCDictionaryTokenEnricher.scala} | 14 +++---
.../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 2 +-
.../nlp/enrichers/NCOpenNLPTokenEnricher.scala | 18 ++++----
...nEnricher.scala => NCQuotesTokenEnricher.scala} | 6 +--
...icher.scala => NCSwearWordsTokenEnricher.scala} | 17 +++----
.../nlpcraft/nlp/parsers/NCNLPEntityParser.scala | 13 +++---
.../nlp/parsers/NCOpenNLPEntityParser.scala | 16 +++----
.../nlp/parsers/NCOpenNLPTokenParser.scala | 10 ++--
.../nlpcraft/nlp/parsers/NCSemanticElement.scala | 3 +-
.../nlp/parsers/NCSemanticEntityParser.scala | 32 ++++++-------
.../parsers/impl/NCSemanticSynonymsProcessor.scala | 5 +-
.../apache/nlpcraft/nlp/NCTokenEnricherSpec.scala | 2 +-
.../enrichers/NCBracketsTokenEnricherSpec.scala | 4 +-
.../enrichers/NCDictionaryTokenEnricherSpec.scala | 6 +--
.../nlp/enrichers/NCQuotesTokenEnricherSpec.scala | 2 +-
.../enrichers/NCSwearWordsTokenEnricherSpec.scala | 15 ++++--
.../parsers/NCSemanticEntityParserLemmaSpec.scala | 3 +-
.../org/apache/nlpcraft/nlp/util/NCTestUtils.scala | 7 +--
25 files changed, 144 insertions(+), 115 deletions(-)
diff --git a/nlpcraft-examples/calculator/src/main/scala/org/apache/nlpcraft/examples/time/CalculatorModel.scala b/nlpcraft-examples/calculator/src/main/scala/org/apache/nlpcraft/examples/time/CalculatorModel.scala
index 0aecbc86..e1eb0a9c 100644
--- a/nlpcraft-examples/calculator/src/main/scala/org/apache/nlpcraft/examples/time/CalculatorModel.scala
+++ b/nlpcraft-examples/calculator/src/main/scala/org/apache/nlpcraft/examples/time/CalculatorModel.scala
@@ -65,7 +65,7 @@ class CalculatorModel extends NCModel(NCModelConfig("nlpcraft.calculator.ex", "C
@NCIntent(
"intent=calc options={ 'ordered': true }" +
" term(x)={# == 'stanford:number'}" +
- " term(op)={has(list('+', '-', '*', '/'), meta_ent('nlp:token:text')) == true}" +
+ " term(op)={has(list('+', '-', '*', '/'), meta_ent('nlp:entity:text')) == true}" +
" term(y)={# == 'stanford:number'}"
)
@unused def onMatch(
@@ -78,7 +78,7 @@ class CalculatorModel extends NCModel(NCModelConfig("nlpcraft.calculator.ex", "C
@NCIntent(
"intent=calcMem options={ 'ordered': true }" +
- " term(op)={has(list('+', '-', '*', '/'), meta_ent('nlp:token:text')) == true}" +
+ " term(op)={has(list('+', '-', '*', '/'), meta_ent('nlp:entity:text')) == true}" +
" term(y)={# == 'stanford:number'}"
)
@unused def onMatchMem(
diff --git a/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala b/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala
index 55350bf1..c13251f3 100644
--- a/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala
+++ b/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala
@@ -20,6 +20,7 @@ package org.apache.nlpcraft.examples.lightswitch.nlp.entity.parser
import opennlp.tools.stemmer.snowball.SnowballStemmer
import org.apache.nlpcraft.examples.lightswitch.nlp.token.parser.NCFrTokenParser
import org.apache.nlpcraft.*
+import org.apache.nlpcraft.nlp.common.NCStemmer
import org.apache.nlpcraft.nlp.parsers.*
/**
@@ -27,7 +28,7 @@ import org.apache.nlpcraft.nlp.parsers.*
* @param src
*/
class NCFrSemanticEntityParser(src: String) extends NCSemanticEntityParser(
- new NCSemanticStemmer:
+ new NCStemmer:
private val stemmer = new SnowballStemmer(SnowballStemmer.ALGORITHM.FRENCH)
override def stem(txt: String): String = stemmer.synchronized { stemmer.stem(txt.toLowerCase).toString }
,
diff --git a/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala b/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala
index 695a118d..e4c48b94 100644
--- a/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala
+++ b/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala
@@ -21,13 +21,14 @@ import opennlp.tools.stemmer.snowball.SnowballStemmer
import org.apache.nlpcraft.examples.lightswitch.nlp.token.parser.NCRuTokenParser
import org.apache.nlpcraft.nlp.parsers.*
import org.apache.nlpcraft.*
+import org.apache.nlpcraft.nlp.common.NCStemmer
/**
*
* @param src
*/
class NCRuSemanticEntityParser(src: String) extends NCSemanticEntityParser(
- new NCSemanticStemmer:
+ new NCStemmer:
private val stemmer = new SnowballStemmer(SnowballStemmer.ALGORITHM.RUSSIAN)
override def stem(txt: String): String = stemmer.synchronized { stemmer.stem(txt.toLowerCase).toString }
,
diff --git a/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala b/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
index 046cf159..c9e86301 100644
--- a/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
+++ b/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
@@ -6,8 +6,9 @@ import org.apache.nlpcraft.nlp.parsers.*
import org.apache.nlpcraft.nlp.entity.parser.stanford.NCStanfordNLPEntityParser
import org.apache.nlpcraft.nlp.token.parser.stanford.NCStanfordNLPTokenParser
import org.apache.nlpcraft.*
+import org.apache.nlpcraft.nlp.common.NCStemmer
import org.apache.nlpcraft.nlp.enrichers.NCEnStopWordsTokenEnricher
-import org.apache.nlpcraft.nlp.parsers.{NCSemanticEntityParser, NCSemanticStemmer}
+import org.apache.nlpcraft.nlp.parsers.NCSemanticEntityParser
import java.util.Properties
@@ -20,7 +21,7 @@ object PizzeriaModelPipeline:
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner")
new StanfordCoreNLP(props)
val tokParser = new NCStanfordNLPTokenParser(stanford)
- val stemmer = new NCSemanticStemmer():
+ val stemmer = new NCStemmer():
private val ps = new PorterStemmer
override def stem(txt: String): String = ps.synchronized { ps.stem(txt) }
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
index fb21dcce..371b96e5 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
@@ -19,6 +19,7 @@ package org.apache.nlpcraft
import opennlp.tools.stemmer.PorterStemmer
import org.apache.nlpcraft.internal.util.NCResourceReader
+import org.apache.nlpcraft.nlp.common.NCStemmer
import org.apache.nlpcraft.nlp.parsers.*
import org.apache.nlpcraft.nlp.enrichers.*
@@ -39,8 +40,8 @@ class NCPipelineBuilder:
private val entMappers: Buf[NCEntityMapper] = Buf.empty
private val varFilters: Buf[NCVariantFilter] = Buf.empty
- private def mkEnStemmer: NCSemanticStemmer =
- new NCSemanticStemmer:
+ private def mkEnStemmer: NCStemmer =
+ new NCStemmer:
final private val ps: PorterStemmer = new PorterStemmer
override def stem(txt: String): String = ps.stem(txt)
@@ -219,10 +220,13 @@ class NCPipelineBuilder:
tokParser = mkEnOpenNLPTokenParser.?
tokEnrichers += new NCOpenNLPTokenEnricher(NCResourceReader.getPath("opennlp/en-pos-maxent.bin"), NCResourceReader.getPath("opennlp/en-lemmatizer.dict"))
tokEnrichers += new NCEnStopWordsTokenEnricher
- tokEnrichers += new NCEnSwearWordsTokenEnricher(NCResourceReader.getPath("badfilter/swear_words.txt"))
- tokEnrichers += new NCEnQuotesTokenEnricher
- tokEnrichers += new NCEnDictionaryTokenEnricher
- tokEnrichers += new NCEnBracketsTokenEnricher
+ tokEnrichers += new NCSwearWordsTokenEnricher(
+ NCResourceReader.getPath("badfilter/swear_words.txt"),
+ mkEnStemmer
+ )
+ tokEnrichers += new NCQuotesTokenEnricher
+ tokEnrichers += new NCDictionaryTokenEnricher("moby/354984si.ngl")
+ tokEnrichers += new NCBracketsTokenEnricher
/**
* Shortcut to configure pipeline with [[NCSemanticEntityParser]].
@@ -238,11 +242,15 @@ class NCPipelineBuilder:
* [[https://raw.githubusercontent.com/richardwilly98/elasticsearch-opennlp-auto-tagging/master/src/main/resources/models/en-lemmatizer.dict en-lemmatizer.dict]] model for
* [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html DictionaryLemmatizer]].
* - [[NCEnStopWordsTokenEnricher Stop-word]] token enricher.
- * - [[NCEnSwearWordsTokenEnricher Swear-word]] token enricher initialized by
+ * - [[NCSwearWordsTokenEnricher Swear-word]] token enricher initialized by
* [[https://raw.githubusercontent.com/apache/incubator-nlpcraft/external_config/external/badfilter/swear_words.txt swear_words.txt]] dictionary.
- * - [[NCEnQuotesTokenEnricher Quotes]] token enricher.
- * - [[NCEnDictionaryTokenEnricher Known-word]] token enricher.
- * - [[NCEnBracketsTokenEnricher Brackets]] token enricher.
+ * - [[NCQuotesTokenEnricher Quotes]] token enricher.
+ * - [[NCDictionaryTokenEnricher Known-word]] token enricher initialized by "moby/354984si.ngl" dictionary,
+ * look more about [[https://en.wikipedia.org/wiki/Moby_Project Moby Project]].
+ * - [[NCBracketsTokenEnricher Brackets]] token enricher.
+ *
+ * Also there is used [[https://en.wikipedia.org/wiki/Stemming Porter stemmer]] implementation of [[NCStemmer]],
+ * based on [[https://opennlp.apache.org/ OpenNLP]] solution.
*
* @param lang ISO 639-1 language code. Currently, only "en" (English) is supported.
* @param macros Macros to use with [[NCSemanticEntityParser]].
@@ -276,11 +284,15 @@ class NCPipelineBuilder:
* [[https://raw.githubusercontent.com/richardwilly98/elasticsearch-opennlp-auto-tagging/master/src/main/resources/models/en-lemmatizer.dict en-lemmatizer.dict]] model for
* [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html DictionaryLemmatizer]].
* - [[NCEnStopWordsTokenEnricher Stop-word]] token enricher.
- * - [[NCEnSwearWordsTokenEnricher Swear-word]] token enricher initialized by
+ * - [[NCSwearWordsTokenEnricher Swear-word]] token enricher initialized by
* [[https://raw.githubusercontent.com/apache/incubator-nlpcraft/external_config/external/badfilter/swear_words.txt swear_words.txt]] dictionary.
- * - [[NCEnQuotesTokenEnricher Quotes]] token enricher.
- * - [[NCEnDictionaryTokenEnricher Known-word]] token enricher.
- * - [[NCEnBracketsTokenEnricher Brackets]] token enricher.
+ * - [[NCQuotesTokenEnricher Quotes]] token enricher.
+ * - [[NCDictionaryTokenEnricher Known-word]] token enricher initialized by "moby/354984si.ngl" dictionary,
+ * look more about [[https://en.wikipedia.org/wiki/Moby_Project Moby Project]].
+ * - [[NCBracketsTokenEnricher Brackets]] token enricher.
+ *
+ * Also there is used [[https://en.wikipedia.org/wiki/Stemming Porter stemmer]] implementation of [[NCStemmer]],
+ * based on [[https://opennlp.apache.org/ OpenNLP]] solution.
*
* @param lang ISO 639-1 language code. Currently, only "en" (English) is supported.
* @param elms Semantic elements to use with [[NCSemanticEntityParser]].
@@ -301,13 +313,17 @@ class NCPipelineBuilder:
* [[https://raw.githubusercontent.com/richardwilly98/elasticsearch-opennlp-auto-tagging/master/src/main/resources/models/en-lemmatizer.dict en-lemmatizer.dict]] model for
* [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html DictionaryLemmatizer]].
* - [[NCEnStopWordsTokenEnricher Stop-word]] token enricher.
- * - [[NCEnSwearWordsTokenEnricher Swear-word]] token enricher initialized by
+ * - [[NCSwearWordsTokenEnricher Swear-word]] token enricher initialized by
* [[https://raw.githubusercontent.com/apache/incubator-nlpcraft/external_config/external/badfilter/swear_words.txt swear_words.txt]] dictionary.
- * - [[NCEnQuotesTokenEnricher Quotes]] token enricher.
- * - [[NCEnDictionaryTokenEnricher Known-word]] token enricher.
- * - [[NCEnBracketsTokenEnricher Brackets]] token enricher.
+ * - [[NCQuotesTokenEnricher Quotes]] token enricher.
+ * - [[NCDictionaryTokenEnricher Known-word]] token enricher initialized by "moby/354984si.ngl" dictionary,
+ * look more about [[https://en.wikipedia.org/wiki/Moby_Project Moby Project]].
+ * - [[NCBracketsTokenEnricher Brackets]] token enricher.
*
- * @param lang ISO 639-1 language code. Currently, only "en" (English) is supported.
+ * Also there is used [[https://en.wikipedia.org/wiki/Stemming Porter stemmer]] implementation of [[NCStemmer]],
+ * based on [[https://opennlp.apache.org/ OpenNLP]] solution.
+ *
+ * @param lang ISO 639-1 language code. Currently, only "en" (English) is supported.
* @param mdlSrc Classpath resource, file path or URL for YAML or JSON semantic model definition file.
*/
def withSemantic(lang: String, mdlSrc: String): NCPipelineBuilder =
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticStemmer.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/common/NCStemmer.scala
similarity index 79%
rename from nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticStemmer.scala
rename to nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/common/NCStemmer.scala
index 27490eda..b68d1986 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticStemmer.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/common/NCStemmer.scala
@@ -15,19 +15,18 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.nlp.parsers
+package org.apache.nlpcraft.nlp.common
+
+import org.apache.nlpcraft.nlp.parsers.*
/**
*
* `Stemmer` trait. Stems are used for finding words by their reduced form.
+ * `Stemmer` trait implementation depends on language.
* Read more about stemming [[https://en.wikipedia.org/wiki/Stemming here]].
*
- * See detailed description on the website [[https://nlpcraft.apache.org/built-in-entity-parser.html#parser-semantic Semantic Parser]].
- *
- * @see [[NCSemanticEntityParser]]
- * @see [[NCSemanticElement]]
*/
-trait NCSemanticStemmer:
+trait NCStemmer:
/**
* Gets text's stem.
*
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnBracketsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala
similarity index 94%
rename from nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnBracketsTokenEnricher.scala
rename to nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala
index 29e562e7..cf3563c5 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnBracketsTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala
@@ -24,7 +24,7 @@ import java.io.*
import scala.collection.mutable
/**
- * Brackets [[NCTokenEnricher enricher]] for English language.
+ * Brackets [[NCTokenEnricher enricher]].
*
* This enricher adds `brackets` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]]
* instance if the word it represents is enclosed in brackets. Supported brackets are: `()`, `{}`,
@@ -33,7 +33,7 @@ import scala.collection.mutable
* **NOTE:** invalid enclosed brackets are ignored.
*/
//noinspection DuplicatedCode,ScalaWeakerAccess
-class NCEnBracketsTokenEnricher extends NCTokenEnricher with LazyLogging:
+class NCBracketsTokenEnricher extends NCTokenEnricher with LazyLogging:
override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit =
val stack = new java.util.Stack[String]()
val map = mutable.HashMap.empty[NCToken, Boolean]
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnDictionaryTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
similarity index 77%
rename from nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnDictionaryTokenEnricher.scala
rename to nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
index 67615aa1..241adf0c 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnDictionaryTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
@@ -21,26 +21,26 @@ import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.util.NCUtils
/**
- * "Known-word" [[NCTokenEnricher enricher]] for English language.
+ * "Known-word" [[NCTokenEnricher enricher]].
*
* This enricher adds `dict` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]]
- * instance if word it represents is a known English word, i.e. the English dictionary contains this word's
+ * instance if word it represents is a known dictionary word, i.e. the configured dictionary contains this word's
* lemma. The value `true` of the metadata property indicates that this word's lemma is found in the dictionary,
* `false` value indicates otherwise.
*
- * Implementation uses the [[https://en.wikipedia.org/wiki/Moby_Project Moby Project]] English dictionary.
- *
* **NOTE:** this implementation requires `lemma` string [[NCPropertyMap metadata]] property that contains
- * token's lemma. You can configure [[NCOpenNLPTokenEnricher]] that provides this metadata property before
+ * token's lemma. You can configure [[NCOpenNLPTokenEnricher]] for required language that provides this metadata property before
* this enricher in your [[NCPipeline pipeline]].
+ *
+ * @param dictRes Path to the dictionary. This dictionary should has a simple plain text format with one dictionary word on one line.
*/
//noinspection DuplicatedCode,ScalaWeakerAccess
-class NCEnDictionaryTokenEnricher extends NCTokenEnricher:
+class NCDictionaryTokenEnricher(dictRes: String) extends NCTokenEnricher:
private var dict: Set[String] = _
init()
- private def init(): Unit = dict = NCUtils.readResource("moby/354984si.ngl", "iso-8859-1").toSet
+ private def init(): Unit = dict = NCUtils.readResource(dictRes, "UTF-8").toSet
private def getLemma(t: NCToken): String = t.get("lemma").getOrElse(throw new NCException("Lemma not found in token."))
override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit =
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index f0ffb1a7..b5b0c762 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -172,7 +172,7 @@ import org.apache.nlpcraft.nlp.enrichers.NCEnStopWordsTokenEnricher.*
* Look more about stop-words [[https://en.wikipedia.org/wiki/Stop_word here]].
*
* **NOTE:** this implementation requires `lemma` and `pos` string [[NCPropertyMap metadata]] properties that contains
- * token's lemma and part of speech. You can configure [[NCOpenNLPTokenEnricher]] that provides this metadata property before
+ * token's lemma and part of speech. You can configure [[NCOpenNLPTokenEnricher]] for English language that provides this metadata property before
* this enricher in your [[NCPipeline pipeline]].
*
* @param addStopsSet User defined collection of additional stop-words.
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
index af8d6f10..7ba30164 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
@@ -39,10 +39,10 @@ import scala.concurrent.ExecutionContext
*
* Some of OpenNLP prepared models can be found [[https://opennlp.sourceforge.net/models-1.5/ here]].
*
- * @param posMdlSrc Path to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/postag/POSTaggerME.html POSTaggerME]] model.
- * @param lemmaDicSrc Path to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html DictionaryLemmatizer]] model.
+ * @param posMdlRes Path to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/postag/POSTaggerME.html POSTaggerME]] model.
+ * @param lemmaDicRes Path to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/lemmatizer/DictionaryLemmatizer.html DictionaryLemmatizer]] model.
*/
-class NCOpenNLPTokenEnricher(posMdlSrc: String = null, lemmaDicSrc: String = null) extends NCTokenEnricher with LazyLogging:
+class NCOpenNLPTokenEnricher(posMdlRes: String = null, lemmaDicRes: String = null) extends NCTokenEnricher with LazyLogging:
private var tagger: POSTaggerME = _
private var lemmatizer: DictionaryLemmatizer = _
@@ -52,15 +52,15 @@ class NCOpenNLPTokenEnricher(posMdlSrc: String = null, lemmaDicSrc: String = nul
NCUtils.execPar(
Seq(
() => {
- if posMdlSrc != null then
- tagger = new POSTaggerME(new POSModel(NCUtils.getStream(posMdlSrc)))
- logger.trace(s"Loaded resource: $posMdlSrc")
+ if posMdlRes != null then
+ tagger = new POSTaggerME(new POSModel(NCUtils.getStream(posMdlRes)))
+ logger.trace(s"Loaded resource: $posMdlRes")
else logger.warn("POS tagger is not configured.")
},
() => {
- if lemmaDicSrc != null then
- lemmatizer = new DictionaryLemmatizer(NCUtils.getStream(lemmaDicSrc))
- logger.trace(s"Loaded resource: $lemmaDicSrc")
+ if lemmaDicRes != null then
+ lemmatizer = new DictionaryLemmatizer(NCUtils.getStream(lemmaDicRes))
+ logger.trace(s"Loaded resource: $lemmaDicRes")
else logger.warn("Lemmatizer is not configured.")
}
)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnQuotesTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
similarity index 92%
rename from nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnQuotesTokenEnricher.scala
rename to nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
index ea9bd28a..6f82ca76 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnQuotesTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
@@ -21,18 +21,18 @@ import com.typesafe.scalalogging.LazyLogging
import org.apache.nlpcraft.*
/**
- * Quotes [[NCTokenEnricher enricher]] for English language.
+ * Quotes [[NCTokenEnricher enricher]].
*
* This enricher adds `quoted` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]]
* instance if word it represents is in quotes. The value `true` of the metadata property indicates that this word is in quotes,
* `false` value indicates otherwise.
*
* **NOTE:** this implementation requires `lemma` string [[NCPropertyMap metadata]] property that contains
- * token's lemma. You can configure [[NCOpenNLPTokenEnricher]] that provides this metadata property before
+ * token's lemma. You can configure [[NCOpenNLPTokenEnricher]] for required language that provides this metadata property before
* this enricher in your [[NCPipeline pipeline]].
*/
//noinspection ScalaWeakerAccess
-class NCEnQuotesTokenEnricher extends NCTokenEnricher with LazyLogging:
+class NCQuotesTokenEnricher extends NCTokenEnricher with LazyLogging:
private final val Q_POS: Set[String] = Set("``", "''")
private def getPos(t: NCToken): String = t.get("pos").getOrElse(throw new NCException("POS not found in token."))
private def isQuote(t: NCToken): Boolean = Q_POS.contains(getPos(t))
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnSwearWordsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
similarity index 71%
rename from nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnSwearWordsTokenEnricher.scala
rename to nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
index c4fa7d8b..f0d282c7 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnSwearWordsTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
@@ -18,15 +18,15 @@
package org.apache.nlpcraft.nlp.enrichers
import com.typesafe.scalalogging.LazyLogging
-import opennlp.tools.stemmer.PorterStemmer
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.util.NCUtils
+import org.apache.nlpcraft.nlp.common.NCStemmer
import java.io.*
import java.util.Objects
/**
- * "Swear-word" [[NCTokenEnricher enricher]] for English language.
+ * "Swear-word" [[NCTokenEnricher enricher]].
*
* This enricher adds `swear` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]]
* instance if word it represents is a swear word dictionary, i.e. the swear dictionary contains this word's
@@ -34,22 +34,23 @@ import java.util.Objects
* `false` value indicates otherwise.
*
* Read more about stemming [[https://en.wikipedia.org/wiki/Stemming here]].
+ * Stemming is used here because it is too difficult to be based on more accurate `lemma` approach for swear words.
*
- * @param res Path to English swear dictionary. English swear dictionary has simple plain text format with one word on one line.
+ * @param dictRes Path to the swear dictionary. This swear dictionary should has a simple plain text format with one dictionary word on one line.
+ * @param stemmer Stemmer implementation for the dictionary language.
*/
//noinspection ScalaWeakerAccess
-class NCEnSwearWordsTokenEnricher(res: String) extends NCTokenEnricher with LazyLogging:
- require(res != null, "Swear words model file cannot be null.")
+class NCSwearWordsTokenEnricher(dictRes: String, stemmer: NCStemmer) extends NCTokenEnricher with LazyLogging:
+ require(dictRes != null, "Swear words model file cannot be null.")
- private final val stemmer = new PorterStemmer
private var swearWords: Set[String] = _
init()
private def init(): Unit =
- swearWords = NCUtils.readTextStream(NCUtils.getStream(res), "UTF-8").
+ swearWords = NCUtils.readTextStream(NCUtils.getStream(dictRes), "UTF-8").
map(p => stemmer.stem(p.toLowerCase)).toSet
- logger.trace(s"Loaded resource: $res")
+ logger.trace(s"Loaded resource: $dictRes")
override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit =
toks.foreach(t => t.put("swear", swearWords.contains(stemmer.stem(t.getText.toLowerCase))))
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
index d23d42a0..b84d3c18 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
@@ -26,22 +26,23 @@ import java.util.stream.Collectors
* [[NCNLPEntityParser]] helper.
*/
object NCNLPEntityParser:
- private val id: String = "nlp:token"
+ private val id: String = "nlp:entity"
import org.apache.nlpcraft.nlp.parsers.NCNLPEntityParser.*
/**
* NLP data [[NCEntityParser parser]].
*
- * This parser converts list of input [[NCToken]] instances to list of [[NCEntity]] instances with ID `nlp:token`.
+ * This parser converts list of input [[NCToken]] instances to list of [[NCEntity]] instances with ID `nlp:entity`.
* All [[NCEntity]] instances contain following mandatory [[NCPropertyMap metadata]] properties:
- * - nlp:token:text
- * - nlp:token:index
- * - nlp:token:startCharIndex
- * - nlp:token:endCharIndex
+ * - nlp:entity:text
+ * - nlp:entity:index
+ * - nlp:entity:startCharIndex
+ * - nlp:entity:endCharIndex
*
* Also created [[NCEntity]] instances receive all another [[NCPropertyMap metadata]] properties
* which were added by configured in [[NCPipeline pipeline]] token [[org.apache.nlpcraft.NCTokenEnricher enrichers]].
+ * These properties identifiers will be prefixed by `nlp:entity:`.
*
* @param predicate Predicate which allows to filter list of converted [[NCToken]] instances.
* By default all [[NCToken]] instances converted.
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
index e40e8ff2..7613e237 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
@@ -39,12 +39,12 @@ object NCOpenNLPEntityParser:
/**
* Creates [[NCOpenNLPEntityParser]] instance.
*
- * @param src Path to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html model]].
+ * @param mdl Path to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html model]].
* @return [[NCOpenNLPEntityParser]] instance.
*/
- def apply(src: String): NCOpenNLPEntityParser =
- require(src != null, "Model source cannot be null.")
- new NCOpenNLPEntityParser(List(src))
+ def apply(mdl: String): NCOpenNLPEntityParser =
+ require(mdl != null, "Model source cannot be null.")
+ new NCOpenNLPEntityParser(List(mdl))
/**
* [[https://opennlp.apache.org/ OpenNLP]] based language independent [[NCEntityParser parser]] configured by
@@ -59,10 +59,10 @@ object NCOpenNLPEntityParser:
*
* **NOTE:** that each input [[NCToken]] can be included into several output [[NCEntity]] instances.
*
- * @param srcs Paths to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html models]].
+ * @param findersMdlsRes Paths to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/namefind/TokenNameFinderModel.html models]].
*/
-class NCOpenNLPEntityParser(srcs: List[String]) extends NCEntityParser with LazyLogging:
- require(srcs != null, "Models source cannot be null.")
+class NCOpenNLPEntityParser(findersMdlsRes: List[String]) extends NCEntityParser with LazyLogging:
+ require(findersMdlsRes != null, "Models sources cannot be null.")
private var finders: Seq[NameFinderME] = _
private case class Holder(start: Int, end: Int, name: String, probability: Double)
@@ -74,7 +74,7 @@ class NCOpenNLPEntityParser(srcs: List[String]) extends NCEntityParser with Lazy
private def init(): Unit =
val finders = mutable.ArrayBuffer.empty[NameFinderME]
NCUtils.execPar(
- srcs.map(res => () => {
+ findersMdlsRes.map(res => () => {
val f = new NameFinderME(new TokenNameFinderModel(NCUtils.getStream(res)))
logger.trace(s"Loaded resource: $res")
finders.synchronized { finders += f }
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala
index dbc6657e..82c4b120 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala
@@ -32,19 +32,19 @@ import java.util.Objects
*
* Some of OpenNLP prepared models can be found [[https://opennlp.sourceforge.net/models-1.5/ here]].
*
- * @param tokMdl Path to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/tokenize/TokenizerModel.html model]].
+ * @param tokMdlRes Path to [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/tokenize/TokenizerModel.html model]].
*/
-class NCOpenNLPTokenParser(tokMdl: String) extends NCTokenParser with LazyLogging:
- require(tokMdl != null, "Tokenizer model path cannot be null.")
+class NCOpenNLPTokenParser(tokMdlRes: String) extends NCTokenParser with LazyLogging:
+ require(tokMdlRes != null, "Tokenizer model path cannot be null.")
@volatile private var tokenizer: TokenizerME = _
init()
private def init(): Unit =
- tokenizer = new TokenizerME(new TokenizerModel(NCUtils.getStream(tokMdl)))
+ tokenizer = new TokenizerME(new TokenizerModel(NCUtils.getStream(tokMdlRes)))
- logger.trace(s"Loaded resource: $tokMdl")
+ logger.trace(s"Loaded resource: $tokMdlRes")
override def tokenize(text: String): List[NCToken] =
this.synchronized {
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala
index b9768e59..e8d43aa1 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala
@@ -17,6 +17,8 @@
package org.apache.nlpcraft.nlp.parsers
+import org.apache.nlpcraft.nlp.common.NCStemmer
+
/**
*
* Configuration element which helps to detect [[org.apache.nlpcraft.NCEntity NCEntity]] for
@@ -25,7 +27,6 @@ package org.apache.nlpcraft.nlp.parsers
* See detailed description on the website [[https://nlpcraft.apache.org/built-in-entity-parser.html#parser-semantic Semantic Parser]].
*
* @see [[NCSemanticEntityParser]]
- * @see [[NCSemanticStemmer]]
*/
trait NCSemanticElement:
/**
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
index 3942584e..e96a257e 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
@@ -21,6 +21,7 @@ import com.typesafe.scalalogging.LazyLogging
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.makro.NCMacroParser
import org.apache.nlpcraft.internal.util.NCUtils
+import org.apache.nlpcraft.nlp.common.NCStemmer
import org.apache.nlpcraft.nlp.parsers.*
import org.apache.nlpcraft.nlp.parsers.impl.*
@@ -38,13 +39,13 @@ object NCSemanticEntityParser:
/**
* Creates [[NCSemanticEntityParser]] instance.
*
- * @param stemmer [[NCSemanticStemmer]] implementation.
- * @param parser [[NCTokenParser]] implementation.
- * @param macros Macros map. Empty by default.
+ * @param stemmer [[NCStemmer]] implementation for synonyms language.
+ * @param parser [[NCTokenParser]] implementation.
+ * @param macros Macros map. Empty by default.
* @param elements [[NCSemanticElement]] list.
*/
def apply(
- stemmer: NCSemanticStemmer,
+ stemmer: NCStemmer,
parser: NCTokenParser,
macros: Map[String, String],
elements: List[NCSemanticElement]
@@ -60,12 +61,12 @@ object NCSemanticEntityParser:
*
* Creates [[NCSemanticEntityParser]] instance.
*
- * @param stemmer [[NCSemanticStemmer]] implementation.
+ * @param stemmer [[NCStemmer]] implementation for synonyms language.
* @param parser [[NCTokenParser]] implementation.
* @param elements [[NCSemanticElement]] list.
*/
def apply(
- stemmer: NCSemanticStemmer,
+ stemmer: NCStemmer,
parser: NCTokenParser,
elements: List[NCSemanticElement]
): NCSemanticEntityParser =
@@ -79,11 +80,11 @@ object NCSemanticEntityParser:
*
* Creates [[NCSemanticEntityParser]] instance.
*
- * @param stemmer [[NCSemanticStemmer]] implementation.
- * @param parser [[NCTokenParser]] implementation.
- * @param mdlSrc Classpath resource, file path or URL for YAML or JSON semantic model definition file.
+ * @param stemmer [[NCStemmer]] implementation for synonyms language.
+ * @param parser [[NCTokenParser]] implementation.
+ * @param mdlSrc Classpath resource, file path or URL for YAML or JSON semantic model definition file.
*/
- def apply(stemmer: NCSemanticStemmer, parser: NCTokenParser, mdlSrc: String): NCSemanticEntityParser =
+ def apply(stemmer: NCStemmer, parser: NCTokenParser, mdlSrc: String): NCSemanticEntityParser =
require(stemmer != null, "Stemmer cannot be null.")
require(parser != null, "Parser cannot be null.")
require(mdlSrc != null, "Model source cannot be null.")
@@ -181,18 +182,15 @@ import org.apache.nlpcraft.nlp.parsers.NCSemanticEntityParser.*
*
* See detailed description on the website [[https://nlpcraft.apache.org/built-in-entity-parser.html#parser-semantic Semantic Parser]].
*
- *
* @see [[NCSemanticElement]]
- * @see [[NCSemanticStemmer]]
- *
- * @param stemmer [[NCSemanticStemmer]] implementation.
- * @param parser [[NCTokenParser]] implementation.
- * @param macros Macros map. Empty by default.
+ * @param stemmer [[NCStemmer]] implementation for synonyms language.
+ * @param parser [[NCTokenParser]] implementation.
+ * @param macros Macros map. Empty by default.
* @param elements [[NCSemanticElement]] list.
* @param mdlSrcOpt Optional classpath resource, file path or URL for YAML or JSON semantic model definition file.
*/
class NCSemanticEntityParser(
- stemmer: NCSemanticStemmer,
+ stemmer: NCStemmer,
parser: NCTokenParser,
macros: Map[String, String] = Map.empty,
elements: List[NCSemanticElement] = List.empty,
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/impl/NCSemanticSynonymsProcessor.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/impl/NCSemanticSynonymsProcessor.scala
index 7c3992e4..e5c0b09d 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/impl/NCSemanticSynonymsProcessor.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/impl/NCSemanticSynonymsProcessor.scala
@@ -24,6 +24,7 @@ import com.typesafe.scalalogging.LazyLogging
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.makro.NCMacroParser
import org.apache.nlpcraft.internal.util.NCUtils
+import org.apache.nlpcraft.nlp.common.NCStemmer
import org.apache.nlpcraft.nlp.parsers.*
import org.apache.nlpcraft.nlp.parsers.impl.NCSemanticChunkKind.*
@@ -144,7 +145,7 @@ private[parsers] object NCSemanticSynonymsProcessor extends LazyLogging:
* @param syns
*/
private def convertSynonyms(
- stemmer: NCSemanticStemmer,
+ stemmer: NCStemmer,
tokParser: NCTokenParser,
macroParser: NCMacroParser,
elemId: String,
@@ -205,7 +206,7 @@ private[parsers] object NCSemanticSynonymsProcessor extends LazyLogging:
* @param elements
*/
def prepare(
- stemmer: NCSemanticStemmer,
+ stemmer: NCStemmer,
tokParser: NCTokenParser,
macros: Map[String, String],
elements: Seq[NCSemanticElement]
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/NCTokenEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/NCTokenEnricherSpec.scala
index 4712b55e..2562e317 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/NCTokenEnricherSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/NCTokenEnricherSpec.scala
@@ -31,7 +31,7 @@ import scala.util.Using
class NCTokenEnricherSpec extends AnyFunSuite:
private def test0(pipeline: NCPipeline, ok: Boolean): Unit =
val mdl: NCModel = new NCModel(NCModelConfig("test.id", "Test model", "1.0"), pipeline):
- @NCIntent("intent=i term(any)={meta_ent('nlp:token:k1') == 'v1'}")
+ @NCIntent("intent=i term(any)={meta_ent('nlp:entity:k1') == 'v1'}")
def onMatch(ctx: NCContext, im: NCIntentMatch): NCResult = TEST_RESULT
NCTestUtils.askSomething(mdl, ok)
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricherSpec.scala
index 480edd24..6739a703 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricherSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricherSpec.scala
@@ -18,7 +18,7 @@
package org.apache.nlpcraft.nlp.enrichers
import org.apache.nlpcraft.*
-import nlp.enrichers.NCEnBracketsTokenEnricher
+import nlp.enrichers.NCBracketsTokenEnricher
import nlp.util.*
import org.scalatest.funsuite.AnyFunSuite
@@ -26,7 +26,7 @@ import org.scalatest.funsuite.AnyFunSuite
*
*/
class NCBracketsTokenEnricherSpec extends AnyFunSuite:
- private val bracketsEnricher = new NCEnBracketsTokenEnricher()
+ private val bracketsEnricher = new NCBracketsTokenEnricher()
/**
*
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricherSpec.scala
index f6f945b7..537ec5cb 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricherSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricherSpec.scala
@@ -25,14 +25,14 @@ import internal.util.NCResourceReader
import org.scalatest.funsuite.AnyFunSuite
class NCDictionaryTokenEnricherSpec extends AnyFunSuite:
- private val dictEnricher = new NCEnDictionaryTokenEnricher()
+ private val dictEnricher = new NCDictionaryTokenEnricher("moby/354984si.ngl")
test("test") {
val txt = "milk XYZ"
val toks = EN_TOK_PARSER.tokenize(txt)
- require(toks.head.get[Boolean]("dict:en").isEmpty)
- require(toks.last.get[Boolean]("dict:en").isEmpty)
+ require(toks.head.get[Boolean]("dict").isEmpty)
+ require(toks.last.get[Boolean]("dict").isEmpty)
val req = NCTestRequest(txt)
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricherSpec.scala
index 3f87f757..ee3ad403 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricherSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricherSpec.scala
@@ -28,7 +28,7 @@ import org.scalatest.funsuite.AnyFunSuite
*
*/
class NCQuotesTokenEnricherSpec extends AnyFunSuite:
- private val quoteEnricher = new NCEnQuotesTokenEnricher
+ private val quoteEnricher = new NCQuotesTokenEnricher
/**
*
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala
index 86303dea..fcea197c 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala
@@ -17,8 +17,10 @@
package org.apache.nlpcraft.nlp.enrichers
+import opennlp.tools.stemmer.PorterStemmer
import org.apache.nlpcraft.internal.util.NCResourceReader
-import org.apache.nlpcraft.nlp.enrichers.NCEnSwearWordsTokenEnricher
+import org.apache.nlpcraft.nlp.common.NCStemmer
+import org.apache.nlpcraft.nlp.enrichers.NCSwearWordsTokenEnricher
import org.apache.nlpcraft.nlp.enrichers.*
import org.apache.nlpcraft.nlp.util.*
import org.scalatest.funsuite.AnyFunSuite
@@ -26,13 +28,18 @@ import org.scalatest.funsuite.AnyFunSuite
*
*/
class NCSwearWordsTokenEnricherSpec extends AnyFunSuite:
- private val swEnricher = new NCEnSwearWordsTokenEnricher(NCResourceReader.getPath("badfilter/swear_words.txt"))
+ private val swEnricher = new NCSwearWordsTokenEnricher(
+ NCResourceReader.getPath("badfilter/swear_words.txt"),
+ new NCStemmer:
+ final private val ps: PorterStemmer = new PorterStemmer
+ override def stem(txt: String): String = ps.stem(txt)
+ )
test("test") {
val toks = EN_TOK_PARSER.tokenize("english ass")
- require(toks.head.get[Boolean]("swear:en").isEmpty)
- require(toks.last.get[Boolean]("swear:en").isEmpty)
+ require(toks.head.get[Boolean]("swear").isEmpty)
+ require(toks.last.get[Boolean]("swear").isEmpty)
swEnricher.enrich(null, null, toks)
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserLemmaSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserLemmaSpec.scala
index 299a8fdf..cb134ef3 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserLemmaSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserLemmaSpec.scala
@@ -22,6 +22,7 @@ import annotations.*
import nlp.parsers.*
import internal.impl.*
import nlp.util.*
+import org.apache.nlpcraft.nlp.common.NCStemmer
import org.scalatest.funsuite.AnyFunSuite
import java.util
@@ -32,7 +33,7 @@ import scala.collection.mutable
*/
class NCSemanticEntityParserLemmaSpec extends AnyFunSuite:
private val lemmaStemmer =
- new NCSemanticStemmer():
+ new NCStemmer():
override def stem(txt: String): String = if wrapped(txt) then unwrap(txt) else UUID.randomUUID().toString
case class Data(text: String, elemId: String)
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
index fd1e0b07..a23b0f89 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
@@ -21,9 +21,10 @@ import opennlp.tools.stemmer.PorterStemmer
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.ascii.NCAsciiTable
import org.apache.nlpcraft.internal.util.NCResourceReader
+import org.apache.nlpcraft.nlp.common.NCStemmer
import org.apache.nlpcraft.nlp.parsers.*
import org.apache.nlpcraft.nlp.parsers
-import org.apache.nlpcraft.nlp.parsers.{NCOpenNLPTokenParser, NCSemanticElement, NCSemanticEntityParser, NCSemanticStemmer}
+import org.apache.nlpcraft.nlp.parsers.{NCOpenNLPTokenParser, NCSemanticElement, NCSemanticEntityParser}
import java.util
import scala.util.Using
@@ -122,8 +123,8 @@ object NCTestUtils:
/**
*
*/
- private def mkSemanticStemmer: NCSemanticStemmer =
- new NCSemanticStemmer():
+ private def mkSemanticStemmer: NCStemmer =
+ new NCStemmer():
private val ps = new PorterStemmer
override def stem(txt: String): String = ps.synchronized { ps.stem(txt) }