You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2022/12/12 13:38:02 UTC
[incubator-nlpcraft] branch NLPCRAFT-520 updated: WIP.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
new 2205ae2f WIP.
2205ae2f is described below
commit 2205ae2f691a33f69f5f6fd572e2418435f9e0d4
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Mon Dec 12 17:38:08 2022 +0400
WIP.
---
.../components/PizzeriaModelPipeline.scala | 8 ++----
.../org/apache/nlpcraft/NCPipelineBuilder.scala | 14 +++-------
.../apache/nlpcraft/nlp/common/NCEnStemmer.scala | 32 ++++++++++++++++++++++
.../nlp/enrichers/NCDictionaryTokenEnricher.scala | 2 +-
.../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 20 ++++++++++----
.../nlp/enrichers/NCOpenNLPTokenEnricher.scala | 4 +--
.../nlp/enrichers/NCQuotesTokenEnricher.scala | 3 +-
.../nlp/enrichers/NCSwearWordsTokenEnricher.scala | 3 +-
.../nlpcraft/nlp/parsers/NCNLPEntityParser.scala | 7 +++--
.../nlp/parsers/NCOpenNLPEntityParser.scala | 5 ++--
.../nlp/parsers/NCOpenNLPTokenParser.scala | 5 ++--
.../nlpcraft/nlp/parsers/NCSemanticElement.scala | 6 ++--
.../nlp/parsers/NCSemanticEntityParser.scala | 3 +-
.../enrichers/NCSwearWordsTokenEnricherSpec.scala | 8 ++----
.../org/apache/nlpcraft/nlp/util/NCTestUtils.scala | 18 +++---------
15 files changed, 80 insertions(+), 58 deletions(-)
diff --git a/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala b/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
index e21066a7..655072d6 100644
--- a/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
+++ b/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
@@ -1,12 +1,11 @@
package org.apache.nlpcraft.examples.pizzeria.components
import edu.stanford.nlp.pipeline.StanfordCoreNLP
-import opennlp.tools.stemmer.PorterStemmer
import org.apache.nlpcraft.nlp.parsers.*
import org.apache.nlpcraft.nlp.entity.parser.stanford.NCStanfordNLPEntityParser
import org.apache.nlpcraft.nlp.token.parser.stanford.NCStanfordNLPTokenParser
import org.apache.nlpcraft.*
-import org.apache.nlpcraft.nlp.common.NCStemmer
+import org.apache.nlpcraft.nlp.common.{NCEnStemmer, NCStemmer}
import org.apache.nlpcraft.nlp.enrichers.NCEnStopWordsTokenEnricher
import org.apache.nlpcraft.nlp.parsers.NCSemanticEntityParser
@@ -21,9 +20,6 @@ object PizzeriaModelPipeline:
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner")
new StanfordCoreNLP(props)
val tokParser = new NCStanfordNLPTokenParser(stanford)
- val stemmer = new NCStemmer():
- private val ps = new PorterStemmer
- override def stem(word: String): String = ps.synchronized { ps.stem(word) }
import PizzeriaOrderMapperDesc as D
@@ -31,7 +27,7 @@ object PizzeriaModelPipeline:
withTokenParser(tokParser).
withTokenEnricher(new NCEnStopWordsTokenEnricher()).
withEntityParser(new NCStanfordNLPEntityParser(stanford, Set("number"))).
- withEntityParser(NCSemanticEntityParser(stemmer, tokParser, "pizzeria_model.yaml")).
+ withEntityParser(NCSemanticEntityParser(new NCEnStemmer, tokParser, "pizzeria_model.yaml")).
withEntityMapper(PizzeriaOrderMapper(extra = D("ord:pizza:size", "ord:pizza:size:value"), dests = D("ord:pizza", "ord:pizza:size"))).
withEntityMapper(PizzeriaOrderMapper(extra = D("stanford:number", "stanford:number:nne"), dests = D("ord:pizza", "ord:pizza:qty"), D("ord:drink", "ord:drink:qty"))).
withEntityValidator(new PizzeriaOrderValidator()).
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
index 08672995..6452cabc 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
@@ -17,9 +17,8 @@
package org.apache.nlpcraft
-import opennlp.tools.stemmer.PorterStemmer
import org.apache.nlpcraft.internal.util.NCResourceReader
-import org.apache.nlpcraft.nlp.common.NCStemmer
+import org.apache.nlpcraft.nlp.common.{NCEnStemmer, NCStemmer}
import org.apache.nlpcraft.nlp.parsers.*
import org.apache.nlpcraft.nlp.enrichers.*
@@ -40,11 +39,6 @@ class NCPipelineBuilder:
private val entMappers: Buf[NCEntityMapper] = Buf.empty
private val varFilters: Buf[NCVariantFilter] = Buf.empty
- private def mkEnStemmer: NCStemmer =
- new NCStemmer:
- final private val ps: PorterStemmer = new PorterStemmer
- override def stem(word: String): String = ps.stem(word)
-
private def mkEnOpenNLPTokenParser: NCOpenNLPTokenParser =
new NCOpenNLPTokenParser(NCResourceReader.getPath("opennlp/en-token.bin"))
@@ -222,7 +216,7 @@ class NCPipelineBuilder:
tokEnrichers += new NCEnStopWordsTokenEnricher
tokEnrichers += new NCSwearWordsTokenEnricher(
NCResourceReader.getPath("badfilter/swear_words.txt"),
- mkEnStemmer
+ new NCEnStemmer
)
tokEnrichers += new NCQuotesTokenEnricher
tokEnrichers += new NCDictionaryTokenEnricher("moby/354984si.ngl")
@@ -266,7 +260,7 @@ class NCPipelineBuilder:
lang.toUpperCase match
case "EN" =>
setEnComponents()
- entParsers += NCSemanticEntityParser(mkEnStemmer, mkEnOpenNLPTokenParser, macros, elms)
+ entParsers += NCSemanticEntityParser(new NCEnStemmer, mkEnOpenNLPTokenParser, macros, elms)
case _ => require(false, s"Unsupported language: $lang")
this
@@ -332,7 +326,7 @@ class NCPipelineBuilder:
lang.toUpperCase match
case "EN" =>
setEnComponents()
- this.entParsers += NCSemanticEntityParser(mkEnStemmer, mkEnOpenNLPTokenParser, mdlSrc)
+ this.entParsers += NCSemanticEntityParser(new NCEnStemmer, mkEnOpenNLPTokenParser, mdlSrc)
case _ => require(false, s"Unsupported language: $lang")
this
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/common/NCEnStemmer.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/common/NCEnStemmer.scala
new file mode 100644
index 00000000..7e098e76
--- /dev/null
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/common/NCEnStemmer.scala
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.nlp.common
+
+import opennlp.tools.stemmer.PorterStemmer
+import org.apache.nlpcraft.nlp.parsers.*
+
+/**
+ * English language [[NCStemmer]] implementation, based on
+ * [[https://opennlp.apache.org/ OpenNLP]] Porter Stemmer.
+ * Look more [[https://tartarus.org/martin/PorterStemmer here]].
+ */
+class NCEnStemmer extends NCStemmer:
+ private val stemmer = new PorterStemmer
+
+ /** @inheritdoc */
+ def stem(word: String): String = stemmer.stem(word)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
index a394da37..cf17817e 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
@@ -42,7 +42,7 @@ class NCDictionaryTokenEnricher(dictRes: String) extends NCTokenEnricher:
init()
private def init(): Unit = dict = NCUtils.readResource(dictRes).toSet
- private def getLemma(t: NCToken): String = t.get("lemma").getOrElse(throw new NCException("'lemma'' property not found in token."))
+ private def getLemma(t: NCToken): String = t.get("lemma").getOrElse(throw new NCException("Lemma not found in token."))
/** @inheritdoc */
override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit =
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index 8de3f03d..cfd383ef 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -18,9 +18,9 @@
package org.apache.nlpcraft.nlp.enrichers
import com.typesafe.scalalogging.LazyLogging
-import opennlp.tools.stemmer.PorterStemmer
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.util.NCUtils
+import org.apache.nlpcraft.nlp.common.{NCEnStemmer, NCStemmer}
import java.io.*
import java.util
@@ -177,12 +177,19 @@ import org.apache.nlpcraft.nlp.enrichers.NCEnStopWordsTokenEnricher.*
* contain token's lemma and part of speech. You can configure [[NCOpenNLPTokenEnricher]] for English language
* that provides this metadata properties before this enricher in your [[NCPipeline pipeline]].
*
+ * @see [[NCEnStemmer]]
+ *
* @param addStopsSet User defined collection of additional stop-words.
+ * These word will be tried to match based on `stemmer` implementation.
* @param exclStopsSet User defined collection of exceptions, that is words which should not be marked as stop-words during processing.
+ * These word will be tried to match based on `stemmer` implementation.
+ * @param stemmer English stemmer implementation.
*/
-class NCEnStopWordsTokenEnricher(addStopsSet: Set[String] = Set.empty, exclStopsSet: Set[String] = Set.empty) extends NCTokenEnricher with LazyLogging:
- private final val stemmer = new PorterStemmer
-
+class NCEnStopWordsTokenEnricher(
+ addStopsSet: Set[String] = Set.empty,
+ exclStopsSet: Set[String] = Set.empty,
+ stemmer: NCStemmer = new NCEnStemmer
+) extends NCTokenEnricher with LazyLogging:
private var addStems: Set[String] = _
private var exclStems: Set[String] = _
private var percents: Set[String] = _
@@ -324,7 +331,7 @@ class NCEnStopWordsTokenEnricher(addStopsSet: Set[String] = Set.empty, exclStops
* Parses configuration template.
*
* @param lines Configuration file content.
- * @return Holder and `is-exception` flag.
+ * @return Holder and is-exception flag.
*/
private def readStopWords(lines: Seq[String]): Map[Boolean, StopWordHolder] =
// 1. Prepares accumulation data structure.
@@ -461,7 +468,7 @@ class NCEnStopWordsTokenEnricher(addStopsSet: Set[String] = Set.empty, exclStops
* @param ns Sentence.
* @param stopPoses Stop POSes.
* @param lastIdx Last index.
- * @param isException Function which return `stop word exception` flag.
+ * @param isException Function which return stop word exception flag.
* @param stops Stopwords tokens.
*/
@tailrec
@@ -517,6 +524,7 @@ class NCEnStopWordsTokenEnricher(addStopsSet: Set[String] = Set.empty, exclStops
processCommonStops0(ns)
+ /** @inheritdoc */
override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit =
// Stop words and exceptions caches for this sentence.
val cacheSw = mutable.HashMap.empty[Seq[NCToken], Boolean]
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
index 7ba30164..270ca60a 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
@@ -20,7 +20,6 @@ package org.apache.nlpcraft.nlp.enrichers
import com.typesafe.scalalogging.LazyLogging
import opennlp.tools.lemmatizer.DictionaryLemmatizer
import opennlp.tools.postag.*
-import opennlp.tools.stemmer.PorterStemmer
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.util.*
@@ -28,7 +27,7 @@ import java.io.*
import scala.concurrent.ExecutionContext
/**
- * [[https://opennlp.apache.org/ OpenNLP]] based language independent [[NCTokenEnricher enricher]].
+ * [[https://opennlp.apache.org/ OpenNLP]] based language independent [[NCTokenEnricher token enricher]].
*
* This enricher adds `lemma` and `pos` (part-of-speech) string [[NCPropertyMap metadata]] property to the [[NCToken token]]
* instance.
@@ -66,6 +65,7 @@ class NCOpenNLPTokenEnricher(posMdlRes: String = null, lemmaDicRes: String = nul
)
)(ExecutionContext.Implicits.global)
+ /** @inheritdoc */
override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit =
val txts = toks.map(_.getText).toArray
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
index 6f82ca76..695c27c7 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
@@ -21,7 +21,7 @@ import com.typesafe.scalalogging.LazyLogging
import org.apache.nlpcraft.*
/**
- * Quotes [[NCTokenEnricher enricher]].
+ * Quotes [[NCTokenEnricher token enricher]].
*
* This enricher adds `quoted` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]]
* instance if word it represents is in quotes. The value `true` of the metadata property indicates that this word is in quotes,
@@ -38,6 +38,7 @@ class NCQuotesTokenEnricher extends NCTokenEnricher with LazyLogging:
private def isQuote(t: NCToken): Boolean = Q_POS.contains(getPos(t))
//noinspection DuplicatedCode
+ /** @inheritdoc */
override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit =
val quotes = toks.filter(isQuote)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
index f0d282c7..98ca9113 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
@@ -26,7 +26,7 @@ import java.io.*
import java.util.Objects
/**
- * "Swear-word" [[NCTokenEnricher enricher]].
+ * "Swear-word" [[NCTokenEnricher token enricher]].
*
* This enricher adds `swear` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]]
* instance if word it represents is a swear word dictionary, i.e. the swear dictionary contains this word's
@@ -52,6 +52,7 @@ class NCSwearWordsTokenEnricher(dictRes: String, stemmer: NCStemmer) extends NCT
map(p => stemmer.stem(p.toLowerCase)).toSet
logger.trace(s"Loaded resource: $dictRes")
+ /** @inheritdoc */
override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit =
toks.foreach(t => t.put("swear", swearWords.contains(stemmer.stem(t.getText.toLowerCase))))
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
index b84d3c18..cc624432 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCNLPEntityParser.scala
@@ -31,9 +31,9 @@ object NCNLPEntityParser:
import org.apache.nlpcraft.nlp.parsers.NCNLPEntityParser.*
/**
- * NLP data [[NCEntityParser parser]].
+ * NLP data [[NCEntityParser entity parser]].
*
- * This parser converts list of input [[NCToken]] instances to list of [[NCEntity]] instances with ID `nlp:entity`.
+ * This parser converts list of input [[NCToken]] instances to list of [[NCEntity]] instances with ID **nlp:entity**.
* All [[NCEntity]] instances contain following mandatory [[NCPropertyMap metadata]] properties:
* - nlp:entity:text
* - nlp:entity:index
@@ -42,12 +42,13 @@ import org.apache.nlpcraft.nlp.parsers.NCNLPEntityParser.*
*
* Also created [[NCEntity]] instances receive all another [[NCPropertyMap metadata]] properties
* which were added by configured in [[NCPipeline pipeline]] token [[org.apache.nlpcraft.NCTokenEnricher enrichers]].
- * These properties identifiers will be prefixed by `nlp:entity:`.
+ * These properties identifiers will be prefixed by **nlp:entity:**.
*
* @param predicate Predicate which allows to filter list of converted [[NCToken]] instances.
* By default all [[NCToken]] instances converted.
*/
class NCNLPEntityParser(predicate: NCToken => Boolean = _ => true) extends NCEntityParser:
+ /** @inheritdoc */
override def parse(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): List[NCEntity] =
toks.filter(predicate).map(t =>
new NCPropertyMapAdapter with NCEntity:
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
index 7613e237..a9244535 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPEntityParser.scala
@@ -47,8 +47,8 @@ object NCOpenNLPEntityParser:
new NCOpenNLPEntityParser(List(mdl))
/**
- * [[https://opennlp.apache.org/ OpenNLP]] based language independent [[NCEntityParser parser]] configured by
- * paths to [[https://opennlp.apache.org/ OpenNLP]] `name finders` models.
+ * [[https://opennlp.apache.org/ OpenNLP]] based language independent [[NCEntityParser entity parser]] configured by
+ * paths to [[https://opennlp.apache.org/ OpenNLP]] **name finders** models.
*
* This parser prepares [[NCEntity]] instances which are detected by given models.
* These entities are created with ID `opennlp:modelId`, where `modelId` is [[https://opennlp.apache.org/ OpenNLP]] model ID.
@@ -92,6 +92,7 @@ class NCOpenNLPEntityParser(findersMdlsRes: List[String]) extends NCEntityParser
finally finder.clearAdaptiveData()
}
+ /** @inheritdoc */
override def parse(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): List[NCEntity] =
val txtArr = toks.map(_.getText).toArray
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala
index 82c4b120..a148b3bb 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCOpenNLPTokenParser.scala
@@ -27,8 +27,8 @@ import java.util
import java.util.Objects
/**
- * [[https://opennlp.apache.org/ OpenNLP]] based language independent [[NCTokenParser parser]] configured
- * by path to [[https://opennlp.apache.org/ OpenNLP]] `tokenizers` model.
+ * [[https://opennlp.apache.org/ OpenNLP]] based language independent [[NCTokenParser entity parser]] configured
+ * by path to [[https://opennlp.apache.org/ OpenNLP]] **tokenizers** model.
*
* Some of OpenNLP prepared models can be found [[https://opennlp.sourceforge.net/models-1.5/ here]].
*
@@ -46,6 +46,7 @@ class NCOpenNLPTokenParser(tokMdlRes: String) extends NCTokenParser with LazyLog
logger.trace(s"Loaded resource: $tokMdlRes")
+ /** @inheritdoc */
override def tokenize(text: String): List[NCToken] =
this.synchronized {
tokenizer.tokenizePos(text).zipWithIndex.map { (p, idx) =>
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala
index e8d43aa1..dd157cc9 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticElement.scala
@@ -22,7 +22,7 @@ import org.apache.nlpcraft.nlp.common.NCStemmer
/**
*
* Configuration element which helps to detect [[org.apache.nlpcraft.NCEntity NCEntity]] for
- * `Semantic` implementation of [[org.apache.nlpcraft.NCEntityParser NCEntityParser]].
+ * **Semantic** implementation of [[org.apache.nlpcraft.NCEntityParser NCEntityParser]].
*
* See detailed description on the website [[https://nlpcraft.apache.org/built-in-entity-parser.html#parser-semantic Semantic Parser]].
*
@@ -30,14 +30,14 @@ import org.apache.nlpcraft.nlp.common.NCStemmer
*/
trait NCSemanticElement:
/**
- * Gets `id` for created [[org.apache.nlpcraft.NCEntity NCEntity]] instance.
+ * Gets **id** for created [[org.apache.nlpcraft.NCEntity NCEntity]] instance.
* Representation of [[org.apache.nlpcraft.NCEntity.getId NCEntity.getId()]] method.
* @return Element ID.
*/
def getId: String
/**
- * Gets `groups` for created [[org.apache.nlpcraft.NCEntity NCEntity]] instance.
+ * Gets **groups** for created [[org.apache.nlpcraft.NCEntity NCEntity]] instance.
* Representation of [[org.apache.nlpcraft.NCEntity.getGroups NCEntity.getGroups()]] method.
* @return Groups.
*/
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
index e96a257e..76e5fd83 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParser.scala
@@ -178,7 +178,7 @@ object NCSemanticEntityParser:
import org.apache.nlpcraft.nlp.parsers.NCSemanticEntityParser.*
/**
- * `Semantic` [[NCEntityParser parser]] implementation.
+ * **Semantic** [[NCEntityParser entity parser]] implementation.
*
* See detailed description on the website [[https://nlpcraft.apache.org/built-in-entity-parser.html#parser-semantic Semantic Parser]].
*
@@ -233,6 +233,7 @@ class NCSemanticEntityParser(
*/
private def warnMissedProperty(name: String): Unit = logger.warn(s"'$name' property not found. Is proper token enricher configured?")
+ /** @inheritdoc */
override def parse(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): List[NCEntity] =
if toks.exists(_.get[String]("stopword").isEmpty) then warnMissedProperty("stopword")
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala
index c48ef94b..78d8b5e4 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala
@@ -17,9 +17,8 @@
package org.apache.nlpcraft.nlp.enrichers
-import opennlp.tools.stemmer.PorterStemmer
import org.apache.nlpcraft.internal.util.NCResourceReader
-import org.apache.nlpcraft.nlp.common.NCStemmer
+import org.apache.nlpcraft.nlp.common.{NCEnStemmer, NCStemmer}
import org.apache.nlpcraft.nlp.enrichers.NCSwearWordsTokenEnricher
import org.apache.nlpcraft.nlp.enrichers.*
import org.apache.nlpcraft.nlp.util.*
@@ -29,10 +28,7 @@ import org.scalatest.funsuite.AnyFunSuite
*/
class NCSwearWordsTokenEnricherSpec extends AnyFunSuite:
private val swEnricher = new NCSwearWordsTokenEnricher(
- NCResourceReader.getPath("badfilter/swear_words.txt"),
- new NCStemmer:
- final private val ps: PorterStemmer = new PorterStemmer
- override def stem(word: String): String = ps.stem(word)
+ NCResourceReader.getPath("badfilter/swear_words.txt"), new NCEnStemmer
)
test("test") {
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
index 2ea44e91..cdc51d87 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
@@ -17,11 +17,10 @@
package org.apache.nlpcraft.nlp.util
-import opennlp.tools.stemmer.PorterStemmer
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.ascii.NCAsciiTable
import org.apache.nlpcraft.internal.util.NCResourceReader
-import org.apache.nlpcraft.nlp.common.NCStemmer
+import org.apache.nlpcraft.nlp.common.{NCEnStemmer, NCStemmer}
import org.apache.nlpcraft.nlp.parsers.*
import org.apache.nlpcraft.nlp.parsers
import org.apache.nlpcraft.nlp.parsers.{NCOpenNLPTokenParser, NCSemanticElement, NCSemanticEntityParser}
@@ -120,33 +119,24 @@ object NCTestUtils:
catch case e: Exception => println(s"Expected error: ${e.getMessage}")
}
- /**
- *
- */
- private def mkSemanticStemmer: NCStemmer =
- new NCStemmer():
- private val ps = new PorterStemmer
- override def stem(word: String): String = ps.synchronized { ps.stem(word) }
-
-
/**
*
* @param elms
* @param macros
*/
def mkEnSemanticParser(elms: List[NCSemanticElement], macros: Map[String, String] = Map.empty): NCSemanticEntityParser =
- parsers.NCSemanticEntityParser(mkSemanticStemmer, EN_TOK_PARSER, macros, elms)
+ parsers.NCSemanticEntityParser(new NCEnStemmer, EN_TOK_PARSER, macros, elms)
/**
*
* @param elms
*/
def mkEnSemanticParser(elms: NCSemanticElement*): NCSemanticEntityParser =
- parsers.NCSemanticEntityParser(mkSemanticStemmer, EN_TOK_PARSER, elms.toList)
+ parsers.NCSemanticEntityParser(new NCEnStemmer, EN_TOK_PARSER, elms.toList)
/**
*
* @param mdlSrc
*/
def mkEnSemanticParser(mdlSrc: String): NCSemanticEntityParser =
- parsers.NCSemanticEntityParser(mkSemanticStemmer, EN_TOK_PARSER, mdlSrc)
\ No newline at end of file
+ parsers.NCSemanticEntityParser(new NCEnStemmer, EN_TOK_PARSER, mdlSrc)
\ No newline at end of file