You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by ar...@apache.org on 2022/12/09 19:57:11 UTC
[incubator-nlpcraft] branch NLPCRAFT-520 updated: WIP
This is an automated email from the ASF dual-hosted git repository.
aradzinski pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
new 6416404d WIP
6416404d is described below
commit 6416404d06df9762bc8141f72e8542fcd141abb7
Author: Aaron Radzinski <ar...@datalingvo.com>
AuthorDate: Fri Dec 9 11:57:05 2022 -0800
WIP
---
.../nlp/entity/parser/NCFrSemanticEntityParser.scala | 2 +-
.../nlp/entity/parser/NCRuSemanticEntityParser.scala | 2 +-
.../pizzeria/components/PizzeriaModelPipeline.scala | 2 +-
.../scala/org/apache/nlpcraft/NCPipelineBuilder.scala | 2 +-
.../scala/org/apache/nlpcraft/nlp/common/NCStemmer.scala | 15 ++++++++-------
.../nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala | 3 ++-
.../nlp/enrichers/NCDictionaryTokenEnricher.scala | 14 ++++++++------
.../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 16 +++++++++-------
.../nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala | 2 +-
.../nlp/parsers/NCSemanticEntityParserLemmaSpec.scala | 2 +-
.../scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala | 2 +-
11 files changed, 34 insertions(+), 28 deletions(-)
diff --git a/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala b/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala
index c13251f3..89f3a696 100644
--- a/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala
+++ b/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala
@@ -30,7 +30,7 @@ import org.apache.nlpcraft.nlp.parsers.*
class NCFrSemanticEntityParser(src: String) extends NCSemanticEntityParser(
new NCStemmer:
private val stemmer = new SnowballStemmer(SnowballStemmer.ALGORITHM.FRENCH)
- override def stem(txt: String): String = stemmer.synchronized { stemmer.stem(txt.toLowerCase).toString }
+ override def stem(word: String): String = stemmer.synchronized { stemmer.stem(word.toLowerCase).toString }
,
new NCFrTokenParser(),
mdlSrcOpt = src.?
diff --git a/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala b/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala
index e4c48b94..955a9677 100644
--- a/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala
+++ b/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala
@@ -30,7 +30,7 @@ import org.apache.nlpcraft.nlp.common.NCStemmer
class NCRuSemanticEntityParser(src: String) extends NCSemanticEntityParser(
new NCStemmer:
private val stemmer = new SnowballStemmer(SnowballStemmer.ALGORITHM.RUSSIAN)
- override def stem(txt: String): String = stemmer.synchronized { stemmer.stem(txt.toLowerCase).toString }
+ override def stem(word: String): String = stemmer.synchronized { stemmer.stem(word.toLowerCase).toString }
,
new NCRuTokenParser(),
mdlSrcOpt = src.?
diff --git a/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala b/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
index c9e86301..e21066a7 100644
--- a/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
+++ b/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
@@ -23,7 +23,7 @@ object PizzeriaModelPipeline:
val tokParser = new NCStanfordNLPTokenParser(stanford)
val stemmer = new NCStemmer():
private val ps = new PorterStemmer
- override def stem(txt: String): String = ps.synchronized { ps.stem(txt) }
+ override def stem(word: String): String = ps.synchronized { ps.stem(word) }
import PizzeriaOrderMapperDesc as D
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
index 371b96e5..08672995 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
@@ -43,7 +43,7 @@ class NCPipelineBuilder:
private def mkEnStemmer: NCStemmer =
new NCStemmer:
final private val ps: PorterStemmer = new PorterStemmer
- override def stem(txt: String): String = ps.stem(txt)
+ override def stem(word: String): String = ps.stem(word)
private def mkEnOpenNLPTokenParser: NCOpenNLPTokenParser =
new NCOpenNLPTokenParser(NCResourceReader.getPath("opennlp/en-token.bin"))
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/common/NCStemmer.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/common/NCStemmer.scala
index b68d1986..b909018e 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/common/NCStemmer.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/common/NCStemmer.scala
@@ -20,16 +20,17 @@ package org.apache.nlpcraft.nlp.common
import org.apache.nlpcraft.nlp.parsers.*
/**
+ * Trait defining a general stemmer. Stemming is the process of reducing inflected (or sometimes derived)
+ * words to their word stem, base or root form—generally a written word form. Stemmer is used by some of the
+ * built-in pipeline components.
*
- * `Stemmer` trait. Stems are used for finding words by their reduced form.
- * `Stemmer` trait implementation depends on language.
- * Read more about stemming [[https://en.wikipedia.org/wiki/Stemming here]].
- *
+ * Read more about stemming at [[https://en.wikipedia.org/wiki/Stemming]].
*/
trait NCStemmer:
/**
- * Gets text's stem.
+ * Gets a stem for a given text. Note that unlike lemma the stemmatization process does not
+ * require a context for the given word.
*
- * @param txt Stem.
+ * @param word Text to stemmatize.
*/
- def stem(txt: String): String
+ def stem(word: String): String
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala
index cf3563c5..c0e692a3 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala
@@ -24,7 +24,7 @@ import java.io.*
import scala.collection.mutable
/**
- * Brackets [[NCTokenEnricher enricher]].
+ * Brackets [[NCTokenEnricher token enricher]].
*
* This enricher adds `brackets` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]]
* instance if the word it represents is enclosed in brackets. Supported brackets are: `()`, `{}`,
@@ -34,6 +34,7 @@ import scala.collection.mutable
*/
//noinspection DuplicatedCode,ScalaWeakerAccess
class NCBracketsTokenEnricher extends NCTokenEnricher with LazyLogging:
+ /** @inheritdoc */
override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit =
val stack = new java.util.Stack[String]()
val map = mutable.HashMap.empty[NCToken, Boolean]
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
index 241adf0c..a394da37 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
@@ -21,7 +21,7 @@ import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.util.NCUtils
/**
- * "Known-word" [[NCTokenEnricher enricher]].
+ * "Known-word" [[NCTokenEnricher token enricher]].
*
* This enricher adds `dict` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]]
* instance if word it represents is a known dictionary word, i.e. the configured dictionary contains this word's
@@ -29,10 +29,11 @@ import org.apache.nlpcraft.internal.util.NCUtils
* `false` value indicates otherwise.
*
* **NOTE:** this implementation requires `lemma` string [[NCPropertyMap metadata]] property that contains
- * token's lemma. You can configure [[NCOpenNLPTokenEnricher]] for required language that provides this metadata property before
- * this enricher in your [[NCPipeline pipeline]].
+ * token's lemma. You can configure [[NCOpenNLPTokenEnricher]] for required language that provides this
+ * metadata property before this enricher in your [[NCPipeline pipeline]].
*
- * @param dictRes Path to the dictionary. This dictionary should has a simple plain text format with one dictionary word on one line.
+ * @param dictRes Relative path, absolute path or URL to the dictionary file. The dictionary should have a simple
+ * plain text format with *one lemma per line* with no empty line, header or comments allowed.
*/
//noinspection DuplicatedCode,ScalaWeakerAccess
class NCDictionaryTokenEnricher(dictRes: String) extends NCTokenEnricher:
@@ -40,8 +41,9 @@ class NCDictionaryTokenEnricher(dictRes: String) extends NCTokenEnricher:
init()
- private def init(): Unit = dict = NCUtils.readResource(dictRes, "UTF-8").toSet
- private def getLemma(t: NCToken): String = t.get("lemma").getOrElse(throw new NCException("Lemma not found in token."))
+ private def init(): Unit = dict = NCUtils.readResource(dictRes).toSet
+ private def getLemma(t: NCToken): String = t.get("lemma").getOrElse(throw new NCException("'lemma'' property not found in token."))
+ /** @inheritdoc */
override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit =
toks.foreach(t => t.put("dict", dict.contains(getLemma(t))))
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index b5b0c762..8de3f03d 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -163,17 +163,19 @@ object NCEnStopWordsTokenEnricher:
import org.apache.nlpcraft.nlp.enrichers.NCEnStopWordsTokenEnricher.*
/**
- * "Stop-word" [[NCTokenEnricher enricher]] for English language.
+ * "Stop-word" [[NCTokenEnricher token enricher]] for English (EN) language. Stop words are the words
+ * which are filtered out (i.e. stopped) before processing of natural language text because they are
+ * insignificant.
*
* This enricher adds `stopword` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]]
- * instance if word it represents is an English stop-word. The value `true` of the metadata property indicates that this word is detected as stop-word,
- * `false` value indicates otherwise.
+ * instance if word it represents is an English stop-word. The value `true` of the metadata property indicates that
+ * this word is detected as a stop-word, `false` value indicates otherwise.
*
- * Look more about stop-words [[https://en.wikipedia.org/wiki/Stop_word here]].
+ * More information about stop-words can be found at [[https://en.wikipedia.org/wiki/Stop_word]].
*
- * **NOTE:** this implementation requires `lemma` and `pos` string [[NCPropertyMap metadata]] properties that contains
- * token's lemma and part of speech. You can configure [[NCOpenNLPTokenEnricher]] for English language that provides this metadata property before
- * this enricher in your [[NCPipeline pipeline]].
+ * **NOTE:** this implementation requires `lemma` and `pos` string [[NCPropertyMap metadata]] properties that
+ * contain token's lemma and part of speech. You can configure [[NCOpenNLPTokenEnricher]] for English language
+ * that provides this metadata properties before this enricher in your [[NCPipeline pipeline]].
*
* @param addStopsSet User defined collection of additional stop-words.
* @param exclStopsSet User defined collection of exceptions, that is words which should not be marked as stop-words during processing.
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala
index fcea197c..c48ef94b 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala
@@ -32,7 +32,7 @@ class NCSwearWordsTokenEnricherSpec extends AnyFunSuite:
NCResourceReader.getPath("badfilter/swear_words.txt"),
new NCStemmer:
final private val ps: PorterStemmer = new PorterStemmer
- override def stem(txt: String): String = ps.stem(txt)
+ override def stem(word: String): String = ps.stem(word)
)
test("test") {
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserLemmaSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserLemmaSpec.scala
index cb134ef3..01c365ea 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserLemmaSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserLemmaSpec.scala
@@ -34,7 +34,7 @@ import scala.collection.mutable
class NCSemanticEntityParserLemmaSpec extends AnyFunSuite:
private val lemmaStemmer =
new NCStemmer():
- override def stem(txt: String): String = if wrapped(txt) then unwrap(txt) else UUID.randomUUID().toString
+ override def stem(word: String): String = if wrapped(word) then unwrap(word) else UUID.randomUUID().toString
case class Data(text: String, elemId: String)
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
index a23b0f89..2ea44e91 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
@@ -126,7 +126,7 @@ object NCTestUtils:
private def mkSemanticStemmer: NCStemmer =
new NCStemmer():
private val ps = new PorterStemmer
- override def stem(txt: String): String = ps.synchronized { ps.stem(txt) }
+ override def stem(word: String): String = ps.synchronized { ps.stem(word) }
/**