You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by ar...@apache.org on 2022/12/09 19:57:11 UTC

[incubator-nlpcraft] branch NLPCRAFT-520 updated: WIP

This is an automated email from the ASF dual-hosted git repository.

aradzinski pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
     new 6416404d WIP
6416404d is described below

commit 6416404d06df9762bc8141f72e8542fcd141abb7
Author: Aaron Radzinski <ar...@datalingvo.com>
AuthorDate: Fri Dec 9 11:57:05 2022 -0800

    WIP
---
 .../nlp/entity/parser/NCFrSemanticEntityParser.scala     |  2 +-
 .../nlp/entity/parser/NCRuSemanticEntityParser.scala     |  2 +-
 .../pizzeria/components/PizzeriaModelPipeline.scala      |  2 +-
 .../scala/org/apache/nlpcraft/NCPipelineBuilder.scala    |  2 +-
 .../scala/org/apache/nlpcraft/nlp/common/NCStemmer.scala | 15 ++++++++-------
 .../nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala |  3 ++-
 .../nlp/enrichers/NCDictionaryTokenEnricher.scala        | 14 ++++++++------
 .../nlp/enrichers/NCEnStopWordsTokenEnricher.scala       | 16 +++++++++-------
 .../nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala    |  2 +-
 .../nlp/parsers/NCSemanticEntityParserLemmaSpec.scala    |  2 +-
 .../scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala |  2 +-
 11 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala b/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala
index c13251f3..89f3a696 100644
--- a/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala
+++ b/nlpcraft-examples/lightswitch-fr/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCFrSemanticEntityParser.scala
@@ -30,7 +30,7 @@ import org.apache.nlpcraft.nlp.parsers.*
 class NCFrSemanticEntityParser(src: String) extends NCSemanticEntityParser(
     new NCStemmer:
         private val stemmer = new SnowballStemmer(SnowballStemmer.ALGORITHM.FRENCH)
-        override def stem(txt: String): String = stemmer.synchronized { stemmer.stem(txt.toLowerCase).toString }
+        override def stem(word: String): String = stemmer.synchronized { stemmer.stem(word.toLowerCase).toString }
     ,
     new NCFrTokenParser(),
     mdlSrcOpt = src.?
diff --git a/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala b/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala
index e4c48b94..955a9677 100644
--- a/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala
+++ b/nlpcraft-examples/lightswitch-ru/src/main/scala/org/apache/nlpcraft/examples/lightswitch/nlp/entity/parser/NCRuSemanticEntityParser.scala
@@ -30,7 +30,7 @@ import org.apache.nlpcraft.nlp.common.NCStemmer
 class NCRuSemanticEntityParser(src: String) extends NCSemanticEntityParser(
     new NCStemmer:
         private val stemmer = new SnowballStemmer(SnowballStemmer.ALGORITHM.RUSSIAN)
-        override def stem(txt: String): String = stemmer.synchronized { stemmer.stem(txt.toLowerCase).toString }
+        override def stem(word: String): String = stemmer.synchronized { stemmer.stem(word.toLowerCase).toString }
     ,
     new NCRuTokenParser(),
     mdlSrcOpt = src.?
diff --git a/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala b/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
index c9e86301..e21066a7 100644
--- a/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
+++ b/nlpcraft-examples/pizzeria/src/main/scala/org/apache/nlpcraft/examples/pizzeria/components/PizzeriaModelPipeline.scala
@@ -23,7 +23,7 @@ object PizzeriaModelPipeline:
         val tokParser = new NCStanfordNLPTokenParser(stanford)
         val stemmer = new NCStemmer():
             private val ps = new PorterStemmer
-            override def stem(txt: String): String = ps.synchronized { ps.stem(txt) }
+            override def stem(word: String): String = ps.synchronized { ps.stem(word) }
 
         import PizzeriaOrderMapperDesc as D
 
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
index 371b96e5..08672995 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCPipelineBuilder.scala
@@ -43,7 +43,7 @@ class NCPipelineBuilder:
     private def mkEnStemmer: NCStemmer =
         new NCStemmer:
             final private val ps: PorterStemmer = new PorterStemmer
-            override def stem(txt: String): String = ps.stem(txt)
+            override def stem(word: String): String = ps.stem(word)
 
     private def mkEnOpenNLPTokenParser: NCOpenNLPTokenParser =
         new NCOpenNLPTokenParser(NCResourceReader.getPath("opennlp/en-token.bin"))
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/common/NCStemmer.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/common/NCStemmer.scala
index b68d1986..b909018e 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/common/NCStemmer.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/common/NCStemmer.scala
@@ -20,16 +20,17 @@ package org.apache.nlpcraft.nlp.common
 import org.apache.nlpcraft.nlp.parsers.*
 
 /**
+  * Trait defining a general stemmer. Stemming is the process of reducing inflected (or sometimes derived)
+  * words to their word stem, base or root form—generally a written word form. Stemmer is used by some of the
+  * built-in pipeline components.
   *
-  * `Stemmer` trait. Stems are used for finding words by their reduced form.
-  * `Stemmer` trait implementation depends on language.
-  * Read more about stemming [[https://en.wikipedia.org/wiki/Stemming here]].
-  *
+  * Read more about stemming at [[https://en.wikipedia.org/wiki/Stemming]].
   */
 trait NCStemmer:
     /**
-      * Gets text's stem.
+      * Gets a stem for a given text. Note that unlike lemma the stemmatization process does not
+      * require a context for the given word.
       *
-      * @param txt Stem.
+      * @param word Text to stemmatize.
       */
-    def stem(txt: String): String
+    def stem(word: String): String
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala
index cf3563c5..c0e692a3 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala
@@ -24,7 +24,7 @@ import java.io.*
 import scala.collection.mutable
 
 /**
-  * Brackets [[NCTokenEnricher enricher]].
+  * Brackets [[NCTokenEnricher token enricher]].
   *
   * This enricher adds `brackets` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]]
   * instance if the word it represents is enclosed in brackets. Supported brackets are: `()`, `{}`,
@@ -34,6 +34,7 @@ import scala.collection.mutable
   */
 //noinspection DuplicatedCode,ScalaWeakerAccess
 class NCBracketsTokenEnricher extends NCTokenEnricher with LazyLogging:
+    /** @inheritdoc */
     override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit =
         val stack = new java.util.Stack[String]()
         val map = mutable.HashMap.empty[NCToken, Boolean]
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
index 241adf0c..a394da37 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
@@ -21,7 +21,7 @@ import org.apache.nlpcraft.*
 import org.apache.nlpcraft.internal.util.NCUtils
 
 /**
-  * "Known-word" [[NCTokenEnricher enricher]].
+  * "Known-word" [[NCTokenEnricher token enricher]].
   *
   * This enricher adds `dict` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]]
   * instance if word it represents is a known dictionary word, i.e. the configured dictionary contains this word's
@@ -29,10 +29,11 @@ import org.apache.nlpcraft.internal.util.NCUtils
   * `false` value indicates otherwise.
   *
   * **NOTE:** this implementation requires `lemma` string [[NCPropertyMap metadata]] property that contains
-  * token's lemma. You can configure [[NCOpenNLPTokenEnricher]] for required language that provides this metadata property before
-  * this enricher in your [[NCPipeline pipeline]].
+  * token's lemma. You can configure [[NCOpenNLPTokenEnricher]] for required language that provides this
+  * metadata property before this enricher in your [[NCPipeline pipeline]].
   *
-  * @param dictRes Path to the dictionary. This dictionary should has a simple plain text format with one dictionary word on one line.
+  * @param dictRes Relative path, absolute path or URL to the dictionary file. The dictionary should have a simple
+  *         plain text format with *one lemma per line* with no empty line, header or comments allowed.
   */
 //noinspection DuplicatedCode,ScalaWeakerAccess
 class NCDictionaryTokenEnricher(dictRes: String) extends NCTokenEnricher:
@@ -40,8 +41,9 @@ class NCDictionaryTokenEnricher(dictRes: String) extends NCTokenEnricher:
 
     init()
 
-    private def init(): Unit = dict = NCUtils.readResource(dictRes, "UTF-8").toSet
-    private def getLemma(t: NCToken): String = t.get("lemma").getOrElse(throw new NCException("Lemma not found in token."))
+    private def init(): Unit = dict = NCUtils.readResource(dictRes).toSet
+    private def getLemma(t: NCToken): String = t.get("lemma").getOrElse(throw new NCException("'lemma'' property not found in token."))
 
+    /** @inheritdoc */
     override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit =
         toks.foreach(t => t.put("dict", dict.contains(getLemma(t))))
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index b5b0c762..8de3f03d 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -163,17 +163,19 @@ object NCEnStopWordsTokenEnricher:
 import org.apache.nlpcraft.nlp.enrichers.NCEnStopWordsTokenEnricher.*
 
 /**
-  * "Stop-word" [[NCTokenEnricher enricher]] for English language.
+  * "Stop-word" [[NCTokenEnricher token enricher]] for English (EN) language. Stop words are the words
+  * which are filtered out (i.e. stopped) before processing of natural language text because they are
+  * insignificant.
   *
   * This enricher adds `stopword` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]]
-  * instance if word it represents is an English stop-word. The value `true` of the metadata property indicates that this word is detected as stop-word,
-  * `false` value indicates otherwise.
+  * instance if word it represents is an English stop-word. The value `true` of the metadata property indicates that
+  * this word is detected as a stop-word, `false` value indicates otherwise.
   *
-  * Look more about stop-words [[https://en.wikipedia.org/wiki/Stop_word here]].
+  * More information about stop-words can be found at [[https://en.wikipedia.org/wiki/Stop_word]].
   *
-  * **NOTE:** this implementation requires `lemma` and `pos` string [[NCPropertyMap metadata]] properties that contains
-  * token's lemma and part of speech. You can configure [[NCOpenNLPTokenEnricher]] for English language that provides this metadata property before
-  * this enricher in your [[NCPipeline pipeline]].
+  * **NOTE:** this implementation requires `lemma` and `pos` string [[NCPropertyMap metadata]] properties that
+  * contain token's lemma and part of speech. You can configure [[NCOpenNLPTokenEnricher]] for English language
+  * that provides this metadata properties before this enricher in your [[NCPipeline pipeline]].
   *
   * @param addStopsSet User defined collection of additional stop-words.
   * @param exclStopsSet User defined collection of exceptions, that is words which should not be marked as stop-words during processing.
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala
index fcea197c..c48ef94b 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricherSpec.scala
@@ -32,7 +32,7 @@ class NCSwearWordsTokenEnricherSpec extends AnyFunSuite:
         NCResourceReader.getPath("badfilter/swear_words.txt"),
         new NCStemmer:
             final private val ps: PorterStemmer = new PorterStemmer
-            override def stem(txt: String): String = ps.stem(txt)
+            override def stem(word: String): String = ps.stem(word)
     )
 
     test("test") {
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserLemmaSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserLemmaSpec.scala
index cb134ef3..01c365ea 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserLemmaSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/parsers/NCSemanticEntityParserLemmaSpec.scala
@@ -34,7 +34,7 @@ import scala.collection.mutable
 class NCSemanticEntityParserLemmaSpec extends AnyFunSuite:
     private val lemmaStemmer =
         new NCStemmer():
-            override def stem(txt: String): String = if wrapped(txt) then unwrap(txt) else UUID.randomUUID().toString
+            override def stem(word: String): String = if wrapped(word) then unwrap(word) else UUID.randomUUID().toString
 
     case class Data(text: String, elemId: String)
 
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
index a23b0f89..2ea44e91 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala
@@ -126,7 +126,7 @@ object NCTestUtils:
     private def mkSemanticStemmer: NCStemmer =
         new NCStemmer():
             private val ps = new PorterStemmer
-            override def stem(txt: String): String = ps.synchronized { ps.stem(txt) }
+            override def stem(word: String): String = ps.synchronized { ps.stem(word) }
 
 
     /**