You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2022/12/14 10:32:30 UTC

[incubator-nlpcraft] 01/03: WIP.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit b9f683a1d77a5e54052a58e9f8f0c3dd45fe57d4
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Wed Dec 14 14:19:39 2022 +0400

    WIP.
---
 .../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 54 ++++++++++++++--------
 .../nlp/enrichers/NCStopWordsEnricherSpec.scala    | 36 +++++++++++++++
 2 files changed, 71 insertions(+), 19 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index 3fc1e57a..3ac14f84 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -198,11 +198,14 @@ class NCEnStopWordsTokenEnricher(
     private var stopWords: StopWordHolder = _
     private var exceptions: StopWordHolder = _
 
+    private case class TokenExtra(lemma: String, stem: String):
+        val lemmaStem: String = getStem(lemma)
+
     init()
 
     private def read(path: String): Set[String] = NCUtils.readTextGzipResource(path, "UTF-8", logger).toSet
-    private def stem(s: String): String = stemmer.stem(s.toLowerCase)
-    private def toStemKey(toks: Seq[NCToken]): String = toks.map(_.getText).map(stem).mkString(" ")
+    private def getStem(s: String): String = stemmer.stem(s.toLowerCase)
+    private def toStemKey(toks: Seq[NCToken]): String = toks.map(_.getText).map(getStem).mkString(" ")
 
     /**
       * Stop words holder, used for hash search.
@@ -293,11 +296,11 @@ class NCEnStopWordsTokenEnricher(
                 wildcardsOrigins.matches(toOriginalKey(toks), posOpt)
 
     /**
-      * 
+      *
       */
     private def init(): Unit =
-        addStems = if addStopsSet == null then Set.empty else addStopsSet.map(stem)
-        exclStems = if exclStopsSet == null then Set.empty else exclStopsSet.map(stem)
+        addStems = if addStopsSet == null then Set.empty else addStopsSet.map(getStem)
+        exclStems = if exclStopsSet == null then Set.empty else exclStopsSet.map(getStem)
 
         def check(name: String, set: Set[String]): Unit =
             if set.exists(_.exists(_.isWhitespace)) then throw E(s"$name contain a string with whitespaces.")
@@ -308,7 +311,7 @@ class NCEnStopWordsTokenEnricher(
         val dups = addStems.intersect(exclStems)
         if dups.nonEmpty then E(s"Duplicate stems detected between additional and excluded stopwords [dups=${dups.mkString(",")}]")
 
-        percents = PERCENTS.map(stem)
+        percents = PERCENTS.map(getStem)
 
         // Stemmatization is done already by generator.
         NCUtils.execPar(
@@ -429,7 +432,7 @@ class NCEnStopWordsTokenEnricher(
                 val (word, form) =
                     if isCase then (s, ORIG)
                     else
-                        if !hasPoses then (stem(s), STEM) else (stem(s), LEM)
+                        if !hasPoses then (getStem(s), STEM) else (getStem(s), LEM)
                 mHash((isExc, form)).addCondition(word, poses)
             else
                 val b = s.take(idxWild)
@@ -506,23 +509,29 @@ class NCEnStopWordsTokenEnricher(
     /**
       * Marks as stopwords, words with POS from configured list, which also placed before another stop words.
       */
-    private def processCommonStops(ns: Seq[NCToken], stops: mutable.HashSet[NCToken]): Unit =
+    private def processCommonStops(ns: Seq[NCToken], extraToks: Map[NCToken, TokenExtra], stops: mutable.HashSet[NCToken]): Unit =
         /**
           * Marks as stopwords, words with POS from configured list, which also placed before another stop words.
           */
         @tailrec
-        def processCommonStops0(ns: Seq[NCToken]): Unit =
+        def processCommonStops0(ns: Seq[NCToken], extraToks: Map[NCToken, TokenExtra]): Unit =
             val max = ns.size - 1
             var stop = true
 
-            for ((tok, idx) <- ns.zipWithIndex if idx != max && !isStopWord(tok) && !exclStems.contains(stem(tok.getText)) &&
-                POSES.contains(getPos(tok)) && isStopWord(ns(idx + 1)))
+            for (
+                (tok, idx) <- ns.zipWithIndex;
+                extra = extraToks(tok)
+                if
+                    idx != max && !isStopWord(tok) &&
+                    !exclStems.contains(extra.stem) &&
+                    !exclStems.contains(extra.lemmaStem) &&
+                    POSES.contains(getPos(tok)) && isStopWord(ns(idx + 1)))
                 stops += tok
                 stop = false
 
-            if !stop then processCommonStops0(ns)
+            if !stop then processCommonStops0(ns, extraToks)
 
-        processCommonStops0(ns)
+        processCommonStops0(ns, extraToks)
 
     /** @inheritdoc */
     override def enrich(req: NCRequest, cfg: NCModelConfig, toks: List[NCToken]): Unit =
@@ -535,11 +544,15 @@ class NCEnStopWordsTokenEnricher(
 
         val stops = mutable.HashSet.empty[NCToken]
 
-        for (tok <- toks)
+        val extraToks =
+            scala.collection.mutable.LinkedHashMap.empty[NCToken, TokenExtra] ++=
+            toks.map(t => t -> TokenExtra(getLemma(t), getStem(t.getText)))
+
+        for ((tok, extra) <- extraToks)
             val idx = tok.getIndex
             val pos = getPos(tok)
-            val lemma = getLemma(tok)
-            val st = stem(tok.getText)
+            val lemma = extra.lemma
+            val st = extra.stem
 
             def isFirst: Boolean = idx == 0
             def isLast: Boolean = idx == toks.length - 1
@@ -615,10 +628,13 @@ class NCEnStopWordsTokenEnricher(
         // | Pass #6.                                        |
         // | Processing additional and excluded stop words.  |
         // +-------------------------------------------------+
-        for (t <- toks if addStems.contains(stem(t.getText)))
+        for ((t, extra) <- extraToks if addStems.contains(extra.stem) || addStems.contains(extra.lemmaStem))
             stops += t
 
-        for (t <- stops.filter(t => exclStems.contains(stem(t.getText))))
+        for (t <- stops.filter( t =>
+            val extra = extraToks(t)
+            exclStems.contains(extra.stem) || exclStems.contains(extra.lemmaStem))
+        )
             stops -= t
 
         // +-------------------------------------------------+
@@ -627,7 +643,7 @@ class NCEnStopWordsTokenEnricher(
         // | configured list, which also placed before       |
         // | another stop words.                             |
         // +-------------------------------------------------+
-        processCommonStops(toks, stops)
+        processCommonStops(toks, extraToks, stops)
 
         // +-------------------------------------------------+
         // | Pass #8.                                        |
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala
index d0c92d40..142c16b4 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala
@@ -21,6 +21,7 @@ import org.apache.nlpcraft.*
 import internal.util.NCResourceReader
 import nlp.util.*
 import nlp.enrichers.NCEnStopWordsTokenEnricher
+import org.apache.nlpcraft.nlp.stemmer.NCStemmer
 import org.scalatest.funsuite.AnyFunSuite
 
 /**
@@ -60,4 +61,39 @@ class NCStopWordsEnricherSpec extends AnyFunSuite:
             false,
             true
         )
+        // The synonym is defined as lemma => all kind of input words should be found.
+        test(
+            new NCEnStopWordsTokenEnricher(Set("woman")),
+            "woman women",
+            true,
+            true
+        )
+        // The synonym is defined in some form => only in the same form input words should be found.
+        test(
+            new NCEnStopWordsTokenEnricher(Set("women")),
+            "woman women",
+            false,
+            true
+        )
+        // The synonym is defined in some form, but stemmer is very rough =>  all kind of input words should be found.
+        test(
+            new NCEnStopWordsTokenEnricher(addStopsSet = Set("women"), stemmer = _.take(3)),
+            "woman women",
+            true,
+            true
+        )
+        // The synonym is defined as lemma => all kind of input words should be found, but excluded set is defined.
+        test(
+            new NCEnStopWordsTokenEnricher(Set("woman"), Set("women")),
+            "woman women",
+            true,
+            false
+        )
+        // Very rough stemmer defined.
+        test(
+            new NCEnStopWordsTokenEnricher(addStopsSet = Set("women"), stemmer = _.head.toString),
+            "weather windows",
+            true,
+            true
+        )
     }
\ No newline at end of file