You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2022/12/14 10:32:32 UTC
[incubator-nlpcraft] 03/03: WIP.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 96e574aeb832e6f36c5012be62cb2d817abd84fe
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Wed Dec 14 14:32:39 2022 +0400

    WIP.
---
 .../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 28 ++++++++++------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index 108fe3aa..367d96be 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -201,8 +201,8 @@ class NCEnStopWordsTokenEnricher(
     private var stopWords: StopWordHolder = _
     private var exceptions: StopWordHolder = _
 
-    private case class TokenExtra(lemma: String, stem: String):
-        val lemmaStem: String = getStem(lemma)
+    private case class TokenExtra(lemma: String, stemTxt: String):
+        val stemLemma: String = getStem(lemma)
 
     init()
 
@@ -521,8 +521,8 @@ class NCEnStopWordsTokenEnricher(
                 extra = extraToks(tok)
                 if
                     idx != max && !isStopWord(tok) &&
-                    !exclStems.contains(extra.stem) &&
-                    !exclStems.contains(extra.lemmaStem) &&
+                    !exclStems.contains(extra.stemTxt) &&
+                    !exclStems.contains(extra.stemLemma) &&
                     POSES.contains(getPos(tok)) && isStopWord(ns(idx + 1)))
                 stops += tok
                 stop = false
@@ -550,7 +550,7 @@ class NCEnStopWordsTokenEnricher(
             val idx = tok.getIndex
             val pos = getPos(tok)
             val lemma = extra.lemma
-            val st = extra.stem
+            val st = extra.stemTxt
 
             def isFirst: Boolean = idx == 0
             def isLast: Boolean = idx == toks.length - 1
@@ -618,32 +618,30 @@ class NCEnStopWordsTokenEnricher(
 
         // +-------------------------------------------------+
         // | Pass #5.                                        |
-        // | Mark words with POSes before stopwords.        |
+        // | Mark words with POSes before stopwords.         |
         // +-------------------------------------------------+
         markBefore(toks, STOP_BEFORE_STOP, toks.size - 1, isException, stops)
 
         // +-------------------------------------------------+
         // | Pass #6.                                        |
-        // | Processing additional and excluded stopword.  |
+        // | Processing additional and excluded stopword.    |
         // +-------------------------------------------------+
-        for ((t, extra) <- extraToks if addStems.contains(extra.stem) || addStems.contains(extra.lemmaStem)) stops += t
+        def has(set: Set[String], extra: TokenExtra) = set.contains(extra.stemTxt) || set.contains(extra.stemLemma)
 
-        for (t <- stops.filter( t =>
-            val extra = extraToks(t)
-            exclStems.contains(extra.stem) || exclStems.contains(extra.lemmaStem))
-        ) stops -= t
+        for ((t, extra) <- extraToks if has(addStems, extra)) stops += t
+        for ((t, _) <- stops.map(t => t -> extraToks(t)).filter { (_, extra) => has(exclSet, extra)}) stops -= t
 
         // +-------------------------------------------------+
         // | Pass #7.                                        |
         // | Marks as stopwords, words with POS from         |
         // | configured list, which also placed before       |
-        // | another stopword.                             |
+        // | another stopword.                               |
         // +-------------------------------------------------+
         processCommonStops(toks, extraToks, stops)
 
         // +-------------------------------------------------+
         // | Pass #8.                                        |
-        // | Deletes stopword if they are marked as quoted.|
+        // | Deletes stopword if they are marked as quoted.  |
         // +-------------------------------------------------+
         var quotes = toks.filter(isQuote)
 
@@ -663,7 +661,7 @@ class NCEnStopWordsTokenEnricher(
 
         // +-------------------------------------------------+
         // | Pass #9.                                        |
-        // | Deletes stopword if they are brackets.        |
+        // | Deletes stopword if they are brackets.          |
         // +-------------------------------------------------+
         val stack = new java.util.Stack[String]()
         val set = mutable.HashSet.empty[NCToken]