You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2022/12/19 08:34:01 UTC

[incubator-nlpcraft] branch NLPCRAFT-520 updated (ac03dc8e -> 8f8e9cf4)

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a change to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


    from ac03dc8e WIP.
     new 2427be97 WIP.
     new 4b1d67b7 WIP.
     new 8f8e9cf4 WIP.

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../main/resources/stopwords/first_words.txt.gz    | Bin 4024880 -> 0 bytes
 .../src/main/resources/stopwords/noun_words.txt.gz | Bin 862 -> 0 bytes
 .../apache/nlpcraft/internal/util/NCUtils.scala    |  18 -----
 .../nlp/enrichers/NCEnStopWordsTokenEnricher.scala |  24 ++++---
 .../{tools => impl}/NCEnStopWordGenerator.scala    |  46 ++++--------
 .../nlp/enrichers/NCStopWordsEnricherSpec.scala    |  78 ++++++++++++++-------
 6 files changed, 81 insertions(+), 85 deletions(-)
 delete mode 100644 nlpcraft/src/main/resources/stopwords/first_words.txt.gz
 delete mode 100644 nlpcraft/src/main/resources/stopwords/noun_words.txt.gz
 rename nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/{tools => impl}/NCEnStopWordGenerator.scala (87%)


[incubator-nlpcraft] 02/03: WIP.

Posted by se...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 4b1d67b7910107371da42bb2b8d641dcced52ab8
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Mon Dec 19 11:19:10 2022 +0400

    WIP.
---
 .../main/resources/stopwords/first_words.txt.gz    | Bin 4024880 -> 0 bytes
 .../src/main/resources/stopwords/noun_words.txt.gz | Bin 862 -> 0 bytes
 .../nlp/enrichers/NCStopWordsEnricherSpec.scala    |  23 ++++++++++++++-------
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/nlpcraft/src/main/resources/stopwords/first_words.txt.gz b/nlpcraft/src/main/resources/stopwords/first_words.txt.gz
deleted file mode 100644
index e92748b4..00000000
Binary files a/nlpcraft/src/main/resources/stopwords/first_words.txt.gz and /dev/null differ
diff --git a/nlpcraft/src/main/resources/stopwords/noun_words.txt.gz b/nlpcraft/src/main/resources/stopwords/noun_words.txt.gz
deleted file mode 100644
index bfeb6fac..00000000
Binary files a/nlpcraft/src/main/resources/stopwords/noun_words.txt.gz and /dev/null differ
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala
index 142c16b4..b81ee116 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala
@@ -56,43 +56,50 @@ class NCStopWordsEnricherSpec extends AnyFunSuite:
             false
         )
         test(
-            new NCEnStopWordsTokenEnricher(Set("test"), Set("the")),
+            new NCEnStopWordsTokenEnricher(addSet = Set("test"), exclSet = Set("the")),
             "the test",
             false,
             true
         )
         // The synonym is defined as lemma => all kind of input words should be found.
         test(
-            new NCEnStopWordsTokenEnricher(Set("woman")),
+            new NCEnStopWordsTokenEnricher(addSet = Set("woman")),
             "woman women",
             true,
             true
         )
         // The synonym is defined in some form => only in the same form input words should be found.
         test(
-            new NCEnStopWordsTokenEnricher(Set("women")),
+            new NCEnStopWordsTokenEnricher(addSet = Set("women")),
             "woman women",
             false,
             true
         )
         // The synonym is defined in some form, but stemmer is very rough =>  all kind of input words should be found.
         test(
-            new NCEnStopWordsTokenEnricher(addStopsSet = Set("women"), stemmer = _.take(3)),
+            new NCEnStopWordsTokenEnricher(addSet = Set("women"), stemmer = _.take(3)),
             "woman women",
             true,
             true
         )
         // The synonym is defined as lemma => all kind of input words should be found, but excluded set is defined.
         test(
-            new NCEnStopWordsTokenEnricher(Set("woman"), Set("women")),
+            new NCEnStopWordsTokenEnricher(addSet = Set("woman"), exclSet = Set("women")),
             "woman women",
             true,
             false
         )
-        // Very rough stemmer defined.
+        // Very rough stemmers defined.
         test(
-            new NCEnStopWordsTokenEnricher(addStopsSet = Set("women"), stemmer = _.head.toString),
-            "weather windows",
+            new NCEnStopWordsTokenEnricher(addSet = Set("women"), stemmer = _.head.toString),
+            "weather windows noun",
+            true,
+            true,
+            false
+        )
+        test(
+            new NCEnStopWordsTokenEnricher(stemmer = _ => ""),
+            "weather noun",
             true,
             true
         )


[incubator-nlpcraft] 03/03: WIP.

Posted by se...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 8f8e9cf487f250ae9b2ec5d3b47d0cbc8a38c165
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Mon Dec 19 12:34:14 2022 +0400

    WIP.
---
 .../nlp/enrichers/impl/NCEnStopWordGenerator.scala |  4 +-
 .../nlp/enrichers/NCStopWordsEnricherSpec.scala    | 57 ++++++++++++++--------
 2 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/impl/NCEnStopWordGenerator.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/impl/NCEnStopWordGenerator.scala
index b90e0567..3295738a 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/impl/NCEnStopWordGenerator.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/impl/NCEnStopWordGenerator.scala
@@ -156,7 +156,7 @@ import org.apache.nlpcraft.nlp.enrichers.impl.NCEnStopWordGenerator.*
   */
 private[enrichers] class NCEnStopWordGenerator(stemmer: NCStemmer):
     def mkNounWords(): Set[String] =
-        val buf = new mutable.HashSet[String]()
+        val buf = new mutable.ArrayBuffer[String]()
 
         for (w1 <- NOUN_WORDS)
             buf += s"$w1"
@@ -167,7 +167,7 @@ private[enrichers] class NCEnStopWordGenerator(stemmer: NCStemmer):
         buf.map(stem).toSet
 
     def mkFirstWords(): Set[String] =
-        val buf = new mutable.HashSet[String]()
+        val buf = new mutable.ArrayBuffer[String]()
 
         // is there
         for (w1 <- QWORDS2)
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala
index b81ee116..b5b0ee25 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCStopWordsEnricherSpec.scala
@@ -18,11 +18,13 @@
 package org.apache.nlpcraft.nlp.enrichers
 
 import org.apache.nlpcraft.*
-import internal.util.NCResourceReader
 import nlp.util.*
 import nlp.enrichers.NCEnStopWordsTokenEnricher
-import org.apache.nlpcraft.nlp.stemmer.NCStemmer
 import org.scalatest.funsuite.AnyFunSuite
+import org.apache.nlpcraft.internal.util.NCUtils
+
+import scala.collection.*
+import scala.concurrent.ExecutionContext
 
 /**
   *
@@ -34,73 +36,90 @@ class NCStopWordsEnricherSpec extends AnyFunSuite:
       * @param txt
       * @param boolVals
       */
-    private def test(stopEnricher: NCEnStopWordsTokenEnricher, txt: String, boolVals: Boolean*): Unit =
-        val toks = EN_TOK_PARSER.tokenize(txt)
-        require(toks.size == boolVals.size)
+    private def add(stopEnricher: => NCEnStopWordsTokenEnricher, txt: String, boolVals: Boolean*)
+        (using bodies: mutable.ArrayBuffer[() => Unit], errs: mutable.ArrayBuffer[Throwable]): Unit =
+        val body: () => Unit = () =>
+            try
+                val toks = EN_TOK_PARSER.tokenize(txt)
+                require(toks.size == boolVals.size)
+
+                toks.foreach(tok => require(tok.get[Boolean]("stopword").isEmpty))
 
-        toks.foreach(tok => require(tok.get[Boolean]("stopword").isEmpty))
+                val req = NCTestRequest(txt)
 
-        val req = NCTestRequest(txt)
+                EN_TOK_LEMMA_POS_ENRICHER.enrich(req, CFG, toks)
+                stopEnricher.enrich(req, CFG, toks)
 
-        EN_TOK_LEMMA_POS_ENRICHER.enrich(req, CFG, toks)
-        stopEnricher.enrich(req, CFG, toks)
+                NCTestUtils.printTokens(toks)
+                toks.zip(boolVals).foreach { (tok, boolVal) => require(tok[Boolean]("stopword") == boolVal) }
+            catch
+                case e: Throwable => errs.synchronized { errs += e }
 
-        NCTestUtils.printTokens(toks)
-        toks.zip(boolVals).foreach { (tok, boolVal) => require(tok[Boolean]("stopword") == boolVal) }
+        bodies += body
 
     test("test") {
-        test(
+        val errs = mutable.ArrayBuffer.empty[Throwable]
+        val bodies = mutable.ArrayBuffer.empty[() => Unit]
+
+        given mutable.ArrayBuffer[Throwable] = errs
+        given mutable.ArrayBuffer[() => Unit] = bodies
+
+        add(
             EN_TOK_STOP_ENRICHER,
             "the test",
             true,
             false
         )
-        test(
+        add(
             new NCEnStopWordsTokenEnricher(addSet = Set("test"), exclSet = Set("the")),
             "the test",
             false,
             true
         )
         // The synonym is defined as lemma => all kind of input words should be found.
-        test(
+        add(
             new NCEnStopWordsTokenEnricher(addSet = Set("woman")),
             "woman women",
             true,
             true
         )
         // The synonym is defined in some form => only in the same form input words should be found.
-        test(
+        add(
             new NCEnStopWordsTokenEnricher(addSet = Set("women")),
             "woman women",
             false,
             true
         )
         // The synonym is defined in some form, but stemmer is very rough =>  all kind of input words should be found.
-        test(
+        add(
             new NCEnStopWordsTokenEnricher(addSet = Set("women"), stemmer = _.take(3)),
             "woman women",
             true,
             true
         )
         // The synonym is defined as lemma => all kind of input words should be found, but excluded set is defined.
-        test(
+        add(
             new NCEnStopWordsTokenEnricher(addSet = Set("woman"), exclSet = Set("women")),
             "woman women",
             true,
             false
         )
         // Very rough stemmers defined.
-        test(
+        add(
             new NCEnStopWordsTokenEnricher(addSet = Set("women"), stemmer = _.head.toString),
             "weather windows noun",
             true,
             true,
             false
         )
-        test(
+        add(
             new NCEnStopWordsTokenEnricher(stemmer = _ => ""),
             "weather noun",
             true,
             true
         )
+
+        NCUtils.execPar(bodies)(ExecutionContext.Implicits.global)
+        errs.foreach(_.printStackTrace)
+        require(errs.isEmpty)
     }
\ No newline at end of file


[incubator-nlpcraft] 01/03: WIP.

Posted by se...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 2427be979d3d87e9fe94e752ea3c73f9ee376b5a
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Mon Dec 19 11:08:24 2022 +0400

    WIP.
---
 .../apache/nlpcraft/internal/util/NCUtils.scala    | 18 --------
 .../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 24 ++++++-----
 .../{tools => impl}/NCEnStopWordGenerator.scala    | 50 ++++++++--------------
 3 files changed, 31 insertions(+), 61 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
index 6791429d..1b81acd0 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
@@ -424,24 +424,6 @@ object NCUtils extends LazyLogging:
             out
         catch case e: IOException => E(s"Failed to read stream: $res", e)
 
-    /**
-      * @param res Gzip resource, file absolute or relative path.
-      * @param res
-      * @param enc        Encoding. Default value is "UTF-8".
-      * @param strip      Strip flag. If `true` it strips all read lines. Default value is `true`.
-      * @param convert    Line conversion method. Applied after `strip`. By default it passes lines as is.
-      * @param filterText . Filtering text flag. If `true` it skips empty lines and lines with headers (# symbol). Default value is `false`.
-      * @param log Logger.
-      */
-    def readGzipLines(
-        res: String,
-        enc: String = "UTF-8",
-        strip: Boolean = true,
-        convert: String => String = s => s,
-        filterText: Boolean = false,
-        log: Logger = logger
-    ): Iterator[String] = readLines(new GZIPInputStream(getStream(res)), enc, strip, convert, filterText, log)
-
     /**
       *
       * @param bodies
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index 6dfb1b2c..cb1baae2 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -20,6 +20,7 @@ package org.apache.nlpcraft.nlp.enrichers
 import com.typesafe.scalalogging.LazyLogging
 import org.apache.nlpcraft.*
 import org.apache.nlpcraft.internal.util.NCUtils as U
+import org.apache.nlpcraft.nlp.enrichers.impl.NCEnStopWordGenerator
 import org.apache.nlpcraft.nlp.stemmer.*
 
 import java.io.*
@@ -82,11 +83,6 @@ private object NCEnStopWordsTokenEnricher extends LazyLogging:
         "--" // Synthetic POS.
     )
 
-    // Stemmatization is done already by generator.
-    // It is initialized in the companion for test performance reasons.
-    private val FIRST_WORDS: Set[String] = read("stopwords/first_words.txt.gz")
-    private val NOUN_WORDS: Set[String] = read("stopwords/noun_words.txt.gz")
-
     private val STOP_BEFORE_STOP: Seq[Word] = Seq("DT", "PRP", "PRP$", "WDT", "WP", "WP$", "WRB")
     private val Q_POS = Set("``", "''")
     private val PERCENTS = Set(
@@ -100,7 +96,6 @@ private object NCEnStopWordsTokenEnricher extends LazyLogging:
         "percent"
     )
 
-    private def read(path: String): Set[String] = U.readGzipLines(path, convert = _.toLowerCase, filterText = true, log = logger).toSet
     private def getPos(t: NCToken): String = U.getProperty(t, "pos")
     private def getLemma(t: NCToken): String = U.getProperty(t, "lemma")
     private def isQuote(t: NCToken): Boolean = Q_POS.contains(getPos(t))
@@ -199,6 +194,8 @@ class NCEnStopWordsTokenEnricher(
     exclSet: Set[String] = Set.empty,
     stemmer: NCStemmer = new NCEnStemmer
 ) extends NCTokenEnricher with LazyLogging:
+    require(addSet != null, "Additional stopwords cannot be null.")
+    require(exclSet != null, "Exceptions stopwords cannot be null.")
     require(stemmer != null, "Stemmer cannot be null.")
 
     private var addStems: Set[String] = _
@@ -206,6 +203,8 @@ class NCEnStopWordsTokenEnricher(
     private var percents: Set[String] = _
     private var stopWords: StopWordHolder = _
     private var exceptions: StopWordHolder = _
+    private var firstWords: Set[String] = _
+    private var nounWords: Set[String] = _
 
     private case class TokenExtra(lemma: String, stemTxt: String, stemLemma: String)
     private object TokenExtra:
@@ -309,8 +308,8 @@ class NCEnStopWordsTokenEnricher(
       *
       */
     private def init(): Unit =
-        addStems = if addSet == null then Set.empty else addSet.map(getStem)
-        exclStems = if exclSet == null then Set.empty else exclSet.map(getStem)
+        addStems = addSet.map(getStem)
+        exclStems = exclSet.map(getStem)
 
         def check(name: String, set: Set[String]): Unit =
             if set.exists(_.exists(_.isWhitespace)) then throw E(s"$name contain a string with whitespaces.")
@@ -329,6 +328,11 @@ class NCEnStopWordsTokenEnricher(
         stopWords = m(false)
         exceptions = m(true)
 
+        val gen = new NCEnStopWordGenerator(stemmer)
+
+        firstWords = gen.mkFirstWords()
+        nounWords = gen.mkNounWords()
+
     /**
       * Parses configuration template.
       *
@@ -601,7 +605,7 @@ class NCEnStopWordsTokenEnricher(
 
         // All sentence first stopword + first non stop word.
         val startToks = toks.takeWhile(isStopWord) ++ toks.find(p => !isStopWord(p)).map(p => p)
-        for (startTok <- startToks; tup <- origToks.filter(_._1.head == startTok); key = tup._2 if FIRST_WORDS.contains(key) && !isException(tup._1))
+        for (startTok <- startToks; tup <- origToks.filter(_._1.head == startTok); key = tup._2 if firstWords.contains(key) && !isException(tup._1))
             tup._1.foreach(tok => stops += tok)
             foundKeys += key
 
@@ -611,7 +615,7 @@ class NCEnStopWordsTokenEnricher(
         // +-------------------------------------------------+
         for (tup <- origToks; key = tup._2 if !foundKeys.contains(key) && !isException(tup._1))
             foundKeys.find(key.startsWith) match
-                case Some(s) => if NOUN_WORDS.contains(key.substring(s.length).strip) then tup._1.foreach(tok => stops += tok)
+                case Some(s) => if nounWords.contains(key.substring(s.length).strip) then tup._1.foreach(tok => stops += tok)
                 case None => ()
 
         // +-------------------------------------------------+
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/tools/NCEnStopWordGenerator.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/impl/NCEnStopWordGenerator.scala
similarity index 86%
rename from nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/tools/NCEnStopWordGenerator.scala
rename to nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/impl/NCEnStopWordGenerator.scala
index adb66e4a..b90e0567 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/tools/NCEnStopWordGenerator.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/impl/NCEnStopWordGenerator.scala
@@ -15,23 +15,15 @@
  * limitations under the License.
  */
 
-package org.apache.nlpcraft.nlp.enrichers.tools
+package org.apache.nlpcraft.nlp.enrichers.impl
 
 import org.apache.nlpcraft.internal.util.NCUtils
-import org.apache.nlpcraft.nlp.stemmer.NCEnStemmer
+import org.apache.nlpcraft.nlp.stemmer.*
 
 import scala.collection.mutable
 
-/**
-  * Generates first word sequences.
-  */
-object NCEnStopWordGenerator:
-    private final lazy val stemmer = new NCEnStemmer
-
-    // Output files.
-    private val FIRST_WORDS_FILE = "first_words.txt"
-    private val NOUN_WORDS_FILE = "noun_words.txt"
-
+private[enrichers] object NCEnStopWordGenerator:
+    // All string data should be in lowercase.
     private final val QWORDS = Seq(
         "what",
         "when",
@@ -157,13 +149,14 @@ object NCEnStopWordGenerator:
         "couple of"
     )
 
-    private def mkGzip(path: String, lines: Iterable[Any]): Unit =
-        val p = NCUtils.mkPath(s"nlpcraft/src/main/resources/stopwords/$path")
-        NCUtils.mkTextFile(p, lines)
-        NCUtils.gzipPath(p)
+import org.apache.nlpcraft.nlp.enrichers.impl.NCEnStopWordGenerator.*
 
-    private[tools] def mkNounWords(): Unit =
-        val buf = new mutable.ArrayBuffer[String]()
+/**
+  * Generates first word sequences.
+  */
+private[enrichers] class NCEnStopWordGenerator(stemmer: NCStemmer):
+    def mkNounWords(): Set[String] =
+        val buf = new mutable.HashSet[String]()
 
         for (w1 <- NOUN_WORDS)
             buf += s"$w1"
@@ -171,13 +164,10 @@ object NCEnStopWordGenerator:
         for (w1 <- NOUN_WORDS; w2 <- NOUN_WORDS2)
             buf += s"$w1 $w2"
 
-        mkGzip(NOUN_WORDS_FILE, stem(buf.toSeq))
-
-    private def stem(s: String): String = s.split(" ").map(p => stemmer.stem(p.toLowerCase)).mkString(" ")
-    private def stem(seq: Seq[String]): Seq[String] = seq.map(stem)
+        buf.map(stem).toSet
 
-    private[tools] def mkFirstWords(): Unit =
-        val buf = new mutable.ArrayBuffer[String]()
+    def mkFirstWords(): Set[String] =
+        val buf = new mutable.HashSet[String]()
 
         // is there
         for (w1 <- QWORDS2)
@@ -307,14 +297,8 @@ object NCEnStopWordGenerator:
         for (w0 <- DWORDS_PRE; w1 <- DWORDS; w2 <- DWORDS_SUP; w3 <- QWORDS)
             buf += s"$w0 $w1 $w2 $w3"
 
-        mkGzip(FIRST_WORDS_FILE, stem(buf.toSeq))
+        buf.map(stem).toSet
 
-    /**
-      *
-      * @param args
-      */
-    def main(args: Array[String]): Unit =
-        mkFirstWords()
-        mkNounWords()
+    // All data already in lowercase.
+    private def stem(s: String): String = s.split(" ").map(stemmer.stem).mkString(" ")
 
-        sys.exit()