You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/06/28 14:09:42 UTC

[incubator-nlpcraft] branch NLPCRAFT-70_NEW updated (cc40c20 -> 3c5fafb)

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a change to branch NLPCRAFT-70_NEW
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git.


    from cc40c20  WIP.
     new 544d8a4  WIP.
     new 3c5fafb  WIP.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 nlpcraft/pom.xml                                   |   4 +
 .../apache/nlpcraft/common/nlp/NCNlpSentence.scala |   2 +-
 .../nlpcraft/common/nlp/pos/NCPennTreebank.scala   |   4 +-
 .../enrichers/ctxword/NCContextWordEnricher.scala  | 102 ++++++++++++++++-----
 .../nlpcraft/model/ctxword/NCContextWordSpec.scala |  25 +++--
 pom.xml                                            |   7 ++
 6 files changed, 108 insertions(+), 36 deletions(-)

[incubator-nlpcraft] 01/02: WIP.

Posted by se...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-70_NEW
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 544d8a40860393698ec80b5c992e890770469c16
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Mon Jun 28 14:33:32 2021 +0300

    WIP.
---
 nlpcraft/pom.xml                                   |  4 ++++
 .../apache/nlpcraft/common/nlp/NCNlpSentence.scala |  2 +-
 .../enrichers/ctxword/NCContextWordEnricher.scala  | 26 ++++++++++++++++++++--
 pom.xml                                            |  7 ++++++
 4 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/nlpcraft/pom.xml b/nlpcraft/pom.xml
index 4d8c292..62d3683 100644
--- a/nlpcraft/pom.xml
+++ b/nlpcraft/pom.xml
@@ -232,6 +232,10 @@
             <groupId>org.jline</groupId>
             <artifactId>jline</artifactId>
         </dependency>
+        <dependency>
+            <groupId>org.jibx</groupId>
+            <artifactId>jibx-tools</artifactId>
+        </dependency>
 
         <!-- Test dependencies. -->
         <dependency>
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index d237558..ed22935 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -52,7 +52,7 @@ class NCNlpSentence(
     val srvReqId: String,
     val text: String,
     val enabledBuiltInToks: Set[String],
-    val mlConfig: Option[NCModelMLConfigMdo],
+    val mlConfig: Option[NCModelMLConfigMdo] = None,
     var mlData: Map[Int, Map[String, Double]] = Map.empty,
     override val tokens: mutable.ArrayBuffer[NCNlpSentenceToken] = new mutable.ArrayBuffer[NCNlpSentenceToken](32),
     var firstProbePhase: Boolean = true,
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
index 4982e29..79c970e 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
@@ -18,12 +18,16 @@
 package org.apache.nlpcraft.server.nlp.enrichers.ctxword
 
 import io.opencensus.trace.Span
-import org.apache.nlpcraft.common.nlp.core.NCNlpPorterStemmer.stem
+import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager
+import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager.stem
+import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceToken}
 import org.apache.nlpcraft.common.{NCE, NCService, U}
 import org.apache.nlpcraft.server.mdo.NCModelMLConfigMdo
 import org.apache.nlpcraft.server.nlp.enrichers.NCServerEnricher
+import org.apache.nlpcraft.server.nlp.enrichers.basenlp.NCBaseNlpEnricher
 import org.apache.nlpcraft.server.sugsyn.{NCSuggestSynonymManager, NCSuggestionRequest, NCWordSuggestion}
+import org.jibx.schema.codegen.extend.DefaultNameConverter
 
 import scala.collection.mutable
 import scala.concurrent.Await
@@ -33,9 +37,14 @@ import scala.concurrent.duration.Duration
   * ContextWord enricher.
   */
 object NCContextWordEnricher extends NCServerEnricher {
+    private final val POS_PLURALS = Set("NNS", "NNPS")
+    private final val POS_SINGULAR = Set("NN", "NNP")
+
     private final val MAX_CTXWORD_SCORE = 2
     private final val EXCLUSIVE_MIN_SCORE = -1.0
 
+    private final val CONVERTER = new DefaultNameConverter
+
     private case class ModelProbeKey(probeId: String, modelId: String)
     private case class ElementScore(elementId: String, averageScore: Double, senScore: Double, sampleScore: Double)
 
@@ -171,6 +180,19 @@ object NCContextWordEnricher extends NCServerEnricher {
     @throws[NCE]
     private def askSamples(cfg: NCModelMLConfigMdo): ElementStemScore = {
         val sampleWords = cfg.samples.map(spaceTokenize).toSeq
+
+
+        sampleWords.map(s => {
+            val sampleSen = new NCNlpSentence("sampleReqId", sampleWords.mkString(" "), Set.empty)
+
+            NCBaseNlpEnricher.enrich(sampleSen)
+
+            sampleSen.
+        })
+
+
+
+
         val sampleWordsStems = sampleWords.map(_.map(stem))
 
         val recs: Map[String, Seq[NCSuggestionRequest]] =
@@ -227,7 +249,7 @@ object NCContextWordEnricher extends NCServerEnricher {
                     }
                 }
 
-                val nounToks = ns.tokens.filter(_.pos.startsWith("N"))
+                val nounToks = ns.tokens.filter(t => NCPennTreebank.NOUNS_POS.contains(t.pos))
 
                 if (nounToks.nonEmpty) {
                     val key = ModelProbeKey(cfg.probeId, cfg.modelId)
diff --git a/pom.xml b/pom.xml
index fd0d687..59e871a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -154,6 +154,7 @@
         <lightstep.grpc.ver>0.15.8</lightstep.grpc.ver>
         <junit.ver>5.5.1</junit.ver>
         <jsonpath.ver>2.4.0</jsonpath.ver>
+        <jibx.tools.ver>1.3.3</jibx.tools.ver>
 
         <!-- Force specific encoding on text resources. -->
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
@@ -474,6 +475,12 @@
             </dependency>
 
             <dependency>
+                <groupId>org.jibx</groupId>
+                <artifactId>jibx-tools</artifactId>
+                <version>${jibx.tools.ver}</version>
+            </dependency>
+
+            <dependency>
                 <groupId>edu.stanford.nlp</groupId>
                 <artifactId>stanford-corenlp</artifactId>
                 <version>${stanford.corenlp.ver}</version>

[incubator-nlpcraft] 02/02: WIP.

Posted by se...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-70_NEW
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 3c5fafb57f142c597b40332aaa561721ef8e469f
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Mon Jun 28 17:07:54 2021 +0300

    WIP.
---
 .../nlpcraft/common/nlp/pos/NCPennTreebank.scala   |   4 +-
 .../enrichers/ctxword/NCContextWordEnricher.scala  | 114 +++++++++++++--------
 .../nlpcraft/model/ctxword/NCContextWordSpec.scala |  25 +++--
 3 files changed, 91 insertions(+), 52 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/pos/NCPennTreebank.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/pos/NCPennTreebank.scala
index a61c63a..0c6e0de 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/pos/NCPennTreebank.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/pos/NCPennTreebank.scala
@@ -68,7 +68,9 @@ object NCPennTreebank {
     final val SYNTH_POS_DESC = "Synthetic tag"
 
     // Useful POS tags sets.
-    final val NOUNS_POS = Seq("NN", "NNS", "NNP", "NNPS")
+    final val NOUNS_POS_PLURALS = Seq("NNS", "NNPS")
+    final val NOUNS_POS_SINGULAR = Seq("NN", "NNP")
+    final val NOUNS_POS = NOUNS_POS_PLURALS ++ NOUNS_POS_SINGULAR
     final val VERBS_POS = Seq("VB", "VBD", "VBG", "VBN", "VBP", "VBZ")
     final val WHS_POS = Seq("WDT", "WP", "WP$", "WRB")
     final val JJS_POS = Seq("JJ", "JJR", "JJS")
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
index 79c970e..4d83ab7 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
@@ -18,14 +18,13 @@
 package org.apache.nlpcraft.server.nlp.enrichers.ctxword
 
 import io.opencensus.trace.Span
-import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager
 import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager.stem
-import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
+import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank._
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceToken}
 import org.apache.nlpcraft.common.{NCE, NCService, U}
 import org.apache.nlpcraft.server.mdo.NCModelMLConfigMdo
+import org.apache.nlpcraft.server.nlp.core.{NCNlpParser, NCNlpServerManager, NCNlpWord}
 import org.apache.nlpcraft.server.nlp.enrichers.NCServerEnricher
-import org.apache.nlpcraft.server.nlp.enrichers.basenlp.NCBaseNlpEnricher
 import org.apache.nlpcraft.server.sugsyn.{NCSuggestSynonymManager, NCSuggestionRequest, NCWordSuggestion}
 import org.jibx.schema.codegen.extend.DefaultNameConverter
 
@@ -37,9 +36,6 @@ import scala.concurrent.duration.Duration
   * ContextWord enricher.
   */
 object NCContextWordEnricher extends NCServerEnricher {
-    private final val POS_PLURALS = Set("NNS", "NNPS")
-    private final val POS_SINGULAR = Set("NN", "NNP")
-
     private final val MAX_CTXWORD_SCORE = 2
     private final val EXCLUSIVE_MIN_SCORE = -1.0
 
@@ -47,17 +43,24 @@ object NCContextWordEnricher extends NCServerEnricher {
 
     private case class ModelProbeKey(probeId: String, modelId: String)
     private case class ElementScore(elementId: String, averageScore: Double, senScore: Double, sampleScore: Double)
+    private case class ValuesHolder(
+        values: Map[/** Stem */String, /** Element ID */Set[String]],
+        valuesStems: Map[/** Value */String, /** Element ID */Set[String]],
+    )
 
     private type ElementStemScore = Map[/** Element ID */String, Map[/** Stem */String,/** Score */Double]]
 
-    @volatile private var values: mutable.HashMap[ModelProbeKey, Map[/** Stem */String, /** Element ID */Set[String]]] = _
+    @volatile private var valuesStems: mutable.HashMap[ModelProbeKey, ValuesHolder] = _
     @volatile private var samples: mutable.HashMap[ModelProbeKey, ElementStemScore] = _
 
+    @volatile private var parser: NCNlpParser = _
+
     override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ =>
         ackStarting()
 
-        values = mutable.HashMap.empty
+        valuesStems = mutable.HashMap.empty
         samples = mutable.HashMap.empty
+        parser = NCNlpServerManager.getParser
 
         ackStarted()
     }
@@ -65,8 +68,9 @@ object NCContextWordEnricher extends NCServerEnricher {
     override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ =>
         ackStopping()
 
+        parser = null
         samples = null
-        values = null
+        valuesStems = null
 
         ackStopped()
     }
@@ -96,6 +100,7 @@ object NCContextWordEnricher extends NCServerEnricher {
 
     /**
       *
+      * @param nlpWords
       * @param sampleWords
       * @param sampleWordsStems
       * @param elemValuesSyns
@@ -103,27 +108,56 @@ object NCContextWordEnricher extends NCServerEnricher {
       * @return
       */
     private def parseSample(
+        nlpWords: Seq[Seq[NCNlpWord]],
         sampleWords: Seq[Seq[String]],
         sampleWordsStems: Seq[Seq[String]],
         elemValuesSyns: Set[String],
         elemValuesSynsStems: Set[String]
     ): Iterable[NCSuggestionRequest] = {
+        require(nlpWords.size == sampleWords.size)
         require(sampleWords.size == sampleWordsStems.size)
         require(elemValuesSyns.size == elemValuesSynsStems.size)
 
-        sampleWordsStems.zip(sampleWords).flatMap { case (sampleWordsStem, sampleWord) =>
+        sampleWordsStems.zip(sampleWords).zip(nlpWords).flatMap { case ((sampleWordsStem, sampleWords), nlpWords) =>
             val idxs = elemValuesSynsStems.flatMap(valSynsStem => {
                 val i = sampleWordsStem.indexOf(valSynsStem)
 
                 if (i >= 0) Some(i) else None
             })
 
+            def mkRequest(idx: Int, syn: String): NCSuggestionRequest = {
+                def mkSentence(syn: String): String =
+                    sampleWords.zipWithIndex.map { case (w, i) => if (i != idx) w else syn }.mkString(" ")
+
+                var newSen = mkSentence(syn)
+
+                val nlpWordsNew = parser.parse(newSen)
+
+                require(nlpWords.size == nlpWordsNew.size)
+
+                val pos = nlpWords(idx).pos
+                val posNew = nlpWordsNew(idx).pos
+
+                if (NOUNS_POS_SINGULAR.contains(pos) && NOUNS_POS_PLURALS.contains(posNew)) {
+                    println(s"newSen1=$newSen")
+
+                    newSen = mkSentence(CONVERTER.depluralize(syn))
+
+                    println(s"newSen2=$newSen")
+                }
+                else if (NOUNS_POS_PLURALS.contains(pos) && NOUNS_POS_SINGULAR.contains(posNew)) {
+                    println(s"newSen1=$newSen")
+
+                    newSen = mkSentence(CONVERTER.pluralize(syn))
+
+                    println(s"newSen3=$newSen")
+                }
+
+                NCSuggestionRequest(newSen, idx)
+            }
+
             for (idx <- idxs; syn <- elemValuesSyns)
-                yield
-                    NCSuggestionRequest(
-                        sampleWord.zipWithIndex.map { case (w, i) => if (i != idx) w else syn }.mkString(" "),
-                        idx
-                    )
+                yield mkRequest(idx, syn)
         }
     }
 
@@ -157,17 +191,20 @@ object NCContextWordEnricher extends NCServerEnricher {
       * @param key
       * @return
       */
-    private def getValuesData(cfg: NCModelMLConfigMdo, key: ModelProbeKey): Map[String, Set[String]] =
-        values.synchronized { values.get(key) } match {
+    private def getValuesData(cfg: NCModelMLConfigMdo, key: ModelProbeKey): ValuesHolder =
+        valuesStems.synchronized { valuesStems.get(key) } match {
             case Some(cache) => cache
             case None =>
-                val res = cfg.values.
-                    flatMap { case (elemId, vals) => vals.map { case (_, vals) => vals.map(stem(_) -> elemId) } }.
-                    flatten.
-                    groupBy { case (stem, _) => stem }.
-                    map { case (stem, map) => stem -> map.map {case (_, elemId) => elemId }.toSet }
+                def mkMap(convert: String => String): Map[String, Set[String]] =
+                    cfg.values.
+                        flatMap { case (elemId, vals) => vals.map { case (_, vals) => vals.map(convert(_) -> elemId) } }.
+                        flatten.
+                        groupBy { case (converted, _) => converted }.
+                        map { case (converted, map) => converted -> map.map {case (_, elemId) => elemId }.toSet }
+
+                val res = ValuesHolder(mkMap(stem), mkMap(_.toLowerCase))
 
-                values.synchronized { values += key -> res }
+                valuesStems.synchronized { valuesStems += key -> res }
 
                 res
         }
@@ -179,29 +216,20 @@ object NCContextWordEnricher extends NCServerEnricher {
       */
     @throws[NCE]
     private def askSamples(cfg: NCModelMLConfigMdo): ElementStemScore = {
-        val sampleWords = cfg.samples.map(spaceTokenize).toSeq
-
-
-        sampleWords.map(s => {
-            val sampleSen = new NCNlpSentence("sampleReqId", sampleWords.mkString(" "), Set.empty)
-
-            NCBaseNlpEnricher.enrich(sampleSen)
-
-            sampleSen.
-        })
-
-
-
+        val samplesSeq = cfg.samples.toSeq
+        val sampleWords = samplesSeq.map(spaceTokenize)
+        val nlpWords = samplesSeq.map(s => parser.parse(s))
 
         val sampleWordsStems = sampleWords.map(_.map(stem))
 
-        val recs: Map[String, Seq[NCSuggestionRequest]] =
+        val recs =
             (
                 for (
                     (elemId, elemValues) <- cfg.values;
-                    elemValuesSyns = elemValues.flatMap(_._2).toSet;
+                    // Uses single words synonyms only.
+                    elemValuesSyns = elemValues.flatMap(_._2).toSet.filter(!_.contains(' '));
                     elemValuesSynsStems = elemValuesSyns.map(stem);
-                    suggReq <- parseSample(sampleWords, sampleWordsStems, elemValuesSyns, elemValuesSynsStems)
+                    suggReq <- parseSample(nlpWords, sampleWords, sampleWordsStems, elemValuesSyns, elemValuesSynsStems)
                 )
                     yield (elemId, suggReq)
             ).
@@ -249,7 +277,7 @@ object NCContextWordEnricher extends NCServerEnricher {
                     }
                 }
 
-                val nounToks = ns.tokens.filter(t => NCPennTreebank.NOUNS_POS.contains(t.pos))
+                val nounToks = ns.tokens.filter(t => NOUNS_POS.contains(t.pos))
 
                 if (nounToks.nonEmpty) {
                     val key = ModelProbeKey(cfg.probeId, cfg.modelId)
@@ -257,7 +285,11 @@ object NCContextWordEnricher extends NCServerEnricher {
                     // 1. Values. Direct.
                     val valuesData = getValuesData(cfg, key)
 
-                    for (nounTok <- nounToks; elemId <- valuesData.getOrElse(nounTok.stem, Set.empty))
+                    for (nounTok <- nounToks; elemId <- valuesData.values.getOrElse(nounTok.lemma.toLowerCase, Set.empty))
+                        add(nounTok, elemId, 1, 1, 1)
+                    for (nounTok <- nounToks; elemId <- valuesData.values.getOrElse(nounTok.normText, Set.empty))
+                        add(nounTok, elemId, 1, 1, 1)
+                    for (nounTok <- nounToks; elemId <- valuesData.valuesStems.getOrElse(nounTok.stem, Set.empty))
                         add(nounTok, elemId, 1, 1, 1)
 
                     // 2. Via examples.
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
index cbe7ce2..f8d3f12 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
@@ -106,8 +106,12 @@ class NCContextWordSpecModel extends NCModel {
   */
 @NCTestEnvironment(model = classOf[NCContextWordSpecModel], startClient = true)
 class NCContextWordSpec extends NCTestContext {
-    private def check(txt: String, elemId: String, words: String*): Unit =
-        require(s"$elemId ${words.mkString(" ")}" == getClient.ask(txt).getResult.get())
+    private def check(txt: String, elemId: String, words: String*): Unit = {
+        val res = getClient.ask(txt).getResult.get()
+        val exp = s"$elemId ${words.mkString(" ")}"
+
+        require(exp == res, s"Expected: $exp, result: $res")
+    }
 
     @BeforeEach
     private[ctxword] def before(): Unit = testsData.clear()
@@ -117,13 +121,14 @@ class NCContextWordSpec extends NCTestContext {
 
     @Test
     private[ctxword] def test(): Unit = {
-        check("I want to have a dog and fox", "class:animal", "dog", "fox")
-        check("I fed your fish", "class:animal", "fish")
-
-        check("I like to drive my Porsche and Volkswagen", "class:cars", "Porsche", "Volkswagen")
-        check("Peugeot added motorcycles to its range in 1901", "class:cars", "Peugeot", "motorcycles")
-
-        check("The frost is possible today", "class:weather", "frost")
-        check("There's a very strong wind from the east now", "class:weather", "wind")
+        check("I want to have a dogs and foxes", "class:animal", "dogs", "foxes")
+        //check("I want to have a dog and fox", "class:animal", "dog", "fox")
+//        check("I fed your fish", "class:animal", "fish")
+//
+//        check("I like to drive my Porsche and Volkswagen", "class:cars", "Porsche", "Volkswagen")
+//        check("Peugeot added motorcycles to its range in 1901", "class:cars", "Peugeot", "motorcycles")
+//
+//        check("The frost is possible today", "class:weather", "frost")
+//        check("There's a very strong wind from the east now", "class:weather", "wind")
     }
 }