You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/04/09 11:26:33 UTC

[incubator-nlpcraft] 01/03: WIP.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-287
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit a349b6fe1baa5d6069f725f232b32146efb21873
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Fri Apr 9 14:09:52 2021 +0300

    WIP.
---
 .../nlpcraft/common/nlp/NCNlpSentenceNote.scala    |  25 ++--
 .../apache/nlpcraft/model/impl/NCTokenImpl.scala   |   8 +-
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 144 +++++++++++----------
 .../probe/mgrs/sentence/NCSentenceManager.scala    |  15 ++-
 .../nlpcraft/model/sparse/NCSparseSpec.scala       |  15 ++-
 5 files changed, 118 insertions(+), 89 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala
index c0923ae..9adbe01 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala
@@ -23,12 +23,13 @@ import org.apache.nlpcraft.common.ascii._
 import scala.collection.JavaConverters._
 import scala.collection.{Seq, Set, mutable}
 import scala.language.implicitConversions
+import java.io.{Serializable ⇒ JSerializable}
 
 /**
   * Sentence token note is a typed map of KV pairs.
   *
   */
-class NCNlpSentenceNote(private val values: Map[String, java.io.Serializable]) extends java.io.Serializable with NCAsciiLike {
+class NCNlpSentenceNote(private val values: Map[String, JSerializable]) extends JSerializable with NCAsciiLike {
     import NCNlpSentenceNote._
 
     @transient
@@ -75,7 +76,7 @@ class NCNlpSentenceNote(private val values: Map[String, java.io.Serializable]) e
         )
 
     override def clone(): NCNlpSentenceNote = {
-        val m = mutable.Map.empty[String, java.io.Serializable] ++ values
+        val m = mutable.Map.empty[String, JSerializable] ++ values
 
         new NCNlpSentenceNote(m.toMap)
     }
@@ -91,20 +92,20 @@ class NCNlpSentenceNote(private val values: Map[String, java.io.Serializable]) e
       *
       * @return
       */
-    def skipNlp(): Map[String, java.io.Serializable] =
+    def skipNlp(): Map[String, JSerializable] =
         values.filter { case (key, _) ⇒ !SKIP_CLONE.contains(key) && key != "noteType" }
 
     /**
       *
       */
-    def asMetadata(): Map[String, java.io.Serializable] =
+    def asMetadata(): Map[String, JSerializable] =
         if (isUser)
             values.get("meta") match {
-                case Some(meta) ⇒ meta.asInstanceOf[Map[String, java.io.Serializable]]
-                case None ⇒ Map.empty[String, java.io.Serializable]
+                case Some(meta) ⇒ meta.asInstanceOf[Map[String, JSerializable]]
+                case None ⇒ Map.empty[String, JSerializable]
             }
         else {
-            val md = mutable.Map.empty[String, java.io.Serializable]
+            val md = mutable.Map.empty[String, JSerializable]
 
             val m = if (noteType != "nlpcraft:nlp") skipNlp() else values
 
@@ -117,8 +118,8 @@ class NCNlpSentenceNote(private val values: Map[String, java.io.Serializable]) e
      *
      * @param kvs
      */
-    def clone(kvs : (String, java.io.Serializable)*): NCNlpSentenceNote = {
-        val m = mutable.HashMap.empty[String, java.io.Serializable] ++ values
+    def clone(kvs : (String, JSerializable)*): NCNlpSentenceNote = {
+        val m = mutable.HashMap.empty[String, JSerializable] ++ values
 
         kvs.foreach(kv ⇒ m += kv._1 → kv._2)
 
@@ -206,7 +207,7 @@ object NCNlpSentenceNote {
     /**
      * To immutable map.
      */
-    implicit def values(note: NCNlpSentenceNote): Map[String, java.io.Serializable] = note.values
+    implicit def values(note: NCNlpSentenceNote): Map[String, JSerializable] = note.values
 
     /**
       * Creates new note with given parameters.
@@ -228,7 +229,7 @@ object NCNlpSentenceNote {
         val (sparsity, tokMinIndex, tokMaxIndex, tokWordIndexes, len) = calc(wordIndexesOpt.getOrElse(indexes))
 
         new NCNlpSentenceNote(
-            mutable.HashMap[String, java.io.Serializable]((
+            mutable.HashMap[String, JSerializable]((
             params.filter(_._2 != null) :+
                ("noteType" → typ) :+
                ("tokMinIndex" → indexes.min) :+
@@ -240,7 +241,7 @@ object NCNlpSentenceNote {
                ("wordLength" → len) :+
                ("sparsity" → sparsity) :+
                ("contiguous" → (sparsity == 0))
-            ).map(p ⇒ p._1 → p._2.asInstanceOf[java.io.Serializable]): _*).toMap
+            ).map(p ⇒ p._1 → p._2.asInstanceOf[JSerializable]): _*).toMap
         )
     }
 
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala
index 017ead1..8c5005a 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala
@@ -17,7 +17,7 @@
 
 package org.apache.nlpcraft.model.impl
 
-import java.io.Serializable
+import java.io.{Serializable ⇒ JSerializable}
 import java.util.Collections
 
 import org.apache.nlpcraft.common._
@@ -50,7 +50,7 @@ private[nlpcraft] class NCTokenImpl(
     endCharIndex: Int,
     meta: Map[String, Object],
     isAbstractProp: Boolean
-) extends NCToken with Serializable {
+) extends NCToken with JSerializable {
     require(mdl != null)
     require(srvReqId != null)
     require(id != null)
@@ -106,7 +106,7 @@ private[nlpcraft] object NCTokenImpl {
         // nlpcraft:nlp and some optional (after collapsing).
         require(tok.size <= 2, s"Unexpected token [size=${tok.size}, token=$tok]")
 
-        val md = mutable.HashMap.empty[String, java.io.Serializable]
+        val md = mutable.HashMap.empty[String, JSerializable]
 
         tok.foreach(n ⇒ {
             val id = n.noteType.toLowerCase
@@ -142,7 +142,7 @@ private[nlpcraft] object NCTokenImpl {
                 // Special synthetic meta data element.
                 md.put("nlpcraft:nlp:freeword", false)
 
-                elm.getMetadata.asScala.foreach { case (k, v) ⇒ md.put(k, v.asInstanceOf[java.io.Serializable]) }
+                elm.getMetadata.asScala.foreach { case (k, v) ⇒ md.put(k, v.asInstanceOf[JSerializable]) }
 
                 new NCTokenImpl(
                     mdl.model,
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 0ec40cd..d668c02 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -130,13 +130,21 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
 
     case class ComplexHolder(complexesWords: Seq[Complex], complexes: Seq[ComplexSeq])
 
-    // Found-by-synonym model element.
+    /**
+      * Found-by-synonym model element.
+      *
+      * @param element Element.
+      * @param tokens Element tokens.
+      * @param synonym Synonyms.
+      * @param parts Parts for DSL synonyms.
+      * @param allToksIdxs All tokens indexes (whole tokens slice, has sense for sparse tokens)
+      */
     case class ElementMatch(
         element: NCElement,
         tokens: Seq[NlpToken],
         synonym: Synonym,
         parts: Seq[TokType],
-        tokIdxs: Seq[Int]
+        allToksIdxs: Seq[Int]
     ) extends Ordered[ElementMatch] {
         // Tokens sparsity.
         lazy val sparsity: Int = U.calcSparsity(tokens.map(_.index))
@@ -206,7 +214,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
       * @param syn
       * @param metaOpt
       * @param parts
-      * @param toksIdxs
+      * @param allToksIdxs
       */
     private def mark(
         ns: NCNlpSentence,
@@ -216,16 +224,15 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
         syn: Option[Synonym],
         metaOpt: Option[Map[String, Object]],
         parts: Seq[TokType],
-        toksIdxs: Seq[Int]
+        allToksIdxs: Seq[Int]
     ): Unit = {
         val params = mutable.ArrayBuffer.empty[(String, AnyRef)]
 
         // For system elements.
         params += "direct" → direct.asInstanceOf[AnyRef]
 
-        val toksIdxsJava: JList[Int] = toksIdxs.asJava
-
-        params += "allToksIndexes" → toksIdxsJava
+        // Internal usage.
+        params += "allToksIndexes" → allToksIdxs.asJava
 
         syn match {
             case Some(s) ⇒
@@ -334,6 +341,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
                             ).getOrElse(throw new AssertionError(s"Custom model parser returned an invalid custom token: $w"))
                         )
 
+                        // Checks element's tokens.
                         if (!alreadyMarked(matchedToks, elemId))
                             mark(
                                 ns,
@@ -379,17 +387,15 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
       * @param toks
       * @param elemId
       */
-    private def alreadyMarked(toks: Seq[NlpToken], elemId: String): Boolean =
-        toks.forall(_.isTypeOf(elemId)) ||
-        toks.flatten.exists(n ⇒
-            n.noteType == elemId &&
-            (
-                n.dataOpt("allToksIndexes").asInstanceOf[Option[JList[Int]]] match {
-                    case Some(idxs) ⇒ idxs.asScala.containsSlice(toks.map(_.index))
-                    case None ⇒ false
-                }
-            )
-        )
+    private def alreadyMarked(toks: Seq[NlpToken], elemId: String): Boolean = {
+        def hasIndex(n: NCNlpSentenceNote): Boolean =
+            n.dataOpt("allToksIndexes").asInstanceOf[Option[JList[Int]]] match {
+                case Some(idxs) ⇒ idxs.asScala.containsSlice(toks.map(_.index))
+                case None ⇒ false
+            }
+
+         toks.flatten.exists(n ⇒ n.noteType == elemId && hasIndex(n))
+    }
 
     /**
       *
@@ -519,39 +525,6 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
             }).seq
     }
 
-    /**
-      *
-      * @param ns
-      * @param mdlId
-      * @param matches
-      */
-    private def processMatches(ns: NCNlpSentence, mdlId: String, matches: Seq[ElementMatch]): Unit = {
-        // TODO:matchesNorm
-        // Add notes for all remaining (non-intersecting) matches.
-        for ((m, idx) ← matches.zipWithIndex) {
-            if (DEEP_DEBUG)
-                logger.trace(
-                    s"Model '$mdlId' element found (${idx + 1} of ${matches.size}) [" +
-                        s"elementId=${m.element.getId}, " +
-                        s"synonym=${m.synonym}, " +
-                        s"tokens=${tokString(m.tokens)}" +
-                        s"]"
-                )
-
-            val tokIdxs = m.tokens.map(_.index)
-            val direct = m.synonym.isDirect && (tokIdxs == tokIdxs.sorted)
-
-            // TODO:
-            if (!alreadyMarked(m.tokens, m.element.getId)) {
-                mark(ns, m.element, m.tokens, direct, syn = Some(m.synonym), metaOpt = None, m.parts, m.tokIdxs)
-
-                println(s"SET: ${m.element.getId}, m.tokens=${m.tokens.map(_.origText).mkString("|")}")
-            }
-            else
-                println(s"NOT SET: ${m.element.getId}, m.tokens=${m.tokens.map(_.origText).mkString("|")}")
-        }
-    }
-
     @throws[NCE]
     override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = {
         require(isStarted)
@@ -561,7 +534,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
 
         startScopedSpan("enrich", parent, "srvReqId" → srvReqId, "mdlId" → mdlId, "txt" → ns.text) { span ⇒
             val req = NCRequestImpl(senMeta, srvReqId)
-            val h = mkComplexes(mdl, ns)
+            lazy val h = mkComplexes(mdl, ns)
 
             startScopedSpan("synsProc", span, "srvReqId" → srvReqId, "mdlId" → mdlId, "txt" → ns.text) { _ ⇒
                 var state = if (ns.firstProbePhase) SIMPLE else DSL_NEXT
@@ -571,9 +544,6 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
                 val combosToks = combos(ns)
 
                 def go(): Unit = {
-                    println
-                    println(s"GO $state")
-
                     val matches = mutable.ArrayBuffer.empty[ElementMatch]
 
                     val cacheSparse = mkCache(mdl)
@@ -582,22 +552,30 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
 
                     var found = false
 
-                    def add(typ: String, elm: NCElement, cache: Cache, res: Seq[NlpToken], tokIdxs: Seq[Int], s: Synonym, parts: Seq[TokType] = Seq.empty): Unit = {
+                    def add(typ: String, elm: NCElement, cache: Cache, res: Seq[NlpToken], allToksIdxs: Seq[Int], s: Synonym, parts: Seq[TokType] = Seq.empty): Unit = {
                         var added = false
 
                         if (!matchExist(elm.getId, res)) {
-                            matches += ElementMatch(elm, res, s, parts, tokIdxs)
+                            matches += ElementMatch(elm, res, s, parts, allToksIdxs)
 
                             added = true
                         }
 
-                        cache(elm.getId) += tokIdxs
+                        cache(elm.getId) += allToksIdxs
                         found = true
 
-                        println(s"ADDED: ${elm.getId}, type=$typ, res=${res.map(_.origText).mkString("|")}, toks=${tokIdxs.mkString("|")}, added=$added")
+                        if (DEEP_DEBUG)
+                            logger.trace(
+                                s"Found element [" +
+                                    s"id=${elm.getId}, " +
+                                    s"type=$typ, " +
+                                    s"indexes=${res.map(_.index).mkString("|")}, " +
+                                    s"allTokensIndexes=${allToksIdxs.mkString("|")}, " +
+                                    s"added=$added" +
+                                    s"]"
+                            )
                     }
 
-                    // TODO:
                     def matchExist(elemId: String, toks: Seq[NlpToken]): Boolean =
                         matches.exists(m ⇒ m.element.getId == elemId && toks.toSet.subsetOf(m.tokensSet))
 
@@ -607,15 +585,16 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
                         lazy val tokStems = toks.map(_.stem).mkString(" ")
 
                         // Attempt to match each element.
-                        // TODO: alreadyMarked - может быть найдено тоже самое но отмечено меньше (как это сразу не рассматривать?)
                         for (
                             elm ← mdl.elements.values;
                             elemId = elm.getId;
                             dirProc = cacheDirect(elemId).exists(_.containsSlice(tokIdxs));
                             sparseProc = cacheSparse(elemId).exists(_.containsSlice(tokIdxs))
-                            if (!dirProc || !sparseProc) && !alreadyMarked(toks, elemId) && !matchExist(elemId, toks)
+                            if
+                                (!dirProc || !sparseProc) &&
+                                 // Checks whole tokens slice.
+                                !alreadyMarked(toks, elemId) && !matchExist(elemId, toks)
                         ) {
-                            //println(s"State=$elemId, dirProc=$dirProc, sparseProc=$sparseProc, cacheSparse(elemId)="+cacheSparse(elemId).mkString("|"))
                             // 1. SIMPLE.
                             found = false
 
@@ -662,9 +641,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
                                     }
 
                             // 2. DSL.
-                            val dslEnabled = state != SIMPLE
-
-                            if (dslEnabled && mdl.synonymsDsl.nonEmpty) {
+                            if (state != SIMPLE && mdl.synonymsDsl.nonEmpty) {
                                 found = false
 
                                 // 2.1 Sparse.
@@ -691,9 +668,42 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
                         }
                     }
 
-                    processMatches(ns, mdlId, matches)
+                    for ((m, idx) ← matches.zipWithIndex) {
+                        if (DEEP_DEBUG)
+                            logger.trace(
+                                s"Model '$mdlId' element found (${idx + 1} of ${matches.size}) [" +
+                                    s"elementId=${m.element.getId}, " +
+                                    s"synonym=${m.synonym}, " +
+                                    s"tokens=${tokString(m.tokens)}" +
+                                    s"]"
+                            )
+
+                        val tokIdxs = m.tokens.map(_.index)
+                        val direct = m.synonym.isDirect && !tokIdxs.zip(tokIdxs.tail).exists { case (x, y) ⇒ x > y }
+
+                        var added = false
+
+                        // Checks element's tokens.
+                        if (!alreadyMarked(m.tokens, m.element.getId)) {
+                            mark(ns, m.element, m.tokens, direct, syn = Some(m.synonym), metaOpt = None, m.parts, m.allToksIdxs)
+
+                            added = true
+                        }
+
+                        if (DEEP_DEBUG)
+                            logger.trace(
+                                s"Element ${if (added) "added" else "skipped"} [" +
+                                    s"id=${m.element.getId}, " +
+                                    s"indexes=${m.tokens.map(_.index).mkString("|")}, " +
+                                    s"allTokensIndexes=${m.allToksIdxs.mkString("|")}, " +
+                                    s"]"
+                            )
+                    }
                 }
 
+                if (DEEP_DEBUG)
+                    logger.trace(s"Exexucution started with state: $state")
+
                 go()
 
                 if (state == SIMPLE) {
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index 2776677..541966a 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -733,9 +733,22 @@ object NCSentenceManager extends NCService {
             )
         )
 
+        def notNlpNotes(s: NCNlpSentence): Seq[NCNlpSentenceNote] = s.flatten.filter(!_.isNlp)
+
+        // Drops similar sentences (with same notes structure). Keeps with more found.
+        sens = sens.groupBy(notNlpNotes(_).groupBy(_.noteType).keys.toSeq.sorted.distinct).
+            flatMap(p ⇒ {
+                val m: Map[NCNlpSentence, Int] = p._2.map(p ⇒ p → notNlpNotes(p).size).toMap
+
+                val max = m.values.max
+
+                m.filter(_._2 == max).keys
+            }).
+            toSeq
+
         // Drops similar sentences (with same tokens structure).
         // Among similar sentences we prefer one with minimal free words count.
-        sens.groupBy(_.flatten.filter(!_.isNlp).map(_.getKey(withIndexes = false))).
+        sens.groupBy(notNlpNotes(_).map(_.getKey(withIndexes = false))).
             map { case (_, seq) ⇒ seq.minBy(_.filter(p ⇒ p.isNlp && !p.isStopWord).map(_.wordIndexes.length).sum) }.
             toSeq
     }
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/sparse/NCSparseSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/sparse/NCSparseSpec.scala
index 37df085..8441532 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/sparse/NCSparseSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/sparse/NCSparseSpec.scala
@@ -36,17 +36,20 @@ class NCSparseModel extends NCAbstractTokensModel {
         val variants = ctx.getVariants.asScala
 
         def checkOneVariant(sparsity: Int): Unit = {
-            require(variants.size == 1)
+            require(variants.size == 1, "There is should be single variant.")
 
             val toks = variants.head.asScala.filter(_.getId == "xyz")
 
-            require(toks.size == 3)
+            require(toks.size == 3, "There are should be 3 `xyz` tokens.")
 
             checkSparsity(sparsity, toks)
         }
 
         def checkSparsity(sparsity: Int, toks: mutable.Buffer[NCToken]): Unit =
-            require(toks.forall(_.getMetadata.get("nlpcraft:nlp:sparsity").asInstanceOf[Int] == sparsity))
+            require(
+                toks.forall(_.getMetadata.get("nlpcraft:nlp:sparsity").asInstanceOf[Int] == sparsity),
+                s"Sparsity of each tokens should be: $sparsity."
+            )
 
         def checkExists(sparsity: Int): Unit =
             require(
@@ -58,9 +61,11 @@ class NCSparseModel extends NCAbstractTokensModel {
                             checkSparsity(sparsity, toks)
 
                             true
-                        case _ ⇒ false
+                        case _ ⇒
+                            false
                     }
-                })
+                }),
+                s"Variant with 3 `xyz` tokens should be exists."
             )
 
         ctx.getRequest.getNormalizedText match {