You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/04/08 19:48:40 UTC

[incubator-nlpcraft] 05/07: WIP.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-287
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 1f47a982c25c0b82802b8881277bb51e1a6f3442
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Thu Apr 8 18:57:21 2021 +0300

    WIP.
---
 .../apache/nlpcraft/probe/mgrs/NCProbeModel.scala  |   9 +-
 .../probe/mgrs/deploy/NCDeployManager.scala        |  15 +-
 .../nlpcraft/probe/mgrs/model/NCModelManager.scala |  24 ++-
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 161 ++++++++++-----------
 4 files changed, 103 insertions(+), 106 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
index 0e418b3..31fa627 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
@@ -30,8 +30,7 @@ import scala.collection.{Map, Seq}
   * @param intents
   * @param directSynonyms
   * @param sparseSynonyms
-  * @param directSynonymsDsl
-  * @param addStopWordsStems
+  * @param synonymsDsl
   * @param exclStopWordsStems
   * @param suspWordsStems
   * @param elements
@@ -42,13 +41,13 @@ case class NCProbeModel(
     intents: Seq[NCIdlIntent],
     directSynonyms: Map[String /*Element ID*/ , Map[Int /*Synonym length*/ , NCProbeSynonymsWrapper]], // Fast access map.
     sparseSynonyms: Map[String /*Element ID*/, Seq[NCProbeSynonym]],
-    directSynonymsDsl: Map[String /*Element ID*/ , Seq[NCProbeSynonym]], // Fast access map.
-    sparseSynonymsDsl: Map[String /*Element ID*/ , Seq[NCProbeSynonym]],
+    synonymsDsl: Map[String /*Element ID*/ , Seq[NCProbeSynonym]], // Fast access map.
     addStopWordsStems: Set[String],
     exclStopWordsStems: Set[String],
     suspWordsStems: Set[String],
     elements: Map[String /*Element ID*/ , NCElement],
     samples: Set[(String, Seq[Seq[String]])]
 ) {
-    def hasDslSynonyms(elemId: String): Boolean = directSynonymsDsl.contains(elemId) || sparseSynonymsDsl.contains(elemId)
+    def hasDslSynonyms(elemId: String): Boolean = synonymsDsl.contains(elemId)
+    def hasDslSynonyms: Boolean = synonymsDsl.nonEmpty
 }
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
index aa3b99e..04ed091 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
@@ -197,8 +197,10 @@ object NCDeployManager extends NCService with DecorateAsScala {
 
         // TODO: Sparse for nonDSL
         def ok(b: Boolean, exp: Boolean): Boolean = if (exp) b else !b
-        def filter(dsl: Boolean, sparse: Boolean): Set[SynonymHolder] =
-            syns.toSet.filter(s ⇒ ok(s.syn.exists(_.kind == IDL), dsl) && ok(s.sparse && s.syn.size > 1, sparse))
+        def filterDsl(syns: Set[SynonymHolder], dsl: Boolean): Set[SynonymHolder] =
+            syns.filter(s ⇒ ok(s.syn.exists(_.kind == IDL), dsl))
+        def filterSparse(syns: Set[SynonymHolder], sparse: Boolean): Set[SynonymHolder] =
+            syns.filter(s ⇒ ok(s.sparse && s.syn.size > 1, sparse))
 
         var cnt = 0
         val maxCnt = mdl.getMaxTotalSynonyms
@@ -506,14 +508,15 @@ object NCDeployManager extends NCService with DecorateAsScala {
         def toMap(set: Set[SynonymHolder]): Map[String, Seq[NCProbeSynonym]] =
             set.groupBy(_.elmId).map(p ⇒ p._1 → p._2.map(_.syn).toSeq.sortBy(-_.size))
 
+        val notDsl = filterDsl(syns.toSet, dsl = false)
+
         NCProbeModel(
             model = mdl,
             solver = solver,
             intents = intents.map(_._1).toSeq,
-            directSynonyms = mkFastAccessMap(filter(dsl = false, sparse = false), NCProbeSynonymsWrapper(_)),
-            sparseSynonyms = toMap(filter(dsl = false, sparse = true)),
-            directSynonymsDsl = toMap(filter(dsl = true, sparse = false)),
-            sparseSynonymsDsl = toMap(filter(dsl = true, sparse = true)),
+            directSynonyms = mkFastAccessMap(filterSparse(notDsl, sparse = false), NCProbeSynonymsWrapper(_)),
+            sparseSynonyms = toMap(filterSparse(notDsl, sparse = true)),
+            synonymsDsl = toMap(filterDsl(syns.toSet, dsl = true)),
             addStopWordsStems = addStopWords,
             exclStopWordsStems = exclStopWords,
             suspWordsStems = suspWords,
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
index 457bf35..ff0cb78 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
@@ -58,26 +58,24 @@ object NCModelManager extends NCService with DecorateAsScala {
             data.values.foreach(w ⇒ {
                 val mdl = w.model
 
-                val synCnt = w.directSynonyms.flatMap(_._2.map(_._2.count)).sum
-                val synDslCnt = w.directSynonymsDsl.map(_._2.size).sum
+                val synDirectCnt = w.directSynonyms.flatMap(_._2.map(_._2.count)).sum
                 val synSparseCnt = w.sparseSynonyms.map(_._2.size).sum
-                val synSparseDslCnt = w.sparseSynonymsDsl.map(_._2.size).sum
+                val synDslCnt = w.synonymsDsl.map(_._2.size).sum
                 val elmCnt = w.elements.keySet.size
                 val intentCnt = w.intents.size
 
                 def withWarn(i: Int): String = if (i == 0) s"0 ${r("(!)")}" else i.toString
 
                 tbl += Seq(
-                    s"Name:                  ${bo(c(mdl.getName))}",
-                    s"ID:                    ${bo(mdl.getId)}",
-                    s"Version:               ${mdl.getVersion}",
-                    s"Origin:                ${mdl.getOrigin}",
-                    s"Elements:              ${withWarn(elmCnt)}",
-                    s"Synonyms(Direct)       $synCnt",
-                    s"Synonyms(Direct, DSL): $synDslCnt",
-                    s"Synonyms(Sparse):      $synSparseCnt",
-                    s"Synonyms(Sparse, DSL): $synSparseDslCnt",
-                    s"Intents:               ${withWarn(intentCnt)}"
+                    s"Name:             ${bo(c(mdl.getName))}",
+                    s"ID:               ${bo(mdl.getId)}",
+                    s"Version:          ${mdl.getVersion}",
+                    s"Origin:           ${mdl.getOrigin}",
+                    s"Elements:         ${withWarn(elmCnt)}",
+                    s"Synonyms(Direct)  $synDirectCnt",
+                    s"Synonyms(Sparse): $synSparseCnt",
+                    s"Synonyms(DSL):    $synDslCnt",
+                    s"Intents:          ${withWarn(intentCnt)}"
                 )
             })
         }
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 0542174..5169afe 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -373,30 +373,31 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
 
     /**
       *
-      * @param comb
-      * @param syn
+      * @param seq
+      * @param s
       */
-    private def getPartsComplex(comb: Seq[Complex], syn: Synonym): Seq[TokType] =
-        comb.zip(syn.map(_.kind)).flatMap {
+    private def toPartsComplex(seq: Seq[Complex], s: Synonym): Seq[TokType] =
+        seq.zip(s.map(_.kind)).flatMap {
             case (complex, kind) ⇒ if (complex.isToken) Some(complex.token → kind)
             else None
         }
 
     /**
       *
-      * @param comb
-      * @param syn
+      * @param seq
+      * @param s
       */
-    private def toParts(comb: Seq[NCDslContent], syn: Synonym): Seq[TokType] =
-        comb.zip(syn.map(_.kind)).flatMap {
+    private def toParts(seq: Seq[NCDslContent], s: Synonym): Seq[TokType] =
+        seq.zip(s.map(_.kind)).flatMap {
             case (complex, kind) ⇒ if (complex.isLeft) Some(complex.left.get → kind) else None
         }
 
     /**
       *
       */
-    private def mkCache(): Cache =
-        mutable.HashMap.empty[String, mutable.ArrayBuffer[Seq[Int]]].withDefault(_ ⇒ mutable.ArrayBuffer.empty[Seq[Int]])
+    private def mkCache(mdl: NCProbeModel): Cache =
+        mutable.HashMap.empty[String, mutable.ArrayBuffer[Seq[Int]]].empty ++
+            mdl.elements.keys.map(k ⇒ k → mutable.ArrayBuffer.empty[Seq[Int]])
 
     /**
       *
@@ -478,24 +479,21 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
       * @param h
       * @param toks
       */
-    private def mkComplexCombinations(h: ComplexHolder, toks: Seq[NlpToken]): Seq[Seq[Complex]] = {
-        val idxsSeq = toks.flatMap(_.wordIndexes)
-//        val idxsSorted = idxsSeq.sorted
-        val idxs = idxsSeq.toSet
-//        val idxMin = idxsSorted.head
-//        val idxMax = idxsSorted.last
+    private def mkComplexCombinations(h: ComplexHolder, toks: Seq[NlpToken], cache: Set[Seq[Complex]]): Seq[Seq[Complex]] = {
+        val idxs = toks.flatMap(_.wordIndexes).toSet
 
         h.complexes.par.
             flatMap(complexSeq ⇒ {
                 //val rec = complexSeq.tokensComplexes.filter(_.isSubsetOf(idxMin, idxMax, idxs))
-                val rec = complexSeq.tokensComplexes.filter(_.wordIndexes.exists(idxsSeq.contains))
+                val rec = complexSeq.tokensComplexes.filter(_.wordIndexes.exists(idxs.contains))
 
                 // Drops without tokens (IDL part works with tokens).
-                if (rec.nonEmpty)
-                    Some(
-                        rec ++
+                if (rec.nonEmpty) {
+                    val data = rec ++
                         (complexSeq.wordsIndexes.intersect(idxs) -- rec.flatMap(_.wordIndexes)).map(h.complexesWords)
-                    )
+
+                    if (!cache.contains(data)) Some(data) else None
+                }
                 else
                     None
             }).seq
@@ -569,31 +567,8 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
 
         startScopedSpan("enrich", parent, "srvReqId" → srvReqId, "mdlId" → mdlId, "txt" → ns.text) { span ⇒
             val req = NCRequestImpl(senMeta, srvReqId)
-            val matches = mutable.ArrayBuffer.empty[ElementMatch]
-            val cacheSparse = mkCache()
-            val cacheDirect = mkCache()
             val h = mkComplexes(mdl, ns)
 
-            var found = false
-
-            def add(typ: String, elm: NCElement, cache: Cache, res: Seq[NlpToken], tokIdxs: Seq[Int], s: Synonym, parts: Seq[TokType]): Unit = {
-                val toksSet = res.toSet
-
-                var added = false
-
-                // TODO:
-                if (!matches.exists(m ⇒ m.element.getId == elm.getId && toksSet.subsetOf(m.tokensSet))) {
-                    matches += ElementMatch(elm, res, s, parts)
-
-                    added = true
-                }
-
-                cache(elm.getId) += tokIdxs
-                found = true
-
-                println(s"ADDED: ${elm.getId}, type=$typ, res=${res.map(_.origText).mkString("|")}, tokIdxs=${tokIdxs.mkString("|")}, added=$added")
-            }
-
             startScopedSpan("synsProc", span, "srvReqId" → srvReqId, "mdlId" → mdlId, "txt" → ns.text) { _ ⇒
                 var state = if (ns.firstProbePhase) SIMPLE else DSL_NEXT
                 ns.firstProbePhase = false
@@ -603,9 +578,36 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
                     println
                     println(s"GO $state")
 
+                    val matches = mutable.ArrayBuffer.empty[ElementMatch]
+
+                    val cacheSparse = mkCache(mdl)
+                    val cacheDirect = mkCache(mdl)
+                    val dslCache = mutable.HashSet.empty[Seq[Complex]]
+
+                    var found = false
+
+                    def add(typ: String, elm: NCElement, cache: Cache, res: Seq[NlpToken], tokIdxs: Seq[Int], s: Synonym, parts: Seq[TokType] = Seq.empty): Unit = {
+                        var added = false
+
+                        if (!matchExist(elm.getId, res)) {
+                            matches += ElementMatch(elm, res, s, parts)
+
+                            added = true
+                        }
+
+                        cache(elm.getId) += tokIdxs
+                        found = true
+
+                        println(s"ADDED: ${elm.getId}, type=$typ, res=${res.map(_.origText).mkString("|")}, toks=${tokIdxs.mkString("|")}, added=$added")
+                    }
+
+                    // TODO:
+                    def matchExist(elemId: String, toks: Seq[NlpToken]): Boolean =
+                        matches.exists(m ⇒ m.element.getId == elemId && toks.toSet.subsetOf(m.tokensSet))
+
                     for (toks ← combosToks) {
                         val tokIdxs = toks.map(_.index)
-                        lazy val dslCombs: Seq[Seq[Complex]] = mkComplexCombinations(h, toks)
+                        lazy val dslCombs: Seq[Seq[Complex]] = mkComplexCombinations(h, toks, dslCache.toSet)
                         lazy val tokStems = toks.map(_.stem).mkString(" ")
 
                         // Attempt to match each element.
@@ -613,12 +615,11 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
                         for (
                             elm ← mdl.elements.values;
                             elemId = elm.getId;
-                            if
-                                !alreadyMarked(toks, elm.getId)
+                            dirProc = cacheDirect(elemId).exists(_.containsSlice(tokIdxs));
+                            sparseProc = cacheSparse(elemId).exists(_.containsSlice(tokIdxs))
+                            if (!dirProc || !sparseProc) && !alreadyMarked(toks, elemId) && !matchExist(elemId, toks)
                         ) {
-                            val directProc = cacheDirect(elemId).exists(_.containsSlice(tokIdxs))
-                            val sparseProc = cacheSparse(elemId).exists(_.containsSlice(tokIdxs))
-
+                            //println(s"State=$elemId, dirProc=$dirProc, sparseProc=$sparseProc, cacheSparse(elemId)="+cacheSparse(elemId).mkString("|"))
                             // 1. SIMPLE.
                             found = false
 
@@ -630,19 +631,19 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
                                 }
 
                             // 1.1 Direct.
-                            if (simpleEnabled && !directProc && !found)
+                            if (simpleEnabled && !dirProc && !found)
                                 fastAccess(mdl.directSynonyms, elemId, toks.length) match {
                                     case Some(h) ⇒
                                         def tryMap(syns: Map[String, Synonym], notFound: () ⇒ Unit): Unit =
                                             syns.get(tokStems) match {
-                                                case Some(s) ⇒ add("direct simple", elm, cacheDirect, toks, tokIdxs, s, Seq.empty)
+                                                case Some(s) ⇒ add("direct simple", elm, cacheDirect, toks, tokIdxs, s)
                                                 case None ⇒ notFound()
                                             }
 
                                         def tryScan(syns: Seq[Synonym]): Unit =
                                             for (s ← syns if !found)
                                                 if (s.isMatch(toks))
-                                                    add("direct simple2", elm, cacheDirect, toks, tokIdxs, s, Seq.empty)
+                                                    add("direct simple2", elm, cacheDirect, toks, tokIdxs, s)
 
                                         tryMap(
                                             h.txtDirectSynonyms,
@@ -660,34 +661,37 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
                             if (simpleEnabled && !sparseProc && !found)
                                 for (s ← get(mdl.sparseSynonyms, elemId) if !found)
                                     s.trySparseMatch(toks) match {
-                                        case Some(res) ⇒ add("sparse simple", elm, cacheSparse, res, tokIdxs, s, Seq.empty)
+                                        case Some(res) ⇒ add("sparse simple", elm, cacheSparse, res, tokIdxs, s)
                                         case None ⇒ // No-op.
                                     }
 
                             // 2. DSL.
-                            found = false
                             val dslEnabled = state != SIMPLE
 
-                            // 2.1 Direct.
-                            if (dslEnabled && mdl.directSynonymsDsl.nonEmpty && !directProc && !found)
-                                for (s ← get(mdl.directSynonymsDsl, elemId); comb ← dslCombs if !found) {
-                                    if (s.isMatch(comb.map(_.data), req)) {
-                                        println(s"OK $elemId for s=$s for toks:${toks.map(_.origText)}")
-
-                                        add("direct DSL", elm, cacheDirect, toks, tokIdxs, s, getPartsComplex(comb, s))
-                                    }
-                                    println {
-                                        println(s"NOT OK $elemId for s=$s for toks:${toks.map(_.origText)}")
-                                    }
+                            if (dslEnabled && mdl.synonymsDsl.nonEmpty) {
+                                found = false
+
+                                // 2.1 Sparse.
+                                if (mdl.hasDslSynonyms) {
+                                    if (!sparseProc)
+                                        for (s ← get(mdl.synonymsDsl, elemId); comb ← dslCombs if !found)
+                                            s.trySparseMatch(comb.map(_.data), req) match {
+                                                case Some(res) ⇒
+                                                    add("DSL", elm, cacheSparse, toTokens(res, ns), tokIdxs, s, toParts(res, s))
+                                                    dslCache += comb
+                                                case None ⇒ // No-op.
+                                            }
                                 }
-
-                            // 2.2 Sparse.
-                            if (dslEnabled && mdl.sparseSynonymsDsl.nonEmpty && !sparseProc && !found)
-                                for (s ← get(mdl.sparseSynonymsDsl, elemId); comb ← dslCombs if !found)
-                                    s.trySparseMatch(comb.map(_.data), req) match {
-                                        case Some(res) ⇒ add("sparse DSL", elm, cacheSparse, toTokens(res, ns), tokIdxs, s, toParts(res, s))
-                                        case None ⇒ // No-op.
-                                    }
+                                // 2.2 Direct.
+                                else {
+                                    if (!dirProc)
+                                        for (s ← get(mdl.synonymsDsl, elemId); comb ← dslCombs if !found)
+                                            if (s.isMatch(comb.map(_.data), req)) {
+                                                add("direct DSL", elm, cacheDirect, toks, tokIdxs, s, toPartsComplex(comb, s))
+                                                dslCache += comb
+                                            }
+                                }
+                            }
                         }
                     }
 
@@ -701,18 +705,11 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
 
                     go()
                 }
-
-
             }
 
-
-
             processParsers(mdl, ns, span, req)
         }
     }
 
-    def isComplex(mdl: NCProbeModel): Boolean =
-        mdl.directSynonymsDsl.nonEmpty ||
-        mdl.sparseSynonymsDsl.nonEmpty ||
-        !mdl.model.getParsers.isEmpty
+    def isComplex(mdl: NCProbeModel): Boolean = mdl.synonymsDsl.nonEmpty || !mdl.model.getParsers.isEmpty
 }
\ No newline at end of file