You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/04/06 09:15:17 UTC
[incubator-nlpcraft] 03/05: WIP.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-287
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit c4d2d15cb4ff94c96105cedfb12a70e4845dd113
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Tue Apr 6 11:25:59 2021 +0300

    WIP.
---
 .../nlpcraft/probe/mgrs/NCProbeSynonym.scala       |  51 ++++++-----
 .../nlpcraft/probe/mgrs/model/NCModelManager.scala |  22 ++---
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 100 ++++++++++-----------
 .../probe/mgrs/sentence/NCSentenceManager.scala    |  22 +++--
 4 files changed, 106 insertions(+), 89 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
index 5324304..95c526f 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
@@ -92,37 +92,44 @@ class NCProbeSynonym(
         require(toks != null)
         require(toks.nonEmpty)
 
-        lazy val res = mutable.ArrayBuffer.empty[T]
-        lazy val all = mutable.HashSet.empty[T]
+        if (toks.size >= this.size) {
+            lazy val res = mutable.ArrayBuffer.empty[T]
+            lazy val all = mutable.HashSet.empty[T]
 
-        var state = 0
+            var state = 0
 
-        for (chunk ← this if state != -1) {
-            val seq =
-                if (state == 0) {
-                    state = 1
+            for (chunk ← this if state != -1) {
+                val seq =
+                    if (state == 0) {
+                        state = 1
 
-                    toks.filter(t ⇒ isMatch(t, chunk))
-                }
-                else
-                    toks.filter(t ⇒ !res.contains(t) && isMatch(t, chunk))
+                        toks.filter(t ⇒ isMatch(t, chunk))
+                    }
+                    else
+                        toks.filter(t ⇒ !res.contains(t) && isMatch(t, chunk))
 
-            if (seq.nonEmpty) {
-                val head = seq.head
+                if (seq.nonEmpty) {
+                    val head = seq.head
 
-                if (!perm && res.nonEmpty && getIndex(head) <= getIndex(res.last))
-                    state = -1
-                else {
-                    res += head
-                    all ++= seq
+                    if (!perm && res.nonEmpty && getIndex(head) <= getIndex(res.last))
+                        state = -1
+                    else {
+                        res += head
+                        all ++= seq
+
+                        if (all.size > res.size)
+                            state = -1
+                    }
                 }
+                else
+                    state = -1
             }
+
+            if (state != -1 && all.size == res.size)
+                Some(res)
             else
-                state = -1
+                None
         }
-
-        if (state != -1 && all.size == res.size)
-            Some(res)
         else
             None
     }
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
index cdfdf89..03c59ff 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
@@ -65,17 +65,19 @@ object NCModelManager extends NCService with DecorateAsScala {
                 val elmCnt = w.elements.keySet.size
                 val intentCnt = w.intents.size
 
+                def getWithWarning(i: Int): String = if (i == 0) s"0 ${r("(!)")}" else i.toString
+
                 tbl += Seq(
-                    s"Name:                  ${bo(c(mdl.getName))}",
-                    s"ID:                    ${bo(mdl.getId)}",
-                    s"Version:               ${mdl.getVersion}",
-                    s"Origin:                ${mdl.getOrigin}",
-                    s"Elements:              $elmCnt" + (if (elmCnt == 0) s" ${r("(!)")}" else ""),
-                    s"Synonyms:              $synCnt" + (if (synCnt == 0) s" ${r("(!)")}" else ""),
-                    s"Synonyms(DSL):         $synDslCnt" + (if (synDslCnt == 0) s" ${r("(!)")}" else ""),
-                    s"Synonyms(Sparse):      $synSparseCnt" + (if (synSparseCnt == 0) s" ${r("(!)")}" else ""),
-                    s"Synonyms(Sparse, DSL): $synSparseDslCnt" + (if (synSparseDslCnt == 0) s" ${r("(!)")}" else ""),
-                    s"Intents:               $intentCnt" + (if (intentCnt == 0) s" ${r("(!)")}" else "")
+                    s"Name:                      ${bo(c(mdl.getName))}",
+                    s"ID:                        ${bo(mdl.getId)}",
+                    s"Version:                   ${mdl.getVersion}",
+                    s"Origin:                    ${mdl.getOrigin}",
+                    s"Elements:                  ${getWithWarning(elmCnt)}",
+                    s"Synonyms(Continuous)       $synCnt",
+                    s"Synonyms(Continuous, DSL): $synDslCnt",
+                    s"Synonyms(Sparse):          $synSparseCnt",
+                    s"Synonyms(Sparse, DSL):     $synSparseDslCnt",
+                    s"Intents:                   ${getWithWarning(intentCnt)}"
                 )
             })
         }
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 46506fd..f9acd95 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -296,11 +296,11 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
             else None
         }
 
-    private def mkCache(): mutable.Map[String, ArrayBuffer[Seq[NlpToken]]] =
+    private def mkCache(): mutable.Map[String, ArrayBuffer[Seq[Int]]] =
         mutable.HashMap.empty[
             String,
-            mutable.ArrayBuffer[Seq[NlpToken]]
-        ].withDefault(_ ⇒ mutable.ArrayBuffer.empty[Seq[NlpToken]])
+            mutable.ArrayBuffer[Seq[Int]]
+        ].withDefault(_ ⇒ mutable.ArrayBuffer.empty[Seq[Int]])
 
     private def convert(tows: Seq[NCDslContent], ns: NCNlpSentence): Seq[NlpToken] =
         (
@@ -388,15 +388,17 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
             ) {
                 _ ⇒
                 for (toks ← combos(ns)) {
-                    val idxsSeq = toks.flatMap(tokIdxs)
-                    val idxsSorted = idxsSeq.sorted
-                    val idxs = idxsSeq.toSet
-                    val idxMin = idxsSorted.head
-                    val idxMax = idxsSorted.last
+                    val indexes = toks.map(_.index)
 
-                    lazy val sorted = idxsSorted.zipWithIndex.toMap
+                    lazy val dslCombs: Map[Int, Seq[Seq[Complex]]] = {
+                        val idxsSeq = toks.flatMap(tokIdxs)
+                        val idxsSorted = idxsSeq.sorted
+                        val idxs = idxsSeq.toSet
+                        val idxMin = idxsSorted.head
+                        val idxMax = idxsSorted.last
+
+                        lazy val sorted = idxsSorted.zipWithIndex.toMap
 
-                    lazy val dslCombs: Map[Int, Seq[Seq[Complex]]] =
                         complexes.par.
                             flatMap(complexSeq ⇒ {
                                 val rec = complexSeq.tokensComplexes.filter(_.isSubsetOf(idxMin, idxMax, idxs))
@@ -412,54 +414,41 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
                                     None
                             }).
                             map(_.sortBy(p ⇒ sorted(p.wordIndexes.head))).seq.groupBy(_.length)
+                    }
 
                     lazy val tokStems = toks.map(_.stem).mkString(" ")
 
                     // Attempt to match each element.
                     for (elm ← mdl.elements.values) {
                         val elemId = elm.getId
-                        val sparseEnabled = !cacheSparse(elemId).exists(_.contains(toks))
-                        val notSparseEnabled = !cacheNotSparse(elemId).exists(_.contains(toks))
-                        var foundSparse = false
-                        var foundNotSparse = false
+                        val sparseEnabled = !cacheSparse(elemId).exists(_.containsSlice(indexes))
+                        val notSparseEnabled = !cacheNotSparse(elemId).exists(_.containsSlice(indexes))
+                        var found = false
 
                         def addSparse(res: Seq[NlpToken], syn: NCProbeSynonym, parts: Seq[TokenData]): Unit = {
                             addMatch(elm, res, syn, parts)
-                            cacheSparse(elemId) += toks
-                            foundSparse = true
+                            cacheSparse(elemId) += indexes
+                            found = true
                         }
 
                         def addNotSparse(syn: NCProbeSynonym, parts: Seq[TokenData]): Unit = {
                             addMatch(elm, toks, syn, parts)
-                            cacheNotSparse(elemId) += toks
-                            foundNotSparse = true
+                            cacheNotSparse(elemId) += indexes
+                            found = true
                         }
 
-
-                        // 1. Simple, sparse.
-                        if (firstPhase && sparseEnabled)
-                            for (syn ← mdl.sparseSynonyms.getOrElse(elemId, Seq.empty) if !foundSparse)
-                                syn.trySparseMatch(toks) match {
-                                    case Some(res) ⇒ addSparse(res, syn, Seq.empty)
-                                    case None ⇒ // No-op.
-                                }
-
-                        // 2. Simple, not sparse.
-                        // Optimization - plain synonyms can be used only on first iteration
-                        if (firstPhase && notSparseEnabled)
+                        // 1. Simple, not sparse.
+                        if (firstPhase && notSparseEnabled && !found)
                             fastAccess(mdl.nonSparseSynonyms, elemId, toks.length) match {
                                 case Some(h) ⇒
                                     def tryMap(synsMap: Map[String, NCProbeSynonym], notFound: () ⇒ Unit): Unit =
                                         synsMap.get(tokStems) match {
                                             case Some(syn) ⇒ addNotSparse(syn, Seq.empty)
-                                                // TODO:
-                                                //if (!found)
-                                                //   notFound()
                                             case None ⇒ notFound()
                                         }
 
                                     def tryScan(synsSeq: Seq[NCProbeSynonym]): Unit =
-                                        for (syn ← synsSeq if !foundNotSparse)
+                                        for (syn ← synsSeq if !found)
                                             if (syn.isMatch(toks))
                                                 addNotSparse(syn, Seq.empty)
 
@@ -468,7 +457,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
                                         () ⇒ {
                                             tryScan(h.notTxtDirectSynonyms)
 
-                                            if (!foundNotSparse)
+                                            if (!found)
                                                 tryMap(
                                                     h.txtNotDirectSynonyms,
                                                     () ⇒ tryScan(h.notTxtNotDirectSynonyms)
@@ -478,30 +467,38 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
                                 case None ⇒ // No-op.
                             }
 
-                        // 3. DSL, sparse.
-                        if (sparseEnabled)
-                            for (
-                                (_, seq) ← dslCombs;
-                                syn ← mdl.sparseSynonymsDsl.getOrElse(elemId, Seq.empty);
-                                comb ← seq if !foundSparse
-                            ) {
-                                syn.trySparseMatch(comb.map(_.data), req) match {
-                                    case Some(towsRes) ⇒ addSparse(convert(towsRes, ns), syn, getPartsContent(towsRes, syn))
-                                    case None ⇒ // No-op.
-                                }
-                            }
-
-                        // 4. DSL, non sparse.
-                        if (notSparseEnabled) {
+                        // 2. DSL, non sparse.
+                        if (notSparseEnabled && mdl.nonSparseSynonymsDsl.nonEmpty && !found) {
                             for (
                                 (len, seq) ← dslCombs;
                                 syn ← fastAccess(mdl.nonSparseSynonymsDsl, elemId, len).getOrElse(Seq.empty);
-                                comb ← seq if !foundNotSparse
+                                comb ← seq if !found
                             ) {
                                 if (syn.isMatch(comb.map(_.data), req))
                                     addNotSparse(syn, getPartsComplex(comb, syn))
                             }
                         }
+
+                        // 3. Simple, sparse.
+                        if (firstPhase && sparseEnabled && !found)
+                            for (syn ← mdl.sparseSynonyms.getOrElse(elemId, Seq.empty) if !found)
+                                syn.trySparseMatch(toks) match {
+                                    case Some(res) ⇒ addSparse(res, syn, Seq.empty)
+                                    case None ⇒ // No-op.
+                                }
+
+                        // 4. DSL, sparse.
+                        if (sparseEnabled && mdl.sparseSynonymsDsl.nonEmpty && !found)
+                            for (
+                                syn ← mdl.sparseSynonymsDsl.getOrElse(elemId, Seq.empty);
+                                (_, seq) ← dslCombs;
+                                comb ← seq if !found
+                            ) {
+                                syn.trySparseMatch(comb.map(_.data), req) match {
+                                    case Some(towsRes) ⇒ addSparse(convert(towsRes, ns), syn, getPartsContent(towsRes, syn))
+                                    case None ⇒ // No-op.
+                                }
+                            }
                     }
                 }
             }
@@ -529,6 +526,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
 
             val matchCnt = matchesNorm.size
 
+
             // TODO:matchesNorm
             // Add notes for all remaining (non-intersecting) matches.
             for ((m, idx) ← matches.zipWithIndex) {
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index ad66b8f..a938f59 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -24,9 +24,9 @@ import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSe
 import org.apache.nlpcraft.common.{NCE, NCService, U}
 import org.apache.nlpcraft.model.NCModel
 
-import java.io.{Serializable ⇒ JSerializable}
+import java.io.{Serializable => JSerializable}
 import java.util
-import java.util.{List ⇒ JList}
+import java.util.{List => JList}
 import scala.collection.JavaConverters.{asScalaBufferConverter, _}
 import scala.collection.{Map, Seq, mutable}
 import scala.language.implicitConversions
@@ -37,6 +37,8 @@ import scala.language.implicitConversions
 object NCSentenceManager extends NCService {
     @volatile private var pool: java.util.concurrent.ForkJoinPool = _
 
+    private val cache = U.mkLRUMap[Seq[Set[NCNlpSentenceNote]], util.List[util.List[NCNlpSentenceNote]]]("sentence-combinations-cache", 500)
+
     case class PartKey(id: String, start: Int, end: Int) {
         require(start <= end)
 
@@ -197,7 +199,7 @@ object NCSentenceManager extends NCService {
       * @param noteField
       * @param ns
       */
-    private def fixNoteIndexesList(note: String, idxsField: String, noteField: String, ns: NCNlpSentence): Unit = {
+    private def fixNoteIndexesList(note: String, idxsField: String, noteField: String, ns: NCNlpSentence): Unit =
         ns.flatMap(_.getNotes(note)).foreach(rel ⇒
             rel.dataOpt[JList[JList[Int]]](idxsField) match {
                 case Some(idxsList) ⇒
@@ -211,7 +213,6 @@ object NCSentenceManager extends NCService {
                 case None ⇒ // No-op.
             }
         )
-    }
 
     /**
       * Copies token.
@@ -679,14 +680,23 @@ object NCSentenceManager extends NCService {
 
         var sens =
             if (delCombs.nonEmpty) {
-                val toksByIdx =
+                val toksByIdx: Seq[Set[NCNlpSentenceNote]] =
                     delCombs.flatMap(note ⇒ note.wordIndexes.map(_ → note)).
                         groupBy { case (idx, _) ⇒ idx }.
                         map { case (_, seq) ⇒ seq.map { case (_, note) ⇒ note }.toSet }.
                         toSeq.sortBy(-_.size)
 
+
+                var combs: JList[JList[NCNlpSentenceNote]] = cache.get(toksByIdx)
+
+                if (combs == null) {
+                    combs = NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool)
+
+                    cache.put(toksByIdx, combs)
+                }
+
                 val seqSens =
-                    NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala).
+                    combs.asScala.map(_.asScala).
                         par.
                         flatMap(delComb ⇒ {
                             val nsClone = sen.clone()