You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/09/18 09:05:11 UTC

[incubator-nlpcraft] branch NLPCRAFT-443 updated (5fb0f77 -> 25f6015)

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a change to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git.


    from 5fb0f77  Merge branch 'master' into NLPCRAFT-443
     new 426b8e8  WIP.
     add 4297ad9  Default `nlpEngine` fixed.
     new 1856f6c  Merge branch 'master' into NLPCRAFT-443
     new 1ccf5c5  WIP.
     new 1891c2c  WIP.
     add 78386a0  Update alarm_intents.idl
     add fbb64bc  Fix for NLPCRAFT-448
     new 054bc51  Merge branch 'master' into NLPCRAFT-443
     new c0d3b7c  WIP.
     new f471a8b  WIP.
     new 25f6015  WIP.

The 8 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../alarm/src/main/resources/alarm_intents.idl     |   2 +-
 nlpcraft-examples/minecraft-mod/build.gradle       |   4 +-
 .../minecraft/src/main/resources/minecraft.yaml    |   4 +-
 .../minecraft/src/main/resources/probe.conf        |   2 +-
 .../examples/minecraft/NCModelValidationSpec.scala |   2 +-
 nlpcraft/src/main/resources/nlpcraft.conf          |   2 +-
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 152 ++++++++++++++---
 .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala   | 187 ++++++++++-----------
 .../probe/mgrs/sentence/NCSentenceManager.scala    |   7 +-
 .../model/stm/indexes/NCSpecModelAdapter.scala     |   4 +-
 .../nlpcraft/model/stop/NCStopWordsBaseSpec.scala  |  73 ++++++++
 .../model/stop/NCStopWordsInsideSpec.scala         |  11 +-
 .../model/NCEnricherNestedModelSpec.scala          |   3 +-
 .../model/NCEnricherNestedModelSpec4.scala         |  81 +++++++--
 .../nlp/enrichers/sort/NCEnricherSortSpec.scala    |   3 +-
 15 files changed, 381 insertions(+), 156 deletions(-)
 create mode 100644 nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsBaseSpec.scala

[incubator-nlpcraft] 06/08: WIP.

Posted by se...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit c0d3b7cf8bf2c6974a2631d4c1a02a5db44583d9
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Fri Sep 17 12:11:29 2021 +0300

    WIP.
---
 .../nlpcraft/model/stop/NCStopWordsBaseSpec.scala  | 73 ++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsBaseSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsBaseSpec.scala
new file mode 100644
index 0000000..07ca216
--- /dev/null
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsBaseSpec.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.model.stop
+
+import org.apache.nlpcraft.model.{NCElement, NCIntent, NCModelAdapter, NCResult}
+import org.apache.nlpcraft.{NCTestContext, NCTestElement, NCTestEnvironment}
+import org.junit.jupiter.api.Test
+
+import java.util
+import scala.language.implicitConversions
+
+/**
+  *
+  */
+class NCStopWordsBaseModel extends NCModelAdapter("nlpcraft.test", "Test Model", "1.0") {
+    override def getElements: util.Set[NCElement] = Set(
+        NCTestElement("a"),
+        NCTestElement("b"),
+        NCTestElement("xy", "x y"),
+    )
+
+    @NCIntent(
+        "intent=twoWords " +
+        "    term(a)~{# == 'a'}" +
+        "    term(b)~{# == 'b'}"
+    )
+    def onTwoWords(): NCResult = NCResult.text("OK")
+
+    @NCIntent(
+        "intent=oneWord " +
+        "    term(xt)~{# == 'xy'}"
+    )
+    def onOneWord(): NCResult = NCResult.text("OK")
+}
+
+/**
+  *
+  */
+@NCTestEnvironment(model = classOf[NCStopWordsBaseModel], startClient = true)
+class NCStopWordsBaseSpec extends NCTestContext {
+    @Test
+    def testTwoWords(): Unit = {
+        checkIntent("a b", "twoWords")
+        checkIntent("a the b", "twoWords")
+        checkIntent("a the the b", "twoWords")
+        checkIntent("the a the b", "twoWords")
+        checkIntent("the a the b the the", "twoWords")
+    }
+
+    @Test
+    def testOneWord(): Unit = {
+        checkIntent("x y", "oneWord")
+        checkIntent("x the y", "oneWord")
+        checkIntent("x the the y", "oneWord")
+        checkIntent("the x the y", "oneWord")
+        checkIntent("the x the y the the", "oneWord")
+    }
+}
\ No newline at end of file

[incubator-nlpcraft] 03/08: WIP.

Posted by se...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 1ccf5c5cc5a489b2565500d31209e4af19aa8187
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Thu Sep 16 12:10:22 2021 +0300

    WIP.
---
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 25 ++++++++++++++++++----
 .../model/NCEnricherNestedModelSpec.scala          |  3 +--
 .../nlp/enrichers/sort/NCEnricherSortSpec.scala    |  3 +--
 3 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 6908265..22af412 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -450,6 +450,22 @@ object NCModelEnricher extends NCProbeEnricher {
             )
     }
 
+    /**
+      *
+      * @param matched
+      * @param toks2Match
+      */
+    private def getSparsedTokens(matched: Seq[NlpToken], toks2Match: Seq[NlpToken]): Seq[NlpToken] = {
+        require(matched.nonEmpty)
+
+        // Matched tokens should be already sorted.
+        val stopsInside = toks2Match.filter(t =>
+            t.isStopWord && !matched.contains(matched) && t.index > matched.head.index && t.index < matched.last.index
+        )
+
+        if (stopsInside.nonEmpty) (matched ++ stopsInside).sortBy(_.index) else matched
+    }
+
     @throws[NCE]
     override def enrich(mdl: NCProbeModel, ns: Sentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = {
         require(isStarted)
@@ -526,9 +542,10 @@ object NCModelEnricher extends NCProbeEnricher {
                                 for (s <- get(mdl.sparseSynonyms, eId))
                                     s.sparseMatch(toks) match {
                                         case Some(res) =>
-                                            println("!!!toks="+toks.map(_.origText))
-                                            println("!!!res="+res.map(_.origText))
-                                            add("simple sparse", ns, contCache, eId, greedy, res, idxs, s)
+//                                            println("!!!toks="+toks.map(_.origText))
+//                                            println("!!!res="+res.map(_.origText))
+//                                            println
+                                            add("simple sparse", ns, contCache, eId, greedy, getSparsedTokens(res, toks), idxs, s)
                                         case None => // No-op.
                                     }
                         }
@@ -566,7 +583,7 @@ object NCModelEnricher extends NCProbeEnricher {
                                         case Some(res) =>
                                             val typ = if (s.sparse) "IDL sparse" else "IDL continuous"
 
-                                            add(typ, ns, contCache, eId, greedy, toTokens(res, ns), idxs, s, toParts(res, s))
+                                            add(typ, ns, contCache, eId, greedy, getSparsedTokens(toTokens(res, ns), toTokens(comb.map(_.data), ns)), idxs, s, toParts(res, s))
 
                                             idlCache += comb
                                         case None => // No-op.
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
index 4d5d991..8b25e87 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
@@ -94,8 +94,7 @@ class NCEnricherNestedModelSpec2 extends NCEnricherNestedModelSpec1 {
             ),
             _ => checkExists(
                 "y the y",
-                usr(text = "y y", id = "y3"),
-                nlp(text = "the", isStop = true)
+                usr(text = "y the y", id = "y3")
             ),
             _ => checkExists(
                 "y xxx y",
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
index 228885d..7b8d858 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
@@ -224,8 +224,7 @@ class NCEnricherSortSpec extends NCEnricherBaseSpec {
             _ => checkExists(
                 "sort A the A the A",
                 srt(text = "sort", typ = SUBJ_ONLY, note = "wrapperA", index = 1),
-                usr("A A A", "wrapperA"),
-                nlp("the the", isStop = true)
+                usr("A the A the A", "wrapperA")
             )
         )
 }

[incubator-nlpcraft] 07/08: WIP.

Posted by se...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit f471a8be8f5210d4d1c1ad9905e3021fc04673d6
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Sat Sep 18 12:00:24 2021 +0300

    WIP.
---
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 135 ++++-----------
 .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala   | 189 ++++++++++-----------
 .../probe/mgrs/sentence/NCSentenceManager.scala    |  11 +-
 3 files changed, 130 insertions(+), 205 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index d83ab05..f8457e8 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -19,6 +19,7 @@ package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model
 
 import io.opencensus.trace.Span
 import org.apache.nlpcraft.common._
+import org.apache.nlpcraft.common.nlp.NCNlpSentence.NoteLink
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence => Sentence, NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken}
 import org.apache.nlpcraft.model._
 import org.apache.nlpcraft.model.impl.NCTokenImpl
@@ -330,7 +331,6 @@ object NCModelEnricher extends NCProbeEnricher {
             val bigSlides = slides.filter(_.size > 2)
 
             var stops4Delete: Seq[Seq[NlpToken]] =
-
                 if (bigSlides.nonEmpty) {
                     val allBig = bigSlides.flatMap(p => p)
                     val stops4AllCombs = stops.filter(p => !allBig.contains(p))
@@ -534,8 +534,6 @@ object NCModelEnricher extends NCProbeEnricher {
     override def enrich(mdl: NCProbeModel, ns: Sentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = {
         require(isStarted)
 
-        //logger.info("ENRICH111")
-
         startScopedSpan(
             "enrich", parent, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text
         ) { span =>
@@ -547,17 +545,16 @@ object NCModelEnricher extends NCProbeEnricher {
 //
 //            logger.info("ns.flatten.flatten.size="+ns.tokens.flatten.distinct.count(!_.isNlp))
 //
-//            ns.tokens.flatten.filter(!_.isNlp).distinct.foreach(n => {
+//            ns.tokens.flatten.filter(!_.isNlp).distinct.sortBy(p => (p.noteType, -p.tokenIndexes.size)).foreach(n => {
 //                val parts =
 //                    n.get("parts") match {
 //                        case Some(v) =>
 //                            val parts = v.asInstanceOf[java.util.List[NCTokenPartKey]].asScala
 //
-//                            "all parts=" + parts.size + "  " +
 //                            parts.map(p => {
 //                                val ref = ns.tokens.find(t => t.startCharIndex == p.from && t.endCharIndex == p.to).get
 //
-//                                "part=" + p.id + " (" + ref.index + "), text=" + ref.origText
+//                                "part=" + p.id + " (idx=" + ref.index + "), text=" + ref.origText
 //                            }).mkString(" | ")
 //                        case None => "NO"
 //                    }
@@ -588,11 +585,6 @@ object NCModelEnricher extends NCProbeEnricher {
                             !greedy ||
                             !contCache(eId).exists(_.containsSlice(idxs))  && !alreadyMarked(ns, eId, toks, idxs)
                     ) {
-//                        println("!!!toks="+toks.map(_.origText).mkString(" "))
-//                        println("!!!toksExt="+toksExt.map(_.origText).mkString(" "))
-//                        println()
-
-
                         // 1. SIMPLE.
                         if (simpleEnabled && (if (idlEnabled) mdl.hasIdlSynonyms(eId) else !mdl.hasIdlSynonyms(eId))) {
                             lazy val tokStems = toks.map(_.stem).mkString(" ")
@@ -699,7 +691,38 @@ object NCModelEnricher extends NCProbeEnricher {
             processParsers(mdl, ns, span, req)
         }
 
-        //logger.info("ENRICH222")
+        normalize(ns)
+    }
+
+    /**
+      *
+      * @param ns
+      */
+    private def normalize(ns: Sentence): Unit = {
+        val usrNotes = ns.flatten.filter(_.isUser).distinct
+        val links = NCSentenceManager.getLinks(usrNotes)
+        val parts = NCSentenceManager.getPartKeys(usrNotes: _*)
+
+        usrNotes.
+            filter(n => !links.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))).
+            filter(n => !parts.contains(NCTokenPartKey(n, ns))).
+            foreach(n => {
+                val hasBetter =
+                    usrNotes.exists(candidate =>
+                        candidate != n &&
+                        candidate.noteType == n.noteType &&
+                        candidate.dataOpt("parts") == n.dataOpt("parts") &&
+                        candidate.wordIndexes.toSet.subsetOf(n.wordIndexes.toSet) &&
+                        n.wordIndexes.filter(n => !candidate.wordIndexes.contains(n)).
+                            forall(wordIdx => ns.tokens.exists(t => t.wordIndexes.contains(wordIdx) && t.isStopWord)))
+
+                if (hasBetter) {
+                    ns.removeNote(n)
+
+                    // TODO: trace.
+                    logger.info(s"Element removed: ${n}")
+                }
+            })
     }
 
     // TODO: simplify, add tests, check model properties (sparse etc) for optimization.
@@ -730,91 +753,3 @@ object NCModelEnricher extends NCProbeEnricher {
         ))
     }
 }
-
-object x extends App {
-    case class T(index: Int, isStopWord: Boolean = false) {
-        override def toString: String = index.toString
-    }
-
-    private def combosTokens1(toks: Seq[T]): Seq[(Seq[T], Seq[T])] =
-        combos(toks).flatMap(combo => {
-            val stops = combo.filter(_.isStopWord)
-
-            val stops4Delete = Range.inclusive(1, stops.size).flatMap(stops.combinations)
-
-            (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => !del.contains(t)))).map(_ -> combo)
-        }).
-            toMap.
-            filter(_._1.nonEmpty).
-            groupBy(_._1).
-            map(p => p._1 -> p._2.values.minBy(p => (-p.size, p.head.index))).
-            sortBy(p => (-p._2.size, -p._1.size, -p._2.head.index, -p._1.head.index))
-
-    private def combos[T](toks: Seq[T]): Seq[Seq[T]] = {
-        val x  = (for (n <- toks.size until 0 by -1) yield toks.sliding(n)).flatten.map(p => p)
-
-        println("size=" + x.size)
-
-        x
-    }
-
-    private def combosTokens(toks: Seq[T]): Seq[(Seq[T], Seq[T])] = {
-        val value = combos(toks)
-
-        value.flatMap(combo => {
-            val stops = combo.filter(_.isStopWord)
-
-            val slides = mutable.ArrayBuffer.empty[mutable.ArrayBuffer[T]]
-
-            for (stop <- stops)
-                if (slides.nonEmpty && slides.last.last.index + 1 == stop.index)
-                    slides.last += stop
-                else
-                    slides += mutable.ArrayBuffer.empty :+ stop
-
-            val bigSlides = slides.filter(_.size >= 3)
-
-            var stops4Delete: Seq[Seq[T]] =
-                if (bigSlides.nonEmpty) {
-                    val allBig = bigSlides.flatten
-                    val stops4AllCombs = stops.filter(p => !allBig.contains(p))
-
-                    if (stops4AllCombs.nonEmpty)
-                        for (
-                            seq1 <- Range.inclusive(0, stops4AllCombs.size).flatMap(stops4AllCombs.combinations);
-                                seq2 <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations)
-                        )
-                        yield seq1 ++ seq2.flatMap(p => p)
-                    else
-                        for (seq <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations))
-                            yield seq.flatMap(p => p)
-                }
-                else
-                    Range.inclusive(1, stops.size).flatMap(stops.combinations)
-
-            stops4Delete = stops4Delete.filter(seq => !seq.contains(combo.head) && !seq.contains(combo.last))
-
-            (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => !del.contains(t)))).map(_ -> combo).distinct
-        }).
-            filter(_._1.nonEmpty).
-            groupBy(_._1).
-            map(p => p._1 -> p._2.map(_._2).minBy(p => (-p.size, p.head.index))).
-            sortBy { case (data, combo) => (-combo.size, -data.size, combo.head.index, data.head.index) }
-    }
-
-    def go(): Unit = {
-        val combs = combosTokens(
-//            Seq(
-//                T(0), T(2, true), T(3, true), T(4, true), T(5), T(6), T(7, true), T(8, true),  T(9),  T(10, true),  T(11, true),  T(12)
-//            )
-
-            Range.inclusive(0, 12).map(T(_, true))
-        )
-
-        println("All=" + combs.size)
-
-        combs.foreach { case (p1, p2) => println(p1.mkString("|") + " : " + p2.mkString("|"))  }
-    }
-
-    go()
-}
\ No newline at end of file
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index 286c8b4..1e31ab0 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -17,7 +17,6 @@
 
 package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.sort
 
-import java.io.Serializable
 import io.opencensus.trace.Span
 import org.apache.nlpcraft.common.NCService
 import org.apache.nlpcraft.common.makro.NCMacroParser
@@ -26,6 +25,7 @@ import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSe
 import org.apache.nlpcraft.probe.mgrs.NCProbeModel
 import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
 
+import java.io.Serializable
 import java.util.{List => JList}
 import scala.collection.mutable
 import scala.jdk.CollectionConverters._
@@ -187,59 +187,50 @@ object NCSortEnricher extends NCProbeEnricher {
       *
       * @param toksNoteData
       */
-    private def split(toks: Seq[NCNlpSentenceToken], othersRefs: Seq[NCNlpSentenceToken], toksNoteData: Seq[NoteData], nullable: Boolean): Seq[Seq[NoteData]] = {
-        val res =
-            if (toksNoteData.nonEmpty) {
-                val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
-
-                /**
-                  * Returns flag which indicates are token contiguous or not.
-                  *
-                  * @param tok1Idx First token index.
-                  * @param tok2Idx Second token index.
-                  */
-                def contiguous(tok1Idx: Int, tok2Idx: Int): Boolean = {
-                    val between = toks.filter(t => t.index > tok1Idx && t.index < tok2Idx)
-
-                    between.isEmpty || between.forall(p => p.isStopWord || p.stem == stemAnd)
-                }
+    private def split(
+        toks: Seq[NCNlpSentenceToken],
+        othersRefs: Seq[NCNlpSentenceToken],
+        toksNoteData: Seq[NoteData]
+    ): Seq[Seq[NoteData]] =
+        if (toksNoteData.nonEmpty) {
+            val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
+
+            /**
+              * Returns flag which indicates are token contiguous or not.
+              *
+              * @param tok1Idx First token index.
+              * @param tok2Idx Second token index.
+              */
+            def contiguous(tok1Idx: Int, tok2Idx: Int): Boolean = {
+                val between = toks.filter(t => t.index > tok1Idx && t.index < tok2Idx)
+
+                between.isEmpty || between.forall(p => p.isStopWord || p.stem == stemAnd)
+            }
 
-                val toks2 = toks.filter(othersRefs.contains)
+            val toks2 = toks.filter(othersRefs.contains)
 
-                val minIdx = toks2.dropWhile(t => !isUserNotValue(t)).head.index
-                val maxIdx = toks2.reverse.dropWhile(t => !isUserNotValue(t)).head.index
+            val minIdx = toks2.dropWhile(t => !isUserNotValue(t)).head.index
+            val maxIdx = toks2.reverse.dropWhile(t => !isUserNotValue(t)).head.index
 
-                require(minIdx <= maxIdx)
+            require(minIdx <= maxIdx)
 
-                def fill(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = mutable.ArrayBuffer.empty[NoteData]): Unit = {
-                    seq += nd
+            def fill(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = mutable.ArrayBuffer.empty[NoteData]): Unit = {
+                seq += nd
 
-                    toksNoteData.
-                        filter(p => nd.indexes.last < p.indexes.head && contiguous(nd.indexes.last, p.indexes.head)).
-                        foreach(fill(_, mutable.ArrayBuffer.empty[NoteData] ++ seq.clone()))
+                toksNoteData.
+                    filter(p => nd.indexes.last < p.indexes.head && contiguous(nd.indexes.last, p.indexes.head)).
+                    foreach(fill(_, mutable.ArrayBuffer.empty[NoteData] ++ seq.clone()))
 
-                    if (seq.nonEmpty && seq.head.indexes.head == minIdx && seq.last.indexes.last == maxIdx)
-                        res += seq
-                }
+                if (seq.nonEmpty && seq.head.indexes.head == minIdx && seq.last.indexes.last == maxIdx)
+                    res += seq
+            }
 
-                toksNoteData.filter(_.indexes.head == minIdx).foreach(p => fill(p))
+            toksNoteData.filter(_.indexes.head == minIdx).foreach(p => fill(p))
 
-                res
-            }
-            else
-                Seq.empty
-
-        if (res.isEmpty && !nullable)
-            throw new AssertionError(s"Invalid empty result " +
-                s"[tokensTexts=[${toks.map(_.origText).mkString("|")}]" +
-                s", notes=[${toks.flatten.map(n => s"${n.noteType}:[${n.tokenIndexes.mkString(",")}]").mkString("|")}]" +
-                s", tokensIndexes=[${toks.map(_.index).mkString("|")}]" +
-                s", allData=[${toksNoteData.mkString("|")}]" +
-                s"]"
-            )
-
-        res.toSeq
-    }
+            res
+        }
+        else
+            Seq.empty
 
     /**
       *
@@ -346,72 +337,78 @@ object NCSortEnricher extends NCProbeEnricher {
                             if (data1.nonEmpty || data2.nonEmpty) {
                                 val seq1 =
                                     if (data1.nonEmpty)
-                                        split(part1, othersRefs, data1, nullable = false)
-                                    else
-                                        split(part2, othersRefs, data2, nullable = false)
-                                val seq2 =
-                                    if (data1.nonEmpty && data2.nonEmpty)
-                                        split(part2, othersRefs, data2, nullable = true)
+                                        split(part1, othersRefs, data1)
                                     else
-                                        Seq.empty
-                                val asc = orderOpt.flatMap(o => Some(order(o.synonymIndex)._2))
-
-                                typ match {
-                                    case TYPE_SUBJ =>
-                                        require(seq1.nonEmpty)
-                                        require(seq2.isEmpty)
-                                        require(sortToks.nonEmpty)
-
-                                        // Ignores invalid cases.
-                                        if (byToks.isEmpty)
-                                            res =
-                                                Some(
+                                        split(part2, othersRefs, data2)
+
+                                if (seq1.nonEmpty) {
+                                    val seq2 =
+                                        if (data1.nonEmpty && data2.nonEmpty)
+                                            split(part2, othersRefs, data2)
+                                        else
+                                            Seq.empty
+
+                                    val asc = orderOpt.flatMap(o => Some(order(o.synonymIndex)._2))
+
+                                    typ match {
+                                        case TYPE_SUBJ =>
+                                            require(seq1.nonEmpty)
+                                            require(seq2.isEmpty)
+                                            require(sortToks.nonEmpty)
+
+                                            // Ignores invalid cases.
+                                            if (byToks.isEmpty)
+                                                res =
+                                                    Some(
+                                                        Match(
+                                                            asc = asc,
+                                                            main = sortToks,
+                                                            stop = orderToks,
+                                                            subjSeq = seq1,
+                                                            bySeq = Seq.empty
+                                                        )
+                                                    )
+
+                                        case TYPE_SUBJ_BY =>
+                                            require(seq1.nonEmpty)
+                                            require(sortToks.nonEmpty)
+                                            require(byToks.nonEmpty)
+
+                                            if (seq2.isEmpty)
+                                                res = None
+                                            else
+                                                res = Some(
                                                     Match(
                                                         asc = asc,
                                                         main = sortToks,
-                                                        stop = orderToks,
+                                                        stop = byToks ++ orderToks,
                                                         subjSeq = seq1,
-                                                        bySeq = Seq.empty
+                                                        bySeq = seq2
                                                     )
                                                 )
 
-                                    case TYPE_SUBJ_BY =>
-                                        require(seq1.nonEmpty)
-                                        require(sortToks.nonEmpty)
-                                        require(byToks.nonEmpty)
+                                        case TYPE_BY =>
+                                            require(seq1.nonEmpty)
+                                            require(seq2.isEmpty)
+                                            require(sortToks.nonEmpty)
+                                            require(byToks.nonEmpty)
 
-                                        if (seq2.isEmpty)
-                                            res = None
-                                        else
+                                            // `Sort by` as one element, see validation.
                                             res = Some(
                                                 Match(
                                                     asc = asc,
-                                                    main = sortToks,
-                                                    stop = byToks ++ orderToks,
-                                                    subjSeq = seq1,
-                                                    bySeq = seq2
+                                                    main = sortToks ++ byToks,
+                                                    stop = orderToks,
+                                                    subjSeq = Seq.empty,
+                                                    bySeq = seq1
                                                 )
                                             )
 
-                                    case TYPE_BY =>
-                                        require(seq1.nonEmpty)
-                                        require(seq2.isEmpty)
-                                        require(sortToks.nonEmpty)
-                                        require(byToks.nonEmpty)
-
-                                        // `Sort by` as one element, see validation.
-                                        res = Some(
-                                            Match(
-                                                asc = asc,
-                                                main = sortToks ++ byToks,
-                                                stop = orderToks,
-                                                subjSeq = Seq.empty,
-                                                bySeq = seq1
-                                            )
-                                        )
-
-                                    case _ => throw new AssertionError(s"Unexpected type: $typ")
+                                        case _ => throw new AssertionError(s"Unexpected type: $typ")
+                                    }
                                 }
+                                else
+                                    None
                             }
                         case None => // No-op.
                     }
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index 41fc484..d85c9d6 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -48,7 +48,7 @@ object NCSentenceManager extends NCService {
       *
       * @param notes
       */
-    private def getLinks(notes: Seq[NCNlpSentenceNote]): Seq[NoteLink] = {
+    def getLinks(notes: Seq[NCNlpSentenceNote]): Seq[NoteLink] = {
         val noteLinks = mutable.ArrayBuffer.empty[NoteLink]
 
         for (n <- notes.filter(n => n.noteType == "nlpcraft:limit" || n.noteType == "nlpcraft:references"))
@@ -79,7 +79,7 @@ object NCSentenceManager extends NCService {
       *
       * @param notes
       */
-    private def getPartKeys(notes: NCNlpSentenceNote*): Seq[NCTokenPartKey] =
+    def getPartKeys(notes: NCNlpSentenceNote*): Seq[NCTokenPartKey] =
         notes.
             filter(_.isUser).
             flatMap(n => {
@@ -677,15 +677,9 @@ object NCSentenceManager extends NCService {
                         map { case (_, seq) => seq.map { case (_, note) => note }.toSet }.
                         toSeq.sortBy(-_.size)
 
-//                println("!!!!!toksByIdx.size="+toksByIdx.size)
-//                println("!!!!!toksByIdx.ALL-sizes-sum="+toksByIdx.map(_.size).sum)
-//                println("!!!!!toksByIdx.all-sized="+toksByIdx.map(_.size))
-                        
                 def findCombinations(): Seq[Seq[NCNlpSentenceNote]] = {
                     val res = NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala.toSeq)
 
-//                    println("!!! combinations=" + res.size)
-
                     res
                 }
 
@@ -742,7 +736,6 @@ object NCSentenceManager extends NCService {
             )
         )
 
-
         def notNlpNotes(s: NCNlpSentence): Seq[NCNlpSentenceNote] = s.flatten.filter(!_.isNlp)
 
         // Drops similar sentences (with same notes structure). Keeps with more found.

[incubator-nlpcraft] 05/08: Merge branch 'master' into NLPCRAFT-443

Posted by se...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 054bc51789c6287a8beebcbbe61ef8c3f9f685f5
Merge: 1891c2c fbb64bc
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Fri Sep 17 11:56:06 2021 +0300

    Merge branch 'master' into NLPCRAFT-443

 nlpcraft-examples/alarm/src/main/resources/alarm_intents.idl          | 2 +-
 nlpcraft-examples/minecraft-mod/build.gradle                          | 4 ++--
 nlpcraft-examples/minecraft/src/main/resources/minecraft.yaml         | 4 ++--
 nlpcraft-examples/minecraft/src/main/resources/probe.conf             | 2 +-
 .../apache/nlpcraft/examples/minecraft/NCModelValidationSpec.scala    | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

[incubator-nlpcraft] 02/08: Merge branch 'master' into NLPCRAFT-443

Posted by se...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 1856f6cb77c9609c951911c0be8bfef601560003
Merge: 426b8e8 4297ad9
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Thu Sep 16 11:14:07 2021 +0300

    Merge branch 'master' into NLPCRAFT-443

 nlpcraft/src/main/resources/nlpcraft.conf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

[incubator-nlpcraft] 08/08: WIP.

Posted by se...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 25f6015fb5aabe7af651c355e081f72d846c6a28
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Sat Sep 18 12:05:02 2021 +0300

    WIP.
---
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 27 +---------------------
 .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala   |  2 --
 .../probe/mgrs/sentence/NCSentenceManager.scala    |  7 ++----
 .../model/stm/indexes/NCSpecModelAdapter.scala     |  4 ++--
 4 files changed, 5 insertions(+), 35 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index f8457e8..058d713 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -196,11 +196,7 @@ object NCModelEnricher extends NCProbeEnricher {
         toks.foreach(_.add(note))
 
         // For NLP elements.
-        toks.foreach(t => {
-            ns.fixNote(t.getNlpNote, "direct" -> direct)
-            // TODO:
-            //ns.fixNote(t.getNlpNote, "stopWord" -> false)
-        })
+        toks.foreach(t => ns.fixNote(t.getNlpNote, "direct" -> direct))
     }
 
     /**
@@ -541,27 +537,6 @@ object NCModelEnricher extends NCProbeEnricher {
             val combToks = combosTokens(ns.toSeq)
             lazy val ch = mkComplexes(mdl, ns)
 
-//            logger.info("combToks="+combToks.size)
-//
-//            logger.info("ns.flatten.flatten.size="+ns.tokens.flatten.distinct.count(!_.isNlp))
-//
-//            ns.tokens.flatten.filter(!_.isNlp).distinct.sortBy(p => (p.noteType, -p.tokenIndexes.size)).foreach(n => {
-//                val parts =
-//                    n.get("parts") match {
-//                        case Some(v) =>
-//                            val parts = v.asInstanceOf[java.util.List[NCTokenPartKey]].asScala
-//
-//                            parts.map(p => {
-//                                val ref = ns.tokens.find(t => t.startCharIndex == p.from && t.endCharIndex == p.to).get
-//
-//                                "part=" + p.id + " (idx=" + ref.index + "), text=" + ref.origText
-//                            }).mkString(" | ")
-//                        case None => "NO"
-//                    }
-//                logger.info(s"${n.noteType}  [${n.wordIndexes.mkString(",")}], parts=$parts")
-//            })
-//
-//            logger.info("---")
 
             def execute(simpleEnabled: Boolean, idlEnabled: Boolean): Unit =
                 startScopedSpan(
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index 1e31ab0..6e0780e 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -407,8 +407,6 @@ object NCSortEnricher extends NCProbeEnricher {
                                         case _ => throw new AssertionError(s"Unexpected type: $typ")
                                     }
                                 }
-                                else
-                                    None
                             }
                         case None => // No-op.
                     }
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index d85c9d6..415e6ae 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -677,11 +677,8 @@ object NCSentenceManager extends NCService {
                         map { case (_, seq) => seq.map { case (_, note) => note }.toSet }.
                         toSeq.sortBy(-_.size)
 
-                def findCombinations(): Seq[Seq[NCNlpSentenceNote]] = {
-                    val res = NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala.toSeq)
-
-                    res
-                }
+                def findCombinations(): Seq[Seq[NCNlpSentenceNote]] =
+                    NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala.toSeq)
 
                 val seqSens =
                     combCache.
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stm/indexes/NCSpecModelAdapter.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stm/indexes/NCSpecModelAdapter.scala
index 2328e7c..f9911f6 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stm/indexes/NCSpecModelAdapter.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stm/indexes/NCSpecModelAdapter.scala
@@ -23,8 +23,8 @@ import org.apache.nlpcraft.NCTestElement
 import org.apache.nlpcraft.model.{NCElement, NCModelAdapter}
 
 import java.util
-import java.util.{Collections, Optional}
-import scala.jdk.CollectionConverters.{SeqHasAsJava, SetHasAsJava}
+import java.util.Optional
+import scala.jdk.CollectionConverters.SetHasAsJava
 
 object NCSpecModelAdapter {
     val mapper = new ObjectMapper()

[incubator-nlpcraft] 04/08: WIP.

Posted by se...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 1891c2c2d8e2441d0ad60a2a75f3fafee0dbaa04
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Fri Sep 17 11:55:48 2021 +0300

    WIP.
---
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 216 +++++++++++++++++++--
 .../probe/mgrs/sentence/NCSentenceManager.scala    |  13 +-
 .../model/stop/NCStopWordsInsideSpec.scala         |  11 +-
 .../model/NCEnricherNestedModelSpec4.scala         |  81 ++++++--
 4 files changed, 280 insertions(+), 41 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 22af412..d83ab05 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -21,6 +21,7 @@ import io.opencensus.trace.Span
 import org.apache.nlpcraft.common._
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence => Sentence, NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken}
 import org.apache.nlpcraft.model._
+import org.apache.nlpcraft.model.impl.NCTokenImpl
 import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent
 import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.NCSynonymChunkKind
 import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
@@ -285,11 +286,7 @@ object NCModelEnricher extends NCProbeEnricher {
         }
     }
 
-    /**
-      *
-      * @param toks
-      */
-    private def combosNlpTokens(toks: Seq[NlpToken]): Seq[(Seq[NlpToken], Seq[NlpToken])] =
+    private def combosTokens1(toks: Seq[NlpToken]): Seq[(Seq[NlpToken], Seq[NlpToken])] =
         combos(toks).flatMap(combo => {
             val stops = combo.filter(_.isStopWord)
 
@@ -303,6 +300,64 @@ object NCModelEnricher extends NCProbeEnricher {
             map(p => p._1 -> p._2.values.minBy(p => (-p.size, p.head.index))).
             sortBy(p => (-p._2.size, -p._1.size, -p._2.head.index, -p._1.head.index))
 
+
+    /**
+      *
+      * 1. Prepares combination of tokens (sliding).
+      *  Example: 'A B C D' -> {'A B C', 'A B', 'B C', 'A', 'B', 'C'}
+      *  One sentence converted to 4 pieces.
+      *
+      * 2. Additionally, each piece converted into set of elements with all possible its stopwords permutations.
+      *  Example: Piece: 'x1, x2(stopword), x3(stopword), x4' will be expanded  into
+      *  {'x1, x2, x3, x4', 'x1, x2, x4', 'x1, x3, x4', 'x1, x4'}
+      *
+      *  3. All variants collected, duplicated deleted, etc.
+      *
+      * @param toks
+      */
+    private def combosTokens(toks: Seq[NlpToken]): Seq[(Seq[NlpToken], Seq[NlpToken])] =
+        combos(toks).flatMap(combo => {
+            val stops = combo.filter(s => s.isStopWord && s != combo.head && s != combo.last)
+
+            val slides = mutable.ArrayBuffer.empty[mutable.ArrayBuffer[NlpToken]]
+
+            for (stop <- stops)
+                if (slides.nonEmpty && slides.last.last.index + 1 == stop.index)
+                    slides.last += stop
+                else
+                    slides += mutable.ArrayBuffer.empty :+ stop
+
+            val bigSlides = slides.filter(_.size > 2)
+
+            var stops4Delete: Seq[Seq[NlpToken]] =
+
+                if (bigSlides.nonEmpty) {
+                    val allBig = bigSlides.flatMap(p => p)
+                    val stops4AllCombs = stops.filter(p => !allBig.contains(p))
+
+                    if (stops4AllCombs.nonEmpty)
+                        for (
+                            seq1 <- Range.inclusive(0, stops4AllCombs.size).flatMap(stops4AllCombs.combinations);
+                            seq2 <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations)
+                        )
+                        yield seq1 ++ seq2.flatMap(p => p)
+                    else
+                        for (seq <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations))
+                            yield seq.flatMap(p => p)
+                }
+                else
+                    Range.inclusive(1, stops.size).flatMap(stops.combinations)
+
+            stops4Delete = stops4Delete.filter(seq => !seq.contains(combo.head) && !seq.contains(combo.last))
+
+            (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => !del.contains(t)))).map(_ -> combo).distinct
+
+        }).
+            filter(_._1.nonEmpty).
+            groupBy(_._1).
+            map(p => p._1 -> p._2.map(_._2).minBy(p => (-p.size, p.head.index))).
+            sortBy { case(data, combo) => (-combo.size, -data.size, combo.head.index, data.head.index) }
+
     /**
       *
       * @param toks
@@ -315,9 +370,18 @@ object NCModelEnricher extends NCProbeEnricher {
       * @param seq
       * @param s
       */
-    private def toParts(seq: Seq[NCIdlContent], s: Synonym): Seq[TokType] =
+    private def toParts(mdl: NCProbeModel, stvReqId: String, seq: Seq[NCIdlContent], s: Synonym): Seq[TokType] =
         seq.zip(s.map(_.kind)).flatMap {
-            case (complex, kind) => if (complex.isLeft) Some(complex.swap.toOption.get -> kind) else None
+            case (complex, kind) =>
+                if (complex.isLeft)
+                    Some(complex.swap.toOption.get -> kind)
+                else {
+                    val clone = complex.toOption.get.clone()
+
+                    clone.filter(!_.isNlp).foreach(clone.remove)
+
+                    Some(NCTokenImpl(mdl, stvReqId, clone) -> kind)
+                }
         }
 
     /**
@@ -457,8 +521,8 @@ object NCModelEnricher extends NCProbeEnricher {
       */
     private def getSparsedTokens(matched: Seq[NlpToken], toks2Match: Seq[NlpToken]): Seq[NlpToken] = {
         require(matched.nonEmpty)
-
         // Matched tokens should be already sorted.
+
         val stopsInside = toks2Match.filter(t =>
             t.isStopWord && !matched.contains(matched) && t.index > matched.head.index && t.index < matched.last.index
         )
@@ -470,13 +534,38 @@ object NCModelEnricher extends NCProbeEnricher {
     override def enrich(mdl: NCProbeModel, ns: Sentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = {
         require(isStarted)
 
+        //logger.info("ENRICH111")
+
         startScopedSpan(
             "enrich", parent, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text
         ) { span =>
             val req = NCRequestImpl(senMeta, ns.srvReqId)
-            val combToks = combosNlpTokens(ns.toSeq)
+            val combToks = combosTokens(ns.toSeq)
             lazy val ch = mkComplexes(mdl, ns)
 
+//            logger.info("combToks="+combToks.size)
+//
+//            logger.info("ns.flatten.flatten.size="+ns.tokens.flatten.distinct.count(!_.isNlp))
+//
+//            ns.tokens.flatten.filter(!_.isNlp).distinct.foreach(n => {
+//                val parts =
+//                    n.get("parts") match {
+//                        case Some(v) =>
+//                            val parts = v.asInstanceOf[java.util.List[NCTokenPartKey]].asScala
+//
+//                            "all parts=" + parts.size + "  " +
+//                            parts.map(p => {
+//                                val ref = ns.tokens.find(t => t.startCharIndex == p.from && t.endCharIndex == p.to).get
+//
+//                                "part=" + p.id + " (" + ref.index + "), text=" + ref.origText
+//                            }).mkString(" | ")
+//                        case None => "NO"
+//                    }
+//                logger.info(s"${n.noteType}  [${n.wordIndexes.mkString(",")}], parts=$parts")
+//            })
+//
+//            logger.info("---")
+
             def execute(simpleEnabled: Boolean, idlEnabled: Boolean): Unit =
                 startScopedSpan(
                     "execute", span, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text
@@ -489,8 +578,7 @@ object NCModelEnricher extends NCProbeEnricher {
                     lazy val idlCache = mutable.HashSet.empty[Seq[Complex]]
 
                     for (
-                        // toksExt is part of sentence.
-                        // toks is toksExt or toksExt without some stopwords set. All stopwords combinations are taking into account.
+                        // 'toksExt' is piece of sentence, 'toks' is the same as 'toksExt' or without some stopwords set.
                         (toks, toksExt) <- combToks;
                         idxs = toks.map(_.index);
                         e <- mdl.elements.values;
@@ -500,6 +588,11 @@ object NCModelEnricher extends NCProbeEnricher {
                             !greedy ||
                             !contCache(eId).exists(_.containsSlice(idxs))  && !alreadyMarked(ns, eId, toks, idxs)
                     ) {
+//                        println("!!!toks="+toks.map(_.origText).mkString(" "))
+//                        println("!!!toksExt="+toksExt.map(_.origText).mkString(" "))
+//                        println()
+
+
                         // 1. SIMPLE.
                         if (simpleEnabled && (if (idlEnabled) mdl.hasIdlSynonyms(eId) else !mdl.hasIdlSynonyms(eId))) {
                             lazy val tokStems = toks.map(_.stem).mkString(" ")
@@ -542,9 +635,6 @@ object NCModelEnricher extends NCProbeEnricher {
                                 for (s <- get(mdl.sparseSynonyms, eId))
                                     s.sparseMatch(toks) match {
                                         case Some(res) =>
-//                                            println("!!!toks="+toks.map(_.origText))
-//                                            println("!!!res="+res.map(_.origText))
-//                                            println
                                             add("simple sparse", ns, contCache, eId, greedy, getSparsedTokens(res, toks), idxs, s)
                                         case None => // No-op.
                                     }
@@ -566,7 +656,9 @@ object NCModelEnricher extends NCProbeEnricher {
                                     data = comb.map(_.data)
                                 )
                                     if (s.isMatch(data, req)) {
-                                        add("IDL continuous", ns, contCache, eId, greedy, toksExt, idxs, s, toParts(data, s))
+                                        val parts = toParts(mdl, ns.srvReqId, data, s)
+
+                                        add("IDL continuous", ns, contCache, eId, greedy, toksExt, idxs, s, parts)
 
                                         idlCache += comb
 
@@ -583,7 +675,9 @@ object NCModelEnricher extends NCProbeEnricher {
                                         case Some(res) =>
                                             val typ = if (s.sparse) "IDL sparse" else "IDL continuous"
 
-                                            add(typ, ns, contCache, eId, greedy, getSparsedTokens(toTokens(res, ns), toTokens(comb.map(_.data), ns)), idxs, s, toParts(res, s))
+                                            val parts = toParts(mdl, ns.srvReqId, res, s)
+
+                                            add(typ, ns, contCache, eId, greedy, getSparsedTokens(toTokens(res, ns), toTokens(comb.map(_.data), ns)), idxs, s, parts)
 
                                             idlCache += comb
                                         case None => // No-op.
@@ -604,6 +698,8 @@ object NCModelEnricher extends NCProbeEnricher {
 
             processParsers(mdl, ns, span, req)
         }
+
+        //logger.info("ENRICH222")
     }
 
     // TODO: simplify, add tests, check model properties (sparse etc) for optimization.
@@ -633,4 +729,92 @@ object NCModelEnricher extends NCProbeEnricher {
             )
         ))
     }
+}
+
+object x extends App {
+    case class T(index: Int, isStopWord: Boolean = false) {
+        override def toString: String = index.toString
+    }
+
+    private def combosTokens1(toks: Seq[T]): Seq[(Seq[T], Seq[T])] =
+        combos(toks).flatMap(combo => {
+            val stops = combo.filter(_.isStopWord)
+
+            val stops4Delete = Range.inclusive(1, stops.size).flatMap(stops.combinations)
+
+            (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => !del.contains(t)))).map(_ -> combo)
+        }).
+            toMap.
+            filter(_._1.nonEmpty).
+            groupBy(_._1).
+            map(p => p._1 -> p._2.values.minBy(p => (-p.size, p.head.index))).
+            sortBy(p => (-p._2.size, -p._1.size, -p._2.head.index, -p._1.head.index))
+
+    private def combos[T](toks: Seq[T]): Seq[Seq[T]] = {
+        val x  = (for (n <- toks.size until 0 by -1) yield toks.sliding(n)).flatten.map(p => p)
+
+        println("size=" + x.size)
+
+        x
+    }
+
+    private def combosTokens(toks: Seq[T]): Seq[(Seq[T], Seq[T])] = {
+        val value = combos(toks)
+
+        value.flatMap(combo => {
+            val stops = combo.filter(_.isStopWord)
+
+            val slides = mutable.ArrayBuffer.empty[mutable.ArrayBuffer[T]]
+
+            for (stop <- stops)
+                if (slides.nonEmpty && slides.last.last.index + 1 == stop.index)
+                    slides.last += stop
+                else
+                    slides += mutable.ArrayBuffer.empty :+ stop
+
+            val bigSlides = slides.filter(_.size >= 3)
+
+            var stops4Delete: Seq[Seq[T]] =
+                if (bigSlides.nonEmpty) {
+                    val allBig = bigSlides.flatten
+                    val stops4AllCombs = stops.filter(p => !allBig.contains(p))
+
+                    if (stops4AllCombs.nonEmpty)
+                        for (
+                            seq1 <- Range.inclusive(0, stops4AllCombs.size).flatMap(stops4AllCombs.combinations);
+                                seq2 <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations)
+                        )
+                        yield seq1 ++ seq2.flatMap(p => p)
+                    else
+                        for (seq <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations))
+                            yield seq.flatMap(p => p)
+                }
+                else
+                    Range.inclusive(1, stops.size).flatMap(stops.combinations)
+
+            stops4Delete = stops4Delete.filter(seq => !seq.contains(combo.head) && !seq.contains(combo.last))
+
+            (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => !del.contains(t)))).map(_ -> combo).distinct
+        }).
+            filter(_._1.nonEmpty).
+            groupBy(_._1).
+            map(p => p._1 -> p._2.map(_._2).minBy(p => (-p.size, p.head.index))).
+            sortBy { case (data, combo) => (-combo.size, -data.size, combo.head.index, data.head.index) }
+    }
+
+    def go(): Unit = {
+        val combs = combosTokens(
+//            Seq(
+//                T(0), T(2, true), T(3, true), T(4, true), T(5), T(6), T(7, true), T(8, true),  T(9),  T(10, true),  T(11, true),  T(12)
+//            )
+
+            Range.inclusive(0, 12).map(T(_, true))
+        )
+
+        println("All=" + combs.size)
+
+        combs.foreach { case (p1, p2) => println(p1.mkString("|") + " : " + p2.mkString("|"))  }
+    }
+
+    go()
 }
\ No newline at end of file
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index f6855ea..41fc484 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -676,9 +676,18 @@ object NCSentenceManager extends NCService {
                         groupBy { case (idx, _) => idx }.
                         map { case (_, seq) => seq.map { case (_, note) => note }.toSet }.
                         toSeq.sortBy(-_.size)
+
+//                println("!!!!!toksByIdx.size="+toksByIdx.size)
+//                println("!!!!!toksByIdx.ALL-sizes-sum="+toksByIdx.map(_.size).sum)
+//                println("!!!!!toksByIdx.all-sized="+toksByIdx.map(_.size))
                         
-                def findCombinations(): Seq[Seq[NCNlpSentenceNote]] =
-                    NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala.toSeq)
+                def findCombinations(): Seq[Seq[NCNlpSentenceNote]] = {
+                    val res = NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala.toSeq)
+
+//                    println("!!! combinations=" + res.size)
+
+                    res
+                }
 
                 val seqSens =
                     combCache.
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala
index 9e3e911..3cc26f1 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala
@@ -32,9 +32,7 @@ class NCStopWordsInsideModel extends NCModelAdapter("nlpcraft.test", "Test Model
     override def getElements: util.Set[NCElement] = Set(NCTestElement("complex", "a b"))
 
     @NCIntent("intent=i term={# == 'complex'}")
-    def onI(
-        ctx: NCIntentMatch
-    ): NCResult = {
+    def onI(ctx: NCIntentMatch): NCResult = {
         require(ctx.getContext.getVariants.size() == 1)
         require(ctx.getContext.getVariants.asScala.head.asScala.size == 1)
         require(ctx.getContext.getVariants.asScala.head.asScala.head.getNormalizedText == ctx.getContext.getRequest.getNormalizedText)
@@ -68,12 +66,7 @@ class NCStopWordsInsideSparseModel extends NCStopWordsInsideModel {
 class NCStopWordsInsideSparseSpec extends NCStopWordsInsideSpec {
     @Test
     def test2(): Unit = {
-        //checkIntent("a b", "i")
-        checkIntent("a the b", "i")
-//        checkIntent("a , b", "i")
-//        checkIntent("a, b", "i")
-//        checkIntent("a, the b", "i")
-//        checkIntent("a, the, b", "i")
+        // TODO:
     }
 }
 
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
index 27082f1..825e4a2 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
@@ -24,10 +24,8 @@ import org.junit.jupiter.api.Test
 import java.util
 import scala.jdk.CollectionConverters.SetHasAsJava
 
-/**
-  * Nested Elements test model.
-  */
-class NCNestedTestModel41 extends NCModelAdapter("nlpcraft.nested4.test.mdl", "Nested Test Model", "1.0") {
+// It shouldn't be too slow.
+class NCNestedTestModel4Adapter extends NCModelAdapter("nlpcraft.nested4.test.mdl", "Nested Test Model", "1.0") {
     override def getElements: util.Set[NCElement] =
         Set(
             NCTestElement("e1", "//[a-zA-Z0-9]+//"),
@@ -36,34 +34,89 @@ class NCNestedTestModel41 extends NCModelAdapter("nlpcraft.nested4.test.mdl", "N
 
     override def getAbstractTokens: util.Set[String] = Set("e1").asJava
     override def getEnabledBuiltInTokens: util.Set[String] = Set.empty[String].asJava
+}
 
-    @NCIntent("intent=onE2 term(t1)={# == 'e2'}[8, 100]")
-    def onAB(ctx: NCIntentMatch): NCResult = NCResult.text("OK")
+/**
+  * Greedy(one element expected) + not permuted.
+  */
+class NCNestedTestModel41 extends NCNestedTestModel4Adapter {
+    @NCIntent("intent=onE2 term(t1)={# == 'e2'}")
+    def onAB(): NCResult = NCResult.text("OK")
 
+    override def isGreedy: Boolean = true
     override def isPermutateSynonyms: Boolean = false
     override def isSparse: Boolean = false
 }
 
 /**
-  * It shouldn't be too slow.
+  *
   */
 @NCTestEnvironment(model = classOf[NCNestedTestModel41], startClient = true)
 class NCEnricherNestedModelSpec41 extends NCTestContext {
-    @Test
+    // @Test
+    def test(): Unit = checkIntent("the a " * 11, "onE2")
+}
+
+/**
+  * Not-greedy(few elements expected) + not permuted.
+  */
+class NCNestedTestModel42 extends NCNestedTestModel4Adapter {
+    @NCIntent("intent=onE2 term(t1)={# == 'e2'}[3, 100]")
+    def onAB(): NCResult = NCResult.text("OK")
+
+    override def isGreedy: Boolean = false
+    override def isPermutateSynonyms: Boolean = false
+    override def isSparse: Boolean = false
+}
+
+/**
+  *
+  */
+@NCTestEnvironment(model = classOf[NCNestedTestModel41], startClient = true)
+class NCEnricherNestedModelSpec42 extends NCTestContext {
+    // @Test
     def test(): Unit = checkIntent("the a " * 11, "onE2")
 }
 
-class NCNestedTestModel42 extends NCNestedTestModel41 {
+/**
+  * Greedy(one element expected) + permuted.
+  */
+class NCNestedTestModel43 extends NCNestedTestModel4Adapter {
+    @NCIntent("intent=onE2 term(t1)={# == 'e2'}[1, 100]")
+    def onAB(): NCResult = NCResult.text("OK")
+
+    override def isGreedy: Boolean = true
     override def isPermutateSynonyms: Boolean = true
     override def isSparse: Boolean = true
 }
 
 /**
-  * It shouldn't be too slow.
+  *
   */
-@NCTestEnvironment(model = classOf[NCNestedTestModel42], startClient = true)
-class NCEnricherNestedModelSpec42 extends NCTestContext {
-    @Test
-    def test(): Unit = checkIntent("the a " * 8, "onE2")
+@NCTestEnvironment(model = classOf[NCNestedTestModel43], startClient = true)
+class NCEnricherNestedModelSpec43 extends NCTestContext {
+    // @Test
+    def test(): Unit = checkIntent("the a " * 4, "onE2")
+}
+
+/**
+  * Not-greedy(few elements expected) + permuted.
+  */
+class NCNestedTestModel44 extends NCNestedTestModel4Adapter {
+    @NCIntent("intent=onE2 term(t1)={# == 'e2'}[3, 100]")
+    def onAB(): NCResult = NCResult.text("OK")
+
+    override def isGreedy: Boolean = false
+    override def isPermutateSynonyms: Boolean = true
+    override def isSparse: Boolean = true
+}
+
+/**
+  *
+  */
+@NCTestEnvironment(model = classOf[NCNestedTestModel44], startClient = true)
+class NCEnricherNestedModelSpec44 extends NCTestContext {
+    // @Test
+    def test(): Unit = checkIntent("the a " * 2, "onE2")
 }
 

[incubator-nlpcraft] 01/08: WIP.

Posted by se...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 426b8e8bdedb569634ea657b3f6f859a0045b467
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Thu Sep 16 11:13:17 2021 +0300

    WIP.
---
 .../probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala       |  5 ++++-
 .../org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala | 10 +++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index ca388ed..6908265 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -525,7 +525,10 @@ object NCModelEnricher extends NCProbeEnricher {
                             if (!found && mdl.hasSparseSynonyms)
                                 for (s <- get(mdl.sparseSynonyms, eId))
                                     s.sparseMatch(toks) match {
-                                        case Some(res) => add("simple sparse", ns, contCache, eId, greedy, res, idxs, s)
+                                        case Some(res) =>
+                                            println("!!!toks="+toks.map(_.origText))
+                                            println("!!!res="+res.map(_.origText))
+                                            add("simple sparse", ns, contCache, eId, greedy, res, idxs, s)
                                         case None => // No-op.
                                     }
                         }
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala
index b17cea8..9e3e911 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala
@@ -68,12 +68,12 @@ class NCStopWordsInsideSparseModel extends NCStopWordsInsideModel {
 class NCStopWordsInsideSparseSpec extends NCStopWordsInsideSpec {
     @Test
     def test2(): Unit = {
-        checkIntent("a b", "i")
+        //checkIntent("a b", "i")
         checkIntent("a the b", "i")
-        checkIntent("a , b", "i")
-        checkIntent("a, b", "i")
-        checkIntent("a, the b", "i")
-        checkIntent("a, the, b", "i")
+//        checkIntent("a , b", "i")
+//        checkIntent("a, b", "i")
+//        checkIntent("a, the b", "i")
+//        checkIntent("a, the, b", "i")
     }
 }