You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/02/12 20:23:52 UTC

[incubator-nlpcraft] branch NLPCRAFT-236 updated: WIP.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-236
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


The following commit(s) were added to refs/heads/NLPCRAFT-236 by this push:
     new cf19be3  WIP.
cf19be3 is described below

commit cf19be35609cb2c51cebc7f276411c09d4562b7b
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Fri Feb 12 23:23:38 2021 +0300

    WIP.
---
 .../apache/nlpcraft/common/nlp/NCNlpSentence.scala |  57 +++++++-----
 .../nlpcraft/probe/mgrs/NCProbeVariants.scala      | 103 +++++++++++++++------
 .../abstract/NCAbstractTokensVariantsSpec.scala    |  97 +++++++++++++++++++
 .../nlp/enrichers/sort/NCEnricherSortSpec.scala    |   5 -
 4 files changed, 207 insertions(+), 55 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index 0d65199..a446204 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -368,7 +368,6 @@ object NCNlpSentence {
         t
     }
 
-
     /**
       * Fixes notes with references list to other notes indexes.
       *
@@ -495,7 +494,8 @@ class NCNlpSentence(
     val srvReqId: String,
     val text: String,
     val enabledBuiltInToks: Set[String],
-    override val tokens: ArrayBuffer[NCNlpSentenceToken] = new ArrayBuffer[NCNlpSentenceToken](32)
+    override val tokens: mutable.ArrayBuffer[NCNlpSentenceToken] = new mutable.ArrayBuffer[NCNlpSentenceToken](32),
+    val deletedNotes: mutable.HashMap[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = mutable.HashMap.empty
 ) extends NCNlpSentenceTokenBuffer(tokens) with java.io.Serializable {
     @transient
     private var hash: java.lang.Integer = _
@@ -505,7 +505,13 @@ class NCNlpSentence(
 
     // Deep copy.
     override def clone(): NCNlpSentence =
-        new NCNlpSentence(srvReqId, text, enabledBuiltInToks, tokens.map(_.clone()))
+        new NCNlpSentence(
+            srvReqId,
+            text,
+            enabledBuiltInToks,
+            tokens.map(_.clone()),
+            deletedNotes.map(p ⇒ p._1.clone() → p._2.map(_.clone()))
+        )
 
     /**
       * Utility method that gets set of notes for given note type collected from
@@ -548,7 +554,7 @@ class NCNlpSentence(
 
         case class Key(id: String, start: Int, end: Int) {
             private def in(i: Int): Boolean = i >= start && i <= end
-            def intersect(id: String, start: Int, end: Int): Boolean = in(start) || in(end)
+            def intersect(id: String, start: Int, end: Int): Boolean = id == this.id && (in(start) || in(end))
         }
 
         val keys: Seq[Key] =
@@ -583,6 +589,13 @@ class NCNlpSentence(
       */
     @throws[NCE]
     def collapse(mdl: NCModel, lastPhase: Boolean = false): Seq[NCNlpSentence] = {
+        def collapse0(ns: NCNlpSentence): Option[NCNlpSentence] = {
+            if (lastPhase)
+                dropAbstract(mdl, ns)
+
+            if (collapseSentence(ns, getNotNlpNotes(ns).map(_.noteType).distinct)) Some(ns) else None
+        }
+
         // Always deletes `similar` notes.
         // Some words with same note type can be detected various ways.
         // We keep only one variant -  with `best` direct and sparsity parameters,
@@ -625,7 +638,7 @@ class NCNlpSentence(
 
         val minDelSize = if (toksByIdx.isEmpty) 1 else toksByIdx.map(_.size).max - 1
 
-        val sens =
+        var sens =
             if (delCombs.nonEmpty) {
                 val deleted = mutable.ArrayBuffer.empty[Seq[NCNlpSentenceNote]]
 
@@ -641,6 +654,19 @@ class NCNlpSentence(
                             if (!deleted.exists(_.forall(delComb.contains))) {
                                 val nsClone = this.clone()
 
+                                nsClone.deletedNotes ++= delComb.map(n ⇒ {
+                                    val savedDelNote = n.clone()
+                                    val savedDelToks = n.tokenIndexes.map(idx ⇒ nsClone(idx).clone())
+
+                                    val mainNotes =
+                                        savedDelToks.flatten.filter(n ⇒ n.noteType != "nlpcraft:nlp" && n != savedDelNote)
+
+                                    for (savedDelTok ← savedDelToks; mainNote ← mainNotes)
+                                        savedDelTok.remove(mainNote)
+
+                                    savedDelNote → savedDelToks
+                                })
+
                                 delComb.foreach(nsClone.removeNote)
 
                                 // Has overlapped notes for some tokens.
@@ -648,13 +674,7 @@ class NCNlpSentence(
 
                                 deleted += delComb
 
-                                if (lastPhase)
-                                    dropAbstract(mdl, nsClone)
-
-                                if (collapseSentence(nsClone, getNotNlpNotes(nsClone).map(_.noteType).distinct))
-                                    Some(nsClone)
-                                else
-                                    None
+                                collapse0(nsClone)
                             }
                             else
                                 None
@@ -697,15 +717,10 @@ class NCNlpSentence(
 
                 m.values.map(_.sentence).toSeq
             }
-            else {
-                if (lastPhase)
-                    dropAbstract(mdl, this)
-
-                if (collapseSentence(this, getNotNlpNotes(this).map(_.noteType).distinct))
-                    Seq(this)
-                else
-                    Seq.empty
-            }.distinct
+            else
+                collapse0(this).flatMap(p ⇒ Option(Seq(p))).getOrElse(Seq.empty)
+
+        sens = sens.distinct
 
         sens.foreach(sen ⇒
             sen.foreach(tok ⇒
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
index caa5a4a..f5bcfbf 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
@@ -17,14 +17,13 @@
 
 package org.apache.nlpcraft.probe.mgrs
 
-import java.io.Serializable
-import java.util
-
-import org.apache.nlpcraft.common.TOK_META_ALIASES_KEY
-import org.apache.nlpcraft.common.nlp.NCNlpSentence
+import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceToken}
+import org.apache.nlpcraft.common.{NCE, TOK_META_ALIASES_KEY}
 import org.apache.nlpcraft.model.NCVariant
 import org.apache.nlpcraft.model.impl.{NCTokenImpl, NCVariantImpl}
 
+import java.io.Serializable
+import java.util
 import scala.collection.JavaConverters._
 import scala.collection.{Seq, mutable}
 
@@ -49,6 +48,12 @@ object NCProbeVariants {
         val keys2Toks = toks.flatten.map(t ⇒ Key(t.getId, t.getStartCharIndex, t.getEndCharIndex) → t).toMap
         val partsKeys = mutable.HashSet.empty[Key]
 
+        val nlpTok2nlpSen: Map[NCNlpSentenceToken, Seq[NCNlpSentence]] =
+            sens.
+            flatMap(sen ⇒ sen.map(_ → sen)).
+            groupBy { case (tok, _) ⇒ tok }.
+            map { case (tok, seq) ⇒ tok → seq.map { case (_, sen) ⇒ sen } }
+
         seq.flatten.foreach { case (tok, tokNlp) ⇒
             if (tokNlp.isUser) {
                 val userNotes = tokNlp.filter(_.isUser)
@@ -72,14 +77,43 @@ object NCProbeVariants {
                             keys2Toks.get(key) match {
                                 case Some(tok) ⇒ tok
                                 case None ⇒
-                                    val toks =
-                                        keys2Toks.filter { case (k, _) ⇒ k.from == key.from && k.to == key.to }.values
-
-                                    require(toks.size == 1, s"Unexpected state [key=$key, tokens=${toks.mkString(",")}]")
-
-                                    val tok = toks.head
-
-                                    tok
+                                    val deletedNotes = nlpTok2nlpSen(tokNlp).flatMap(_.deletedNotes).distinct
+
+                                    def find(noteTypeFilter: String ⇒ Boolean): Option[NCNlpSentenceToken] =
+                                        deletedNotes.toStream.
+                                            flatMap { case (delNote, delNoteToks) ⇒
+                                                if (noteTypeFilter(delNote.noteType)) {
+                                                    val toks =
+                                                        delNoteToks.
+                                                            dropWhile(_.startCharIndex != key.from).
+                                                            reverse.
+                                                            dropWhile(_.endCharIndex != key.to).
+                                                            reverse
+
+                                                    toks.size match {
+                                                        case 0 ⇒ None
+                                                        case 1 ⇒
+                                                            // TODO:
+                                                            Some(toks.head)
+                                                        case _ ⇒
+                                                            // TODO:
+                                                            Some(toks.last)
+                                                    }
+                                                }
+                                                else
+                                                    None
+                                            }.headOption
+
+                                    var nlpTokOpt = find(_ == key.id)
+
+                                    if (nlpTokOpt.isEmpty && key.id == "nlpcraft:nlp")
+                                        nlpTokOpt = find(_ ⇒ true)
+
+                                    NCTokenImpl(
+                                        mdl,
+                                        srvReqId,
+                                        nlpTokOpt.getOrElse(throw new NCE(s"Part not found for: $key"))
+                                    )
                             }
                         })
 
@@ -103,27 +137,38 @@ object NCProbeVariants {
         var vars = toks.filter(sen ⇒
             !sen.exists(t ⇒
                 t.getId != "nlpcraft:nlp" &&
-                    partsKeys.contains(Key(t.getId, t.getStartCharIndex, t.getEndCharIndex))
+                partsKeys.contains(Key(t.getId, t.getStartCharIndex, t.getEndCharIndex))
             )
         ).map(p ⇒ new NCVariantImpl(p.asJava))
 
         if (lastPhase) {
-            if (vars.size > 1)
+            if (vars.size > 1) {
                 vars = vars.filter(v ⇒ !v.asScala.forall(_.getId == "nlpcraft:nlp"))
-//
-//            // Assertions.
-//            for (v ← vars;
-//                toks = v.asScala;
-//                (tok, idx) ← toks.filter(_.isUserDefined).zipWithIndex;
-//                part ← tok.getPartTokens.asScala
-//            ) {
-//                require(part.getIndex < toks.size, s"Part has unexpected index [tokens=${toks.mkString(", ")}, token=$idx, part=$part]}")
-//
-//                require(
-//                    toks(part.getIndex).getId == part.getId,
-//                    s"Part has unexpected ID [tokens=${toks.mkString(", ")}, token=$idx, part=$part]}"
-//                )
-//            }
+
+                val sorted = vars.sortBy(p ⇒ -p.asScala.count(_.getId != "nlpcraft:nlp"))
+
+                val saved = mutable.ArrayBuffer.empty :+ sorted.head
+
+                for (v ← sorted.tail) {
+                    val toks = v.asScala
+
+                    if (!saved.exists(s ⇒
+                        s.size == toks.size &&
+                        s.asScala.zip(toks).forall {
+                            case (savedTok, tok) ⇒
+                                savedTok.getStartCharIndex == tok.getStartCharIndex &&
+                                savedTok.getEndCharIndex == tok.getEndCharIndex &&
+                                (
+                                    savedTok.getId == tok.getId && savedTok.getMetadata == tok.getMetadata ||
+                                    tok.getId == "nlpcraft:nlp"
+                                )
+                        }
+                    ))
+                        saved += v
+                }
+
+                vars = saved.sortBy(sorted.indexOf)
+            }
         }
 
         vars
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
new file mode 100644
index 0000000..29abbaf
--- /dev/null
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.model.`abstract`
+
+import org.apache.nlpcraft.model.{NCContext, NCResult, NCToken}
+import org.apache.nlpcraft.{NCTestContext, NCTestEnvironment}
+import org.junit.jupiter.api.Test
+
+import scala.collection.JavaConverters._
+
+class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
+    private def checkId(t: NCToken, id: String): Unit =
+        require(t.getId == id, s"Expected ID: $id, token: $t")
+    private def checkText(t: NCToken, txt: String): Unit =
+        require(t.getOriginalText == txt, s"Expected text: $txt, token: $t")
+
+    override def onContext(ctx: NCContext): NCResult = {
+        val variants = ctx.getVariants.asScala
+
+        ctx.getRequest.getNormalizedText match {
+            case "word the word" ⇒
+                require(variants.size == 1)
+
+                val toks = variants.head.asScala
+
+                require(toks.size == 2)
+
+                val t1 = toks.head
+
+                checkId(t1, "nlpcraft:nlp")
+                checkText(t1, "word")
+
+                val t2 = toks.last
+
+                checkId(t2, "wrapAnyWord")
+                checkText(t2, "the word")
+
+                val t2Parts = t2.getPartTokens.asScala
+
+                require(t2Parts.size == 2)
+
+                checkId(t2Parts.head,"anyWord")
+                checkId(t2Parts.last, "anyWord")
+            case "10 w1 10 w2" ⇒
+                require(variants.size == 1)
+
+                val toks = variants.head.asScala
+
+                require(toks.size == 2)
+
+                val t1 = toks.head
+
+                checkId(t1, "nlpcraft:nlp")
+                checkText(t1, "10")
+
+                val t2 = toks.last
+
+                checkId(t2,"wrapNum")
+                checkText(t2,"w1 10 w2")
+
+                val t2Parts = t2.getPartTokens.asScala
+
+                require(t2Parts.size == 3)
+
+                checkId(t2Parts.head,"nlpcraft:nlp")
+                checkId(t2Parts(1),"nlpcraft:num")
+                checkId(t2Parts.last,"nlpcraft:nlp")
+            case _ ⇒ throw new AssertionError(s"Unexpected request: ${ctx.getRequest.getNormalizedText}")
+        }
+
+        NCResult.text("OK")
+    }
+}
+
+@NCTestEnvironment(model = classOf[NCAbstractTokensModelVariants], startClient = true)
+class NCAbstractTokensVariantsSpec extends NCTestContext {
+    @Test
+    def test(): Unit = {
+        checkResult("word the word", "OK")
+        checkResult("10 w1 10 w2", "OK")
+    }
+}
\ No newline at end of file
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
index 3317331..cc03066 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
@@ -153,11 +153,6 @@ class NCEnricherSortSpec extends NCEnricherBaseSpec {
                     usr(text = "A", id = "A"),
                     usr(text = "B", id = "B"),
                     srt(text = "classify", subjNotes = Seq("B"), subjIndexes = Seq(1))
-                ),
-                Seq(
-                    usr(text = "A", id = "A"),
-                    usr(text = "B", id = "B"),
-                    nlp(text = "classify")
                 )
             ),
             _ ⇒ checkAll(