You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/01/25 19:12:48 UTC

[incubator-nlpcraft] branch master updated: NLP sentence processing minor fix.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


The following commit(s) were added to refs/heads/master by this push:
     new f63c9d0  NLP sentence processing minor fix.
f63c9d0 is described below

commit f63c9d04c5b06381a80a63a647969d1efea99c20
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Mon Jan 25 22:12:33 2021 +0300

    NLP sentence processing minor fix.
---
 .../apache/nlpcraft/common/nlp/NCNlpSentence.scala | 131 ++++++++++++++-------
 .../nlp/enrichers/limit/NCEnricherLimitSpec.scala  |  21 +++-
 .../nlp/enrichers/sort/NCEnricherSortSpec.scala    |   5 +
 3 files changed, 113 insertions(+), 44 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index e1614c6..d1aeb60 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -17,12 +17,11 @@
 
 package org.apache.nlpcraft.common.nlp
 
-import java.util
-import java.util.Collections
-
 import org.apache.nlpcraft.common.NCE
 import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
 
+import java.util
+import java.util.Collections
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.{Map, Seq, Set, mutable}
@@ -40,16 +39,7 @@ object NCNlpSentence {
       * @return
       */
     private def checkRelation(ns: NCNlpSentence, idxs: Seq[Int], notesType: String, note: NCNlpSentenceNote): Boolean = {
-        val types =
-            idxs.flatMap(idx ⇒ {
-                val types = ns(idx).map(p ⇒ p).filter(!_.isNlp).map(_.noteType)
-
-                types.size match {
-                    case 0 ⇒ None
-                    case 1 ⇒ Some(types.head)
-                    case _ ⇒ throw new AssertionError(s"Unexpected tokes: ${ns(idx)}")
-                }
-            }).distinct
+        val types = idxs.flatMap(idx ⇒ ns(idx).map(p ⇒ p).filter(!_.isNlp).map(_.noteType)).distinct
 
         /**
           * Example:
@@ -84,15 +74,23 @@ object NCNlpSentence {
       * Note that 'idxsField' is 'indexes' and 'noteField' is 'note' for all kind of references.
       *
       * @param noteType Note type.
+      * @param idxsField Indexes field.
+      * @param noteField Note field.
       * @param ns Sentence.
       * @param history Indexes transformation history.
       * @return Valid flag.
       */
-    private def fixIndexesReferences(noteType: String, ns: NCNlpSentence, history: Seq[(Int, Int)]): Boolean = {
+    private def fixIndexesReferences(
+        noteType: String,
+        idxsField: String,
+        noteField: String,
+        ns: NCNlpSentence,
+        history: Seq[(Int, Int)]
+    ): Boolean = {
         ns.filter(_.isTypeOf(noteType)).foreach(tok ⇒
-            tok.getNoteOpt(noteType, "indexes") match {
+            tok.getNoteOpt(noteType, idxsField) match {
                 case Some(n) ⇒
-                    val idxs: Seq[Int] = n.data[java.util.List[Int]]("indexes").asScala
+                    val idxs: Seq[Int] = n.data[java.util.List[Int]](idxsField).asScala
                     var fixed = idxs
 
                     history.foreach { case (idxOld, idxNew) ⇒ fixed = fixed.map(i ⇒ if (i == idxOld) idxNew else i) }
@@ -106,11 +104,46 @@ object NCNlpSentence {
         )
 
         ns.flatMap(_.getNotes(noteType)).forall(
-            n ⇒ checkRelation(ns, n.data[java.util.List[Int]]("indexes").asScala, n.data[String]("note"), n)
+            n ⇒ checkRelation(ns, n.data[java.util.List[Int]]("indexes").asScala, n.data[String](noteField), n)
         )
     }
 
     /**
+      *
+      * @param note
+      * @param idxsField
+      * @param noteField
+      * @param ns
+      */
+    private def fixNoteIndexes(note: String, idxsField: String, noteField: String, ns: NCNlpSentence): Unit =
+        ns.flatMap(_.getNotes(note)).foreach(
+            n ⇒ checkRelation(ns, n.data[java.util.List[Int]](idxsField).asScala, n.data[String](noteField), n)
+        )
+
+    /**
+      *
+      * @param note
+      * @param idxsField
+      * @param noteField
+      * @param ns
+      */
+    private def fixNoteIndexesList(note: String, idxsField: String, noteField: String, ns: NCNlpSentence): Unit =
+        ns.flatMap(_.getNotes(note)).foreach(rel ⇒
+            rel.dataOpt[java.util.List[java.util.List[Int]]](idxsField) match {
+                case Some(idxsList) ⇒
+                    val notesTypes = rel.data[util.List[String]](noteField)
+
+                    require(idxsList.size() == notesTypes.size())
+
+                    idxsList.asScala.zip(notesTypes.asScala).foreach {
+                        case (idxs, notesType) ⇒ checkRelation(ns, idxs.asScala, notesType, rel)
+                    }
+                case None ⇒ // No-op.
+            }
+        )
+
+
+    /**
       * Copies token.
       *
       * @param ns Sentence.
@@ -357,20 +390,22 @@ object NCNlpSentence {
         for (tok ← ns.filter(_.isTypeOf(noteType)) if ok)
             tok.getNoteOpt(noteType, idxsField) match {
                 case Some(n) ⇒
-                    val idxs: Seq[Seq[Int]] = n.data[java.util.List[java.util.List[Int]]](idxsField).asScala.map(_.asScala)
+                    val idxs: Seq[Seq[Int]] =
+                        n.data[java.util.List[java.util.List[Int]]](idxsField).asScala.map(_.asScala)
                     var fixed = idxs
 
-                    history.foreach { case (idxOld, idxNew) ⇒ fixed = fixed.map(_.map(i ⇒ if (i == idxOld) idxNew else i).distinct) }
+                    history.foreach {
+                        case (idxOld, idxNew) ⇒ fixed = fixed.map(_.map(i ⇒ if (i == idxOld) idxNew else i).distinct)
+                    }
 
                     if (fixed.forall(_.size == 1))
-                    // Fix double dimension array to one dimension,
-                    // so it should be called always in spite of 'fixIndexesReferences' method.
-                    ns.fixNote(n, idxsField → fixed.map(_.head).asJava.asInstanceOf[java.io.Serializable])
+                        // Fix double dimension array to one dimension,
+                        // so it should be called always in spite of 'fixIndexesReferences' method.
+                        ns.fixNote(n, idxsField → fixed.map(_.head).asJava.asInstanceOf[java.io.Serializable])
                     else
-                    ok = false
+                        ok = false
                 case None ⇒ // No-op.
             }
-
         ok &&
             ns.flatMap(_.getNotes(noteType)).forall(rel ⇒
                 rel.dataOpt[java.util.List[Int]](idxsField) match {
@@ -412,26 +447,33 @@ object NCNlpSentence {
 
         val history = mutable.ArrayBuffer.empty[(Int, Int)]
 
-        notNlpTypes.foreach(typ ⇒ zipNotes(ns, typ, notNlpTypes, history))
+        fixNoteIndexes("nlpcraft:relation", "indexes", "note", ns)
+        fixNoteIndexes("nlpcraft:limit", "indexes", "note", ns)
+        fixNoteIndexesList("nlpcraft:sort", "subjindexes", "subjnotes", ns)
+        fixNoteIndexesList("nlpcraft:sort", "byindexes", "bynotes", ns)
 
+        notNlpTypes.foreach(typ ⇒ zipNotes(ns, typ, notNlpTypes, history))
         unionStops(ns, notNlpTypes, history)
 
         val res =
-            Seq("nlpcraft:relation", "nlpcraft:limit").forall(t ⇒ fixIndexesReferences(t, ns, history)) &&
-                fixIndexesReferencesList("nlpcraft:sort", "subjindexes", "subjnotes", ns, history) &&
-                fixIndexesReferencesList("nlpcraft:sort", "byindexes", "bynotes", ns, history)
+            fixIndexesReferences("nlpcraft:relation", "indexes", "note", ns, history) &&
+            fixIndexesReferences("nlpcraft:limit", "indexes", "note", ns, history) &&
+            fixIndexesReferencesList("nlpcraft:sort", "subjindexes", "subjnotes", ns, history) &&
+            fixIndexesReferencesList("nlpcraft:sort", "byindexes", "bynotes", ns, history)
 
-        if (res)
+        if (res) {
             // Validation (all indexes calculated well)
             require(
-                !ns.flatten.
-                exists(n ⇒ ns.filter(_.wordIndexes.exists(n.wordIndexes.contains)).exists(t ⇒ !t.contains(n))),
-                    s"Invalid sentence:\n" +
+                !res ||
+                    !ns.flatten.
+                        exists(n ⇒ ns.filter(_.wordIndexes.exists(n.wordIndexes.contains)).exists(t ⇒ !t.contains(n))),
+                s"Invalid sentence:\n" +
                     ns.map(t ⇒
-                    // Human readable invalid sentence for debugging.
-                    s"${t.origText}{index:${t.index}}[${t.map(n ⇒ s"${n.noteType}, {range:${n.tokenFrom}-${n.tokenTo}}").mkString("|")}]"
+                        // Human readable invalid sentence for debugging.
+                        s"${t.origText}{index:${t.index}}[${t.map(n ⇒ s"${n.noteType}, {range:${n.tokenFrom}-${n.tokenTo}}").mkString("|")}]"
                     ).mkString("\n")
             )
+        }
 
         res
     }
@@ -522,11 +564,17 @@ class NCNlpSentence(
                 (
                     // System notes don't have such flags.
                     if (p.isUser) {
-                        if (p.isDirect) 0 else 1
+                        if (p.isDirect)
+                            0
+                        else
+                            1
                     }
                     else
                         0,
-                    if (p.isUser) p.sparsity else 0
+                    if (p.isUser)
+                        p.sparsity
+                    else
+                        0
                 )
             )).
             flatMap(_.drop(1)).
@@ -619,7 +667,10 @@ class NCNlpSentence(
                 m.values.map(_.sentence).toSeq
             }
             else {
-                if (collapseSentence(this, getNotNlpNotes(this).map(_.noteType).distinct)) Seq(this) else Seq.empty
+                if (collapseSentence(this, getNotNlpNotes(this).map(_.noteType).distinct))
+                    Seq(this)
+                else
+                    Seq.empty
             }.distinct
 
         sens.foreach(sen ⇒
@@ -703,9 +754,9 @@ class NCNlpSentence(
     override def equals(obj: Any): Boolean = obj match {
         case x: NCNlpSentence ⇒
             tokens == x.tokens &&
-            srvReqId == x.srvReqId &&
-            text == x.text &&
-            enabledBuiltInToks == x.enabledBuiltInToks
+                srvReqId == x.srvReqId &&
+                text == x.text &&
+                enabledBuiltInToks == x.enabledBuiltInToks
 
         case _ ⇒ false
     }
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala
index e983030..624fbe4 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala
@@ -18,7 +18,7 @@
 package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.limit
 
 import org.apache.nlpcraft.NCTestEnvironment
-import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.{NCDefaultTestModel, NCEnricherBaseSpec, NCTestLimitToken => lim, NCTestUserToken => usr}
+import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.{NCDefaultTestModel, NCEnricherBaseSpec, NCTestLimitToken ⇒ lim, NCTestUserToken ⇒ usr,  NCTestNlpToken ⇒ nlp}
 import org.junit.jupiter.api.Test
 
 /**
@@ -58,10 +58,23 @@ class NCEnricherLimitSpec extends NCEnricherBaseSpec {
                 lim(text = "top 10", limit = 10, index = 1, note = "D1", asc = false),
                 usr(text = "D1", id = "D1")
             ),
-            _ ⇒ checkExists(
+            _ ⇒ checkAll(
                 "handful of A B",
-                lim(text = "handful of", limit = 5, index = 1, note = "AB", asc = false),
-                usr(text = "A B", id = "AB")
+                Seq(
+                    lim(text = "handful of", limit = 5, index = 1, note = "AB", asc = false),
+                    usr(text = "A B", id = "AB")
+                ),
+                Seq(
+                    lim(text = "handful of", limit = 5, index = 1, note = "A", asc = false),
+                    usr(text = "A", id = "A"),
+                    usr(text = "B", id = "B")
+                ),
+                Seq(
+                    nlp("handful"),
+                    nlp("of"),
+                    usr(text = "A", id = "A"),
+                    usr(text = "B", id = "B")
+                )
             )
         )
 }
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
index cc03066..3317331 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
@@ -153,6 +153,11 @@ class NCEnricherSortSpec extends NCEnricherBaseSpec {
                     usr(text = "A", id = "A"),
                     usr(text = "B", id = "B"),
                     srt(text = "classify", subjNotes = Seq("B"), subjIndexes = Seq(1))
+                ),
+                Seq(
+                    usr(text = "A", id = "A"),
+                    usr(text = "B", id = "B"),
+                    nlp(text = "classify")
                 )
             ),
             _ ⇒ checkAll(