You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/01/25 19:12:48 UTC
[incubator-nlpcraft] branch master updated: NLP sentence processing
minor fix.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/master by this push:
new f63c9d0 NLP sentence processing minor fix.
f63c9d0 is described below
commit f63c9d04c5b06381a80a63a647969d1efea99c20
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Mon Jan 25 22:12:33 2021 +0300
NLP sentence processing minor fix.
---
.../apache/nlpcraft/common/nlp/NCNlpSentence.scala | 131 ++++++++++++++-------
.../nlp/enrichers/limit/NCEnricherLimitSpec.scala | 21 +++-
.../nlp/enrichers/sort/NCEnricherSortSpec.scala | 5 +
3 files changed, 113 insertions(+), 44 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index e1614c6..d1aeb60 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -17,12 +17,11 @@
package org.apache.nlpcraft.common.nlp
-import java.util
-import java.util.Collections
-
import org.apache.nlpcraft.common.NCE
import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
+import java.util
+import java.util.Collections
import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer
import scala.collection.{Map, Seq, Set, mutable}
@@ -40,16 +39,7 @@ object NCNlpSentence {
* @return
*/
private def checkRelation(ns: NCNlpSentence, idxs: Seq[Int], notesType: String, note: NCNlpSentenceNote): Boolean = {
- val types =
- idxs.flatMap(idx ⇒ {
- val types = ns(idx).map(p ⇒ p).filter(!_.isNlp).map(_.noteType)
-
- types.size match {
- case 0 ⇒ None
- case 1 ⇒ Some(types.head)
- case _ ⇒ throw new AssertionError(s"Unexpected tokes: ${ns(idx)}")
- }
- }).distinct
+ val types = idxs.flatMap(idx ⇒ ns(idx).map(p ⇒ p).filter(!_.isNlp).map(_.noteType)).distinct
/**
* Example:
@@ -84,15 +74,23 @@ object NCNlpSentence {
* Note that 'idxsField' is 'indexes' and 'noteField' is 'note' for all kind of references.
*
* @param noteType Note type.
+ * @param idxsField Indexes field.
+ * @param noteField Note field.
* @param ns Sentence.
* @param history Indexes transformation history.
* @return Valid flag.
*/
- private def fixIndexesReferences(noteType: String, ns: NCNlpSentence, history: Seq[(Int, Int)]): Boolean = {
+ private def fixIndexesReferences(
+ noteType: String,
+ idxsField: String,
+ noteField: String,
+ ns: NCNlpSentence,
+ history: Seq[(Int, Int)]
+ ): Boolean = {
ns.filter(_.isTypeOf(noteType)).foreach(tok ⇒
- tok.getNoteOpt(noteType, "indexes") match {
+ tok.getNoteOpt(noteType, idxsField) match {
case Some(n) ⇒
- val idxs: Seq[Int] = n.data[java.util.List[Int]]("indexes").asScala
+ val idxs: Seq[Int] = n.data[java.util.List[Int]](idxsField).asScala
var fixed = idxs
history.foreach { case (idxOld, idxNew) ⇒ fixed = fixed.map(i ⇒ if (i == idxOld) idxNew else i) }
@@ -106,11 +104,46 @@ object NCNlpSentence {
)
ns.flatMap(_.getNotes(noteType)).forall(
- n ⇒ checkRelation(ns, n.data[java.util.List[Int]]("indexes").asScala, n.data[String]("note"), n)
+ n ⇒ checkRelation(ns, n.data[java.util.List[Int]]("indexes").asScala, n.data[String](noteField), n)
)
}
/**
+ *
+ * @param note
+ * @param idxsField
+ * @param noteField
+ * @param ns
+ */
+ private def fixNoteIndexes(note: String, idxsField: String, noteField: String, ns: NCNlpSentence): Unit =
+ ns.flatMap(_.getNotes(note)).foreach(
+ n ⇒ checkRelation(ns, n.data[java.util.List[Int]](idxsField).asScala, n.data[String](noteField), n)
+ )
+
+ /**
+ *
+ * @param note
+ * @param idxsField
+ * @param noteField
+ * @param ns
+ */
+ private def fixNoteIndexesList(note: String, idxsField: String, noteField: String, ns: NCNlpSentence): Unit =
+ ns.flatMap(_.getNotes(note)).foreach(rel ⇒
+ rel.dataOpt[java.util.List[java.util.List[Int]]](idxsField) match {
+ case Some(idxsList) ⇒
+ val notesTypes = rel.data[util.List[String]](noteField)
+
+ require(idxsList.size() == notesTypes.size())
+
+ idxsList.asScala.zip(notesTypes.asScala).foreach {
+ case (idxs, notesType) ⇒ checkRelation(ns, idxs.asScala, notesType, rel)
+ }
+ case None ⇒ // No-op.
+ }
+ )
+
+
+ /**
* Copies token.
*
* @param ns Sentence.
@@ -357,20 +390,22 @@ object NCNlpSentence {
for (tok ← ns.filter(_.isTypeOf(noteType)) if ok)
tok.getNoteOpt(noteType, idxsField) match {
case Some(n) ⇒
- val idxs: Seq[Seq[Int]] = n.data[java.util.List[java.util.List[Int]]](idxsField).asScala.map(_.asScala)
+ val idxs: Seq[Seq[Int]] =
+ n.data[java.util.List[java.util.List[Int]]](idxsField).asScala.map(_.asScala)
var fixed = idxs
- history.foreach { case (idxOld, idxNew) ⇒ fixed = fixed.map(_.map(i ⇒ if (i == idxOld) idxNew else i).distinct) }
+ history.foreach {
+ case (idxOld, idxNew) ⇒ fixed = fixed.map(_.map(i ⇒ if (i == idxOld) idxNew else i).distinct)
+ }
if (fixed.forall(_.size == 1))
- // Fix double dimension array to one dimension,
- // so it should be called always in spite of 'fixIndexesReferences' method.
- ns.fixNote(n, idxsField → fixed.map(_.head).asJava.asInstanceOf[java.io.Serializable])
+ // Fix double dimension array to one dimension,
+ // so it should be called always in spite of 'fixIndexesReferences' method.
+ ns.fixNote(n, idxsField → fixed.map(_.head).asJava.asInstanceOf[java.io.Serializable])
else
- ok = false
+ ok = false
case None ⇒ // No-op.
}
-
ok &&
ns.flatMap(_.getNotes(noteType)).forall(rel ⇒
rel.dataOpt[java.util.List[Int]](idxsField) match {
@@ -412,26 +447,33 @@ object NCNlpSentence {
val history = mutable.ArrayBuffer.empty[(Int, Int)]
- notNlpTypes.foreach(typ ⇒ zipNotes(ns, typ, notNlpTypes, history))
+ fixNoteIndexes("nlpcraft:relation", "indexes", "note", ns)
+ fixNoteIndexes("nlpcraft:limit", "indexes", "note", ns)
+ fixNoteIndexesList("nlpcraft:sort", "subjindexes", "subjnotes", ns)
+ fixNoteIndexesList("nlpcraft:sort", "byindexes", "bynotes", ns)
+ notNlpTypes.foreach(typ ⇒ zipNotes(ns, typ, notNlpTypes, history))
unionStops(ns, notNlpTypes, history)
val res =
- Seq("nlpcraft:relation", "nlpcraft:limit").forall(t ⇒ fixIndexesReferences(t, ns, history)) &&
- fixIndexesReferencesList("nlpcraft:sort", "subjindexes", "subjnotes", ns, history) &&
- fixIndexesReferencesList("nlpcraft:sort", "byindexes", "bynotes", ns, history)
+ fixIndexesReferences("nlpcraft:relation", "indexes", "note", ns, history) &&
+ fixIndexesReferences("nlpcraft:limit", "indexes", "note", ns, history) &&
+ fixIndexesReferencesList("nlpcraft:sort", "subjindexes", "subjnotes", ns, history) &&
+ fixIndexesReferencesList("nlpcraft:sort", "byindexes", "bynotes", ns, history)
- if (res)
+ if (res) {
// Validation (all indexes calculated well)
require(
- !ns.flatten.
- exists(n ⇒ ns.filter(_.wordIndexes.exists(n.wordIndexes.contains)).exists(t ⇒ !t.contains(n))),
- s"Invalid sentence:\n" +
+ !res ||
+ !ns.flatten.
+ exists(n ⇒ ns.filter(_.wordIndexes.exists(n.wordIndexes.contains)).exists(t ⇒ !t.contains(n))),
+ s"Invalid sentence:\n" +
ns.map(t ⇒
- // Human readable invalid sentence for debugging.
- s"${t.origText}{index:${t.index}}[${t.map(n ⇒ s"${n.noteType}, {range:${n.tokenFrom}-${n.tokenTo}}").mkString("|")}]"
+ // Human readable invalid sentence for debugging.
+ s"${t.origText}{index:${t.index}}[${t.map(n ⇒ s"${n.noteType}, {range:${n.tokenFrom}-${n.tokenTo}}").mkString("|")}]"
).mkString("\n")
)
+ }
res
}
@@ -522,11 +564,17 @@ class NCNlpSentence(
(
// System notes don't have such flags.
if (p.isUser) {
- if (p.isDirect) 0 else 1
+ if (p.isDirect)
+ 0
+ else
+ 1
}
else
0,
- if (p.isUser) p.sparsity else 0
+ if (p.isUser)
+ p.sparsity
+ else
+ 0
)
)).
flatMap(_.drop(1)).
@@ -619,7 +667,10 @@ class NCNlpSentence(
m.values.map(_.sentence).toSeq
}
else {
- if (collapseSentence(this, getNotNlpNotes(this).map(_.noteType).distinct)) Seq(this) else Seq.empty
+ if (collapseSentence(this, getNotNlpNotes(this).map(_.noteType).distinct))
+ Seq(this)
+ else
+ Seq.empty
}.distinct
sens.foreach(sen ⇒
@@ -703,9 +754,9 @@ class NCNlpSentence(
override def equals(obj: Any): Boolean = obj match {
case x: NCNlpSentence ⇒
tokens == x.tokens &&
- srvReqId == x.srvReqId &&
- text == x.text &&
- enabledBuiltInToks == x.enabledBuiltInToks
+ srvReqId == x.srvReqId &&
+ text == x.text &&
+ enabledBuiltInToks == x.enabledBuiltInToks
case _ ⇒ false
}
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala
index e983030..624fbe4 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala
@@ -18,7 +18,7 @@
package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.limit
import org.apache.nlpcraft.NCTestEnvironment
-import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.{NCDefaultTestModel, NCEnricherBaseSpec, NCTestLimitToken => lim, NCTestUserToken => usr}
+import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.{NCDefaultTestModel, NCEnricherBaseSpec, NCTestLimitToken ⇒ lim, NCTestUserToken ⇒ usr, NCTestNlpToken ⇒ nlp}
import org.junit.jupiter.api.Test
/**
@@ -58,10 +58,23 @@ class NCEnricherLimitSpec extends NCEnricherBaseSpec {
lim(text = "top 10", limit = 10, index = 1, note = "D1", asc = false),
usr(text = "D1", id = "D1")
),
- _ ⇒ checkExists(
+ _ ⇒ checkAll(
"handful of A B",
- lim(text = "handful of", limit = 5, index = 1, note = "AB", asc = false),
- usr(text = "A B", id = "AB")
+ Seq(
+ lim(text = "handful of", limit = 5, index = 1, note = "AB", asc = false),
+ usr(text = "A B", id = "AB")
+ ),
+ Seq(
+ lim(text = "handful of", limit = 5, index = 1, note = "A", asc = false),
+ usr(text = "A", id = "A"),
+ usr(text = "B", id = "B")
+ ),
+ Seq(
+ nlp("handful"),
+ nlp("of"),
+ usr(text = "A", id = "A"),
+ usr(text = "B", id = "B")
+ )
)
)
}
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
index cc03066..3317331 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
@@ -153,6 +153,11 @@ class NCEnricherSortSpec extends NCEnricherBaseSpec {
usr(text = "A", id = "A"),
usr(text = "B", id = "B"),
srt(text = "classify", subjNotes = Seq("B"), subjIndexes = Seq(1))
+ ),
+ Seq(
+ usr(text = "A", id = "A"),
+ usr(text = "B", id = "B"),
+ nlp(text = "classify")
)
),
_ ⇒ checkAll(