You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/02/26 06:58:54 UTC
[incubator-nlpcraft] branch master updated: Probe sentences
variants processing bugfixes.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/master by this push:
new 0f5ee6c Probe sentences variants processing bugfixes.
0f5ee6c is described below
commit 0f5ee6c0c9826ed5c7bfd2c3173bcf68f4a1e8bc
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Fri Feb 26 09:58:39 2021 +0300
Probe sentences variants processing bugfixes.
---
.../apache/nlpcraft/common/nlp/NCNlpSentence.scala | 120 ++++---
.../nlpcraft/probe/mgrs/NCProbeVariants.scala | 375 ++++++++++++---------
.../probe/mgrs/nlp/NCProbeEnrichmentManager.scala | 2 +
.../abstract/NCAbstractTokensVariantsSpec.scala | 20 +-
...pec3.scala => NCEnricherNestedModelSpec2.scala} | 48 +--
.../model/NCEnricherNestedModelSpec3.scala | 12 +-
6 files changed, 325 insertions(+), 252 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index 0583a3d..113e088 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -34,17 +34,32 @@ import scala.language.implicitConversions
object NCNlpSentence extends LazyLogging {
implicit def toTokens(x: NCNlpSentence): ArrayBuffer[NCNlpSentenceToken] = x.tokens
- private case class NoteLink(note: String, indexes: Seq[Int])
- private case class PartKey(id: String, start: Int, end: Int) {
+ case class NoteKey(start: Int, end: Int)
+ case class TokenKey(id: String, start: Int, end: Int)
+ case class NoteLink(note: String, indexes: Seq[Int])
+
+ case class PartKey(id: String, start: Int, end: Int) {
+ require(start <= end)
+
private def in(i: Int): Boolean = i >= start && i <= end
def intersect(id: String, start: Int, end: Int): Boolean = id == this.id && (in(start) || in(end))
}
+ object PartKey {
+ def apply(m: util.HashMap[String, JSerializable]): PartKey = {
+ def get[T](name: String): T = m.get(name).asInstanceOf[T]
+
+ PartKey(get("id"), get("startcharindex"), get("endcharindex"))
+ }
+
+ def apply(t: NCNlpSentenceNote, sen: NCNlpSentence): PartKey =
+ PartKey(t.noteType, sen(t.tokenFrom).startCharIndex, sen(t.tokenTo).endCharIndex)
+ }
private def getLinks(notes: Seq[NCNlpSentenceNote]): Seq[NoteLink] = {
val noteLinks = mutable.ArrayBuffer.empty[NoteLink]
for (n ← notes.filter(n ⇒ n.noteType == "nlpcraft:limit" || n.noteType == "nlpcraft:references"))
- noteLinks += NoteLink(n("note").asInstanceOf[String], n("indexes").asInstanceOf[JList[Int]].asScala)
+ noteLinks += NoteLink(n("note").asInstanceOf[String], n("indexes").asInstanceOf[JList[Int]].asScala.sorted)
for (n ← notes.filter(_.noteType == "nlpcraft:sort")) {
def add(noteName: String, idxsName: String): Unit = {
@@ -55,7 +70,7 @@ object NCNlpSentence extends LazyLogging {
noteLinks ++=
(for ((name, idxs) ← names.asScala.zip(idxsSeq.asScala.map(_.asScala)))
- yield NoteLink(name, idxs)
+ yield NoteLink(name, idxs.sorted)
)
}
@@ -73,14 +88,7 @@ object NCNlpSentence extends LazyLogging {
val optList: Option[JList[util.HashMap[String, JSerializable]]] = n.dataOpt("parts")
optList
- }).flatMap(_.asScala).
- map(map ⇒
- PartKey(
- map.get("id").asInstanceOf[String],
- map.get("startcharindex").asInstanceOf[Int],
- map.get("endcharindex").asInstanceOf[Int]
- )
- ).distinct
+ }).flatMap(_.asScala).map(m ⇒ PartKey(m)).distinct
/**
*
@@ -549,7 +557,9 @@ class NCNlpSentence(
val text: String,
val enabledBuiltInToks: Set[String],
override val tokens: mutable.ArrayBuffer[NCNlpSentenceToken] = new mutable.ArrayBuffer[NCNlpSentenceToken](32),
- val deletedNotes: mutable.HashMap[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = mutable.HashMap.empty
+ private val deletedNotes: mutable.HashMap[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = mutable.HashMap.empty,
+ private var initNlpNotes: Map[NoteKey, NCNlpSentenceNote] = null,
+ private val nlpTokens: mutable.HashMap[TokenKey, NCNlpSentenceToken] = mutable.HashMap.empty
) extends NCNlpSentenceTokenBuffer(tokens) with JSerializable {
@transient
private var hash: java.lang.Integer = _
@@ -578,7 +588,8 @@ class NCNlpSentence(
text,
enabledBuiltInToks,
tokens.map(_.clone()),
- deletedNotes.map(p ⇒ p._1.clone() → p._2.map(_.clone()))
+ deletedNotes.map(p ⇒ p._1.clone() → p._2.map(_.clone())),
+ initNlpNotes = initNlpNotes
)
/**
@@ -621,7 +632,6 @@ class NCNlpSentence(
if (!mdl.getAbstractTokens.isEmpty) {
val notes = ns.flatten
-
val keys = getPartKeys(notes :_*)
val noteLinks = getLinks(notes)
@@ -630,7 +640,7 @@ class NCNlpSentence(
mdl.getAbstractTokens.contains(n.noteType) &&
!keys.exists(_.intersect(n.noteType, noteToks.head.startCharIndex, noteToks.last.startCharIndex)) &&
- !noteLinks.contains(NoteLink(n.noteType, n.tokenIndexes))
+ !noteLinks.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))
}).foreach(ns.removeNote)
}
@@ -685,7 +695,7 @@ class NCNlpSentence(
var delCombs: Seq[NCNlpSentenceNote] =
getNotNlpNotes(this).
- flatMap(note ⇒ getNotNlpNotes(this.slice(note.tokenFrom, note.tokenTo + 1)).filter(_ != note)).
+ flatMap(note ⇒ getNotNlpNotes(note.tokenIndexes.sorted.map(i ⇒ this(i))).filter(_ != note)).
distinct
// Optimization. Deletes all wholly swallowed notes.
@@ -693,38 +703,18 @@ class NCNlpSentence(
val swallowed =
delCombs.
- filter(n ⇒ !links.contains(NoteLink(n.noteType, n.tokenIndexes))).
+ // There aren't links on it.
+ filter(n ⇒ !links.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))).
+ // It doesn't have links.
filter(getPartKeys(_).isEmpty).
- flatMap(n ⇒ {
- val wIdxs = n.wordIndexes.toSet
-
- val owners =
- delCombs.
- filter(_ != n).
- flatMap(n1 ⇒
- if (getPartKeys(n1).contains(
- PartKey(
- n.noteType,
- this(n.tokenFrom).startCharIndex,
- this(n.tokenTo).endCharIndex)
- )
- )
- Some(n1)
- else
- None
- )
+ flatMap(note ⇒ {
+ val noteWordsIdxs = note.wordIndexes.toSet
+ val key = PartKey(note, this)
+ val delCombOthers =
+ delCombs.filter(_ != note).flatMap(n ⇒ if (getPartKeys(n).contains(key)) Some(n) else None)
- if (owners.exists(
- o ⇒ {
- val oWIdxs = o.wordIndexes.toSet
-
- wIdxs == oWIdxs || wIdxs.subsetOf(oWIdxs)
- })
- )
- Some(n)
- else
- None
+ if (delCombOthers.exists(o ⇒ noteWordsIdxs == o.wordIndexes.toSet)) Some(note) else None
})
delCombs = delCombs.filter(p ⇒ !swallowed.contains(p))
@@ -903,4 +893,42 @@ class NCNlpSentence(
case _ ⇒ false
}
+
+ /**
+ *
+ */
+ def saveNlpNotes(): Unit =
+ initNlpNotes = this.map(t ⇒ NoteKey(t.startCharIndex, t.endCharIndex) → t.getNlpNote).toMap
+
+ /**
+ *
+ * @return
+ */
+ def findInitialNlpNote(startCharIndex: Int, endCharIndex: Int): Option[NCNlpSentenceNote] =
+ initNlpNotes.get(NoteKey(startCharIndex, endCharIndex))
+
+ /**
+ *
+ * @param nlp
+ */
+ def addNlpToken(nlp: NCNlpSentenceToken): Unit = {
+ require(nlp.size <= 2)
+
+ nlp.foreach(n ⇒ nlpTokens += TokenKey(n.noteType, nlp.startCharIndex, nlp.endCharIndex) → nlp)
+ }
+
+ /**
+ *
+ * @param noteType
+ * @param startCharIndex
+ * @param endCharIndex
+ * @return
+ */
+ def findNlpToken(noteType: String, startCharIndex: Int, endCharIndex: Int): Option[NCNlpSentenceToken] =
+ nlpTokens.get(TokenKey(noteType, startCharIndex, endCharIndex))
+
+ /**
+ *
+ */
+ def getDeletedNotes: Predef.Map[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = deletedNotes.toMap
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
index 16138c6..aefdbd1 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
@@ -18,10 +18,10 @@
package org.apache.nlpcraft.probe.mgrs
import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
-import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken}
+import org.apache.nlpcraft.common.nlp.{NCNlpSentence ⇒ NlpSentence, NCNlpSentenceNote ⇒ NlpNote, NCNlpSentenceToken ⇒ NlpToken}
import org.apache.nlpcraft.common.{NCE, TOK_META_ALIASES_KEY}
-import org.apache.nlpcraft.model.NCVariant
-import org.apache.nlpcraft.model.impl.{NCTokenImpl, NCVariantImpl}
+import org.apache.nlpcraft.model.impl.{NCTokenImpl, NCTokenLogger, NCVariantImpl}
+import org.apache.nlpcraft.model.{NCToken, NCVariant}
import java.io.{Serializable ⇒ JSerializable}
import java.util
@@ -33,39 +33,130 @@ import scala.collection.{Seq, mutable}
* Sentence to variants converter.
*/
object NCProbeVariants {
- private final val IDXS_SER: JSerializable = singletonList(-1).asInstanceOf[JSerializable]
- private final val IDXS2_SER: JSerializable = singletonList(singletonList(-1)).asInstanceOf[JSerializable]
- private final val IDXS_OBJ: Object = IDXS_SER.asInstanceOf[Object]
- private final val IDXS2_OBJ: Object = IDXS2_SER.asInstanceOf[Object]
- private final val IDX_OBJ = (-1).asInstanceOf[Object]
+ private final val IDX: java.lang.Integer = -1
+ private final val IDXS: JSerializable = singletonList(IDX).asInstanceOf[JSerializable]
+ private final val IDXS2: JSerializable = singletonList(singletonList(IDX)).asInstanceOf[JSerializable]
- private def mkNlpNote(srcToks: Seq[NCNlpSentenceToken]): NCNlpSentenceNote = {
+ case class Key(id: String, from: Int, to: Int)
+
+ object Key {
+ def apply(m: util.HashMap[String, JSerializable]): Key = {
+ def get[T](name: String): T = m.get(name).asInstanceOf[T]
+
+ Key(get("id"), get("startcharindex"), get("endcharindex"))
+ }
+
+ def apply(t: NCToken): Key = Key(t.getId, t.getStartCharIndex, t.getEndCharIndex)
+ }
+
+ /**
+ *
+ * @param t
+ * @return
+ */
+ private def convertTokenMetaIndexes(t: NCTokenImpl) : NCTokenImpl = {
+ val meta = mutable.HashMap.empty[String, Any] ++
+ Map(
+ "nlpcraft:nlp:index" → IDX,
+ s"${t.getId}:tokenindexes" → IDXS,
+ s"${t.getId}:wordindexes" → IDXS
+ )
+
+ t.getId match {
+ case "nlpcraft:relation" | "nlpcraft:limit" ⇒ meta += "nlpcraft:relation:indexes" → IDXS
+ case "nlpcraft:sort" ⇒ meta += "nlpcraft:sort:subjindexes" → IDXS2; meta += "nlpcraft:sort:byindexes" → IDXS2
+ case _ ⇒ // No-op.
+ }
+
+ t.getMetadata.putAll(meta.map(p ⇒ p._1 → p._2.asInstanceOf[Object]).asJava)
+
+ t
+ }
+
+ /**
+ *
+ * @param key
+ * @param delNotes
+ * @param noteTypePred
+ * @return
+ */
+ private def findDeletedToken(
+ key: Key,
+ delNotes: Map[NlpNote, Seq[NlpToken]],
+ noteTypePred: String ⇒ Boolean
+ ): Option[NlpToken] =
+ delNotes.toStream.
+ flatMap { case (delNote, delNoteToks) ⇒
+ if (noteTypePred(delNote.noteType)) {
+ val toks =
+ delNoteToks.
+ dropWhile(_.startCharIndex != key.from).
+ reverse.
+ dropWhile(_.endCharIndex != key.to).
+ reverse
+
+ toks.size match {
+ case 0 ⇒ None
+ case _ ⇒
+ val artTok = NlpToken(IDX)
+
+ artTok.add(mkNote(toks))
+
+ if (key.id != "nlpcraft:nlp") {
+ val ps = mkNlpNoteParams()
+
+ delNote.noteType match {
+ case "nlpcraft:relation" | "nlpcraft:limit" ⇒ ps += "indexes" → IDXS
+ case "nlpcraft:sort" ⇒ ps += "subjindexes" → IDXS2; ps += "byindexes" → IDXS2
+ case _ ⇒ // No-op.
+ }
+
+ artTok.add(delNote.clone(ps :_*))
+ }
+
+ Some(artTok)
+ }
+ }
+ else
+ None
+ }.headOption
+
+ /**
+ *
+ * @return
+ */
+ private def mkNlpNoteParams(): mutable.ArrayBuffer[(String, JSerializable)] =
+ mutable.ArrayBuffer.empty[(String, JSerializable)] ++ Seq("tokMinIndex" → IDX, "tokMaxIndex" → IDX)
+
+ /**
+ *
+ * @param srcToks
+ * @return
+ */
+ private def mkNote(srcToks: Seq[NlpToken]): NlpNote = {
// Note, it adds stop-words too.
- def mkValue(get: NCNlpSentenceToken ⇒ String): String = {
+ def mkValue(get: NlpToken ⇒ String): String = {
val buf = mutable.Buffer.empty[String]
val n = srcToks.size - 1
- srcToks.zipWithIndex.foreach(p ⇒ {
- val t = p._1
- val idx = p._2
-
+ srcToks.zipWithIndex.foreach { case (t, idx) ⇒
buf += get(t)
if (idx < n && t.endCharIndex != srcToks(idx + 1).startCharIndex)
buf += " "
- })
+ }
buf.mkString
}
- def all(is: NCNlpSentenceToken ⇒ Boolean): Boolean = srcToks.forall(is)
- def exists(is: NCNlpSentenceToken ⇒ Boolean): Boolean = srcToks.exists(is)
+ def all(is: NlpToken ⇒ Boolean): Boolean = srcToks.forall(is)
+ def exists(is: NlpToken ⇒ Boolean): Boolean = srcToks.exists(is)
- val origText = mkValue((t: NCNlpSentenceToken) ⇒ t.origText)
+ val origText = mkValue((t: NlpToken) ⇒ t.origText)
val params = Seq(
- "index" → -1,
+ "index" → IDX,
"pos" → NCPennTreebank.SYNTH_POS,
"posDesc" → NCPennTreebank.SYNTH_POS_DESC,
"lemma" → mkValue(_.lemma),
@@ -84,7 +175,7 @@ object NCProbeVariants {
"swear" → exists(_.isSwearWord)
)
- NCNlpSentenceNote(Seq(-1), srcToks.flatMap(_.wordIndexes).distinct.sorted, "nlpcraft:nlp", params: _*)
+ NlpNote(Seq(IDX.intValue()), srcToks.flatMap(_.wordIndexes).distinct.sorted, "nlpcraft:nlp", params: _*)
}
/**
@@ -92,154 +183,99 @@ object NCProbeVariants {
*
* @param mdl Probe model.
* @param srvReqId Server request ID.
- * @param sens Sentences.
+ * @param nlpSens Sentences.
* @param lastPhase Flag.
*/
- def convert(srvReqId: String, mdl: NCProbeModel, sens: Seq[NCNlpSentence], lastPhase: Boolean = false): Seq[NCVariant] = {
- val seq = sens.map(_.toSeq.map(nlpTok ⇒ NCTokenImpl(mdl, srvReqId, nlpTok) → nlpTok))
- val toks = seq.map(_.map { case (tok, _) ⇒ tok })
-
- case class Key(id: String, from: Int, to: Int)
+ def convert(srvReqId: String, mdl: NCProbeModel, nlpSens: Seq[NlpSentence], lastPhase: Boolean = false): Seq[NCVariant] = {
+ var vars =
+ nlpSens.flatMap(nlpSen ⇒ {
+ var ok = true
- val keys2Toks = toks.flatten.map(t ⇒ Key(t.getId, t.getStartCharIndex, t.getEndCharIndex) → t).toMap
- val partsKeys = mutable.HashSet.empty[Key]
+ def mkToken(nlpTok: NlpToken): NCTokenImpl = {
+ val ncTok = NCTokenImpl(mdl, srvReqId, nlpTok)
- val nlpTok2nlpSen: Map[NCNlpSentenceToken, Seq[NCNlpSentence]] =
- sens.
- flatMap(sen ⇒ sen.map(_ → sen)).
- groupBy { case (tok, _) ⇒ tok }.
- map { case (tok, seq) ⇒ tok → seq.map { case (_, sen) ⇒ sen } }
+ nlpSen.addNlpToken(nlpTok)
- seq.flatten.foreach { case (tok, tokNlp) ⇒
- if (tokNlp.isUser) {
- val userNotes = tokNlp.filter(_.isUser)
-
- require(userNotes.size == 1)
+ ncTok
+ }
- val optList: Option[util.List[util.HashMap[String, JSerializable]]] = userNotes.head.dataOpt("parts")
+ val toks = nlpSen.map(mkToken)
+ val keys2Toks = toks.map(t ⇒ Key(t) → t).toMap
- optList match {
- case Some(list) ⇒
- val keys =
- list.asScala.map(m ⇒
- Key(
- m.get("id").asInstanceOf[String],
- m.get("startcharindex").asInstanceOf[Integer],
- m.get("endcharindex").asInstanceOf[Integer]
- )
- )
+ def process(tok: NCTokenImpl, tokNlp: NlpToken): Unit = {
+ val optList: Option[util.List[util.HashMap[String, JSerializable]]] =
+ tokNlp.find(_.isUser) match {
+ case Some(u) ⇒ u.dataOpt("parts")
+ case None ⇒ None
+ }
- val parts = keys.map(key ⇒ {
- keys2Toks.get(key) match {
- // Notes for sentence.
- case Some(t) ⇒
- val meta = mutable.HashMap.empty[String, Object]
-
- meta += "nlpcraft:nlp:index" → IDX_OBJ
-
- meta += s"${t.getId}:tokenindexes" → IDXS_OBJ
- meta += s"${t.getId}:wordindexes" → IDXS_OBJ
-
- t.getId match {
- case "nlpcraft:relation" ⇒
- meta += "nlpcraft:relation:indexes" → IDXS_OBJ
- case "nlpcraft:limit" ⇒
- meta += "nlpcraft:limit:indexes" → IDXS_OBJ
- case "nlpcraft:sort" ⇒
- meta += "nlpcraft:sort:subjindexes" → IDXS2_OBJ
- meta += "nlpcraft:sort:byindexes" → IDXS2_OBJ
- case _ ⇒ // No-op.
- }
-
- t.getMetadata.putAll(meta.asJava)
-
- t
- case None ⇒
- // Tries to find between deleted notes.
- val delNotes = nlpTok2nlpSen(tokNlp).flatMap(_.deletedNotes).distinct
-
- def find(noteTypePred: String ⇒ Boolean): Option[NCNlpSentenceToken] =
- delNotes.toStream.
- flatMap { case (delNote, delNoteToks) ⇒
- if (noteTypePred(delNote.noteType)) {
- val toks =
- delNoteToks.
- dropWhile(_.startCharIndex != key.from).
- reverse.
- dropWhile(_.endCharIndex != key.to).
- reverse
-
- toks.size match {
- case 0 ⇒ None
- case _ ⇒
- val artTok = NCNlpSentenceToken(-1)
-
- artTok.add(mkNlpNote(toks))
-
- if (key.id != "nlpcraft:nlp") {
- val ps =
- mutable.ArrayBuffer.empty[(String, JSerializable)]
-
- ps += "tokenIndexes" → IDXS_SER
- ps += "wordIndexes" → IDXS_SER
-
- delNote.noteType match {
- case "nlpcraft:relation" ⇒
- ps += "indexes" → IDXS_SER
- case "nlpcraft:limit" ⇒
- ps += "indexes" → IDXS_SER
- case "nlpcraft:sort" ⇒
- ps += "subjindexes" → IDXS2_SER
- ps += "byindexes" → IDXS2_SER
- case _ ⇒ // No-op.
- }
-
- artTok.add(delNote.clone(ps :_*))
- }
-
- Some(artTok)
- }
+ optList match {
+ case Some(list) ⇒
+ val keys = list.asScala.map(Key(_))
+
+ val parts = keys.map(key ⇒
+ keys2Toks.get(key) match {
+ // Notes for sentence.
+ case Some(t) ⇒ convertTokenMetaIndexes(t)
+ case None ⇒
+ val delNotes = nlpSen.getDeletedNotes
+
+ // Tries to find with same key.
+ var nlpTokOpt = findDeletedToken(key, delNotes, _ == key.id)
+
+ // If couldn't find nlp note, we can try to find any note on the same position.
+ if (nlpTokOpt.isEmpty && key.id == "nlpcraft:nlp")
+ nlpTokOpt = findDeletedToken(key, delNotes, _ ⇒ true)
+
+ nlpTokOpt match {
+ case Some(nlpTok) ⇒ mkToken(nlpTok)
+ case None ⇒
+ nlpSen.findInitialNlpNote(key.from, key.to) match {
+ case Some(nlpNote) ⇒
+ val artTok = NlpToken(IDX)
+
+ artTok.add(nlpNote.clone(mkNlpNoteParams(): _*))
+
+ mkToken(artTok)
+ case None ⇒
+ throw new NCE(
+ s"Part not found for: $key, " +
+ s"token: $tok, " +
+ s"lastPhase=$lastPhase"
+ )
}
- else
- None
- }.headOption
-
- // Tries to find with same key.
- var nlpTokOpt = find(_ == key.id)
-
- // If couldn't find nlp note, we can try to find any note on the same position.
- if (nlpTokOpt.isEmpty && key.id == "nlpcraft:nlp")
- nlpTokOpt = find(_ ⇒ true)
-
- val nlpTok = nlpTokOpt.getOrElse(throw new NCE(s"Part not found for: $key"))
+ }
+ }
+ )
- NCTokenImpl(mdl, srvReqId, nlpTok)
+ parts.zip(list.asScala).foreach { case (part, map) ⇒
+ map.get(TOK_META_ALIASES_KEY) match {
+ case null ⇒ // No-op.
+ case aliases ⇒ part.getMetadata.put(TOK_META_ALIASES_KEY, aliases.asInstanceOf[Object])
+ }
}
- })
- parts.zip(list.asScala).foreach { case (part, map) ⇒
- map.get(TOK_META_ALIASES_KEY) match {
- case null ⇒ // No-op.
- case aliases ⇒ part.getMetadata.put(TOK_META_ALIASES_KEY, aliases.asInstanceOf[Object])
- }
- }
+ tok.setParts(parts)
- tok.setParts(parts)
+ require(parts.nonEmpty)
- partsKeys ++= keys
+ for (tok ← parts)
+ process(tok,
+ nlpSen.
+ findNlpToken(tok.getId, tok.getStartCharIndex, tok.getEndCharIndex).
+ getOrElse(throw new NCE(s"Token not found for $tok"))
+ )
- case None ⇒ // No-op.
+ ok = ok && !toks.exists(t ⇒ t.getId != "nlpcraft:nlp" && keys.contains(Key(t)))
+ case None ⇒ // No-op.
+ }
}
- }
- }
- // We can't collapse parts earlier, because we need them here (setParts method, few lines above.)
- var vars = toks.filter(sen ⇒
- !sen.exists(t ⇒
- t.getId != "nlpcraft:nlp" &&
- partsKeys.contains(Key(t.getId, t.getStartCharIndex, t.getEndCharIndex))
- )
- ).map(p ⇒ new NCVariantImpl(p.asJava))
+ for ((tok, tokNlp) ← toks.zip(nlpSen) if tokNlp.isUser)
+ process(tok, tokNlp)
+
+ if (ok) Some(new NCVariantImpl(toks.asJava)) else None
+ })
if (lastPhase && vars.size > 1) {
// Drops empty.
@@ -252,19 +288,19 @@ object NCProbeVariants {
for (
vrnt ← sortedVars.tail
- // Skips if the candidate has same structure that exists between already saved and
- // there is only one difference - some candidate's tokens are nlp tokens.
- if !bestVars.exists(savedVrnt ⇒
- savedVrnt.size == vrnt.size &&
- savedVrnt.asScala.zip(vrnt.asScala).forall { case (savedTok, tok) ⇒
- savedTok.getStartCharIndex == tok.getStartCharIndex &&
- savedTok.getEndCharIndex == tok.getEndCharIndex &&
- (
- savedTok.getId == tok.getId && savedTok.getMetadata == tok.getMetadata ||
- tok.getId == "nlpcraft:nlp"
- )
- }
- )
+ // Skips if the candidate has same structure that exists between already saved and
+ // there is only one difference - some candidate's tokens are nlp tokens.
+ if !bestVars.exists(savedVrnt ⇒
+ savedVrnt.size == vrnt.size &&
+ savedVrnt.asScala.zip(vrnt.asScala).forall { case (savedTok, tok) ⇒
+ savedTok.getStartCharIndex == tok.getStartCharIndex &&
+ savedTok.getEndCharIndex == tok.getEndCharIndex &&
+ (
+ savedTok.getId == tok.getId && savedTok.getMetadata == tok.getMetadata ||
+ tok.getId == "nlpcraft:nlp"
+ )
+ }
+ )
)
bestVars += vrnt
@@ -273,6 +309,15 @@ object NCProbeVariants {
vars = bestVars.sortBy(sortedVars.indexOf)
}
+ for (v ← vars; t ← v.asScala)
+ require(
+ t.getIndex >= 0,
+ s"Invalid token: $t with index: ${t.getIndex}, " +
+ s"lastPhase: $lastPhase, " +
+ s"sentence:\n${NCTokenLogger.prepareTable(v.asScala)}" +
+ s""
+ )
+
vars
}
-}
+}
\ No newline at end of file
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
index 1b26c43..368f0c4 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
@@ -409,6 +409,8 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats {
NCSuspiciousNounsEnricher.enrich(mdl, nlpSen, senMeta, span)
NCStopWordEnricher.enrich(mdl, nlpSen, senMeta, span)
+ nlpSen.saveNlpNotes()
+
case class Holder(enricher: NCProbeEnricher, getNotes: () ⇒ Seq[NCNlpSentenceNote])
def get(name: String, e: NCProbeEnricher): Option[Holder] =
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
index 349cc61..8912bc8 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
@@ -34,20 +34,18 @@ class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
val variants = ctx.getVariants.asScala
def checkLimit(limitPart: NCToken): Unit = {
- require(limitPart.getIndex == -1, s"Unexpected limit token: $limitPart, meta: ${limitPart.getMetadata}")
+ require(limitPart.getIndex == -1, s"Unexpected limit token index: ${limitPart.getIndex}, token: $limitPart, meta: ${limitPart.getMetadata}")
checkId(limitPart, "nlpcraft:limit")
val limNote = limitPart.getMetadata.get("nlpcraft:limit:note").asInstanceOf[String]
- require(limNote == "wrapAnyWord", s"Unexpected limit token: $limitPart, meta: ${limitPart.getMetadata}")
+ require(limNote == "anyWord", s"Unexpected limit token note: '$limNote', token: $limitPart, meta: ${limitPart.getMetadata}")
- val limIdxs =
- limitPart.getMetadata.get("nlpcraft:limit:indexes").
- asInstanceOf[util.List[Integer]].asScala
+ val limIdxs = limitPart.getMetadata.get("nlpcraft:limit:indexes").asInstanceOf[util.List[Integer]].asScala
require(
limIdxs.size == 1 && limIdxs.head == -1,
- s"Unexpected limit token: $limitPart, meta: ${limitPart.getMetadata}"
+ s"Unexpected limit token ref indexes: [${limIdxs.mkString(",")}], token: $limitPart, meta: ${limitPart.getMetadata}"
)
}
@@ -141,14 +139,14 @@ class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
require(wrap.size == 2)
- val part = wrap.last
+ val wrapLimit = wrap.last
- require(part.getIndex == -1, s"Unexpected limit token: $part, meta: ${part.getMetadata}")
- checkId(part,"wrapLimit")
+ require(wrapLimit.getIndex == -1, s"Unexpected limit token: $wrapLimit, meta: ${wrapLimit.getMetadata}")
+ checkId(wrapLimit,"wrapLimit")
- require(part.getPartTokens.size == 3)
+ require(wrapLimit.getPartTokens.size == 3, s"Parts count: ${wrapLimit.getPartTokens.size()}")
- checkLimit(part.getPartTokens.asScala.last)
+ checkLimit(wrapLimit.getPartTokens.asScala.last)
case _ ⇒ throw new AssertionError(s"Unexpected request: ${ctx.getRequest.getNormalizedText}")
}
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec3.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec2.scala
similarity index 54%
copy from nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec3.scala
copy to nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec2.scala
index e1fedca..af8e2b2 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec3.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec2.scala
@@ -22,40 +22,48 @@ import org.apache.nlpcraft.{NCTestContext, NCTestElement, NCTestEnvironment}
import org.junit.jupiter.api.Test
import java.util
-import scala.collection.JavaConverters._
/**
* Nested Elements test model.
*/
-class NCNestedTestModel3 extends NCModelAdapter(
- "nlpcraft.nested3.test.mdl", "Nested Data Test Model", "1.0"
-) {
+class NCNestedTestModel21 extends NCModelAdapter("nlpcraft.nested2.test.mdl", "Nested Test Model", "1.0") {
override def getElements: util.Set[NCElement] =
- Set(
- NCTestElement("e1", "//[a-zA-Z0-9]+//"),
- NCTestElement("e2", "^^(id == 'e1')^^"),
- )
+ Set(NCTestElement("e1", "{^^(id == 'nlpcraft:num')^^|_} word"))
- override def getAbstractTokens: util.Set[String] = Set("e1").asJava
- override def getEnabledBuiltInTokens: util.Set[String] = Set.empty[String].asJava
-
- @NCIntent("intent=onE2 term(t1)={id == 'e2'}[12, 100]")
+ @NCIntent("intent=onE1 term(t1)={id == 'e1'}")
def onAB(ctx: NCIntentMatch): NCResult = NCResult.text("OK")
+
+ @NCIntent("intent=onNumAndE1 term(t1)={id == 'nlpcraft:num'} term(t1)={id == 'e1'}")
+ def onNumAndE1(ctx: NCIntentMatch): NCResult = NCResult.text("OK")
+
+ override def isPermutateSynonyms: Boolean = false
+ override def getJiggleFactor: Int = 0
}
/**
* Nested elements model enricher test.
*/
-@NCTestEnvironment(model = classOf[NCNestedTestModel3], startClient = true)
-class NCEnricherNestedModelSpec3 extends NCTestContext {
+@NCTestEnvironment(model = classOf[NCNestedTestModel21], startClient = true)
+class NCEnricherNestedModelSpec21 extends NCTestContext {
@Test
def test(): Unit = {
- println("Started")
+ checkIntent("word", "onE1")
+ checkIntent("10 word", "onE1")
+ checkIntent("11 12 word", "onNumAndE1")
+ }
+}
- val t = System.currentTimeMillis()
+/**
+ * Nested Elements test model.
+ */
+class NCNestedTestModel22 extends NCNestedTestModel21 {
+ override def isPermutateSynonyms: Boolean = true
+ override def getJiggleFactor: Int = 4
+}
- checkIntent("a " * 12, "onE2")
+/**
+ * Nested elements model enricher test.
+ */
+@NCTestEnvironment(model = classOf[NCNestedTestModel22], startClient = true)
+class NCEnricherNestedModelSpec22 extends NCEnricherNestedModelSpec21
- println(s"Passed: ${System.currentTimeMillis() - t}")
- }
-}
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec3.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec3.scala
index e1fedca..8fc474b 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec3.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec3.scala
@@ -44,18 +44,10 @@ class NCNestedTestModel3 extends NCModelAdapter(
}
/**
- * Nested elements model enricher test.
+ * It shouldn't be too slow.
*/
@NCTestEnvironment(model = classOf[NCNestedTestModel3], startClient = true)
class NCEnricherNestedModelSpec3 extends NCTestContext {
@Test
- def test(): Unit = {
- println("Started")
-
- val t = System.currentTimeMillis()
-
- checkIntent("a " * 12, "onE2")
-
- println(s"Passed: ${System.currentTimeMillis() - t}")
- }
+ def test(): Unit = checkIntent("a " * 18, "onE2")
}