You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/03/09 18:40:33 UTC
[incubator-nlpcraft] branch master updated: Sentences collapsing
performance improvements.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/master by this push:
new 1b19154 Sentences collapsing performance improvements.
1b19154 is described below
commit 1b19154436d30602ef6a0174cb1f2148d9714bfc
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Tue Mar 9 21:40:17 2021 +0300
Sentences collapsing performance improvements.
---
.../apache/nlpcraft/common/nlp/NCNlpSentence.scala | 773 +--------------------
.../org/apache/nlpcraft/probe/NCProbeBoot.scala | 2 +
.../nlpcraft/probe/mgrs/NCProbeVariants.scala | 4 +-
.../probe/mgrs/nlp/NCProbeEnrichmentManager.scala | 3 +-
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 9 +-
.../probe/mgrs/sentence/NCSentenceHelper.java | 199 ++++++
.../mgrs/sentence/NCSentenceManager.scala} | 351 +++-------
.../model/NCEnricherNestedModelSpec4.scala | 53 ++
8 files changed, 397 insertions(+), 997 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index 113e088..91ca5a9 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -17,527 +17,16 @@
package org.apache.nlpcraft.common.nlp
-import com.typesafe.scalalogging.LazyLogging
-import org.apache.nlpcraft.common.NCE
-import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
-import org.apache.nlpcraft.model.NCModel
-
-import java.util
-import java.util.{List ⇒ JList}
import java.io.{Serializable ⇒ JSerializable}
-import java.util.Collections
+import java.util.{Collections, List ⇒ JList}
import scala.collection.JavaConverters._
-import scala.collection.mutable.ArrayBuffer
import scala.collection.{Map, Seq, Set, mutable}
import scala.language.implicitConversions
-object NCNlpSentence extends LazyLogging {
- implicit def toTokens(x: NCNlpSentence): ArrayBuffer[NCNlpSentenceToken] = x.tokens
-
+object NCNlpSentence {
case class NoteKey(start: Int, end: Int)
case class TokenKey(id: String, start: Int, end: Int)
case class NoteLink(note: String, indexes: Seq[Int])
-
- case class PartKey(id: String, start: Int, end: Int) {
- require(start <= end)
-
- private def in(i: Int): Boolean = i >= start && i <= end
- def intersect(id: String, start: Int, end: Int): Boolean = id == this.id && (in(start) || in(end))
- }
- object PartKey {
- def apply(m: util.HashMap[String, JSerializable]): PartKey = {
- def get[T](name: String): T = m.get(name).asInstanceOf[T]
-
- PartKey(get("id"), get("startcharindex"), get("endcharindex"))
- }
-
- def apply(t: NCNlpSentenceNote, sen: NCNlpSentence): PartKey =
- PartKey(t.noteType, sen(t.tokenFrom).startCharIndex, sen(t.tokenTo).endCharIndex)
- }
-
- private def getLinks(notes: Seq[NCNlpSentenceNote]): Seq[NoteLink] = {
- val noteLinks = mutable.ArrayBuffer.empty[NoteLink]
-
- for (n ← notes.filter(n ⇒ n.noteType == "nlpcraft:limit" || n.noteType == "nlpcraft:references"))
- noteLinks += NoteLink(n("note").asInstanceOf[String], n("indexes").asInstanceOf[JList[Int]].asScala.sorted)
-
- for (n ← notes.filter(_.noteType == "nlpcraft:sort")) {
- def add(noteName: String, idxsName: String): Unit = {
- val names = n(noteName).asInstanceOf[JList[String]]
- val idxsSeq = n(idxsName).asInstanceOf[JList[JList[Int]]]
-
- require(names.size() == idxsSeq.size())
-
- noteLinks ++=
- (for ((name, idxs) ← names.asScala.zip(idxsSeq.asScala.map(_.asScala)))
- yield NoteLink(name, idxs.sorted)
- )
- }
-
- if (n.contains("subjnotes")) add("subjnotes", "subjindexes")
- if (n.contains("bynotes")) add("bynotes", "byindexes")
- }
-
- noteLinks
- }
-
- private def getPartKeys(notes: NCNlpSentenceNote*): Seq[PartKey] =
- notes.
- filter(_.isUser).
- flatMap(n ⇒ {
- val optList: Option[JList[util.HashMap[String, JSerializable]]] = n.dataOpt("parts")
-
- optList
- }).flatMap(_.asScala).map(m ⇒ PartKey(m)).distinct
-
- /**
- *
- * @param ns
- * @param idxs
- * @param notesType
- * @param note
- * @return
- */
- private def checkRelation(ns: NCNlpSentence, idxs: Seq[Int], notesType: String, note: NCNlpSentenceNote): Boolean = {
- val types = idxs.flatMap(idx ⇒ ns(idx).map(p ⇒ p).filter(!_.isNlp).map(_.noteType)).distinct
-
- /**
- * Example:
- * 1. Sentence 'maximum x' (single element related function)
- * - maximum is aggregate function linked to date element.
- * - x defined as 2 elements: date and num.
- * So, the variant 'maximum x (as num)' should be excluded.
- * *
- * 2. Sentence 'compare x and y' (multiple elements related function)
- * - compare is relation function linked to date element.
- * - x an y defined as 2 elements: date and num.
- * So, variants 'x (as num) and x (as date)' and 'x (as date) and x (as num)'
- * should not be excluded, but invalid relation should be deleted for these combinations.
- */
- types.size match {
- case 0 ⇒ false
- case 1 ⇒ types.head == notesType
- case _ ⇒
- // Equal elements should be processed together with function element.
- if (types.size == 1)
- false
- else {
- ns.removeNote(note)
-
- logger.trace(s"Removed note: $note")
-
- true
- }
- }
- }
-
- /**
- * Fixes notes with references to other notes indexes.
- * Note that 'idxsField' is 'indexes' and 'noteField' is 'note' for all kind of references.
- *
- * @param noteType Note type.
- * @param idxsField Indexes field.
- * @param noteField Note field.
- * @param ns Sentence.
- * @param history Indexes transformation history.
- * @return Valid flag.
- */
- private def fixIndexesReferences(
- noteType: String,
- idxsField: String,
- noteField: String,
- ns: NCNlpSentence,
- history: Seq[(Int, Int)]
- ): Boolean = {
- ns.filter(_.isTypeOf(noteType)).foreach(tok ⇒
- tok.getNoteOpt(noteType, idxsField) match {
- case Some(n) ⇒
- val idxs: Seq[Int] = n.data[JList[Int]](idxsField).asScala
- var fixed = idxs
-
- history.foreach { case (idxOld, idxNew) ⇒ fixed = fixed.map(i ⇒ if (i == idxOld) idxNew else i) }
-
- fixed = fixed.distinct
-
- if (idxs != fixed)
- ns.fixNote(n, "indexes" → fixed.asJava.asInstanceOf[JSerializable])
- case None ⇒ // No-op.
- }
- )
-
- ns.flatMap(_.getNotes(noteType)).forall(
- n ⇒ checkRelation(ns, n.data[JList[Int]]("indexes").asScala, n.data[String](noteField), n)
- )
- }
-
- /**
- *
- * @param note
- * @param idxsField
- * @param noteField
- * @param ns
- */
- private def fixNoteIndexes(note: String, idxsField: String, noteField: String, ns: NCNlpSentence): Unit =
- ns.flatMap(_.getNotes(note)).foreach(
- n ⇒ checkRelation(ns, n.data[JList[Int]](idxsField).asScala, n.data[String](noteField), n)
- )
-
- /**
- *
- * @param note
- * @param idxsField
- * @param noteField
- * @param ns
- */
- private def fixNoteIndexesList(note: String, idxsField: String, noteField: String, ns: NCNlpSentence): Unit = {
- ns.flatMap(_.getNotes(note)).foreach(rel ⇒
- rel.dataOpt[JList[JList[Int]]](idxsField) match {
- case Some(idxsList) ⇒
- val notesTypes = rel.data[JList[String]](noteField)
-
- require(idxsList.size() == notesTypes.size())
-
- idxsList.asScala.zip(notesTypes.asScala).foreach {
- case (idxs, notesType) ⇒ checkRelation(ns, idxs.asScala, notesType, rel)
- }
- case None ⇒ // No-op.
- }
- )
- }
-
- /**
- * Copies token.
- *
- * @param ns Sentence.
- * @param history Indexes transformation history.
- * @param toksCopy Copied tokens.
- * @param i Index.
- */
- private def simpleCopy(
- ns: NCNlpSentence,
- history: mutable.ArrayBuffer[(Int, Int)],
- toksCopy: NCNlpSentence, i: Int
- ): Seq[NCNlpSentenceToken] = {
- val tokCopy = toksCopy(i)
-
- history += tokCopy.index → ns.size
-
- ns += tokCopy.clone(ns.size)
- }
-
- /**
- * Glues stop words.
- *
- * @param ns Sentence.
- * @param userNoteTypes Notes types.
- * @param history Indexes transformation history.
- */
- private def unionStops(
- ns: NCNlpSentence,
- userNoteTypes: Seq[String],
- history: mutable.ArrayBuffer[(Int, Int)]
- ): Unit = {
- // Java collection used because using scala collections (mutable.Buffer.empty[mutable.Buffer[Token]]) is reason
- // Of compilation errors which seems as scala compiler internal error.
- val bufs = new util.ArrayList[mutable.Buffer[NCNlpSentenceToken]]()
-
- def last[T](l: JList[T]): T = l.get(l.size() - 1)
-
- ns.filter(t ⇒ t.isStopWord && !t.isBracketed).foreach(t ⇒
- if (!bufs.isEmpty && last(bufs).last.index + 1 == t.index)
- last(bufs) += t
- else
- bufs.add(mutable.Buffer.empty[NCNlpSentenceToken] :+ t)
- )
-
- val idxsSeq = bufs.asScala.filter(_.lengthCompare(1) > 0).map(_.map(_.index))
-
- if (idxsSeq.nonEmpty) {
- val nsCopyToks = ns.clone()
- ns.clear()
-
- val buf = mutable.Buffer.empty[Int]
-
- for (i ← nsCopyToks.indices)
- idxsSeq.find(_.contains(i)) match {
- case Some(idxs) ⇒
- if (!buf.contains(idxs.head)) {
- buf += idxs.head
-
- ns += mkCompound(ns, nsCopyToks, idxs, stop = true, ns.size, None, history)
- }
- case None ⇒ simpleCopy(ns, history, nsCopyToks, i)
- }
-
- fixIndexes(ns, userNoteTypes)
- }
- }
-
- /**
- * Fixes indexes for all notes after recreating tokens.
- *
- * @param ns Sentence.
- * @param userNoteTypes Notes types.
- */
- private def fixIndexes(ns: NCNlpSentence, userNoteTypes: Seq[String]) {
- // Replaces other notes indexes.
- for (t ← userNoteTypes :+ "nlpcraft:nlp"; note ← ns.getNotes(t)) {
- val toks = ns.filter(_.contains(note)).sortBy(_.index)
-
- val newNote = note.clone(toks.map(_.index), toks.flatMap(_.wordIndexes).sorted)
-
- toks.foreach(t ⇒ {
- t.remove(note)
- t.add(newNote)
- })
- }
-
- // Special case - field index of core NLP note.
- ns.zipWithIndex.foreach { case (tok, idx) ⇒ ns.fixNote(tok.getNlpNote, "index" → idx) }
- }
-
- /**
- * Zip notes with same type.
- *
- * @param ns Sentence.
- * @param nType Notes type.
- * @param userNotesTypes Notes types.
- * @param history Indexes transformation history.
- */
- private def zipNotes(
- ns: NCNlpSentence,
- nType: String,
- userNotesTypes: Seq[String],
- history: mutable.ArrayBuffer[(Int, Int)]
- ): Unit = {
- val nts = ns.getNotes(nType).filter(n ⇒ n.tokenFrom != n.tokenTo).sortBy(_.tokenFrom)
-
- val overlapped =
- nts.flatMap(n ⇒ n.tokenFrom to n.tokenTo).map(ns(_)).exists(
- t ⇒ userNotesTypes.map(pt ⇒ t.getNotes(pt).size).sum > 1
- )
-
- if (nts.nonEmpty && !overlapped) {
- val nsCopyToks = ns.clone()
- ns.clear()
-
- val buf = mutable.ArrayBuffer.empty[Int]
-
- for (i ← nsCopyToks.indices)
- nts.find(_.tokenIndexes.contains(i)) match {
- case Some(n) ⇒
- if (!buf.contains(n.tokenFrom)) {
- buf += n.tokenFrom
-
- ns += mkCompound(ns, nsCopyToks, n.tokenIndexes, stop = false, ns.size, Some(n), history)
- }
- case None ⇒ simpleCopy(ns, history, nsCopyToks, i)
- }
-
- fixIndexes(ns, userNotesTypes)
- }
- }
-
- /**
- * Makes compound note.
- *
- * @param ns Sentence.
- * @param nsCopyToks Tokens.
- * @param indexes Indexes.
- * @param stop Flag.
- * @param idx Index.
- * @param commonNote Common note.
- * @param history Indexes transformation history.
- */
- private def mkCompound(
- ns: NCNlpSentence,
- nsCopyToks: Seq[NCNlpSentenceToken],
- indexes: Seq[Int],
- stop: Boolean,
- idx: Int,
- commonNote: Option[NCNlpSentenceNote],
- history: mutable.ArrayBuffer[(Int, Int)]
- ): NCNlpSentenceToken = {
- val t = NCNlpSentenceToken(idx)
-
- // Note, it adds stop-words too.
- val content = nsCopyToks.zipWithIndex.filter(p ⇒ indexes.contains(p._2)).map(_._1)
-
- content.foreach(t ⇒ history += t.index → idx)
-
- def mkValue(get: NCNlpSentenceToken ⇒ String): String = {
- val buf = mutable.Buffer.empty[String]
-
- val n = content.size - 1
-
- content.zipWithIndex.foreach(p ⇒ {
- val t = p._1
- val idx = p._2
-
- buf += get(t)
-
- if (idx < n && t.endCharIndex != content(idx + 1).startCharIndex)
- buf += " "
- })
-
- buf.mkString
- }
-
- val origText = mkValue((t: NCNlpSentenceToken) ⇒ t.origText)
-
- val idxs = Seq(idx)
- val wordIdxs = content.flatMap(_.wordIndexes).sorted
-
- val direct =
- commonNote match {
- case Some(n) if n.isUser ⇒ n.isDirect
- case _ ⇒ content.forall(_.isDirect)
- }
-
- val params = Seq(
- "index" → idx,
- "pos" → NCPennTreebank.SYNTH_POS,
- "posDesc" → NCPennTreebank.SYNTH_POS_DESC,
- "lemma" → mkValue((t: NCNlpSentenceToken) ⇒ t.lemma),
- "origText" → origText,
- "normText" → mkValue((t: NCNlpSentenceToken) ⇒ t.normText),
- "stem" → mkValue((t: NCNlpSentenceToken) ⇒ t.stem),
- "start" → content.head.startCharIndex,
- "end" → content.last.endCharIndex,
- "charLength" → origText.length,
- "quoted" → false,
- "stopWord" → stop,
- "bracketed" → false,
- "direct" → direct,
- "dict" → (if (nsCopyToks.size == 1) nsCopyToks.head.getNlpNote.data[Boolean]("dict") else false),
- "english" → nsCopyToks.forall(_.getNlpNote.data[Boolean]("english")),
- "swear" → nsCopyToks.exists(_.getNlpNote.data[Boolean]("swear"))
- )
-
- val nlpNote = NCNlpSentenceNote(idxs, wordIdxs, "nlpcraft:nlp", params: _*)
-
- t.add(nlpNote)
-
- // Adds processed note with fixed indexes.
- commonNote match {
- case Some(n) ⇒
- ns.removeNote(n)
- t.add(n.clone(idxs, wordIdxs))
- case None ⇒ // No-op.
- }
-
- t
- }
-
- /**
- * Fixes notes with references list to other notes indexes.
- *
- * @param noteType Note type.
- * @param idxsField Indexes field.
- * @param noteField Note field.
- * @param ns Sentence.
- * @param history Indexes transformation history.
- * @return Valid flag.
- */
- private def fixIndexesReferencesList(
- noteType: String,
- idxsField: String,
- noteField: String,
- ns: NCNlpSentence,
- history: Seq[(Int, Int)]
- ): Boolean = {
- var ok = true
-
- for (tok ← ns.filter(_.isTypeOf(noteType)) if ok)
- tok.getNoteOpt(noteType, idxsField) match {
- case Some(n) ⇒
- val idxs: Seq[Seq[Int]] =
- n.data[JList[JList[Int]]](idxsField).asScala.map(_.asScala)
- var fixed = idxs
-
- history.foreach {
- case (idxOld, idxNew) ⇒ fixed = fixed.map(_.map(i ⇒ if (i == idxOld) idxNew else i).distinct)
- }
-
- if (fixed.forall(_.size == 1))
- // Fix double dimension array to one dimension,
- // so it should be called always in spite of 'fixIndexesReferences' method.
- ns.fixNote(n, idxsField → fixed.map(_.head).asJava.asInstanceOf[JSerializable])
- else
- ok = false
- case None ⇒ // No-op.
- }
- ok &&
- ns.flatMap(_.getNotes(noteType)).forall(rel ⇒
- rel.dataOpt[JList[Int]](idxsField) match {
- case Some(idxsList) ⇒
- val notesTypes = rel.data[JList[String]](noteField)
-
- require(idxsList.size() == notesTypes.size())
-
- idxsList.asScala.zip(notesTypes.asScala).forall {
- case (idxs, notesType) ⇒ checkRelation(ns, Seq(idxs), notesType, rel)
- }
- case None ⇒ true
- }
- )
- }
-
- /**
- * Fixes tokens positions.
- *
- * @param ns Sentence.
- * @param notNlpTypes Token types.
- */
- private def collapseSentence(ns: NCNlpSentence, notNlpTypes: Seq[String]): Boolean = {
- ns.
- filter(!_.isNlp).
- filter(_.isStopWord).
- flatten.
- filter(_.isNlp).
- foreach(n ⇒ ns.fixNote(n, "stopWord" → false))
-
- val all = ns.tokens.flatten
- val nsNotes: Map[String, Seq[Int]] = all.map(p ⇒ p.noteType → p.tokenIndexes).toMap
-
- for (
- t ← ns.tokens; stopReason ← t.stopsReasons
- if all.contains(stopReason) && nsNotes.getOrElse(stopReason.noteType, Seq.empty) == stopReason.tokenIndexes
- )
- ns.fixNote(t.getNlpNote, "stopWord" → true)
-
- val history = mutable.ArrayBuffer.empty[(Int, Int)]
-
- fixNoteIndexes("nlpcraft:relation", "indexes", "note", ns)
- fixNoteIndexes("nlpcraft:limit", "indexes", "note", ns)
- fixNoteIndexesList("nlpcraft:sort", "subjindexes", "subjnotes", ns)
- fixNoteIndexesList("nlpcraft:sort", "byindexes", "bynotes", ns)
-
- notNlpTypes.foreach(typ ⇒ zipNotes(ns, typ, notNlpTypes, history))
- unionStops(ns, notNlpTypes, history)
-
- val res =
- fixIndexesReferences("nlpcraft:relation", "indexes", "note", ns, history) &&
- fixIndexesReferences("nlpcraft:limit", "indexes", "note", ns, history) &&
- fixIndexesReferencesList("nlpcraft:sort", "subjindexes", "subjnotes", ns, history) &&
- fixIndexesReferencesList("nlpcraft:sort", "byindexes", "bynotes", ns, history)
-
- if (res) {
- // Validation (all indexes calculated well)
- require(
- !res ||
- !ns.flatten.
- exists(n ⇒ ns.filter(_.wordIndexes.exists(n.wordIndexes.contains)).exists(t ⇒ !t.contains(n))),
- s"Invalid sentence:\n" +
- ns.map(t ⇒
- // Human readable invalid sentence for debugging.
- s"${t.origText}{index:${t.index}}[${t.map(n ⇒ s"${n.noteType}, {range:${n.tokenFrom}-${n.tokenTo}}").mkString("|")}]"
- ).mkString("\n")
- )
- }
-
- res
- }
}
import org.apache.nlpcraft.common.nlp.NCNlpSentence._
@@ -567,29 +56,16 @@ class NCNlpSentence(
private def calcHash(): Int =
Seq(srvReqId, text, enabledBuiltInToks, tokens).map(_.hashCode()).foldLeft(0)((a, b) ⇒ 31 * a + b)
- private def addDeleted(sen: NCNlpSentence, dels: Iterable[NCNlpSentenceNote]): Unit =
- sen.deletedNotes ++= dels.map(n ⇒ {
- val savedDelNote = n.clone()
- val savedDelToks = n.tokenIndexes.map(idx ⇒ this(idx).clone())
-
- val mainNotes = savedDelToks.flatten.filter(n ⇒ n.noteType != "nlpcraft:nlp" && n != savedDelNote)
-
- // Deleted note's tokens should contains only nlp data and deleted notes.
- for (savedDelTok ← savedDelToks; mainNote ← mainNotes)
- savedDelTok.remove(mainNote)
-
- savedDelNote → savedDelToks
- })
-
// Deep copy.
override def clone(): NCNlpSentence =
new NCNlpSentence(
- srvReqId,
- text,
- enabledBuiltInToks,
- tokens.map(_.clone()),
- deletedNotes.map(p ⇒ p._1.clone() → p._2.map(_.clone())),
- initNlpNotes = initNlpNotes
+ srvReqId = srvReqId,
+ text = text,
+ enabledBuiltInToks = enabledBuiltInToks,
+ tokens = tokens.map(_.clone()),
+ deletedNotes = deletedNotes.map(p ⇒ p._1.clone() → p._2.map(_.clone())),
+ initNlpNotes = initNlpNotes,
+ nlpTokens = nlpTokens
)
/**
@@ -617,6 +93,21 @@ class NCNlpSentence(
hash
}
+ override def equals(obj: Any): Boolean = obj match {
+ case x: NCNlpSentence ⇒
+ tokens == x.tokens &&
+ srvReqId == x.srvReqId &&
+ text == x.text &&
+ enabledBuiltInToks == x.enabledBuiltInToks
+
+ case _ ⇒ false
+ }
+
+ /**
+ *
+ * @param note
+ * @param kvs
+ */
def fixNote(note: NCNlpSentenceNote, kvs: (String, JSerializable)*): Unit = {
val fixed = note.clone(kvs: _*)
@@ -628,201 +119,6 @@ class NCNlpSentence(
hash = null
}
- private def dropAbstract(mdl: NCModel, ns: NCNlpSentence): Unit =
- if (!mdl.getAbstractTokens.isEmpty) {
- val notes = ns.flatten
-
- val keys = getPartKeys(notes :_*)
- val noteLinks = getLinks(notes)
-
- notes.filter(n ⇒ {
- val noteToks = ns.tokens.filter(_.contains(n))
-
- mdl.getAbstractTokens.contains(n.noteType) &&
- !keys.exists(_.intersect(n.noteType, noteToks.head.startCharIndex, noteToks.last.startCharIndex)) &&
- !noteLinks.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))
- }).foreach(ns.removeNote)
- }
-
- private def getNotNlpNotes(toks: Seq[NCNlpSentenceToken]): Seq[NCNlpSentenceNote] =
- toks.flatten.filter(!_.isNlp).distinct
-
- /**
- * This collapser handles several tasks:
- * - "overall" collapsing after all other individual collapsers had their turn.
- * - Special further enrichment of tokens like linking, etc.
- *
- * In all cases of overlap (full or partial) - the "longest" note wins. In case of overlap and equal
- * lengths - the winning note is chosen based on this priority.
- */
- @throws[NCE]
- def collapse(mdl: NCModel, lastPhase: Boolean = false): Seq[NCNlpSentence] = {
- def collapse0(ns: NCNlpSentence): Option[NCNlpSentence] = {
- if (lastPhase)
- dropAbstract(mdl, ns)
-
- if (collapseSentence(ns, getNotNlpNotes(ns).map(_.noteType).distinct)) Some(ns) else None
- }
-
- // Always deletes `similar` notes.
- // Some words with same note type can be detected various ways.
- // We keep only one variant - with `best` direct and sparsity parameters,
- // other variants for these words are redundant.
- val redundant: Seq[NCNlpSentenceNote] =
- this.flatten.filter(!_.isNlp).distinct.
- groupBy(_.getKey()).
- map(p ⇒ p._2.sortBy(p ⇒
- (
- // System notes don't have such flags.
- if (p.isUser) {
- if (p.isDirect)
- 0
- else
- 1
- }
- else
- 0,
- if (p.isUser)
- p.sparsity
- else
- 0
- )
- )).
- flatMap(_.drop(1)).
- toSeq
-
- redundant.foreach(this.removeNote)
-
- var delCombs: Seq[NCNlpSentenceNote] =
- getNotNlpNotes(this).
- flatMap(note ⇒ getNotNlpNotes(note.tokenIndexes.sorted.map(i ⇒ this(i))).filter(_ != note)).
- distinct
-
- // Optimization. Deletes all wholly swallowed notes.
- val links = getLinks(this.flatten)
-
- val swallowed =
- delCombs.
- // There aren't links on it.
- filter(n ⇒ !links.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))).
- // It doesn't have links.
- filter(getPartKeys(_).isEmpty).
- flatMap(note ⇒ {
- val noteWordsIdxs = note.wordIndexes.toSet
- val key = PartKey(note, this)
-
- val delCombOthers =
- delCombs.filter(_ != note).flatMap(n ⇒ if (getPartKeys(n).contains(key)) Some(n) else None)
-
- if (delCombOthers.exists(o ⇒ noteWordsIdxs == o.wordIndexes.toSet)) Some(note) else None
- })
-
- delCombs = delCombs.filter(p ⇒ !swallowed.contains(p))
- addDeleted(this, swallowed)
- swallowed.foreach(this.removeNote)
-
- val toksByIdx: Seq[Seq[NCNlpSentenceNote]] =
- delCombs.flatMap(note ⇒ note.wordIndexes.map(_ → note)).
- groupBy { case (idx, _) ⇒ idx }.
- map { case (_, seq) ⇒ seq.map { case (_, note) ⇒ note } }.
- toSeq.sortBy(-_.size)
-
- val minDelSize = if (toksByIdx.isEmpty) 1 else toksByIdx.map(_.size).max - 1
-
- var sens =
- if (delCombs.nonEmpty) {
- val deleted = mutable.ArrayBuffer.empty[Set[NCNlpSentenceNote]]
-
- val sens =
- (minDelSize to delCombs.size).
- flatMap(i ⇒
- delCombs.combinations(i).
- filter(delComb ⇒
- !toksByIdx.exists(
- rec ⇒
- rec.size - delCombs.size <= 1 &&
- rec.count(note ⇒ !delComb.contains(note)) > 1
- )
- )
- ).
- sortBy(_.size).
- map(_.toSet).
- flatMap(delComb ⇒
- // Already processed with less subset of same deleted tokens.
- if (!deleted.exists(_.subsetOf(delComb))) {
- val nsClone = this.clone()
-
- // Saves deleted notes for sentence and their tokens.
- addDeleted(nsClone, delComb)
- delComb.foreach(nsClone.removeNote)
-
- // Has overlapped notes for some tokens.
- require(!nsClone.exists(_.count(!_.isNlp) > 1))
-
- deleted += delComb
-
- collapse0(nsClone)
- }
- else
- None
- )
-
- // It removes sentences which have only one difference - 'direct' flag of their user tokens.
- // `Direct` sentences have higher priority.
- case class Key(sysNotes: Seq[Map[String, JSerializable]], userNotes: Seq[Map[String, JSerializable]])
- case class Value(sentence: NCNlpSentence, directCount: Int)
-
- val m = mutable.HashMap.empty[Key, Value]
-
- sens.map(sen ⇒ {
- val notes = sen.flatten
-
- val sysNotes = notes.filter(_.isSystem)
- val nlpNotes = notes.filter(_.isNlp)
- val userNotes = notes.filter(_.isUser)
-
- def get(seq: Seq[NCNlpSentenceNote]): Seq[Map[String, JSerializable]] =
- seq.map(p ⇒
- // We have to delete some keys to have possibility to compare sentences.
- p.clone().filter(_._1 != "direct")
- )
-
- (Key(get(sysNotes), get(userNotes)), sen, nlpNotes.map(p ⇒ if (p.isDirect) 0 else 1).sum)
- }).
- foreach { case (key, sen, directCnt) ⇒
- m.get(key) match {
- case Some(v) ⇒
- // Best sentence is sentence with `direct` synonyms.
- if (v.directCount > directCnt)
- m += key → Value(sen, directCnt)
- case None ⇒ m += key → Value(sen, directCnt)
- }
- }
-
- m.values.map(_.sentence).toSeq
- }
- else
- collapse0(this).flatMap(p ⇒ Option(Seq(p))).getOrElse(Seq.empty)
-
- sens = sens.distinct
-
- sens.foreach(sen ⇒
- sen.foreach(tok ⇒
- tok.size match {
- case 1 ⇒ require(tok.head.isNlp, s"Unexpected non-'nlpcraft:nlp' token: $tok")
- case 2 ⇒ require(tok.head.isNlp ^ tok.last.isNlp, s"Unexpected token notes: $tok")
- case _ ⇒ require(requirement = false, s"Unexpected token notes count: $tok")
- }
- )
- )
-
- // Drops similar sentences (with same tokens structure).
- // Among similar sentences we prefer one with minimal free words count.
- sens.groupBy(_.flatten.filter(!_.isNlp).map(_.getKey(withIndexes = false))).
- map { case (_, seq) ⇒ seq.minBy(_.filter(p ⇒ p.isNlp && !p.isStopWord).map(_.wordIndexes.length).sum) }.
- toSeq
- }
-
/**
* Returns flag are note notes equal (or similar) or not. Reason of ignored difference can be stopwords tokens.
*
@@ -884,16 +180,6 @@ class NCNlpSentence(
getUniqueKey0(n1) == getUniqueKey0(n2) && wordsEqualOrSimilar(n1, n2) && referencesEqualOrSimilar(n1, n2)
}
- override def equals(obj: Any): Boolean = obj match {
- case x: NCNlpSentence ⇒
- tokens == x.tokens &&
- srvReqId == x.srvReqId &&
- text == x.text &&
- enabledBuiltInToks == x.enabledBuiltInToks
-
- case _ ⇒ false
- }
-
/**
*
*/
@@ -904,7 +190,7 @@ class NCNlpSentence(
*
* @return
*/
- def findInitialNlpNote(startCharIndex: Int, endCharIndex: Int): Option[NCNlpSentenceNote] =
+ def getInitialNlpNote(startCharIndex: Int, endCharIndex: Int): Option[NCNlpSentenceNote] =
initNlpNotes.get(NoteKey(startCharIndex, endCharIndex))
/**
@@ -924,11 +210,18 @@ class NCNlpSentence(
* @param endCharIndex
* @return
*/
- def findNlpToken(noteType: String, startCharIndex: Int, endCharIndex: Int): Option[NCNlpSentenceToken] =
+ def getNlpToken(noteType: String, startCharIndex: Int, endCharIndex: Int): Option[NCNlpSentenceToken] =
nlpTokens.get(TokenKey(noteType, startCharIndex, endCharIndex))
/**
*
*/
def getDeletedNotes: Predef.Map[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = deletedNotes.toMap
+
+ /***
+ *
+ * @param deletedNotes
+ */
+ def addDeletedNotes(deletedNotes: Map[NCNlpSentenceNote, Seq[NCNlpSentenceToken]]): Unit =
+ this.deletedNotes ++= deletedNotes
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala
index 19308e3..da13b07 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala
@@ -49,6 +49,7 @@ import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.sort.NCSortEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.stopword.NCStopWordEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.suspicious.NCSuspiciousNounsEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.validate.NCValidateManager
+import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
import resource.managed
import java.io._
@@ -512,6 +513,7 @@ private [probe] object NCProbeBoot extends LazyLogging with NCOpenCensusTrace {
startedMgrs += NCProbeEnrichmentManager.start(span)
startedMgrs += NCConnectionManager.start(span)
startedMgrs += NCDialogFlowManager.start(span)
+ startedMgrs += NCSentenceManager.start(span)
}
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
index aefdbd1..bbf7630 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
@@ -230,7 +230,7 @@ object NCProbeVariants {
nlpTokOpt match {
case Some(nlpTok) ⇒ mkToken(nlpTok)
case None ⇒
- nlpSen.findInitialNlpNote(key.from, key.to) match {
+ nlpSen.getInitialNlpNote(key.from, key.to) match {
case Some(nlpNote) ⇒
val artTok = NlpToken(IDX)
@@ -262,7 +262,7 @@ object NCProbeVariants {
for (tok ← parts)
process(tok,
nlpSen.
- findNlpToken(tok.getId, tok.getStartCharIndex, tok.getEndCharIndex).
+ getNlpToken(tok.getId, tok.getStartCharIndex, tok.getEndCharIndex).
getOrElse(throw new NCE(s"Token not found for $tok"))
)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
index 368f0c4..c328e57 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
@@ -43,6 +43,7 @@ import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.stopword.NCStopWordEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.suspicious.NCSuspiciousNounsEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.impl._
import org.apache.nlpcraft.probe.mgrs.nlp.validate._
+import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
import org.apache.nlpcraft.probe.mgrs.{NCProbeMessage, NCProbeVariants}
import java.io.Serializable
@@ -500,7 +501,7 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats {
s"]")
}
- nlpSen.clone().collapse(mdl.model, lastPhase = true).
+ NCSentenceManager.collapse(mdl.model, nlpSen.clone(), lastPhase = true).
// Sorted to support deterministic logs.
sortBy(p ⇒
p.map(p ⇒ {
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 2a9dec0..0a11314 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -21,9 +21,12 @@ import io.opencensus.trace.Span
import org.apache.nlpcraft.common._
import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken, NCNlpSentenceTokenBuffer, _}
import org.apache.nlpcraft.model._
+import org.apache.nlpcraft.model.impl.NCTokenLogger
import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind, TEXT}
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
+import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
+import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeSynonym, NCProbeVariants}
import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeSynonym, NCProbeVariants}
import java.io.Serializable
@@ -355,7 +358,11 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
toks.map(t ⇒ (t.origText, t.index)).mkString(" ")
var permCnt = 0
- lazy val collapsedSens = NCProbeVariants.convert(ns.srvReqId, mdl, ns.clone().collapse(mdl.model)).map(_.asScala)
+ lazy val collapsedSens = NCProbeVariants.convert(
+ ns.srvReqId,
+ mdl,
+ NCSentenceManager.collapse(mdl.model, ns.clone())
+ ).map(_.asScala)
/**
*
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceHelper.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceHelper.java
new file mode 100644
index 0000000..1e215ad
--- /dev/null
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceHelper.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.probe.mgrs.sentence;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.ForkJoinPool;
+import java.util.concurrent.RecursiveTask;
+
+import static java.util.stream.Collectors.toList;
+
+/**
+ * It is not converted to scala because scala and java long values implicit conversion performance problems.
+ */
+class NCSentenceHelper extends RecursiveTask<List<Long>> {
+ private static final long THRESHOLD = (long)Math.pow(2, 20);
+
+ private final long lo;
+ private final long hi;
+ private final long[] wordBits;
+ private final int[] wordCounts;
+
+ private NCSentenceHelper(long lo, long hi, long[] wordBits, int[] wordCounts) {
+ this.lo = lo;
+ this.hi = hi;
+ this.wordBits = wordBits;
+ this.wordCounts = wordCounts;
+ }
+
+ private List<Long> computeLocal() {
+ List<Long> res = new ArrayList<>();
+
+ for (long comboBits = lo; comboBits < hi; comboBits++) {
+ boolean match = true;
+
+ // For each input row we check if subtracting the current combination of words
+ // from the input row would give us the expected result.
+ for (int j = 0; j < wordBits.length; j++) {
+ // Get bitmask of how many words can be subtracted from the row.
+ // Check if there is more than 1 word remaining after subtraction.
+ if (wordCounts[j] - Long.bitCount(wordBits[j] & comboBits) > 1) {
+ // Skip this combination.
+ match = false;
+
+ break;
+ }
+ }
+
+ if (match && excludes(comboBits, res))
+ res.add(comboBits);
+ }
+
+ return res;
+ }
+
+ private List<Long> forkJoin() {
+ long mid = lo + hi >>> 1L;
+
+ NCSentenceHelper t1 = new NCSentenceHelper(lo, mid, wordBits, wordCounts);
+ NCSentenceHelper t2 = new NCSentenceHelper(mid, hi, wordBits, wordCounts);
+
+ t2.fork();
+
+ return merge(t1.compute(), t2.join());
+ }
+
+ private static List<Long> merge(List<Long> l1, List<Long> l2) {
+ if (l1.isEmpty())
+ return l2;
+ else if (l2.isEmpty())
+ return l1;
+
+ int size1 = l1.size();
+ int size2 = l2.size();
+
+ if (size1 == 1 && size2 > 1 || size2 == 1 && size1 > 1) {
+ // Minor optimization in case if one of the lists has only one element.
+ List<Long> res = size1 == 1 ? l2 : l1;
+ Long val = size1 == 1 ? l1.get(0) : l2.get(0);
+
+ if (excludes(val, res))
+ res.add(val);
+
+ return res;
+ }
+
+ List<Long> res = new ArrayList<>(size1 + size2);
+
+ for (int i = 0, max = Math.max(size1, size2); i < max; i++) {
+ Long v1 = i < size1 ? l1.get(i) : null;
+ Long v2 = i < size2 ? l2.get(i) : null;
+
+ if (v1 != null && v2 != null) {
+ if (containsAllBits(v1, v2))
+ v1 = null;
+ else if (containsAllBits(v2, v1))
+ v2 = null;
+ }
+
+ if (v1 != null && excludes(v1, res))
+ res.add(v1);
+
+ if (v2 != null && excludes(v2, res))
+ res.add(v2);
+ }
+
+ return res;
+ }
+
+ private static boolean excludes(long bits, List<Long> allBits) {
+ for (Long allBit : allBits)
+ if (containsAllBits(bits, allBit))
+ return false;
+
+ return true;
+ }
+
+ private static boolean containsAllBits(long bitSet1, long bitSet2) {
+ return (bitSet1 & bitSet2) == bitSet2;
+ }
+
+ private static <T> long wordsToBits(Set<T> words, List<T> dict) {
+ long bits = 0;
+
+ for (int i = 0, n = dict.size(); i < n; i++)
+ if (words.contains(dict.get(i)))
+ bits |= 1L << i;
+
+ return bits;
+ }
+
+ private static <T> List<T> bitsToWords(long bits, List<T> dict) {
+ List<T> words = new ArrayList<>(Long.bitCount(bits));
+
+ for (int i = 0, n = dict.size(); i < n; i++)
+ if ((bits & 1L << i) != 0)
+ words.add(dict.get(i));
+
+ return words;
+ }
+
+ @Override
+ protected List<Long> compute() {
+ return hi - lo <= THRESHOLD ? computeLocal() : forkJoin();
+ }
+
+ /**
+ *
+ * @param words
+ * @param pool
+ * @param <T>
+ * @return
+ */
+ static <T> List<List<T>> findCombinations(List<Set<T>> words, ForkJoinPool pool) {
+ assert words != null && !words.isEmpty();
+ assert pool != null;
+
+ if (words.stream().allMatch(p -> p.size() == 1))
+ return Collections.singletonList(Collections.emptyList());
+
+ // Build dictionary of unique words.
+ List<T> dict = words.stream().flatMap(Collection::stream).distinct().collect(toList());
+
+ if (dict.size() > Long.SIZE)
+ // Note: Power set of 64 words results in 9223372036854775807 combinations.
+ throw new IllegalArgumentException("Dictionary is too long: " + dict.size());
+
+ // Convert words to bitmasks (each bit corresponds to an index in the dictionary).
+ long[] wordBits =
+ words.stream().sorted(Comparator.comparingInt(Set::size)).mapToLong(row -> wordsToBits(row, dict)).toArray();
+
+ // Cache words count per row.
+ int[] wordCounts = words.stream().sorted(Comparator.comparingInt(Set::size)).mapToInt(Set::size).toArray();
+
+ // Prepare Fork/Join task to iterate over the power set of all combinations.
+ return
+ pool.invoke(new NCSentenceHelper(1, (long)Math.pow(2, dict.size()), wordBits, wordCounts)).
+ stream().map(bits -> bitsToWords(bits, dict)).collect(toList());
+ }
+}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
similarity index 70%
copy from nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
copy to nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index 113e088..470776c 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -15,28 +15,27 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.common.nlp
+package org.apache.nlpcraft.probe.mgrs.sentence
-import com.typesafe.scalalogging.LazyLogging
-import org.apache.nlpcraft.common.NCE
+import io.opencensus.trace.Span
+import org.apache.nlpcraft.common.nlp.NCNlpSentence.NoteLink
import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
+import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken}
+import org.apache.nlpcraft.common.{NCE, NCService, U}
import org.apache.nlpcraft.model.NCModel
+import java.io.{Serializable ⇒ JSerializable}
import java.util
import java.util.{List ⇒ JList}
-import java.io.{Serializable ⇒ JSerializable}
-import java.util.Collections
-import scala.collection.JavaConverters._
-import scala.collection.mutable.ArrayBuffer
-import scala.collection.{Map, Seq, Set, mutable}
+import scala.collection.JavaConverters.{asScalaBufferConverter, _}
+import scala.collection.{Map, Seq, mutable}
import scala.language.implicitConversions
-object NCNlpSentence extends LazyLogging {
- implicit def toTokens(x: NCNlpSentence): ArrayBuffer[NCNlpSentenceToken] = x.tokens
-
- case class NoteKey(start: Int, end: Int)
- case class TokenKey(id: String, start: Int, end: Int)
- case class NoteLink(note: String, indexes: Seq[Int])
+/**
+ * Sentences processing manager.
+ */
+object NCSentenceManager extends NCService {
+ @volatile private var pool: java.util.concurrent.ForkJoinPool = _
case class PartKey(id: String, start: Int, end: Int) {
require(start <= end)
@@ -44,6 +43,7 @@ object NCNlpSentence extends LazyLogging {
private def in(i: Int): Boolean = i >= start && i <= end
def intersect(id: String, start: Int, end: Int): Boolean = id == this.id && (in(start) || in(end))
}
+
object PartKey {
def apply(m: util.HashMap[String, JSerializable]): PartKey = {
def get[T](name: String): T = m.get(name).asInstanceOf[T]
@@ -55,6 +55,10 @@ object NCNlpSentence extends LazyLogging {
PartKey(t.noteType, sen(t.tokenFrom).startCharIndex, sen(t.tokenTo).endCharIndex)
}
+ /**
+ *
+ * @param notes
+ */
private def getLinks(notes: Seq[NCNlpSentenceNote]): Seq[NoteLink] = {
val noteLinks = mutable.ArrayBuffer.empty[NoteLink]
@@ -71,7 +75,7 @@ object NCNlpSentence extends LazyLogging {
noteLinks ++=
(for ((name, idxs) ← names.asScala.zip(idxsSeq.asScala.map(_.asScala)))
yield NoteLink(name, idxs.sorted)
- )
+ )
}
if (n.contains("subjnotes")) add("subjnotes", "subjindexes")
@@ -81,6 +85,10 @@ object NCNlpSentence extends LazyLogging {
noteLinks
}
+ /**
+ *
+ * @param notes
+ */
private def getPartKeys(notes: NCNlpSentenceNote*): Seq[PartKey] =
notes.
filter(_.isUser).
@@ -460,13 +468,14 @@ object NCNlpSentence extends LazyLogging {
}
if (fixed.forall(_.size == 1))
- // Fix double dimension array to one dimension,
- // so it should be called always in spite of 'fixIndexesReferences' method.
+ // Fix double dimension array to one dimension,
+ // so it should be called always in spite of 'fixIndexesReferences' method.
ns.fixNote(n, idxsField → fixed.map(_.head).asJava.asInstanceOf[JSerializable])
else
ok = false
case None ⇒ // No-op.
}
+
ok &&
ns.flatMap(_.getNotes(noteType)).forall(rel ⇒
rel.dataOpt[JList[Int]](idxsField) match {
@@ -538,116 +547,57 @@ object NCNlpSentence extends LazyLogging {
res
}
-}
-
-import org.apache.nlpcraft.common.nlp.NCNlpSentence._
-
-/**
- * Parsed NLP sentence is a collection of tokens. Each token is a collection of notes and
- * each note is a collection of KV pairs.
- *
- * @param srvReqId Server request ID.
- * @param text Normalized text.
- * @param enabledBuiltInToks Enabled built-in tokens.
- * @param tokens Initial buffer.
- * @param deletedNotes Deleted overridden notes with their tokens.
- */
-class NCNlpSentence(
- val srvReqId: String,
- val text: String,
- val enabledBuiltInToks: Set[String],
- override val tokens: mutable.ArrayBuffer[NCNlpSentenceToken] = new mutable.ArrayBuffer[NCNlpSentenceToken](32),
- private val deletedNotes: mutable.HashMap[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = mutable.HashMap.empty,
- private var initNlpNotes: Map[NoteKey, NCNlpSentenceNote] = null,
- private val nlpTokens: mutable.HashMap[TokenKey, NCNlpSentenceToken] = mutable.HashMap.empty
-) extends NCNlpSentenceTokenBuffer(tokens) with JSerializable {
- @transient
- private var hash: java.lang.Integer = _
-
- private def calcHash(): Int =
- Seq(srvReqId, text, enabledBuiltInToks, tokens).map(_.hashCode()).foldLeft(0)((a, b) ⇒ 31 * a + b)
-
- private def addDeleted(sen: NCNlpSentence, dels: Iterable[NCNlpSentenceNote]): Unit =
- sen.deletedNotes ++= dels.map(n ⇒ {
- val savedDelNote = n.clone()
- val savedDelToks = n.tokenIndexes.map(idx ⇒ this(idx).clone())
-
- val mainNotes = savedDelToks.flatten.filter(n ⇒ n.noteType != "nlpcraft:nlp" && n != savedDelNote)
-
- // Deleted note's tokens should contains only nlp data and deleted notes.
- for (savedDelTok ← savedDelToks; mainNote ← mainNotes)
- savedDelTok.remove(mainNote)
-
- savedDelNote → savedDelToks
- })
-
- // Deep copy.
- override def clone(): NCNlpSentence =
- new NCNlpSentence(
- srvReqId,
- text,
- enabledBuiltInToks,
- tokens.map(_.clone()),
- deletedNotes.map(p ⇒ p._1.clone() → p._2.map(_.clone())),
- initNlpNotes = initNlpNotes
- )
/**
- * Utility method that gets set of notes for given note type collected from
- * tokens in this sentence. Notes are sorted in the same order they appear
- * in this sentence.
*
- * @param noteType Note type.
- */
- def getNotes(noteType: String): Seq[NCNlpSentenceNote] = this.flatMap(_.getNotes(noteType)).distinct
-
- /**
- * Utility method that removes note with given ID from all tokens in this sentence.
- * No-op if such note wasn't found.
- *
- * @param note Note.
+ * @param mdl
+ * @param ns
*/
- def removeNote(note: NCNlpSentenceNote): Unit = this.foreach(_.remove(note))
-
- //noinspection HashCodeUsesVar
- override def hashCode(): Int = {
- if (hash == null)
- hash = calcHash()
-
- hash
- }
-
- def fixNote(note: NCNlpSentenceNote, kvs: (String, JSerializable)*): Unit = {
- val fixed = note.clone(kvs: _*)
-
- this.filter(t ⇒ t.index >= fixed.tokenIndexes.head && t.index <= fixed.tokenIndexes.last).foreach(t ⇒ {
- t.remove(note)
- t.add(fixed)
- })
-
- hash = null
- }
-
private def dropAbstract(mdl: NCModel, ns: NCNlpSentence): Unit =
if (!mdl.getAbstractTokens.isEmpty) {
val notes = ns.flatten
- val keys = getPartKeys(notes :_*)
+ val keys = getPartKeys(notes: _*)
val noteLinks = getLinks(notes)
notes.filter(n ⇒ {
val noteToks = ns.tokens.filter(_.contains(n))
mdl.getAbstractTokens.contains(n.noteType) &&
- !keys.exists(_.intersect(n.noteType, noteToks.head.startCharIndex, noteToks.last.startCharIndex)) &&
- !noteLinks.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))
+ !keys.exists(_.intersect(n.noteType, noteToks.head.startCharIndex, noteToks.last.startCharIndex)) &&
+ !noteLinks.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))
}).foreach(ns.removeNote)
}
+ /**
+ *
+ * @param toks
+ * @return
+ */
private def getNotNlpNotes(toks: Seq[NCNlpSentenceToken]): Seq[NCNlpSentenceNote] =
toks.flatten.filter(!_.isNlp).distinct
/**
+ *
+ * @param thisSen
+ * @param sen
+ * @param dels
+ */
+ private def addDeleted(thisSen: NCNlpSentence, sen: NCNlpSentence, dels: Iterable[NCNlpSentenceNote]): Unit =
+ sen.addDeletedNotes(dels.map(n ⇒ {
+ val savedDelNote = n.clone()
+ val savedDelToks = n.tokenIndexes.map(idx ⇒ thisSen(idx).clone())
+
+ val mainNotes = savedDelToks.flatten.filter(n ⇒ n.noteType != "nlpcraft:nlp" && n != savedDelNote)
+
+ // Deleted note's tokens should contains only nlp data and deleted notes.
+ for (savedDelTok ← savedDelToks; mainNote ← mainNotes)
+ savedDelTok.remove(mainNote)
+
+ savedDelNote → savedDelToks
+ }).toMap)
+
+ /**
* This collapser handles several tasks:
* - "overall" collapsing after all other individual collapsers had their turn.
* - Special further enrichment of tokens like linking, etc.
@@ -656,7 +606,7 @@ class NCNlpSentence(
* lengths - the winning note is chosen based on this priority.
*/
@throws[NCE]
- def collapse(mdl: NCModel, lastPhase: Boolean = false): Seq[NCNlpSentence] = {
+ private def collapseSentence(sen: NCNlpSentence, mdl: NCModel, lastPhase: Boolean = false): Seq[NCNlpSentence] = {
def collapse0(ns: NCNlpSentence): Option[NCNlpSentence] = {
if (lastPhase)
dropAbstract(mdl, ns)
@@ -669,7 +619,7 @@ class NCNlpSentence(
// We keep only one variant - with `best` direct and sparsity parameters,
// other variants for these words are redundant.
val redundant: Seq[NCNlpSentenceNote] =
- this.flatten.filter(!_.isNlp).distinct.
+ sen.flatten.filter(!_.isNlp).distinct.
groupBy(_.getKey()).
map(p ⇒ p._2.sortBy(p ⇒
(
@@ -691,15 +641,15 @@ class NCNlpSentence(
flatMap(_.drop(1)).
toSeq
- redundant.foreach(this.removeNote)
+ redundant.foreach(sen.removeNote)
var delCombs: Seq[NCNlpSentenceNote] =
- getNotNlpNotes(this).
- flatMap(note ⇒ getNotNlpNotes(note.tokenIndexes.sorted.map(i ⇒ this(i))).filter(_ != note)).
+ getNotNlpNotes(sen).
+ flatMap(note ⇒ getNotNlpNotes(note.tokenIndexes.sorted.map(i ⇒ sen(i))).filter(_ != note)).
distinct
// Optimization. Deletes all wholly swallowed notes.
- val links = getLinks(this.flatten)
+ val links = getLinks(sen.flatten)
val swallowed =
delCombs.
@@ -709,7 +659,7 @@ class NCNlpSentence(
filter(getPartKeys(_).isEmpty).
flatMap(note ⇒ {
val noteWordsIdxs = note.wordIndexes.toSet
- val key = PartKey(note, this)
+ val key = PartKey(note, sen)
val delCombOthers =
delCombs.filter(_ != note).flatMap(n ⇒ if (getPartKeys(n).contains(key)) Some(n) else None)
@@ -717,55 +667,33 @@ class NCNlpSentence(
if (delCombOthers.exists(o ⇒ noteWordsIdxs == o.wordIndexes.toSet)) Some(note) else None
})
- delCombs = delCombs.filter(p ⇒ !swallowed.contains(p))
- addDeleted(this, swallowed)
- swallowed.foreach(this.removeNote)
-
- val toksByIdx: Seq[Seq[NCNlpSentenceNote]] =
- delCombs.flatMap(note ⇒ note.wordIndexes.map(_ → note)).
- groupBy { case (idx, _) ⇒ idx }.
- map { case (_, seq) ⇒ seq.map { case (_, note) ⇒ note } }.
- toSeq.sortBy(-_.size)
- val minDelSize = if (toksByIdx.isEmpty) 1 else toksByIdx.map(_.size).max - 1
+ delCombs = delCombs.filter(p ⇒ !swallowed.contains(p))
+ addDeleted(sen, sen, swallowed)
+ swallowed.foreach(sen.removeNote)
var sens =
if (delCombs.nonEmpty) {
- val deleted = mutable.ArrayBuffer.empty[Set[NCNlpSentenceNote]]
+ val toksByIdx =
+ delCombs.flatMap(note ⇒ note.wordIndexes.map(_ → note)).
+ groupBy { case (idx, _) ⇒ idx }.
+ map { case (_, seq) ⇒ seq.map { case (_, note) ⇒ note }.toSet }.
+ toSeq.sortBy(-_.size)
val sens =
- (minDelSize to delCombs.size).
- flatMap(i ⇒
- delCombs.combinations(i).
- filter(delComb ⇒
- !toksByIdx.exists(
- rec ⇒
- rec.size - delCombs.size <= 1 &&
- rec.count(note ⇒ !delComb.contains(note)) > 1
- )
- )
- ).
- sortBy(_.size).
- map(_.toSet).
- flatMap(delComb ⇒
- // Already processed with less subset of same deleted tokens.
- if (!deleted.exists(_.subsetOf(delComb))) {
- val nsClone = this.clone()
-
- // Saves deleted notes for sentence and their tokens.
- addDeleted(nsClone, delComb)
- delComb.foreach(nsClone.removeNote)
-
- // Has overlapped notes for some tokens.
- require(!nsClone.exists(_.count(!_.isNlp) > 1))
-
- deleted += delComb
-
- collapse0(nsClone)
- }
- else
- None
- )
+ NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala).
+ flatMap(delComb ⇒ {
+ val nsClone = sen.clone()
+
+ // Saves deleted notes for sentence and their tokens.
+ addDeleted(sen, nsClone, delComb)
+ delComb.foreach(nsClone.removeNote)
+
+ // Has overlapped notes for some tokens.
+ require(!nsClone.exists(_.count(!_.isNlp) > 1))
+
+ collapse0(nsClone)
+ })
// It removes sentences which have only one difference - 'direct' flag of their user tokens.
// `Direct` sentences have higher priority.
@@ -802,7 +730,7 @@ class NCNlpSentence(
m.values.map(_.sentence).toSeq
}
else
- collapse0(this).flatMap(p ⇒ Option(Seq(p))).getOrElse(Seq.empty)
+ collapse0(sen).flatMap(p ⇒ Option(Seq(p))).getOrElse(Seq.empty)
sens = sens.distinct
@@ -823,112 +751,29 @@ class NCNlpSentence(
toSeq
}
- /**
- * Returns flag are note notes equal (or similar) or not. Reason of ignored difference can be stopwords tokens.
- *
- * @param n1 First note.
- * @param n2 Second note.
- */
- def notesEqualOrSimilar(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean =
- if (n1.noteType != n2.noteType)
- false
- else {
- val stopIdxs = this.filter(_.isStopWord).map(_.index)
-
- // One possible difference - stopwords indexes.
- def wordsEqualOrSimilar0(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean = {
- val set1 = n1.wordIndexes.toSet
- val set2 = n2.wordIndexes.toSet
-
- set1 == set2 || set1.subsetOf(set2) && set2.diff(set1).forall(stopIdxs.contains)
- }
-
- def wordsEqualOrSimilar(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean =
- wordsEqualOrSimilar0(n1, n2) || wordsEqualOrSimilar0(n2, n1)
-
- def tokensEqualOrSimilar0(set1: Set[NCNlpSentenceToken], set2: Set[NCNlpSentenceToken]): Boolean =
- set1 == set2 || set1.subsetOf(set2) && set2.diff(set1).forall(_.isStopWord)
-
- def tokensEqualOrSimilar(set1: Set[NCNlpSentenceToken], set2: Set[NCNlpSentenceToken]): Boolean =
- tokensEqualOrSimilar0(set1, set2) || tokensEqualOrSimilar0(set2, set1)
-
- def getList(n: NCNlpSentenceNote, refIdxName: String): Set[NCNlpSentenceToken] =
- n.getOrElse(refIdxName, Collections.emptyList).asInstanceOf[JList[Int]].asScala.
- map(this (_)).toSet
+ override def start(parent: Span): NCService = {
+ ackStarting()
- def getListList(n: NCNlpSentenceNote, refIdxName: String): Set[NCNlpSentenceToken] =
- n.getOrElse(refIdxName, Collections.emptyList).asInstanceOf[JList[JList[Int]]].asScala.
- flatMap(_.asScala.map(this (_))).toSet
+ pool = new java.util.concurrent.ForkJoinPool()
- def referencesEqualOrSimilar0(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean = {
- require(n1.noteType == n2.noteType)
-
- n1.noteType match {
- case "nlpcraft:sort" ⇒
- tokensEqualOrSimilar(getListList(n1, "subjindexes"), getListList(n2, "subjindexes")) &&
- tokensEqualOrSimilar(getListList(n1, "byindexes"), getListList(n2, "byindexes"))
- case "nlpcraft:limit" ⇒
- tokensEqualOrSimilar(getList(n1, "indexes"), getList(n2, "indexes"))
- case "nlpcraft:reference" ⇒
- tokensEqualOrSimilar(getList(n1, "indexes"), getList(n2, "indexes"))
-
- case _ ⇒ true
- }
- }
-
- def referencesEqualOrSimilar(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean =
- referencesEqualOrSimilar0(n1, n2) || referencesEqualOrSimilar0(n2, n1)
-
- def getUniqueKey0(n: NCNlpSentenceNote): Seq[Any] = n.getKey(withIndexes = false, withReferences = false)
-
- getUniqueKey0(n1) == getUniqueKey0(n2) && wordsEqualOrSimilar(n1, n2) && referencesEqualOrSimilar(n1, n2)
- }
-
- override def equals(obj: Any): Boolean = obj match {
- case x: NCNlpSentence ⇒
- tokens == x.tokens &&
- srvReqId == x.srvReqId &&
- text == x.text &&
- enabledBuiltInToks == x.enabledBuiltInToks
-
- case _ ⇒ false
+ ackStarted()
}
- /**
- *
- */
- def saveNlpNotes(): Unit =
- initNlpNotes = this.map(t ⇒ NoteKey(t.startCharIndex, t.endCharIndex) → t.getNlpNote).toMap
+ override def stop(parent: Span): Unit = {
+ ackStopping()
- /**
- *
- * @return
- */
- def findInitialNlpNote(startCharIndex: Int, endCharIndex: Int): Option[NCNlpSentenceNote] =
- initNlpNotes.get(NoteKey(startCharIndex, endCharIndex))
-
- /**
- *
- * @param nlp
- */
- def addNlpToken(nlp: NCNlpSentenceToken): Unit = {
- require(nlp.size <= 2)
+ U.shutdownPool(pool)
- nlp.foreach(n ⇒ nlpTokens += TokenKey(n.noteType, nlp.startCharIndex, nlp.endCharIndex) → nlp)
+ ackStopped()
}
/**
*
- * @param noteType
- * @param startCharIndex
- * @param endCharIndex
+ * @param mdl
+ * @param sen
+ * @param lastPhase
* @return
*/
- def findNlpToken(noteType: String, startCharIndex: Int, endCharIndex: Int): Option[NCNlpSentenceToken] =
- nlpTokens.get(TokenKey(noteType, startCharIndex, endCharIndex))
-
- /**
- *
- */
- def getDeletedNotes: Predef.Map[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = deletedNotes.toMap
+ def collapse(mdl: NCModel, sen: NCNlpSentence, lastPhase: Boolean = false): Seq[NCNlpSentence] =
+ collapseSentence(sen, mdl, lastPhase)
}
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
new file mode 100644
index 0000000..b354533
--- /dev/null
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model
+
+import org.apache.nlpcraft.model.{NCElement, NCIntent, NCIntentMatch, NCModelAdapter, NCResult}
+import org.apache.nlpcraft.{NCTestContext, NCTestElement, NCTestEnvironment}
+import org.junit.jupiter.api.Test
+
+import java.util
+import scala.collection.JavaConverters._
+
+/**
+ * Nested Elements test model.
+ */
+class NCNestedTestModel4 extends NCModelAdapter(
+ "nlpcraft.nested3.test.mdl", "Nested Data Test Model", "1.0"
+) {
+ override def getElements: util.Set[NCElement] =
+ Set(
+ NCTestElement("e1", "//[a-zA-Z0-9]+//"),
+ NCTestElement("e2", "the ^^(id == 'e1')^^")
+ )
+
+ override def getAbstractTokens: util.Set[String] = Set("e1").asJava
+ override def getEnabledBuiltInTokens: util.Set[String] = Set.empty[String].asJava
+
+ @NCIntent("intent=onE2 term(t1)={id == 'e2'}[8, 100]")
+ def onAB(ctx: NCIntentMatch): NCResult = NCResult.text("OK")
+}
+
+/**
+ * It shouldn't be too slow.
+ */
+@NCTestEnvironment(model = classOf[NCNestedTestModel4], startClient = true)
+class NCEnricherNestedModelSpec4 extends NCTestContext {
+ @Test
+ def test(): Unit = checkIntent("the a " * 8, "onE2")
+}
\ No newline at end of file