You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/04/09 11:26:33 UTC
[incubator-nlpcraft] 01/03: WIP.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-287
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit a349b6fe1baa5d6069f725f232b32146efb21873
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Fri Apr 9 14:09:52 2021 +0300
WIP.
---
.../nlpcraft/common/nlp/NCNlpSentenceNote.scala | 25 ++--
.../apache/nlpcraft/model/impl/NCTokenImpl.scala | 8 +-
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 144 +++++++++++----------
.../probe/mgrs/sentence/NCSentenceManager.scala | 15 ++-
.../nlpcraft/model/sparse/NCSparseSpec.scala | 15 ++-
5 files changed, 118 insertions(+), 89 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala
index c0923ae..9adbe01 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala
@@ -23,12 +23,13 @@ import org.apache.nlpcraft.common.ascii._
import scala.collection.JavaConverters._
import scala.collection.{Seq, Set, mutable}
import scala.language.implicitConversions
+import java.io.{Serializable ⇒ JSerializable}
/**
* Sentence token note is a typed map of KV pairs.
*
*/
-class NCNlpSentenceNote(private val values: Map[String, java.io.Serializable]) extends java.io.Serializable with NCAsciiLike {
+class NCNlpSentenceNote(private val values: Map[String, JSerializable]) extends JSerializable with NCAsciiLike {
import NCNlpSentenceNote._
@transient
@@ -75,7 +76,7 @@ class NCNlpSentenceNote(private val values: Map[String, java.io.Serializable]) e
)
override def clone(): NCNlpSentenceNote = {
- val m = mutable.Map.empty[String, java.io.Serializable] ++ values
+ val m = mutable.Map.empty[String, JSerializable] ++ values
new NCNlpSentenceNote(m.toMap)
}
@@ -91,20 +92,20 @@ class NCNlpSentenceNote(private val values: Map[String, java.io.Serializable]) e
*
* @return
*/
- def skipNlp(): Map[String, java.io.Serializable] =
+ def skipNlp(): Map[String, JSerializable] =
values.filter { case (key, _) ⇒ !SKIP_CLONE.contains(key) && key != "noteType" }
/**
*
*/
- def asMetadata(): Map[String, java.io.Serializable] =
+ def asMetadata(): Map[String, JSerializable] =
if (isUser)
values.get("meta") match {
- case Some(meta) ⇒ meta.asInstanceOf[Map[String, java.io.Serializable]]
- case None ⇒ Map.empty[String, java.io.Serializable]
+ case Some(meta) ⇒ meta.asInstanceOf[Map[String, JSerializable]]
+ case None ⇒ Map.empty[String, JSerializable]
}
else {
- val md = mutable.Map.empty[String, java.io.Serializable]
+ val md = mutable.Map.empty[String, JSerializable]
val m = if (noteType != "nlpcraft:nlp") skipNlp() else values
@@ -117,8 +118,8 @@ class NCNlpSentenceNote(private val values: Map[String, java.io.Serializable]) e
*
* @param kvs
*/
- def clone(kvs : (String, java.io.Serializable)*): NCNlpSentenceNote = {
- val m = mutable.HashMap.empty[String, java.io.Serializable] ++ values
+ def clone(kvs : (String, JSerializable)*): NCNlpSentenceNote = {
+ val m = mutable.HashMap.empty[String, JSerializable] ++ values
kvs.foreach(kv ⇒ m += kv._1 → kv._2)
@@ -206,7 +207,7 @@ object NCNlpSentenceNote {
/**
* To immutable map.
*/
- implicit def values(note: NCNlpSentenceNote): Map[String, java.io.Serializable] = note.values
+ implicit def values(note: NCNlpSentenceNote): Map[String, JSerializable] = note.values
/**
* Creates new note with given parameters.
@@ -228,7 +229,7 @@ object NCNlpSentenceNote {
val (sparsity, tokMinIndex, tokMaxIndex, tokWordIndexes, len) = calc(wordIndexesOpt.getOrElse(indexes))
new NCNlpSentenceNote(
- mutable.HashMap[String, java.io.Serializable]((
+ mutable.HashMap[String, JSerializable]((
params.filter(_._2 != null) :+
("noteType" → typ) :+
("tokMinIndex" → indexes.min) :+
@@ -240,7 +241,7 @@ object NCNlpSentenceNote {
("wordLength" → len) :+
("sparsity" → sparsity) :+
("contiguous" → (sparsity == 0))
- ).map(p ⇒ p._1 → p._2.asInstanceOf[java.io.Serializable]): _*).toMap
+ ).map(p ⇒ p._1 → p._2.asInstanceOf[JSerializable]): _*).toMap
)
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala
index 017ead1..8c5005a 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala
@@ -17,7 +17,7 @@
package org.apache.nlpcraft.model.impl
-import java.io.Serializable
+import java.io.{Serializable ⇒ JSerializable}
import java.util.Collections
import org.apache.nlpcraft.common._
@@ -50,7 +50,7 @@ private[nlpcraft] class NCTokenImpl(
endCharIndex: Int,
meta: Map[String, Object],
isAbstractProp: Boolean
-) extends NCToken with Serializable {
+) extends NCToken with JSerializable {
require(mdl != null)
require(srvReqId != null)
require(id != null)
@@ -106,7 +106,7 @@ private[nlpcraft] object NCTokenImpl {
// nlpcraft:nlp and some optional (after collapsing).
require(tok.size <= 2, s"Unexpected token [size=${tok.size}, token=$tok]")
- val md = mutable.HashMap.empty[String, java.io.Serializable]
+ val md = mutable.HashMap.empty[String, JSerializable]
tok.foreach(n ⇒ {
val id = n.noteType.toLowerCase
@@ -142,7 +142,7 @@ private[nlpcraft] object NCTokenImpl {
// Special synthetic meta data element.
md.put("nlpcraft:nlp:freeword", false)
- elm.getMetadata.asScala.foreach { case (k, v) ⇒ md.put(k, v.asInstanceOf[java.io.Serializable]) }
+ elm.getMetadata.asScala.foreach { case (k, v) ⇒ md.put(k, v.asInstanceOf[JSerializable]) }
new NCTokenImpl(
mdl.model,
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 0ec40cd..d668c02 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -130,13 +130,21 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
case class ComplexHolder(complexesWords: Seq[Complex], complexes: Seq[ComplexSeq])
- // Found-by-synonym model element.
+ /**
+ * Found-by-synonym model element.
+ *
+ * @param element Element.
+ * @param tokens Element tokens.
+ * @param synonym Synonyms.
+ * @param parts Parts for DSL synonyms.
+ * @param allToksIdxs All tokens indexes (whole tokens slice, has sense for sparse tokens)
+ */
case class ElementMatch(
element: NCElement,
tokens: Seq[NlpToken],
synonym: Synonym,
parts: Seq[TokType],
- tokIdxs: Seq[Int]
+ allToksIdxs: Seq[Int]
) extends Ordered[ElementMatch] {
// Tokens sparsity.
lazy val sparsity: Int = U.calcSparsity(tokens.map(_.index))
@@ -206,7 +214,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
* @param syn
* @param metaOpt
* @param parts
- * @param toksIdxs
+ * @param allToksIdxs
*/
private def mark(
ns: NCNlpSentence,
@@ -216,16 +224,15 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
syn: Option[Synonym],
metaOpt: Option[Map[String, Object]],
parts: Seq[TokType],
- toksIdxs: Seq[Int]
+ allToksIdxs: Seq[Int]
): Unit = {
val params = mutable.ArrayBuffer.empty[(String, AnyRef)]
// For system elements.
params += "direct" → direct.asInstanceOf[AnyRef]
- val toksIdxsJava: JList[Int] = toksIdxs.asJava
-
- params += "allToksIndexes" → toksIdxsJava
+ // Internal usage.
+ params += "allToksIndexes" → allToksIdxs.asJava
syn match {
case Some(s) ⇒
@@ -334,6 +341,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
).getOrElse(throw new AssertionError(s"Custom model parser returned an invalid custom token: $w"))
)
+ // Checks element's tokens.
if (!alreadyMarked(matchedToks, elemId))
mark(
ns,
@@ -379,17 +387,15 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
* @param toks
* @param elemId
*/
- private def alreadyMarked(toks: Seq[NlpToken], elemId: String): Boolean =
- toks.forall(_.isTypeOf(elemId)) ||
- toks.flatten.exists(n ⇒
- n.noteType == elemId &&
- (
- n.dataOpt("allToksIndexes").asInstanceOf[Option[JList[Int]]] match {
- case Some(idxs) ⇒ idxs.asScala.containsSlice(toks.map(_.index))
- case None ⇒ false
- }
- )
- )
+ private def alreadyMarked(toks: Seq[NlpToken], elemId: String): Boolean = {
+ def hasIndex(n: NCNlpSentenceNote): Boolean =
+ n.dataOpt("allToksIndexes").asInstanceOf[Option[JList[Int]]] match {
+ case Some(idxs) ⇒ idxs.asScala.containsSlice(toks.map(_.index))
+ case None ⇒ false
+ }
+
+ toks.flatten.exists(n ⇒ n.noteType == elemId && hasIndex(n))
+ }
/**
*
@@ -519,39 +525,6 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
}).seq
}
- /**
- *
- * @param ns
- * @param mdlId
- * @param matches
- */
- private def processMatches(ns: NCNlpSentence, mdlId: String, matches: Seq[ElementMatch]): Unit = {
- // TODO:matchesNorm
- // Add notes for all remaining (non-intersecting) matches.
- for ((m, idx) ← matches.zipWithIndex) {
- if (DEEP_DEBUG)
- logger.trace(
- s"Model '$mdlId' element found (${idx + 1} of ${matches.size}) [" +
- s"elementId=${m.element.getId}, " +
- s"synonym=${m.synonym}, " +
- s"tokens=${tokString(m.tokens)}" +
- s"]"
- )
-
- val tokIdxs = m.tokens.map(_.index)
- val direct = m.synonym.isDirect && (tokIdxs == tokIdxs.sorted)
-
- // TODO:
- if (!alreadyMarked(m.tokens, m.element.getId)) {
- mark(ns, m.element, m.tokens, direct, syn = Some(m.synonym), metaOpt = None, m.parts, m.tokIdxs)
-
- println(s"SET: ${m.element.getId}, m.tokens=${m.tokens.map(_.origText).mkString("|")}")
- }
- else
- println(s"NOT SET: ${m.element.getId}, m.tokens=${m.tokens.map(_.origText).mkString("|")}")
- }
- }
-
@throws[NCE]
override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = {
require(isStarted)
@@ -561,7 +534,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
startScopedSpan("enrich", parent, "srvReqId" → srvReqId, "mdlId" → mdlId, "txt" → ns.text) { span ⇒
val req = NCRequestImpl(senMeta, srvReqId)
- val h = mkComplexes(mdl, ns)
+ lazy val h = mkComplexes(mdl, ns)
startScopedSpan("synsProc", span, "srvReqId" → srvReqId, "mdlId" → mdlId, "txt" → ns.text) { _ ⇒
var state = if (ns.firstProbePhase) SIMPLE else DSL_NEXT
@@ -571,9 +544,6 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
val combosToks = combos(ns)
def go(): Unit = {
- println
- println(s"GO $state")
-
val matches = mutable.ArrayBuffer.empty[ElementMatch]
val cacheSparse = mkCache(mdl)
@@ -582,22 +552,30 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
var found = false
- def add(typ: String, elm: NCElement, cache: Cache, res: Seq[NlpToken], tokIdxs: Seq[Int], s: Synonym, parts: Seq[TokType] = Seq.empty): Unit = {
+ def add(typ: String, elm: NCElement, cache: Cache, res: Seq[NlpToken], allToksIdxs: Seq[Int], s: Synonym, parts: Seq[TokType] = Seq.empty): Unit = {
var added = false
if (!matchExist(elm.getId, res)) {
- matches += ElementMatch(elm, res, s, parts, tokIdxs)
+ matches += ElementMatch(elm, res, s, parts, allToksIdxs)
added = true
}
- cache(elm.getId) += tokIdxs
+ cache(elm.getId) += allToksIdxs
found = true
- println(s"ADDED: ${elm.getId}, type=$typ, res=${res.map(_.origText).mkString("|")}, toks=${tokIdxs.mkString("|")}, added=$added")
+ if (DEEP_DEBUG)
+ logger.trace(
+ s"Found element [" +
+ s"id=${elm.getId}, " +
+ s"type=$typ, " +
+ s"indexes=${res.map(_.index).mkString("|")}, " +
+ s"allTokensIndexes=${allToksIdxs.mkString("|")}, " +
+ s"added=$added" +
+ s"]"
+ )
}
- // TODO:
def matchExist(elemId: String, toks: Seq[NlpToken]): Boolean =
matches.exists(m ⇒ m.element.getId == elemId && toks.toSet.subsetOf(m.tokensSet))
@@ -607,15 +585,16 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
lazy val tokStems = toks.map(_.stem).mkString(" ")
// Attempt to match each element.
- // TODO: alreadyMarked - может быть найдено тоже самое но отмечено меньше (как это сразу не рассматривать?)
for (
elm ← mdl.elements.values;
elemId = elm.getId;
dirProc = cacheDirect(elemId).exists(_.containsSlice(tokIdxs));
sparseProc = cacheSparse(elemId).exists(_.containsSlice(tokIdxs))
- if (!dirProc || !sparseProc) && !alreadyMarked(toks, elemId) && !matchExist(elemId, toks)
+ if
+ (!dirProc || !sparseProc) &&
+ // Checks whole tokens slice.
+ !alreadyMarked(toks, elemId) && !matchExist(elemId, toks)
) {
- //println(s"State=$elemId, dirProc=$dirProc, sparseProc=$sparseProc, cacheSparse(elemId)="+cacheSparse(elemId).mkString("|"))
// 1. SIMPLE.
found = false
@@ -662,9 +641,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
}
// 2. DSL.
- val dslEnabled = state != SIMPLE
-
- if (dslEnabled && mdl.synonymsDsl.nonEmpty) {
+ if (state != SIMPLE && mdl.synonymsDsl.nonEmpty) {
found = false
// 2.1 Sparse.
@@ -691,9 +668,42 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
}
}
- processMatches(ns, mdlId, matches)
+ for ((m, idx) ← matches.zipWithIndex) {
+ if (DEEP_DEBUG)
+ logger.trace(
+ s"Model '$mdlId' element found (${idx + 1} of ${matches.size}) [" +
+ s"elementId=${m.element.getId}, " +
+ s"synonym=${m.synonym}, " +
+ s"tokens=${tokString(m.tokens)}" +
+ s"]"
+ )
+
+ val tokIdxs = m.tokens.map(_.index)
+ val direct = m.synonym.isDirect && !tokIdxs.zip(tokIdxs.tail).exists { case (x, y) ⇒ x > y }
+
+ var added = false
+
+ // Checks element's tokens.
+ if (!alreadyMarked(m.tokens, m.element.getId)) {
+ mark(ns, m.element, m.tokens, direct, syn = Some(m.synonym), metaOpt = None, m.parts, m.allToksIdxs)
+
+ added = true
+ }
+
+ if (DEEP_DEBUG)
+ logger.trace(
+ s"Element ${if (added) "added" else "skipped"} [" +
+ s"id=${m.element.getId}, " +
+ s"indexes=${m.tokens.map(_.index).mkString("|")}, " +
+ s"allTokensIndexes=${m.allToksIdxs.mkString("|")}, " +
+ s"]"
+ )
+ }
}
+ if (DEEP_DEBUG)
+ logger.trace(s"Exexucution started with state: $state")
+
go()
if (state == SIMPLE) {
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index 2776677..541966a 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -733,9 +733,22 @@ object NCSentenceManager extends NCService {
)
)
+ def notNlpNotes(s: NCNlpSentence): Seq[NCNlpSentenceNote] = s.flatten.filter(!_.isNlp)
+
+ // Drops similar sentences (with same notes structure). Keeps with more found.
+ sens = sens.groupBy(notNlpNotes(_).groupBy(_.noteType).keys.toSeq.sorted.distinct).
+ flatMap(p ⇒ {
+ val m: Map[NCNlpSentence, Int] = p._2.map(p ⇒ p → notNlpNotes(p).size).toMap
+
+ val max = m.values.max
+
+ m.filter(_._2 == max).keys
+ }).
+ toSeq
+
// Drops similar sentences (with same tokens structure).
// Among similar sentences we prefer one with minimal free words count.
- sens.groupBy(_.flatten.filter(!_.isNlp).map(_.getKey(withIndexes = false))).
+ sens.groupBy(notNlpNotes(_).map(_.getKey(withIndexes = false))).
map { case (_, seq) ⇒ seq.minBy(_.filter(p ⇒ p.isNlp && !p.isStopWord).map(_.wordIndexes.length).sum) }.
toSeq
}
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/sparse/NCSparseSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/sparse/NCSparseSpec.scala
index 37df085..8441532 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/sparse/NCSparseSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/sparse/NCSparseSpec.scala
@@ -36,17 +36,20 @@ class NCSparseModel extends NCAbstractTokensModel {
val variants = ctx.getVariants.asScala
def checkOneVariant(sparsity: Int): Unit = {
- require(variants.size == 1)
+ require(variants.size == 1, "There is should be single variant.")
val toks = variants.head.asScala.filter(_.getId == "xyz")
- require(toks.size == 3)
+ require(toks.size == 3, "There are should be 3 `xyz` tokens.")
checkSparsity(sparsity, toks)
}
def checkSparsity(sparsity: Int, toks: mutable.Buffer[NCToken]): Unit =
- require(toks.forall(_.getMetadata.get("nlpcraft:nlp:sparsity").asInstanceOf[Int] == sparsity))
+ require(
+ toks.forall(_.getMetadata.get("nlpcraft:nlp:sparsity").asInstanceOf[Int] == sparsity),
+ s"Sparsity of each tokens should be: $sparsity."
+ )
def checkExists(sparsity: Int): Unit =
require(
@@ -58,9 +61,11 @@ class NCSparseModel extends NCAbstractTokensModel {
checkSparsity(sparsity, toks)
true
- case _ ⇒ false
+ case _ ⇒
+ false
}
- })
+ }),
+ s"Variant with 3 `xyz` tokens should be exists."
)
ctx.getRequest.getNormalizedText match {