You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/09/18 09:05:15 UTC
[incubator-nlpcraft] 04/08: WIP.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 1891c2c2d8e2441d0ad60a2a75f3fafee0dbaa04
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Fri Sep 17 11:55:48 2021 +0300
WIP.
---
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 216 +++++++++++++++++++--
.../probe/mgrs/sentence/NCSentenceManager.scala | 13 +-
.../model/stop/NCStopWordsInsideSpec.scala | 11 +-
.../model/NCEnricherNestedModelSpec4.scala | 81 ++++++--
4 files changed, 280 insertions(+), 41 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 22af412..d83ab05 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -21,6 +21,7 @@ import io.opencensus.trace.Span
import org.apache.nlpcraft.common._
import org.apache.nlpcraft.common.nlp.{NCNlpSentence => Sentence, NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken}
import org.apache.nlpcraft.model._
+import org.apache.nlpcraft.model.impl.NCTokenImpl
import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent
import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.NCSynonymChunkKind
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
@@ -285,11 +286,7 @@ object NCModelEnricher extends NCProbeEnricher {
}
}
- /**
- *
- * @param toks
- */
- private def combosNlpTokens(toks: Seq[NlpToken]): Seq[(Seq[NlpToken], Seq[NlpToken])] =
+ private def combosTokens1(toks: Seq[NlpToken]): Seq[(Seq[NlpToken], Seq[NlpToken])] =
combos(toks).flatMap(combo => {
val stops = combo.filter(_.isStopWord)
@@ -303,6 +300,64 @@ object NCModelEnricher extends NCProbeEnricher {
map(p => p._1 -> p._2.values.minBy(p => (-p.size, p.head.index))).
sortBy(p => (-p._2.size, -p._1.size, -p._2.head.index, -p._1.head.index))
+
+ /**
+ *
+ * 1. Prepares combination of tokens (sliding).
+ * Example: 'A B C D' -> {'A B C', 'A B', 'B C', 'A', 'B', 'C'}
+ * One sentence converted to 4 pieces.
+ *
+ * 2. Additionally, each piece converted into set of elements with all possible its stopwords permutations.
+ * Example: Piece: 'x1, x2(stopword), x3(stopword), x4' will be expanded into
+ * {'x1, x2, x3, x4', 'x1, x2, x4', 'x1, x3, x4', 'x1, x4'}
+ *
+ * 3. All variants collected, duplicated deleted, etc.
+ *
+ * @param toks
+ */
+ private def combosTokens(toks: Seq[NlpToken]): Seq[(Seq[NlpToken], Seq[NlpToken])] =
+ combos(toks).flatMap(combo => {
+ val stops = combo.filter(s => s.isStopWord && s != combo.head && s != combo.last)
+
+ val slides = mutable.ArrayBuffer.empty[mutable.ArrayBuffer[NlpToken]]
+
+ for (stop <- stops)
+ if (slides.nonEmpty && slides.last.last.index + 1 == stop.index)
+ slides.last += stop
+ else
+ slides += mutable.ArrayBuffer.empty :+ stop
+
+ val bigSlides = slides.filter(_.size > 2)
+
+ var stops4Delete: Seq[Seq[NlpToken]] =
+
+ if (bigSlides.nonEmpty) {
+ val allBig = bigSlides.flatMap(p => p)
+ val stops4AllCombs = stops.filter(p => !allBig.contains(p))
+
+ if (stops4AllCombs.nonEmpty)
+ for (
+ seq1 <- Range.inclusive(0, stops4AllCombs.size).flatMap(stops4AllCombs.combinations);
+ seq2 <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations)
+ )
+ yield seq1 ++ seq2.flatMap(p => p)
+ else
+ for (seq <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations))
+ yield seq.flatMap(p => p)
+ }
+ else
+ Range.inclusive(1, stops.size).flatMap(stops.combinations)
+
+ stops4Delete = stops4Delete.filter(seq => !seq.contains(combo.head) && !seq.contains(combo.last))
+
+ (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => !del.contains(t)))).map(_ -> combo).distinct
+
+ }).
+ filter(_._1.nonEmpty).
+ groupBy(_._1).
+ map(p => p._1 -> p._2.map(_._2).minBy(p => (-p.size, p.head.index))).
+ sortBy { case(data, combo) => (-combo.size, -data.size, combo.head.index, data.head.index) }
+
/**
*
* @param toks
@@ -315,9 +370,18 @@ object NCModelEnricher extends NCProbeEnricher {
* @param seq
* @param s
*/
- private def toParts(seq: Seq[NCIdlContent], s: Synonym): Seq[TokType] =
+ private def toParts(mdl: NCProbeModel, stvReqId: String, seq: Seq[NCIdlContent], s: Synonym): Seq[TokType] =
seq.zip(s.map(_.kind)).flatMap {
- case (complex, kind) => if (complex.isLeft) Some(complex.swap.toOption.get -> kind) else None
+ case (complex, kind) =>
+ if (complex.isLeft)
+ Some(complex.swap.toOption.get -> kind)
+ else {
+ val clone = complex.toOption.get.clone()
+
+ clone.filter(!_.isNlp).foreach(clone.remove)
+
+ Some(NCTokenImpl(mdl, stvReqId, clone) -> kind)
+ }
}
/**
@@ -457,8 +521,8 @@ object NCModelEnricher extends NCProbeEnricher {
*/
private def getSparsedTokens(matched: Seq[NlpToken], toks2Match: Seq[NlpToken]): Seq[NlpToken] = {
require(matched.nonEmpty)
-
// Matched tokens should be already sorted.
+
val stopsInside = toks2Match.filter(t =>
t.isStopWord && !matched.contains(matched) && t.index > matched.head.index && t.index < matched.last.index
)
@@ -470,13 +534,38 @@ object NCModelEnricher extends NCProbeEnricher {
override def enrich(mdl: NCProbeModel, ns: Sentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = {
require(isStarted)
+ //logger.info("ENRICH111")
+
startScopedSpan(
"enrich", parent, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text
) { span =>
val req = NCRequestImpl(senMeta, ns.srvReqId)
- val combToks = combosNlpTokens(ns.toSeq)
+ val combToks = combosTokens(ns.toSeq)
lazy val ch = mkComplexes(mdl, ns)
+// logger.info("combToks="+combToks.size)
+//
+// logger.info("ns.flatten.flatten.size="+ns.tokens.flatten.distinct.count(!_.isNlp))
+//
+// ns.tokens.flatten.filter(!_.isNlp).distinct.foreach(n => {
+// val parts =
+// n.get("parts") match {
+// case Some(v) =>
+// val parts = v.asInstanceOf[java.util.List[NCTokenPartKey]].asScala
+//
+// "all parts=" + parts.size + " " +
+// parts.map(p => {
+// val ref = ns.tokens.find(t => t.startCharIndex == p.from && t.endCharIndex == p.to).get
+//
+// "part=" + p.id + " (" + ref.index + "), text=" + ref.origText
+// }).mkString(" | ")
+// case None => "NO"
+// }
+// logger.info(s"${n.noteType} [${n.wordIndexes.mkString(",")}], parts=$parts")
+// })
+//
+// logger.info("---")
+
def execute(simpleEnabled: Boolean, idlEnabled: Boolean): Unit =
startScopedSpan(
"execute", span, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text
@@ -489,8 +578,7 @@ object NCModelEnricher extends NCProbeEnricher {
lazy val idlCache = mutable.HashSet.empty[Seq[Complex]]
for (
- // toksExt is part of sentence.
- // toks is toksExt or toksExt without some stopwords set. All stopwords combinations are taking into account.
+ // 'toksExt' is piece of sentence, 'toks' is the same as 'toksExt' or without some stopwords set.
(toks, toksExt) <- combToks;
idxs = toks.map(_.index);
e <- mdl.elements.values;
@@ -500,6 +588,11 @@ object NCModelEnricher extends NCProbeEnricher {
!greedy ||
!contCache(eId).exists(_.containsSlice(idxs)) && !alreadyMarked(ns, eId, toks, idxs)
) {
+// println("!!!toks="+toks.map(_.origText).mkString(" "))
+// println("!!!toksExt="+toksExt.map(_.origText).mkString(" "))
+// println()
+
+
// 1. SIMPLE.
if (simpleEnabled && (if (idlEnabled) mdl.hasIdlSynonyms(eId) else !mdl.hasIdlSynonyms(eId))) {
lazy val tokStems = toks.map(_.stem).mkString(" ")
@@ -542,9 +635,6 @@ object NCModelEnricher extends NCProbeEnricher {
for (s <- get(mdl.sparseSynonyms, eId))
s.sparseMatch(toks) match {
case Some(res) =>
-// println("!!!toks="+toks.map(_.origText))
-// println("!!!res="+res.map(_.origText))
-// println
add("simple sparse", ns, contCache, eId, greedy, getSparsedTokens(res, toks), idxs, s)
case None => // No-op.
}
@@ -566,7 +656,9 @@ object NCModelEnricher extends NCProbeEnricher {
data = comb.map(_.data)
)
if (s.isMatch(data, req)) {
- add("IDL continuous", ns, contCache, eId, greedy, toksExt, idxs, s, toParts(data, s))
+ val parts = toParts(mdl, ns.srvReqId, data, s)
+
+ add("IDL continuous", ns, contCache, eId, greedy, toksExt, idxs, s, parts)
idlCache += comb
@@ -583,7 +675,9 @@ object NCModelEnricher extends NCProbeEnricher {
case Some(res) =>
val typ = if (s.sparse) "IDL sparse" else "IDL continuous"
- add(typ, ns, contCache, eId, greedy, getSparsedTokens(toTokens(res, ns), toTokens(comb.map(_.data), ns)), idxs, s, toParts(res, s))
+ val parts = toParts(mdl, ns.srvReqId, res, s)
+
+ add(typ, ns, contCache, eId, greedy, getSparsedTokens(toTokens(res, ns), toTokens(comb.map(_.data), ns)), idxs, s, parts)
idlCache += comb
case None => // No-op.
@@ -604,6 +698,8 @@ object NCModelEnricher extends NCProbeEnricher {
processParsers(mdl, ns, span, req)
}
+
+ //logger.info("ENRICH222")
}
// TODO: simplify, add tests, check model properties (sparse etc) for optimization.
@@ -633,4 +729,92 @@ object NCModelEnricher extends NCProbeEnricher {
)
))
}
+}
+
+object x extends App {
+ case class T(index: Int, isStopWord: Boolean = false) {
+ override def toString: String = index.toString
+ }
+
+ private def combosTokens1(toks: Seq[T]): Seq[(Seq[T], Seq[T])] =
+ combos(toks).flatMap(combo => {
+ val stops = combo.filter(_.isStopWord)
+
+ val stops4Delete = Range.inclusive(1, stops.size).flatMap(stops.combinations)
+
+ (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => !del.contains(t)))).map(_ -> combo)
+ }).
+ toMap.
+ filter(_._1.nonEmpty).
+ groupBy(_._1).
+ map(p => p._1 -> p._2.values.minBy(p => (-p.size, p.head.index))).
+ sortBy(p => (-p._2.size, -p._1.size, -p._2.head.index, -p._1.head.index))
+
+ private def combos[T](toks: Seq[T]): Seq[Seq[T]] = {
+ val x = (for (n <- toks.size until 0 by -1) yield toks.sliding(n)).flatten.map(p => p)
+
+ println("size=" + x.size)
+
+ x
+ }
+
+ private def combosTokens(toks: Seq[T]): Seq[(Seq[T], Seq[T])] = {
+ val value = combos(toks)
+
+ value.flatMap(combo => {
+ val stops = combo.filter(_.isStopWord)
+
+ val slides = mutable.ArrayBuffer.empty[mutable.ArrayBuffer[T]]
+
+ for (stop <- stops)
+ if (slides.nonEmpty && slides.last.last.index + 1 == stop.index)
+ slides.last += stop
+ else
+ slides += mutable.ArrayBuffer.empty :+ stop
+
+ val bigSlides = slides.filter(_.size >= 3)
+
+ var stops4Delete: Seq[Seq[T]] =
+ if (bigSlides.nonEmpty) {
+ val allBig = bigSlides.flatten
+ val stops4AllCombs = stops.filter(p => !allBig.contains(p))
+
+ if (stops4AllCombs.nonEmpty)
+ for (
+ seq1 <- Range.inclusive(0, stops4AllCombs.size).flatMap(stops4AllCombs.combinations);
+ seq2 <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations)
+ )
+ yield seq1 ++ seq2.flatMap(p => p)
+ else
+ for (seq <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations))
+ yield seq.flatMap(p => p)
+ }
+ else
+ Range.inclusive(1, stops.size).flatMap(stops.combinations)
+
+ stops4Delete = stops4Delete.filter(seq => !seq.contains(combo.head) && !seq.contains(combo.last))
+
+ (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => !del.contains(t)))).map(_ -> combo).distinct
+ }).
+ filter(_._1.nonEmpty).
+ groupBy(_._1).
+ map(p => p._1 -> p._2.map(_._2).minBy(p => (-p.size, p.head.index))).
+ sortBy { case (data, combo) => (-combo.size, -data.size, combo.head.index, data.head.index) }
+ }
+
+ def go(): Unit = {
+ val combs = combosTokens(
+// Seq(
+// T(0), T(2, true), T(3, true), T(4, true), T(5), T(6), T(7, true), T(8, true), T(9), T(10, true), T(11, true), T(12)
+// )
+
+ Range.inclusive(0, 12).map(T(_, true))
+ )
+
+ println("All=" + combs.size)
+
+ combs.foreach { case (p1, p2) => println(p1.mkString("|") + " : " + p2.mkString("|")) }
+ }
+
+ go()
}
\ No newline at end of file
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index f6855ea..41fc484 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -676,9 +676,18 @@ object NCSentenceManager extends NCService {
groupBy { case (idx, _) => idx }.
map { case (_, seq) => seq.map { case (_, note) => note }.toSet }.
toSeq.sortBy(-_.size)
+
+// println("!!!!!toksByIdx.size="+toksByIdx.size)
+// println("!!!!!toksByIdx.ALL-sizes-sum="+toksByIdx.map(_.size).sum)
+// println("!!!!!toksByIdx.all-sized="+toksByIdx.map(_.size))
- def findCombinations(): Seq[Seq[NCNlpSentenceNote]] =
- NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala.toSeq)
+ def findCombinations(): Seq[Seq[NCNlpSentenceNote]] = {
+ val res = NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala.toSeq)
+
+// println("!!! combinations=" + res.size)
+
+ res
+ }
val seqSens =
combCache.
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala
index 9e3e911..3cc26f1 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala
@@ -32,9 +32,7 @@ class NCStopWordsInsideModel extends NCModelAdapter("nlpcraft.test", "Test Model
override def getElements: util.Set[NCElement] = Set(NCTestElement("complex", "a b"))
@NCIntent("intent=i term={# == 'complex'}")
- def onI(
- ctx: NCIntentMatch
- ): NCResult = {
+ def onI(ctx: NCIntentMatch): NCResult = {
require(ctx.getContext.getVariants.size() == 1)
require(ctx.getContext.getVariants.asScala.head.asScala.size == 1)
require(ctx.getContext.getVariants.asScala.head.asScala.head.getNormalizedText == ctx.getContext.getRequest.getNormalizedText)
@@ -68,12 +66,7 @@ class NCStopWordsInsideSparseModel extends NCStopWordsInsideModel {
class NCStopWordsInsideSparseSpec extends NCStopWordsInsideSpec {
@Test
def test2(): Unit = {
- //checkIntent("a b", "i")
- checkIntent("a the b", "i")
-// checkIntent("a , b", "i")
-// checkIntent("a, b", "i")
-// checkIntent("a, the b", "i")
-// checkIntent("a, the, b", "i")
+ // TODO:
}
}
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
index 27082f1..825e4a2 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
@@ -24,10 +24,8 @@ import org.junit.jupiter.api.Test
import java.util
import scala.jdk.CollectionConverters.SetHasAsJava
-/**
- * Nested Elements test model.
- */
-class NCNestedTestModel41 extends NCModelAdapter("nlpcraft.nested4.test.mdl", "Nested Test Model", "1.0") {
+// It shouldn't be too slow.
+class NCNestedTestModel4Adapter extends NCModelAdapter("nlpcraft.nested4.test.mdl", "Nested Test Model", "1.0") {
override def getElements: util.Set[NCElement] =
Set(
NCTestElement("e1", "//[a-zA-Z0-9]+//"),
@@ -36,34 +34,89 @@ class NCNestedTestModel41 extends NCModelAdapter("nlpcraft.nested4.test.mdl", "N
override def getAbstractTokens: util.Set[String] = Set("e1").asJava
override def getEnabledBuiltInTokens: util.Set[String] = Set.empty[String].asJava
+}
- @NCIntent("intent=onE2 term(t1)={# == 'e2'}[8, 100]")
- def onAB(ctx: NCIntentMatch): NCResult = NCResult.text("OK")
+/**
+ * Greedy(one element expected) + not permuted.
+ */
+class NCNestedTestModel41 extends NCNestedTestModel4Adapter {
+ @NCIntent("intent=onE2 term(t1)={# == 'e2'}")
+ def onAB(): NCResult = NCResult.text("OK")
+ override def isGreedy: Boolean = true
override def isPermutateSynonyms: Boolean = false
override def isSparse: Boolean = false
}
/**
- * It shouldn't be too slow.
+ *
*/
@NCTestEnvironment(model = classOf[NCNestedTestModel41], startClient = true)
class NCEnricherNestedModelSpec41 extends NCTestContext {
- @Test
+ // @Test
+ def test(): Unit = checkIntent("the a " * 11, "onE2")
+}
+
+/**
+ * Not-greedy(few elements expected) + not permuted.
+ */
+class NCNestedTestModel42 extends NCNestedTestModel4Adapter {
+ @NCIntent("intent=onE2 term(t1)={# == 'e2'}[3, 100]")
+ def onAB(): NCResult = NCResult.text("OK")
+
+ override def isGreedy: Boolean = false
+ override def isPermutateSynonyms: Boolean = false
+ override def isSparse: Boolean = false
+}
+
+/**
+ *
+ */
+@NCTestEnvironment(model = classOf[NCNestedTestModel41], startClient = true)
+class NCEnricherNestedModelSpec42 extends NCTestContext {
+ // @Test
def test(): Unit = checkIntent("the a " * 11, "onE2")
}
-class NCNestedTestModel42 extends NCNestedTestModel41 {
+/**
+ * Greedy(one element expected) + permuted.
+ */
+class NCNestedTestModel43 extends NCNestedTestModel4Adapter {
+ @NCIntent("intent=onE2 term(t1)={# == 'e2'}[1, 100]")
+ def onAB(): NCResult = NCResult.text("OK")
+
+ override def isGreedy: Boolean = true
override def isPermutateSynonyms: Boolean = true
override def isSparse: Boolean = true
}
/**
- * It shouldn't be too slow.
+ *
*/
-@NCTestEnvironment(model = classOf[NCNestedTestModel42], startClient = true)
-class NCEnricherNestedModelSpec42 extends NCTestContext {
- @Test
- def test(): Unit = checkIntent("the a " * 8, "onE2")
+@NCTestEnvironment(model = classOf[NCNestedTestModel43], startClient = true)
+class NCEnricherNestedModelSpec43 extends NCTestContext {
+ // @Test
+ def test(): Unit = checkIntent("the a " * 4, "onE2")
+}
+
+/**
+ * Not-greedy(few elements expected) + permuted.
+ */
+class NCNestedTestModel44 extends NCNestedTestModel4Adapter {
+ @NCIntent("intent=onE2 term(t1)={# == 'e2'}[3, 100]")
+ def onAB(): NCResult = NCResult.text("OK")
+
+ override def isGreedy: Boolean = false
+ override def isPermutateSynonyms: Boolean = true
+ override def isSparse: Boolean = true
+}
+
+/**
+ *
+ */
+@NCTestEnvironment(model = classOf[NCNestedTestModel44], startClient = true)
+class NCEnricherNestedModelSpec44 extends NCTestContext {
+ // @Test
+ def test(): Unit = checkIntent("the a " * 2, "onE2")
}