You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/04/08 19:48:40 UTC
[incubator-nlpcraft] 05/07: WIP.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-287
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 1f47a982c25c0b82802b8881277bb51e1a6f3442
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Thu Apr 8 18:57:21 2021 +0300
WIP.
---
.../apache/nlpcraft/probe/mgrs/NCProbeModel.scala | 9 +-
.../probe/mgrs/deploy/NCDeployManager.scala | 15 +-
.../nlpcraft/probe/mgrs/model/NCModelManager.scala | 24 ++-
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 161 ++++++++++-----------
4 files changed, 103 insertions(+), 106 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
index 0e418b3..31fa627 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
@@ -30,8 +30,7 @@ import scala.collection.{Map, Seq}
* @param intents
* @param directSynonyms
* @param sparseSynonyms
- * @param directSynonymsDsl
- * @param addStopWordsStems
+ * @param synonymsDsl
* @param exclStopWordsStems
* @param suspWordsStems
* @param elements
@@ -42,13 +41,13 @@ case class NCProbeModel(
intents: Seq[NCIdlIntent],
directSynonyms: Map[String /*Element ID*/ , Map[Int /*Synonym length*/ , NCProbeSynonymsWrapper]], // Fast access map.
sparseSynonyms: Map[String /*Element ID*/, Seq[NCProbeSynonym]],
- directSynonymsDsl: Map[String /*Element ID*/ , Seq[NCProbeSynonym]], // Fast access map.
- sparseSynonymsDsl: Map[String /*Element ID*/ , Seq[NCProbeSynonym]],
+ synonymsDsl: Map[String /*Element ID*/ , Seq[NCProbeSynonym]], // Fast access map.
addStopWordsStems: Set[String],
exclStopWordsStems: Set[String],
suspWordsStems: Set[String],
elements: Map[String /*Element ID*/ , NCElement],
samples: Set[(String, Seq[Seq[String]])]
) {
- def hasDslSynonyms(elemId: String): Boolean = directSynonymsDsl.contains(elemId) || sparseSynonymsDsl.contains(elemId)
+ def hasDslSynonyms(elemId: String): Boolean = synonymsDsl.contains(elemId)
+ def hasDslSynonyms: Boolean = synonymsDsl.nonEmpty
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
index aa3b99e..04ed091 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
@@ -197,8 +197,10 @@ object NCDeployManager extends NCService with DecorateAsScala {
// TODO: Sparse for nonDSL
def ok(b: Boolean, exp: Boolean): Boolean = if (exp) b else !b
- def filter(dsl: Boolean, sparse: Boolean): Set[SynonymHolder] =
- syns.toSet.filter(s ⇒ ok(s.syn.exists(_.kind == IDL), dsl) && ok(s.sparse && s.syn.size > 1, sparse))
+ def filterDsl(syns: Set[SynonymHolder], dsl: Boolean): Set[SynonymHolder] =
+ syns.filter(s ⇒ ok(s.syn.exists(_.kind == IDL), dsl))
+ def filterSparse(syns: Set[SynonymHolder], sparse: Boolean): Set[SynonymHolder] =
+ syns.filter(s ⇒ ok(s.sparse && s.syn.size > 1, sparse))
var cnt = 0
val maxCnt = mdl.getMaxTotalSynonyms
@@ -506,14 +508,15 @@ object NCDeployManager extends NCService with DecorateAsScala {
def toMap(set: Set[SynonymHolder]): Map[String, Seq[NCProbeSynonym]] =
set.groupBy(_.elmId).map(p ⇒ p._1 → p._2.map(_.syn).toSeq.sortBy(-_.size))
+ val notDsl = filterDsl(syns.toSet, dsl = false)
+
NCProbeModel(
model = mdl,
solver = solver,
intents = intents.map(_._1).toSeq,
- directSynonyms = mkFastAccessMap(filter(dsl = false, sparse = false), NCProbeSynonymsWrapper(_)),
- sparseSynonyms = toMap(filter(dsl = false, sparse = true)),
- directSynonymsDsl = toMap(filter(dsl = true, sparse = false)),
- sparseSynonymsDsl = toMap(filter(dsl = true, sparse = true)),
+ directSynonyms = mkFastAccessMap(filterSparse(notDsl, sparse = false), NCProbeSynonymsWrapper(_)),
+ sparseSynonyms = toMap(filterSparse(notDsl, sparse = true)),
+ synonymsDsl = toMap(filterDsl(syns.toSet, dsl = true)),
addStopWordsStems = addStopWords,
exclStopWordsStems = exclStopWords,
suspWordsStems = suspWords,
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
index 457bf35..ff0cb78 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
@@ -58,26 +58,24 @@ object NCModelManager extends NCService with DecorateAsScala {
data.values.foreach(w ⇒ {
val mdl = w.model
- val synCnt = w.directSynonyms.flatMap(_._2.map(_._2.count)).sum
- val synDslCnt = w.directSynonymsDsl.map(_._2.size).sum
+ val synDirectCnt = w.directSynonyms.flatMap(_._2.map(_._2.count)).sum
val synSparseCnt = w.sparseSynonyms.map(_._2.size).sum
- val synSparseDslCnt = w.sparseSynonymsDsl.map(_._2.size).sum
+ val synDslCnt = w.synonymsDsl.map(_._2.size).sum
val elmCnt = w.elements.keySet.size
val intentCnt = w.intents.size
def withWarn(i: Int): String = if (i == 0) s"0 ${r("(!)")}" else i.toString
tbl += Seq(
- s"Name: ${bo(c(mdl.getName))}",
- s"ID: ${bo(mdl.getId)}",
- s"Version: ${mdl.getVersion}",
- s"Origin: ${mdl.getOrigin}",
- s"Elements: ${withWarn(elmCnt)}",
- s"Synonyms(Direct) $synCnt",
- s"Synonyms(Direct, DSL): $synDslCnt",
- s"Synonyms(Sparse): $synSparseCnt",
- s"Synonyms(Sparse, DSL): $synSparseDslCnt",
- s"Intents: ${withWarn(intentCnt)}"
+ s"Name: ${bo(c(mdl.getName))}",
+ s"ID: ${bo(mdl.getId)}",
+ s"Version: ${mdl.getVersion}",
+ s"Origin: ${mdl.getOrigin}",
+ s"Elements: ${withWarn(elmCnt)}",
+ s"Synonyms(Direct) $synDirectCnt",
+ s"Synonyms(Sparse): $synSparseCnt",
+ s"Synonyms(DSL): $synDslCnt",
+ s"Intents: ${withWarn(intentCnt)}"
)
})
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 0542174..5169afe 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -373,30 +373,31 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
/**
*
- * @param comb
- * @param syn
+ * @param seq
+ * @param s
*/
- private def getPartsComplex(comb: Seq[Complex], syn: Synonym): Seq[TokType] =
- comb.zip(syn.map(_.kind)).flatMap {
+ private def toPartsComplex(seq: Seq[Complex], s: Synonym): Seq[TokType] =
+ seq.zip(s.map(_.kind)).flatMap {
case (complex, kind) ⇒ if (complex.isToken) Some(complex.token → kind)
else None
}
/**
*
- * @param comb
- * @param syn
+ * @param seq
+ * @param s
*/
- private def toParts(comb: Seq[NCDslContent], syn: Synonym): Seq[TokType] =
- comb.zip(syn.map(_.kind)).flatMap {
+ private def toParts(seq: Seq[NCDslContent], s: Synonym): Seq[TokType] =
+ seq.zip(s.map(_.kind)).flatMap {
case (complex, kind) ⇒ if (complex.isLeft) Some(complex.left.get → kind) else None
}
/**
*
*/
- private def mkCache(): Cache =
- mutable.HashMap.empty[String, mutable.ArrayBuffer[Seq[Int]]].withDefault(_ ⇒ mutable.ArrayBuffer.empty[Seq[Int]])
+ private def mkCache(mdl: NCProbeModel): Cache =
+ mutable.HashMap.empty[String, mutable.ArrayBuffer[Seq[Int]]].empty ++
+ mdl.elements.keys.map(k ⇒ k → mutable.ArrayBuffer.empty[Seq[Int]])
/**
*
@@ -478,24 +479,21 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
* @param h
* @param toks
*/
- private def mkComplexCombinations(h: ComplexHolder, toks: Seq[NlpToken]): Seq[Seq[Complex]] = {
- val idxsSeq = toks.flatMap(_.wordIndexes)
-// val idxsSorted = idxsSeq.sorted
- val idxs = idxsSeq.toSet
-// val idxMin = idxsSorted.head
-// val idxMax = idxsSorted.last
+ private def mkComplexCombinations(h: ComplexHolder, toks: Seq[NlpToken], cache: Set[Seq[Complex]]): Seq[Seq[Complex]] = {
+ val idxs = toks.flatMap(_.wordIndexes).toSet
h.complexes.par.
flatMap(complexSeq ⇒ {
//val rec = complexSeq.tokensComplexes.filter(_.isSubsetOf(idxMin, idxMax, idxs))
- val rec = complexSeq.tokensComplexes.filter(_.wordIndexes.exists(idxsSeq.contains))
+ val rec = complexSeq.tokensComplexes.filter(_.wordIndexes.exists(idxs.contains))
// Drops without tokens (IDL part works with tokens).
- if (rec.nonEmpty)
- Some(
- rec ++
+ if (rec.nonEmpty) {
+ val data = rec ++
(complexSeq.wordsIndexes.intersect(idxs) -- rec.flatMap(_.wordIndexes)).map(h.complexesWords)
- )
+
+ if (!cache.contains(data)) Some(data) else None
+ }
else
None
}).seq
@@ -569,31 +567,8 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
startScopedSpan("enrich", parent, "srvReqId" → srvReqId, "mdlId" → mdlId, "txt" → ns.text) { span ⇒
val req = NCRequestImpl(senMeta, srvReqId)
- val matches = mutable.ArrayBuffer.empty[ElementMatch]
- val cacheSparse = mkCache()
- val cacheDirect = mkCache()
val h = mkComplexes(mdl, ns)
- var found = false
-
- def add(typ: String, elm: NCElement, cache: Cache, res: Seq[NlpToken], tokIdxs: Seq[Int], s: Synonym, parts: Seq[TokType]): Unit = {
- val toksSet = res.toSet
-
- var added = false
-
- // TODO:
- if (!matches.exists(m ⇒ m.element.getId == elm.getId && toksSet.subsetOf(m.tokensSet))) {
- matches += ElementMatch(elm, res, s, parts)
-
- added = true
- }
-
- cache(elm.getId) += tokIdxs
- found = true
-
- println(s"ADDED: ${elm.getId}, type=$typ, res=${res.map(_.origText).mkString("|")}, tokIdxs=${tokIdxs.mkString("|")}, added=$added")
- }
-
startScopedSpan("synsProc", span, "srvReqId" → srvReqId, "mdlId" → mdlId, "txt" → ns.text) { _ ⇒
var state = if (ns.firstProbePhase) SIMPLE else DSL_NEXT
ns.firstProbePhase = false
@@ -603,9 +578,36 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
println
println(s"GO $state")
+ val matches = mutable.ArrayBuffer.empty[ElementMatch]
+
+ val cacheSparse = mkCache(mdl)
+ val cacheDirect = mkCache(mdl)
+ val dslCache = mutable.HashSet.empty[Seq[Complex]]
+
+ var found = false
+
+ def add(typ: String, elm: NCElement, cache: Cache, res: Seq[NlpToken], tokIdxs: Seq[Int], s: Synonym, parts: Seq[TokType] = Seq.empty): Unit = {
+ var added = false
+
+ if (!matchExist(elm.getId, res)) {
+ matches += ElementMatch(elm, res, s, parts)
+
+ added = true
+ }
+
+ cache(elm.getId) += tokIdxs
+ found = true
+
+ println(s"ADDED: ${elm.getId}, type=$typ, res=${res.map(_.origText).mkString("|")}, toks=${tokIdxs.mkString("|")}, added=$added")
+ }
+
+ // TODO:
+ def matchExist(elemId: String, toks: Seq[NlpToken]): Boolean =
+ matches.exists(m ⇒ m.element.getId == elemId && toks.toSet.subsetOf(m.tokensSet))
+
for (toks ← combosToks) {
val tokIdxs = toks.map(_.index)
- lazy val dslCombs: Seq[Seq[Complex]] = mkComplexCombinations(h, toks)
+ lazy val dslCombs: Seq[Seq[Complex]] = mkComplexCombinations(h, toks, dslCache.toSet)
lazy val tokStems = toks.map(_.stem).mkString(" ")
// Attempt to match each element.
@@ -613,12 +615,11 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
for (
elm ← mdl.elements.values;
elemId = elm.getId;
- if
- !alreadyMarked(toks, elm.getId)
+ dirProc = cacheDirect(elemId).exists(_.containsSlice(tokIdxs));
+ sparseProc = cacheSparse(elemId).exists(_.containsSlice(tokIdxs))
+ if (!dirProc || !sparseProc) && !alreadyMarked(toks, elemId) && !matchExist(elemId, toks)
) {
- val directProc = cacheDirect(elemId).exists(_.containsSlice(tokIdxs))
- val sparseProc = cacheSparse(elemId).exists(_.containsSlice(tokIdxs))
-
+ //println(s"State=$elemId, dirProc=$dirProc, sparseProc=$sparseProc, cacheSparse(elemId)="+cacheSparse(elemId).mkString("|"))
// 1. SIMPLE.
found = false
@@ -630,19 +631,19 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
}
// 1.1 Direct.
- if (simpleEnabled && !directProc && !found)
+ if (simpleEnabled && !dirProc && !found)
fastAccess(mdl.directSynonyms, elemId, toks.length) match {
case Some(h) ⇒
def tryMap(syns: Map[String, Synonym], notFound: () ⇒ Unit): Unit =
syns.get(tokStems) match {
- case Some(s) ⇒ add("direct simple", elm, cacheDirect, toks, tokIdxs, s, Seq.empty)
+ case Some(s) ⇒ add("direct simple", elm, cacheDirect, toks, tokIdxs, s)
case None ⇒ notFound()
}
def tryScan(syns: Seq[Synonym]): Unit =
for (s ← syns if !found)
if (s.isMatch(toks))
- add("direct simple2", elm, cacheDirect, toks, tokIdxs, s, Seq.empty)
+ add("direct simple2", elm, cacheDirect, toks, tokIdxs, s)
tryMap(
h.txtDirectSynonyms,
@@ -660,34 +661,37 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
if (simpleEnabled && !sparseProc && !found)
for (s ← get(mdl.sparseSynonyms, elemId) if !found)
s.trySparseMatch(toks) match {
- case Some(res) ⇒ add("sparse simple", elm, cacheSparse, res, tokIdxs, s, Seq.empty)
+ case Some(res) ⇒ add("sparse simple", elm, cacheSparse, res, tokIdxs, s)
case None ⇒ // No-op.
}
// 2. DSL.
- found = false
val dslEnabled = state != SIMPLE
- // 2.1 Direct.
- if (dslEnabled && mdl.directSynonymsDsl.nonEmpty && !directProc && !found)
- for (s ← get(mdl.directSynonymsDsl, elemId); comb ← dslCombs if !found) {
- if (s.isMatch(comb.map(_.data), req)) {
- println(s"OK $elemId for s=$s for toks:${toks.map(_.origText)}")
-
- add("direct DSL", elm, cacheDirect, toks, tokIdxs, s, getPartsComplex(comb, s))
- }
- println {
- println(s"NOT OK $elemId for s=$s for toks:${toks.map(_.origText)}")
- }
+ if (dslEnabled && mdl.synonymsDsl.nonEmpty) {
+ found = false
+
+ // 2.1 Sparse.
+ if (mdl.hasDslSynonyms) {
+ if (!sparseProc)
+ for (s ← get(mdl.synonymsDsl, elemId); comb ← dslCombs if !found)
+ s.trySparseMatch(comb.map(_.data), req) match {
+ case Some(res) ⇒
+ add("DSL", elm, cacheSparse, toTokens(res, ns), tokIdxs, s, toParts(res, s))
+ dslCache += comb
+ case None ⇒ // No-op.
+ }
}
-
- // 2.2 Sparse.
- if (dslEnabled && mdl.sparseSynonymsDsl.nonEmpty && !sparseProc && !found)
- for (s ← get(mdl.sparseSynonymsDsl, elemId); comb ← dslCombs if !found)
- s.trySparseMatch(comb.map(_.data), req) match {
- case Some(res) ⇒ add("sparse DSL", elm, cacheSparse, toTokens(res, ns), tokIdxs, s, toParts(res, s))
- case None ⇒ // No-op.
- }
+ // 2.2 Direct.
+ else {
+ if (!dirProc)
+ for (s ← get(mdl.synonymsDsl, elemId); comb ← dslCombs if !found)
+ if (s.isMatch(comb.map(_.data), req)) {
+ add("direct DSL", elm, cacheDirect, toks, tokIdxs, s, toPartsComplex(comb, s))
+ dslCache += comb
+ }
+ }
+ }
}
}
@@ -701,18 +705,11 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
go()
}
-
-
}
-
-
processParsers(mdl, ns, span, req)
}
}
- def isComplex(mdl: NCProbeModel): Boolean =
- mdl.directSynonymsDsl.nonEmpty ||
- mdl.sparseSynonymsDsl.nonEmpty ||
- !mdl.model.getParsers.isEmpty
+ def isComplex(mdl: NCProbeModel): Boolean = mdl.synonymsDsl.nonEmpty || !mdl.model.getParsers.isEmpty
}
\ No newline at end of file