You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/04/06 09:15:17 UTC
[incubator-nlpcraft] 03/05: WIP.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-287
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit c4d2d15cb4ff94c96105cedfb12a70e4845dd113
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Tue Apr 6 11:25:59 2021 +0300
WIP.
---
.../nlpcraft/probe/mgrs/NCProbeSynonym.scala | 51 ++++++-----
.../nlpcraft/probe/mgrs/model/NCModelManager.scala | 22 ++---
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 100 ++++++++++-----------
.../probe/mgrs/sentence/NCSentenceManager.scala | 22 +++--
4 files changed, 106 insertions(+), 89 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
index 5324304..95c526f 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
@@ -92,37 +92,44 @@ class NCProbeSynonym(
require(toks != null)
require(toks.nonEmpty)
- lazy val res = mutable.ArrayBuffer.empty[T]
- lazy val all = mutable.HashSet.empty[T]
+ if (toks.size >= this.size) {
+ lazy val res = mutable.ArrayBuffer.empty[T]
+ lazy val all = mutable.HashSet.empty[T]
- var state = 0
+ var state = 0
- for (chunk ← this if state != -1) {
- val seq =
- if (state == 0) {
- state = 1
+ for (chunk ← this if state != -1) {
+ val seq =
+ if (state == 0) {
+ state = 1
- toks.filter(t ⇒ isMatch(t, chunk))
- }
- else
- toks.filter(t ⇒ !res.contains(t) && isMatch(t, chunk))
+ toks.filter(t ⇒ isMatch(t, chunk))
+ }
+ else
+ toks.filter(t ⇒ !res.contains(t) && isMatch(t, chunk))
- if (seq.nonEmpty) {
- val head = seq.head
+ if (seq.nonEmpty) {
+ val head = seq.head
- if (!perm && res.nonEmpty && getIndex(head) <= getIndex(res.last))
- state = -1
- else {
- res += head
- all ++= seq
+ if (!perm && res.nonEmpty && getIndex(head) <= getIndex(res.last))
+ state = -1
+ else {
+ res += head
+ all ++= seq
+
+ if (all.size > res.size)
+ state = -1
+ }
}
+ else
+ state = -1
}
+
+ if (state != -1 && all.size == res.size)
+ Some(res)
else
- state = -1
+ None
}
-
- if (state != -1 && all.size == res.size)
- Some(res)
else
None
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
index cdfdf89..03c59ff 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
@@ -65,17 +65,19 @@ object NCModelManager extends NCService with DecorateAsScala {
val elmCnt = w.elements.keySet.size
val intentCnt = w.intents.size
+ def getWithWarning(i: Int): String = if (i == 0) s"0 ${r("(!)")}" else i.toString
+
tbl += Seq(
- s"Name: ${bo(c(mdl.getName))}",
- s"ID: ${bo(mdl.getId)}",
- s"Version: ${mdl.getVersion}",
- s"Origin: ${mdl.getOrigin}",
- s"Elements: $elmCnt" + (if (elmCnt == 0) s" ${r("(!)")}" else ""),
- s"Synonyms: $synCnt" + (if (synCnt == 0) s" ${r("(!)")}" else ""),
- s"Synonyms(DSL): $synDslCnt" + (if (synDslCnt == 0) s" ${r("(!)")}" else ""),
- s"Synonyms(Sparse): $synSparseCnt" + (if (synSparseCnt == 0) s" ${r("(!)")}" else ""),
- s"Synonyms(Sparse, DSL): $synSparseDslCnt" + (if (synSparseDslCnt == 0) s" ${r("(!)")}" else ""),
- s"Intents: $intentCnt" + (if (intentCnt == 0) s" ${r("(!)")}" else "")
+ s"Name: ${bo(c(mdl.getName))}",
+ s"ID: ${bo(mdl.getId)}",
+ s"Version: ${mdl.getVersion}",
+ s"Origin: ${mdl.getOrigin}",
+ s"Elements: ${getWithWarning(elmCnt)}",
+ s"Synonyms(Continuous) $synCnt",
+ s"Synonyms(Continuous, DSL): $synDslCnt",
+ s"Synonyms(Sparse): $synSparseCnt",
+ s"Synonyms(Sparse, DSL): $synSparseDslCnt",
+ s"Intents: ${getWithWarning(intentCnt)}"
)
})
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 46506fd..f9acd95 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -296,11 +296,11 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
else None
}
- private def mkCache(): mutable.Map[String, ArrayBuffer[Seq[NlpToken]]] =
+ private def mkCache(): mutable.Map[String, ArrayBuffer[Seq[Int]]] =
mutable.HashMap.empty[
String,
- mutable.ArrayBuffer[Seq[NlpToken]]
- ].withDefault(_ ⇒ mutable.ArrayBuffer.empty[Seq[NlpToken]])
+ mutable.ArrayBuffer[Seq[Int]]
+ ].withDefault(_ ⇒ mutable.ArrayBuffer.empty[Seq[Int]])
private def convert(tows: Seq[NCDslContent], ns: NCNlpSentence): Seq[NlpToken] =
(
@@ -388,15 +388,17 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
) {
_ ⇒
for (toks ← combos(ns)) {
- val idxsSeq = toks.flatMap(tokIdxs)
- val idxsSorted = idxsSeq.sorted
- val idxs = idxsSeq.toSet
- val idxMin = idxsSorted.head
- val idxMax = idxsSorted.last
+ val indexes = toks.map(_.index)
- lazy val sorted = idxsSorted.zipWithIndex.toMap
+ lazy val dslCombs: Map[Int, Seq[Seq[Complex]]] = {
+ val idxsSeq = toks.flatMap(tokIdxs)
+ val idxsSorted = idxsSeq.sorted
+ val idxs = idxsSeq.toSet
+ val idxMin = idxsSorted.head
+ val idxMax = idxsSorted.last
+
+ lazy val sorted = idxsSorted.zipWithIndex.toMap
- lazy val dslCombs: Map[Int, Seq[Seq[Complex]]] =
complexes.par.
flatMap(complexSeq ⇒ {
val rec = complexSeq.tokensComplexes.filter(_.isSubsetOf(idxMin, idxMax, idxs))
@@ -412,54 +414,41 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
None
}).
map(_.sortBy(p ⇒ sorted(p.wordIndexes.head))).seq.groupBy(_.length)
+ }
lazy val tokStems = toks.map(_.stem).mkString(" ")
// Attempt to match each element.
for (elm ← mdl.elements.values) {
val elemId = elm.getId
- val sparseEnabled = !cacheSparse(elemId).exists(_.contains(toks))
- val notSparseEnabled = !cacheNotSparse(elemId).exists(_.contains(toks))
- var foundSparse = false
- var foundNotSparse = false
+ val sparseEnabled = !cacheSparse(elemId).exists(_.containsSlice(indexes))
+ val notSparseEnabled = !cacheNotSparse(elemId).exists(_.containsSlice(indexes))
+ var found = false
def addSparse(res: Seq[NlpToken], syn: NCProbeSynonym, parts: Seq[TokenData]): Unit = {
addMatch(elm, res, syn, parts)
- cacheSparse(elemId) += toks
- foundSparse = true
+ cacheSparse(elemId) += indexes
+ found = true
}
def addNotSparse(syn: NCProbeSynonym, parts: Seq[TokenData]): Unit = {
addMatch(elm, toks, syn, parts)
- cacheNotSparse(elemId) += toks
- foundNotSparse = true
+ cacheNotSparse(elemId) += indexes
+ found = true
}
-
- // 1. Simple, sparse.
- if (firstPhase && sparseEnabled)
- for (syn ← mdl.sparseSynonyms.getOrElse(elemId, Seq.empty) if !foundSparse)
- syn.trySparseMatch(toks) match {
- case Some(res) ⇒ addSparse(res, syn, Seq.empty)
- case None ⇒ // No-op.
- }
-
- // 2. Simple, not sparse.
- // Optimization - plain synonyms can be used only on first iteration
- if (firstPhase && notSparseEnabled)
+ // 1. Simple, not sparse.
+ if (firstPhase && notSparseEnabled && !found)
fastAccess(mdl.nonSparseSynonyms, elemId, toks.length) match {
case Some(h) ⇒
def tryMap(synsMap: Map[String, NCProbeSynonym], notFound: () ⇒ Unit): Unit =
synsMap.get(tokStems) match {
case Some(syn) ⇒ addNotSparse(syn, Seq.empty)
- // TODO:
- //if (!found)
- // notFound()
case None ⇒ notFound()
}
def tryScan(synsSeq: Seq[NCProbeSynonym]): Unit =
- for (syn ← synsSeq if !foundNotSparse)
+ for (syn ← synsSeq if !found)
if (syn.isMatch(toks))
addNotSparse(syn, Seq.empty)
@@ -468,7 +457,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
() ⇒ {
tryScan(h.notTxtDirectSynonyms)
- if (!foundNotSparse)
+ if (!found)
tryMap(
h.txtNotDirectSynonyms,
() ⇒ tryScan(h.notTxtNotDirectSynonyms)
@@ -478,30 +467,38 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
case None ⇒ // No-op.
}
- // 3. DSL, sparse.
- if (sparseEnabled)
- for (
- (_, seq) ← dslCombs;
- syn ← mdl.sparseSynonymsDsl.getOrElse(elemId, Seq.empty);
- comb ← seq if !foundSparse
- ) {
- syn.trySparseMatch(comb.map(_.data), req) match {
- case Some(towsRes) ⇒ addSparse(convert(towsRes, ns), syn, getPartsContent(towsRes, syn))
- case None ⇒ // No-op.
- }
- }
-
- // 4. DSL, non sparse.
- if (notSparseEnabled) {
+ // 2. DSL, non sparse.
+ if (notSparseEnabled && mdl.nonSparseSynonymsDsl.nonEmpty && !found) {
for (
(len, seq) ← dslCombs;
syn ← fastAccess(mdl.nonSparseSynonymsDsl, elemId, len).getOrElse(Seq.empty);
- comb ← seq if !foundNotSparse
+ comb ← seq if !found
) {
if (syn.isMatch(comb.map(_.data), req))
addNotSparse(syn, getPartsComplex(comb, syn))
}
}
+
+ // 3. Simple, sparse.
+ if (firstPhase && sparseEnabled && !found)
+ for (syn ← mdl.sparseSynonyms.getOrElse(elemId, Seq.empty) if !found)
+ syn.trySparseMatch(toks) match {
+ case Some(res) ⇒ addSparse(res, syn, Seq.empty)
+ case None ⇒ // No-op.
+ }
+
+ // 4. DSL, sparse.
+ if (sparseEnabled && mdl.sparseSynonymsDsl.nonEmpty && !found)
+ for (
+ syn ← mdl.sparseSynonymsDsl.getOrElse(elemId, Seq.empty);
+ (_, seq) ← dslCombs;
+ comb ← seq if !found
+ ) {
+ syn.trySparseMatch(comb.map(_.data), req) match {
+ case Some(towsRes) ⇒ addSparse(convert(towsRes, ns), syn, getPartsContent(towsRes, syn))
+ case None ⇒ // No-op.
+ }
+ }
}
}
}
@@ -529,6 +526,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
val matchCnt = matchesNorm.size
+
// TODO:matchesNorm
// Add notes for all remaining (non-intersecting) matches.
for ((m, idx) ← matches.zipWithIndex) {
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index ad66b8f..a938f59 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -24,9 +24,9 @@ import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSe
import org.apache.nlpcraft.common.{NCE, NCService, U}
import org.apache.nlpcraft.model.NCModel
-import java.io.{Serializable ⇒ JSerializable}
+import java.io.{Serializable => JSerializable}
import java.util
-import java.util.{List ⇒ JList}
+import java.util.{List => JList}
import scala.collection.JavaConverters.{asScalaBufferConverter, _}
import scala.collection.{Map, Seq, mutable}
import scala.language.implicitConversions
@@ -37,6 +37,8 @@ import scala.language.implicitConversions
object NCSentenceManager extends NCService {
@volatile private var pool: java.util.concurrent.ForkJoinPool = _
+ private val cache = U.mkLRUMap[Seq[Set[NCNlpSentenceNote]], util.List[util.List[NCNlpSentenceNote]]]("sentence-combinations-cache", 500)
+
case class PartKey(id: String, start: Int, end: Int) {
require(start <= end)
@@ -197,7 +199,7 @@ object NCSentenceManager extends NCService {
* @param noteField
* @param ns
*/
- private def fixNoteIndexesList(note: String, idxsField: String, noteField: String, ns: NCNlpSentence): Unit = {
+ private def fixNoteIndexesList(note: String, idxsField: String, noteField: String, ns: NCNlpSentence): Unit =
ns.flatMap(_.getNotes(note)).foreach(rel ⇒
rel.dataOpt[JList[JList[Int]]](idxsField) match {
case Some(idxsList) ⇒
@@ -211,7 +213,6 @@ object NCSentenceManager extends NCService {
case None ⇒ // No-op.
}
)
- }
/**
* Copies token.
@@ -679,14 +680,23 @@ object NCSentenceManager extends NCService {
var sens =
if (delCombs.nonEmpty) {
- val toksByIdx =
+ val toksByIdx: Seq[Set[NCNlpSentenceNote]] =
delCombs.flatMap(note ⇒ note.wordIndexes.map(_ → note)).
groupBy { case (idx, _) ⇒ idx }.
map { case (_, seq) ⇒ seq.map { case (_, note) ⇒ note }.toSet }.
toSeq.sortBy(-_.size)
+
+ var combs: JList[JList[NCNlpSentenceNote]] = cache.get(toksByIdx)
+
+ if (combs == null) {
+ combs = NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool)
+
+ cache.put(toksByIdx, combs)
+ }
+
val seqSens =
- NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala).
+ combs.asScala.map(_.asScala).
par.
flatMap(delComb ⇒ {
val nsClone = sen.clone()