You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by ar...@apache.org on 2020/09/05 16:55:14 UTC
[incubator-nlpcraft] branch NLPCRAFT-41 updated: Update
NCSuggestionInspection.scala
This is an automated email from the ASF dual-hosted git repository.
aradzinski pushed a commit to branch NLPCRAFT-41
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-41 by this push:
new f4484ac Update NCSuggestionInspection.scala
f4484ac is described below
commit f4484ac2e96c4300a67f0dc6e14d4390f76db2d5
Author: Aaron Radzinski <ar...@datalingvo.com>
AuthorDate: Sat Sep 5 09:54:59 2020 -0700
Update NCSuggestionInspection.scala
---
.../inspection/impl/NCSuggestionInspection.scala | 169 +++++++++------------
1 file changed, 70 insertions(+), 99 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/inspection/impl/NCSuggestionInspection.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/inspection/impl/NCSuggestionInspection.scala
index 11f6f15..6cddf03 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/inspection/impl/NCSuggestionInspection.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/inspection/impl/NCSuggestionInspection.scala
@@ -41,7 +41,7 @@ import org.apache.nlpcraft.server.probe.NCProbeManager
import scala.collection.JavaConverters._
import scala.collection.{Seq, mutable}
-import scala.concurrent.{ExecutionContextExecutor, Future, Promise}
+import scala.concurrent.{Future, Promise}
/**
* Synonym suggestion inspection.
@@ -139,11 +139,11 @@ object NCSuggestionInspection extends NCInspectionService {
m.containsKey("macros") && m.containsKey("elementsSynonyms") && m.containsKey("intentsSamples")
)
- val macros = m.get("macros").
+ val mdlMacros = m.get("macros").
asInstanceOf[util.Map[String, String]].asScala
- val synonyms = m.get("synonyms").
+ val mdlSyns = m.get("synonyms").
asInstanceOf[util.Map[String, util.List[String]]].asScala.map(p ⇒ p._1 → p._2.asScala)
- val samples = m.get("samples").
+ val mdlExs = m.get("samples").
asInstanceOf[util.Map[String, util.List[String]]].asScala.map(p ⇒ p._1 → p._2.asScala)
val minScore =
@@ -186,42 +186,41 @@ object NCSuggestionInspection extends NCInspectionService {
)
)
- if (samples.isEmpty)
+ if (mdlExs.isEmpty)
onError(s"Missed intents samples for: '$mdlId'")
else {
val url = s"${Config.urlOpt.getOrElse(throw new NCE("Context word server is not configured."))}/suggestions"
- val allSamplesCnt = samples.map { case (_, samples) ⇒ samples.size }.sum
+ val allSamplesCnt = mdlExs.map { case (_, samples) ⇒ samples.size }.sum
val warns = mutable.ArrayBuffer.empty[String]
if (allSamplesCnt < MIN_CNT_MODEL)
warns +=
- s"Model: '$mdlId' has too small intents samples count: $allSamplesCnt. " +
- s"Potentially is can be not enough for suggestions service high quality work. " +
- s"Try to increase their count at least to $MIN_CNT_MODEL."
+ s"Model '$mdlId' has too few intents samples: $allSamplesCnt. " +
+ s"It will negatively affect the quality of suggestions. " +
+ s"Try to increase overall sample count to at least $MIN_CNT_MODEL."
else {
val ids =
- samples.
+ mdlExs.
filter { case (_, samples) ⇒ samples.size < MIN_CNT_INTENT }.
map { case (intentId, _) ⇒ intentId }
if (ids.nonEmpty)
warns +=
- s"Models '$mdlId' has intents: [${ids.mkString(", ")}] with too small intents samples count." +
- s"Potentially it can be not enough for suggestions service high quality work. " +
- s"Try to increase their count at least to $MIN_CNT_INTENT."
+ s"Following model intent have too few samples: ${ids.mkString(", ")}. " +
+ s"It will negatively affect the quality of suggestions. " +
+ s"Try to increase overall sample count to at least $MIN_CNT_INTENT."
}
val parser = new NCMacroParser()
- macros.foreach { case (name, str) ⇒ parser.addMacro(name, str) }
+ mdlMacros.foreach { case (name, str) ⇒ parser.addMacro(name, str) }
- // Note that we don't use system tokenizer, because ContextWordServer doesn't have this tokenizer.
- // We just split examples words with spaces. Also we divide SEPARATORS as separated words.
- val examples =
- samples.
+ // Note that we don't use system tokenizer, because 'ctxword' module' doesn't have this tokenizer.
+ // We split examples words by spaces. We also treat separator as separate words.
+ val exs = mdlExs.
flatMap { case (_, samples) ⇒ samples }.
map(ex ⇒ SEPARATORS.foldLeft(ex)((s, ch) ⇒ s.replaceAll(s"\\$ch", s" $ch "))).
map(ex ⇒ {
@@ -232,7 +231,7 @@ object NCSuggestionInspection extends NCInspectionService {
toMap
val elemSyns =
- synonyms.map { case (elemId, syns) ⇒ elemId → syns.flatMap(parser.expand) }.
+ mdlSyns.map { case (elemId, syns) ⇒ elemId → syns.flatMap(parser.expand) }.
map { case (id, seq) ⇒ id → seq.map(txt ⇒ split(txt).map(p ⇒ Word(p, toStemWord(p)))) }
val allReqs =
@@ -243,29 +242,29 @@ object NCSuggestionInspection extends NCInspectionService {
val synsWords = normSyns.map(_.map(_.word))
val reqs =
- examples.flatMap { case (exampleWords, exampleStems) ⇒
- val exampleIdxs = synsStems.flatMap(synStems ⇒ getAllSlices(exampleStems, synStems))
+ exs.flatMap { case (exWords, exampleStems) ⇒
+ val exIdxs = synsStems.flatMap(synStems ⇒ getAllSlices(exampleStems, synStems))
def mkRequestData(idx: Int, synStems: Seq[String], synStemsIdx: Int): RequestData = {
val fromIncl = idx
val toExcl = idx + synStems.length
RequestData(
- sentence = exampleWords.zipWithIndex.flatMap {
- case (exampleWord, i) ⇒
+ sentence = exWords.zipWithIndex.flatMap {
+ case (exWord, i) ⇒
i match {
case x if x == fromIncl ⇒ synsWords(synStemsIdx)
case x if x > fromIncl && x < toExcl ⇒ Seq.empty
- case _ ⇒ Seq(exampleWord)
+ case _ ⇒ Seq(exWord)
}
}.mkString(" "),
- ex = exampleWords.mkString(" "),
+ ex = exWords.mkString(" "),
elmId = elemId,
index = idx
)
}
- (for (idx ← exampleIdxs; (synStems, i) ← synsStems.zipWithIndex)
+ (for (idx ← exIdxs; (synStems, i) ← synsStems.zipWithIndex)
yield mkRequestData(idx, synStems, i)).distinct
}
@@ -273,7 +272,7 @@ object NCSuggestionInspection extends NCInspectionService {
}.filter(_._2.nonEmpty)
val noExElems =
- synonyms.
+ mdlSyns.
filter { case (elemId, syns) ⇒ syns.nonEmpty && !allReqs.contains(elemId) }.
map { case (elemId, _) ⇒ elemId }
@@ -285,21 +284,21 @@ object NCSuggestionInspection extends NCInspectionService {
val allReqsCnt = allReqs.map(_._2.size).sum
val allSynsCnt = elemSyns.map(_._2.size).sum
- logger.info(s"Data prepared. Request is going to execute on ContextWord Server " +
- s"[examples=${examples.size}, " +
- s"synonyms=$allSynsCnt, " +
- s"requests=$allReqsCnt]"
- )
+ logger.trace(s"Request is going to execute on 'ctxword' server " +
+ s"[exs=${exs.size}, " +
+ s"syns=$allSynsCnt, " +
+ s"reqs=$allReqsCnt" +
+ s"]")
if (allReqsCnt == 0)
- onError(s"Suggestions cannot be prepared: '$mdlId'. Samples don't contain synonyms")
+ onError(s"Suggestions cannot be generated for model: '$mdlId'")
else {
- val allSuggs = new ConcurrentHashMap[String, util.List[Suggestion]]()
+ val allSgsts = new ConcurrentHashMap[String, util.List[Suggestion]]()
val cdl = new CountDownLatch(1)
val debugs = mutable.HashMap.empty[RequestData, Seq[Suggestion]]
val cnt = new AtomicInteger(0)
- val client = HttpClients.createDefault
+ val cli = HttpClients.createDefault
val err = new AtomicReference[Throwable]()
for ((elemId, reqs) ← allReqs; batch ← reqs.sliding(BATCH_SIZE, BATCH_SIZE).map(_.toSeq)) {
@@ -308,15 +307,13 @@ object NCSuggestionInspection extends NCInspectionService {
val post = new HttpPost(url)
post.setHeader("Content-Type", "application/json")
-
post.setEntity(
new StringEntity(
GSON.toJson(
RestRequest(
sentences = batch.map(p ⇒ RestRequestSentence(p.sentence, Seq(p.index).asJava)).asJava,
- // ContextWord server range is (0, 2), input range is (0, 1)
+ // 'ctxword'' server range is (0, 2), input range is (0, 1)
minScore = minScore * 2,
- // We set big limit value and in fact only minimal score is taken into account.
limit = MAX_LIMIT
)
),
@@ -324,11 +321,10 @@ object NCSuggestionInspection extends NCInspectionService {
)
)
- val resps: Seq[Seq[Suggestion]] =
- try
- client.execute(post, HANDLER)
- finally
- post.releaseConnection()
+ val resps: Seq[Seq[Suggestion]] = try
+ cli.execute(post, HANDLER)
+ finally
+ post.releaseConnection()
require(batch.size == resps.size, s"Batch: ${batch.size}, responses: ${resps.size}")
@@ -338,7 +334,7 @@ object NCSuggestionInspection extends NCInspectionService {
logger.debug(s"Executed: $i requests...")
- allSuggs.
+ allSgsts.
computeIfAbsent(elemId, (_: String) ⇒ new CopyOnWriteArrayList[Suggestion]()).
addAll(resps.flatten.asJava)
@@ -361,66 +357,41 @@ object NCSuggestionInspection extends NCInspectionService {
val allSynsStems = elemSyns.flatMap(_._2).flatten.map(_.stem).toSet
- val nonEmptySuggs = allSuggs.asScala.map(p ⇒ p._1 → p._2.asScala).filter(_._2.nonEmpty)
+ val nonEmptySgsts = allSgsts.asScala.map(p ⇒ p._1 → p._2.asScala).filter(_._2.nonEmpty)
val res = mutable.HashMap.empty[String, mutable.ArrayBuffer[SuggestionResult]]
- nonEmptySuggs.
- foreach { case (elemId, elemSuggs) ⇒
- elemSuggs.
- map(sugg ⇒ (sugg, toStem(sugg.word))).
- groupBy { case (_, stem) ⇒ stem }.
- // Drops already defined.
- filter { case (stem, _) ⇒ !allSynsStems.contains(stem) }.
- map { case (_, group) ⇒
- val seq = group.map { case (sugg, _) ⇒ sugg }.sortBy(-_.score)
-
- // Drops repeated.
- (seq.head, seq.length)
- }.
- toSeq.
- map { case (sugg, cnt) ⇒ (sugg, cnt, sugg.score * cnt / elemSuggs.size) }.
- sortBy { case (_, _, sumFactor) ⇒ -sumFactor }.
- zipWithIndex.
- foreach { case ((sugg, cnt, _), _) ⇒
- val seq =
- res.get(elemId) match {
- case Some(seq) ⇒ seq
- case None ⇒
- val buf = mutable.ArrayBuffer.empty[SuggestionResult]
-
- res += elemId → buf
-
- buf
- }
-
- seq += SuggestionResult(sugg.word, sugg.score, cnt)
- }
- }
-
- logger.whenDebugEnabled({
- logger.debug("Request information:")
-
- var i = 1
-
- debugs.groupBy(_._1.ex).foreach { case (_, m) ⇒
- m.toSeq.sortBy(_._1.sentence).foreach { case (req, suggs) ⇒
- val s =
- split(req.sentence).
- zipWithIndex.map { case (w, i) ⇒ if (i == req.index) s"<<<$w>>>" else w }.
- mkString(" ")
-
- logger.debug(
- s"$i. " +
- s"Request=$s, " +
- s"suggestions=[${suggs.map(_.word).mkString(", ")}], " +
- s"element=${req.elmId}"
- )
+ nonEmptySgsts.foreach { case (elemId, elemSgsts) ⇒
+ elemSgsts.
+ map(sgst ⇒ (sgst, toStem(sgst.word))).
+ groupBy { case (_, stem) ⇒ stem }.
+ // Drops already defined.
+ filter { case (stem, _) ⇒ !allSynsStems.contains(stem) }.
+ map { case (_, group) ⇒
+ val seq = group.map { case (sgst, _) ⇒ sgst }.sortBy(-_.score)
+
+ // Drops repeated.
+ (seq.head, seq.length)
+ }.
+ toSeq.
+ map { case (sgst, cnt) ⇒ (sgst, cnt, sgst.score * cnt / elemSgsts.size) }.
+ sortBy { case (_, _, sumFactor) ⇒ -sumFactor }.
+ zipWithIndex.
+ foreach { case ((sgst, cnt, _), _) ⇒
+ val seq =
+ res.get(elemId) match {
+ case Some(seq) ⇒ seq
+ case None ⇒
+ val buf = mutable.ArrayBuffer.empty[SuggestionResult]
+
+ res += elemId → buf
+
+ buf
+ }
- i = i + 1
+ seq += SuggestionResult(sgst.word, sgst.score, cnt)
}
- }
- })
+ }
val resJ: util.Map[String, util.List[util.HashMap[String, Any]]] =
res.map { case (id, data) ⇒