You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2020/06/16 20:04:21 UTC
[incubator-nlpcraft] branch NLPCRAFT-70 updated: prepare.sh fixed
(generator source files excluded)
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-70
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-70 by this push:
new db430c5 prepare.sh fixed (generator source files excluded)
db430c5 is described below
commit db430c5903afceccb634e065f32464d31d33ab6a
Author: Sergey Kamov <se...@apache.org>
AuthorDate: Tue Jun 16 23:04:14 2020 +0300
prepare.sh fixed (generator source files excluded)
---
.../server/ctxword/NCContextWordManager.scala | 45 ++++++++++++++++------
.../enrichers/ctxword/NCContextWordEnricher.scala | 33 +++++++++-------
2 files changed, 53 insertions(+), 25 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/ctxword/NCContextWordManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/ctxword/NCContextWordManager.scala
index 3f43792..c58e49a 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/ctxword/NCContextWordManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/ctxword/NCContextWordManager.scala
@@ -20,6 +20,7 @@ package org.apache.nlpcraft.server.ctxword
import java.net.ConnectException
import java.util.{List => JList}
+import com.google.common.util.concurrent.AtomicDouble
import com.google.gson.reflect.TypeToken
import com.google.gson.{Gson, GsonBuilder}
import io.opencensus.trace.Span
@@ -54,9 +55,9 @@ object NCContextWordManager extends NCService with NCOpenCensusServerStats with
private final val TYPE_RESP = new TypeToken[JList[JList[Suggestion]]]() {}.getType
private final val CLIENT = HttpClients.createDefault
- private final val CONF_LIMIT = 10
- private final val CONF_SUM_LIMIT = 5
- private final val CONF_MIN_SCORE = 0.8
+ private final val CONF_LIMIT = 1000
+ private final val CONF_MIN_SCORE = 1
+ private final val CONF_TOP_FACTOR = 0.2
@volatile private var url: Option[String] = _
@volatile private var parser: NCNlpParser = _
@@ -216,6 +217,26 @@ object NCContextWordManager extends NCService with NCOpenCensusServerStats with
words.zipWithIndex.map { case (w, i) ⇒ subst.getOrElse(i, w) }
}
+ private def getTop[T](seq: Seq[T], getWeight: T ⇒ Double, contrFactor: Double): Seq[T] = {
+ require(seq.nonEmpty)
+ require(contrFactor > 0 && contrFactor < 1)
+
+ val seqW = seq.map(p ⇒ p → getWeight(p)).sortBy(-_._2)
+ val sorted = seqW.map(_._1)
+
+ val limitSum = seqW.map(_._2).sum * contrFactor
+ val limitCnt = (1 / contrFactor).toInt
+
+ val v = new AtomicDouble(0)
+
+ val top = for (t ← sorted if v.getAndAdd(getWeight(t)) < limitSum) yield t
+
+ println("top="+top)
+ println("limitCnt="+limitCnt)
+
+ if (top.size < limitCnt) sorted.take(limitCnt) else top
+ }
+
@throws[NCE]
def makeConfig(
mdlId: String,
@@ -281,13 +302,15 @@ object NCContextWordManager extends NCService with NCOpenCensusServerStats with
throw new NCE(s"Context words cannot be prepared for element: '$elemId'")
elemId →
- suggs.
- groupBy(_.stem).
- map { case (_, group) ⇒ Group(group.minBy(-_.totalScore), group.size) }.
- toSeq.
- map(group ⇒ GroupFactor(group, group.word.totalScore * group.count / suggs.size)).
- sortBy(-_.factor).
- take(CONF_SUM_LIMIT)
+ getTop(
+ seq = suggs.
+ groupBy(_.stem).
+ map { case (_, group) ⇒ Group(group.minBy(-_.totalScore), group.size) }.
+ toSeq.
+ map(group ⇒ GroupFactor(group, group.word.totalScore * group.count / suggs.size)),
+ getWeight = (g: GroupFactor) ⇒ g.factor,
+ contrFactor = CONF_TOP_FACTOR
+ )
}
logger.whenInfoEnabled({
@@ -298,7 +321,7 @@ object NCContextWordManager extends NCService with NCOpenCensusServerStats with
groups.foreach { case (elemId, elemGroups) ⇒
tbl += (s"Element ID: '$elemId'", "", "", "")
- def f(d: Double): String = "%1.3f" format d
+ def f(d: Double): String = "%1.10f" format d
elemGroups.
sortBy(-_.factor).
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
index dce9744..42d63e4 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
@@ -19,9 +19,9 @@ package org.apache.nlpcraft.server.nlp.enrichers.ctxword
import io.opencensus.trace.Span
import org.apache.nlpcraft.common.NCService
-import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceToken => Token, NCNlpSentenceNote ⇒ Note}
+import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceToken ⇒ Token, NCNlpSentenceNote ⇒ Note}
import org.apache.nlpcraft.server.ctxword.{NCContextWordManager, NCContextWordParameter, NCContextWordRequest, NCContextWordResponse}
-import org.apache.nlpcraft.server.mdo.{NCContextWordConfigMdo => Config}
+import org.apache.nlpcraft.server.mdo.{NCContextWordConfigMdo ⇒ Config}
import org.apache.nlpcraft.server.nlp.enrichers.NCServerEnricher
import scala.collection.Map
@@ -31,7 +31,8 @@ object NCContextWordEnricher extends NCServerEnricher {
private final val MIN_SENTENCE_FTEXT = 0.5
private final val MIN_EXAMPLE_SCORE = 1
- private final val MIN_EXAMPLE_FTEXT = 0.5
+ private final val MIN_EXAMPLE_ALL_FTEXT = 0.3
+ private final val MIN_EXAMPLE_BEST_FTEXT = 0.5
private final val LIMIT = 20
@@ -104,7 +105,7 @@ object NCContextWordEnricher extends NCServerEnricher {
case class V(elementId: String, example: String, token: Token)
case class VExt(value: V, requests: Seq[NCContextWordRequest])
- val allReqs =
+ val allReqs: Seq[VExt] =
examples.flatMap { case (elemId, exMap) ⇒
def make(exampleWords: Seq[String], idxs: Seq[Int], tok: Token): VExt = {
val words = substitute(exampleWords, idxs.map(_ → tok.normText).toMap)
@@ -118,14 +119,13 @@ object NCContextWordEnricher extends NCServerEnricher {
val allSuggs =
NCContextWordManager.suggest(
allReqs.flatMap(_.requests),
- NCContextWordParameter(limit = LIMIT, totalScore = MIN_EXAMPLE_SCORE, ftextScore = MIN_EXAMPLE_FTEXT)
+ NCContextWordParameter(limit = LIMIT, totalScore = MIN_EXAMPLE_SCORE, ftextScore = MIN_EXAMPLE_ALL_FTEXT)
)
- val groupReqs = allReqs.flatMap(p ⇒ p.requests.indices.map(_ ⇒ p.value))
+ require(allSuggs.size == allReqs.map(_.requests.size).sum)
- require(groupReqs.size == allSuggs.size)
-
- groupReqs.
+ allReqs.
+ flatMap(p ⇒ p.requests.indices.map(_ ⇒ p.value)).
zip(allSuggs).
groupBy { case (v, _) ⇒ (v.elementId, v.token) }.
flatMap { case ((elemId, tok), seq) ⇒
@@ -137,8 +137,14 @@ object NCContextWordEnricher extends NCServerEnricher {
find(p ⇒ cfg.contextWords(elemId).contains(p.stem))
}
- if (suggs.size == cfg.examples(elemId).size)
- Some(tok → makeHolder(elemId, tok, suggs.toSeq.minBy(p ⇒ (-p.ftextScore, -p.totalScore))))
+ if (suggs.size == cfg.examples(elemId).size) {
+ val best = suggs.toSeq.minBy(p ⇒ (-p.ftextScore, -p.totalScore))
+
+ if (best.ftextScore >= MIN_EXAMPLE_BEST_FTEXT)
+ Some(tok → makeHolder(elemId, tok, best))
+ else
+ None
+ }
else
None
}
@@ -158,9 +164,8 @@ object NCContextWordEnricher extends NCServerEnricher {
require(words.size >= subst.size)
require(subst.keys.forall(i ⇒ i >= 0 && i < words.length))
- words.zipWithIndex.map {
- case (w, i) ⇒ subst.getOrElse(i, w)
- }
+ words.zipWithIndex.map { case (w, i) ⇒ subst.getOrElse(i, w) }
+ words.zipWithIndex.map { case (w, i) ⇒ subst.getOrElse(i, w) }
}
override def enrich(ns: NCNlpSentence, parent: Span): Unit =