You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2020/06/02 14:28:15 UTC
[incubator-nlpcraft] branch NLPCRAFT-70 updated: WIP.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-70
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-70 by this push:
new 3f94a92 WIP.
3f94a92 is described below
commit 3f94a92d705e87f9b484f59d953a6bf199387d17
Author: Sergey Kamov <se...@apache.org>
AuthorDate: Tue Jun 2 17:28:08 2020 +0300
WIP.
---
.../apache/nlpcraft/examples/cars/cars_model.yaml | 4 +-
.../probe/mgrs/conn/NCConnectionManager.scala | 17 ++-
.../nlpcraft/probe/mgrs/model/NCModelManager.scala | 43 +++++-
.../org/apache/nlpcraft/server/NCServer.scala | 6 +-
.../nlpcraft/server/mdo/NCModelMlConfigMdo.scala | 10 +-
.../apache/nlpcraft/server/ml/NCMlManager.scala | 151 ++++++++++++++++++++-
.../NCMlSuggestion.scala} | 14 +-
.../server/nlp/core/NCNlpServerManager.scala | 2 -
.../server/nlp/enrichers/ml/NCMlEnricher.scala | 51 ++++++-
.../nlpcraft/server/probe/NCProbeManager.scala | 7 +-
10 files changed, 266 insertions(+), 39 deletions(-)
diff --git a/src/main/scala/org/apache/nlpcraft/examples/cars/cars_model.yaml b/src/main/scala/org/apache/nlpcraft/examples/cars/cars_model.yaml
index 0ce66aa..8358eef 100644
--- a/src/main/scala/org/apache/nlpcraft/examples/cars/cars_model.yaml
+++ b/src/main/scala/org/apache/nlpcraft/examples/cars/cars_model.yaml
@@ -20,14 +20,14 @@ name: "Cars Example Model"
version: "1.0"
description: "Cars example model."
examples:
- - "I like drive my new BMW."
+ - "I like drive my new BMW ."
+ - "BMW has the best engine ."
enabledBuiltInTokens: [] # Don't use any built-in tokens.
elements:
- id: "cars:brand"
description: "Any car"
synonyms:
- "BMW"
- - "Mercedez"
mlSupport: true
intents:
- "intent=brand term(brand)={id == 'cars:brand'}"
\ No newline at end of file
diff --git a/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala b/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala
index 860c572..c552a50 100644
--- a/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala
+++ b/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala
@@ -233,6 +233,7 @@ object NCConnectionManager extends NCService {
NCModelManager.getAllModels().map(m ⇒ {
val mdl = m.model
+
// util.HashSet created to avoid scala collections serialization error.
// Seems to be a Scala bug.
(
@@ -241,12 +242,16 @@ object NCConnectionManager extends NCService {
mdl.getVersion,
new util.HashSet[String](mdl.getEnabledBuiltInTokens),
new util.HashMap[String, util.Set[String]](
- mdl.getElements.asScala.filter(_.mlSupport()).
- map(p ⇒
- p.getId →
- new util.HashSet[String](
- p.getSynonyms.asScala.toSet.filter(!_.contains(" ")).asJava)
- ).toMap.asJava
+ mdl.getElements.asScala.filter(_.mlSupport()).map(e ⇒ {
+ // Gets single word text synonyms, its existing should be already validated.
+ val syns = m.synonyms(e.getId)(1).filter(_.isTextOnly)
+
+ require(syns.nonEmpty)
+
+ val stems: util.Set[String] = new util.HashSet[String](syns.map(_.stems).asJava)
+
+ e.getId → stems
+ }).toMap.asJava
),
new util.HashSet[String](mdl.getExamples)
)
diff --git a/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala b/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
index f9ad86c..71eac66 100644
--- a/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
+++ b/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
@@ -324,7 +324,7 @@ object NCModelManager extends NCService with DecorateAsScala {
if (mdl.isPermutateSynonyms && !isElementId && chunks.forall(_.wordStem != null))
simplePermute(chunks).map(p ⇒ p.map(_.wordStem) → p).toMap.values.foreach(p ⇒ add(p, p == chunks))
else
- add(chunks, true)
+ add(chunks, isDirect = true)
}
/**
@@ -345,7 +345,7 @@ object NCModelManager extends NCService with DecorateAsScala {
// Add element ID as a synonyms (dups ignored).
val idChunks = Seq(chunkIdSplit(elmId))
- idChunks.distinct.foreach(ch ⇒ addSynonym(true, false, null, ch))
+ idChunks.distinct.foreach(ch ⇒ addSynonym(isElementId = true, isValueName = false, null, ch))
// Add straight element synonyms (dups printed as warnings).
val synsChunks = for (syn ← elm.getSynonyms.flatMap(parser.expand)) yield chunkSplit(syn)
@@ -358,7 +358,7 @@ object NCModelManager extends NCService with DecorateAsScala {
s"]"
)
- synsChunks.distinct.foreach(ch ⇒ addSynonym(false, false, null, ch))
+ synsChunks.distinct.foreach(ch ⇒ addSynonym(isElementId = false, isValueName = false, value = null, chunks = ch))
val vals =
(if (elm.getValues != null) elm.getValues.asScala else Seq.empty) ++
@@ -382,7 +382,7 @@ object NCModelManager extends NCService with DecorateAsScala {
val idChunks = Seq(chunkIdSplit(valId))
// Add value name as a synonyms (dups ignored)
- idChunks.distinct.foreach(ch ⇒ addSynonym(false, true, valId, ch))
+ idChunks.distinct.foreach(ch ⇒ addSynonym(isElementId = false, isValueName = true, valId, ch))
// Add straight value synonyms (dups printed as warnings)
var skippedOneLikeName = false
@@ -409,7 +409,7 @@ object NCModelManager extends NCService with DecorateAsScala {
s"]"
)
- chunks.distinct.foreach(ch ⇒ addSynonym(false, false, valId, ch))
+ chunks.distinct.foreach(ch ⇒ addSynonym(isElementId = false, isValueName = false, valId, ch))
}
}
@@ -483,6 +483,8 @@ object NCModelManager extends NCService with DecorateAsScala {
logger.warn(s"Found duplicate synonyms - check trace logging for model: ${mdl.getId}")
logger.warn(s"Duplicates are allowed by '${mdl.getId}' model but large number may degrade the performance.")
}
+
+ checkMl(mdl, syns.toSet)
mdl.getMetadata.put(MDL_META_ALL_ALIASES_KEY, allAliases.toSet)
mdl.getMetadata.put(MDL_META_ALL_ELM_IDS_KEY,
@@ -625,6 +627,37 @@ object NCModelManager extends NCService with DecorateAsScala {
/**
*
* @param mdl Model.
+ * @param syns Synonyms.
+ */
+ @throws[NCE]
+ private def checkMl(mdl: NCModel, syns: Set[SynonymHolder]): Unit = {
+ val mlElements = mdl.getElements.asScala.filter(_.mlSupport())
+
+ if (mlElements.nonEmpty) {
+ val examples =
+ mdl.getExamples.asScala.map(s ⇒ NCNlpCoreManager.tokenize(s).map(t ⇒ NCNlpCoreManager.stemWord(t.token)))
+
+ println("examples="+examples)
+
+ mlElements.foreach(e ⇒ {
+ val elemSyns = syns.flatMap(p ⇒
+ if (p.elementId == e.getId && p.synonym.size == 1 && p.synonym.isTextOnly) Some(p.synonym) else None
+ )
+
+ println("elemSyns="+elemSyns)
+
+ if (elemSyns.isEmpty)
+ throw new NCE(s"Text single word synonyms not found for ML element '${e.getId}'")
+
+ if (!elemSyns.exists(s ⇒ examples.exists(_.contains(s.stems))))
+ throw new NCE(s"Examples not found for ML element '${e.getId}'")
+ })
+ }
+ }
+
+ /**
+ *
+ * @param mdl Model.
*/
@throws[NCE]
private def checkElementIdsDups(mdl: NCModel): Unit = {
diff --git a/src/main/scala/org/apache/nlpcraft/server/NCServer.scala b/src/main/scala/org/apache/nlpcraft/server/NCServer.scala
index 1067340..57cbaca 100644
--- a/src/main/scala/org/apache/nlpcraft/server/NCServer.scala
+++ b/src/main/scala/org/apache/nlpcraft/server/NCServer.scala
@@ -125,10 +125,10 @@ object NCServer extends App with NCIgniteInstance with LazyLogging with NCOpenCe
() ⇒ NCProbeManager.start(span),
() ⇒ NCFeedbackManager.start(span)
)
-
+
+ NCMlManager.start(span)
NCQueryManager.start(span)
NCRestManager.start(span)
- NCMlManager.start(span)
// Lifecycle callback.
NCServerLifecycleManager.afterStart()
@@ -144,9 +144,9 @@ object NCServer extends App with NCIgniteInstance with LazyLogging with NCOpenCe
startScopedSpan("stopManagers") { span ⇒
Seq(
- NCMlManager,
NCRestManager,
NCQueryManager,
+ NCMlManager,
NCFeedbackManager,
NCCompanyManager,
NCUserManager,
diff --git a/src/main/scala/org/apache/nlpcraft/server/mdo/NCModelMlConfigMdo.scala b/src/main/scala/org/apache/nlpcraft/server/mdo/NCModelMlConfigMdo.scala
index fa5c7c5..8244734 100644
--- a/src/main/scala/org/apache/nlpcraft/server/mdo/NCModelMlConfigMdo.scala
+++ b/src/main/scala/org/apache/nlpcraft/server/mdo/NCModelMlConfigMdo.scala
@@ -19,11 +19,17 @@ package org.apache.nlpcraft.server.mdo
import org.apache.nlpcraft.server.mdo.impl._
+@NCMdoEntity(sql = false)
+case class NCElementSynonymMlMdo(
+ @NCMdoField word: String,
+ @NCMdoField score: Double
+)
+
/**
* Probe model ML config MDO.
*/
@NCMdoEntity(sql = false)
case class NCModelMlConfigMdo(
- @NCMdoField mlElements: Map[String, Set[String]],
- @NCMdoField examples: Set[String]
+ @NCMdoField mlElements: Map[String, Seq[NCElementSynonymMlMdo]],
+ @NCMdoField examples: Map[String, Map[Seq[String], Int]]
)
\ No newline at end of file
diff --git a/src/main/scala/org/apache/nlpcraft/server/ml/NCMlManager.scala b/src/main/scala/org/apache/nlpcraft/server/ml/NCMlManager.scala
index 37b7613..222d091 100644
--- a/src/main/scala/org/apache/nlpcraft/server/ml/NCMlManager.scala
+++ b/src/main/scala/org/apache/nlpcraft/server/ml/NCMlManager.scala
@@ -17,26 +17,169 @@
package org.apache.nlpcraft.server.ml
+import java.util
+
+import com.google.gson.Gson
+import com.google.gson.reflect.TypeToken
import io.opencensus.trace.Span
+import org.apache.http.HttpResponse
+import org.apache.http.client.ResponseHandler
+import org.apache.http.client.methods.HttpPost
+import org.apache.http.entity.StringEntity
+import org.apache.http.impl.client.HttpClients
+import org.apache.http.util.EntityUtils
+import org.apache.nlpcraft.common.config.NCConfigurable
+import org.apache.nlpcraft.common.util.NCUtils
import org.apache.nlpcraft.common.{NCE, NCService}
-import org.apache.nlpcraft.server.mdo.NCModelMlConfigMdo
+import org.apache.nlpcraft.server.mdo.{NCElementSynonymMlMdo, NCModelMlConfigMdo}
+import org.apache.nlpcraft.server.nlp.core.{NCNlpParser, NCNlpServerManager, NCNlpWord}
import org.apache.nlpcraft.server.opencensus.NCOpenCensusServerStats
+import scala.collection.JavaConverters._
+
/**
* TODO:
*/
object NCMlManager extends NCService with NCOpenCensusServerStats {
+ private object Config extends NCConfigurable {
+ lazy val url: Option[String] = getStringOpt("nlpcraft.server.ml.url")
+ }
+
+ case class RestRequest(sentence: String, simple: Boolean, lower: Int, upper: Int, limit: Int = 10)
+ case class RestResponse(data: java.util.ArrayList[NCMlSuggestion])
+
+ private final val GSON = new Gson
+ private final val TYPE_RESP = new TypeToken[RestResponse]() {}.getType
+ private final val CLIENT = HttpClients.createDefault
+
+ @volatile private var url: Option[String] = _
+ @volatile private var parser: NCNlpParser = _
+
+ private case class Key(txt: String, idx: Int)
+
+ private final val CACHE: util.Map[Key, Seq[NCMlSuggestion]] = NCUtils.mkLRUMap[Key, Seq[NCMlSuggestion]]("ml-cache", 10000)
+
+ @throws[NCE]
+ private def mkHandler(req: String): ResponseHandler[Seq[NCMlSuggestion]] =
+ (resp: HttpResponse) ⇒ {
+ val code = resp.getStatusLine.getStatusCode
+ val e = resp.getEntity
+
+ val js = if (e != null) EntityUtils.toString(e) else null
+
+ if (js == null)
+ throw new NCE(s"Unexpected empty response [req=$req, code=$code]")
+
+ code match {
+ case 200 ⇒
+ val data: RestResponse = GSON.fromJson(js, TYPE_RESP)
+
+ data.data.asScala
+
+ case 400 ⇒ throw new NCE(js)
+ case _ ⇒ throw new NCE(s"Unexpected response [req=$req, code=$code, response=$js]")
+ }
+ }
+
+ @throws[NCE]
+ def ask(sen: String, idx: Int): Seq[NCMlSuggestion] = {
+ require(url.isDefined)
+
+ val key = Key(sen, idx)
+
+ // TODO:
+ CACHE.clear()
+
+ var res = CACHE.synchronized { CACHE.get(key) }
+
+ if (res != null)
+ res
+ else {
+ val post = new HttpPost(url.get + "/synonyms")
+
+ post.setHeader("Content-Type", "application/json")
+ post.setEntity(
+ new StringEntity(
+ GSON.toJson(
+ RestRequest(
+ sentence = sen,
+ simple = false,
+ lower = idx,
+ upper = idx
+ )
+ ),
+ "UTF-8"
+ )
+ )
+
+ res =
+ try
+ CLIENT.execute(post, mkHandler(sen))
+ finally
+ post.releaseConnection()
+
+ CACHE.synchronized { CACHE.put(key, res) }
+
+ res
+ }
+
+ }
+
override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
+ parser = NCNlpServerManager.getParser
+ url = Config.url
+
super.start()
}
override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ ⇒
+ CACHE.clear()
+
super.stop()
}
@throws[NCE]
- def makeModelConfig(elems: Map[String, Set[String]], examples: Set[String]): NCModelMlConfigMdo = {
- // TODO:
- NCModelMlConfigMdo(elems, examples)
+ def makeModelConfig(mlElems: Map[String, Set[String]], examples: Set[String]): NCModelMlConfigMdo = {
+ val parsedExamples: Set[Seq[NCNlpWord]] = examples.map(parser.parse(_))
+
+ val examplesCfg = scala.collection.mutable.HashMap.empty[String, Map[Seq[String], Int]]
+
+ val mlElementsData =
+ mlElems.map { case (elemId, synsStems) ⇒
+ val elemExamples = parsedExamples.filter(_.exists(x ⇒ synsStems.contains(x.stem)))
+
+ if (elemExamples.isEmpty)
+ throw new NCE(s"Examples not found for element: $elemId")
+
+ case class Holder(synomym: NCElementSynonymMlMdo, words: Seq[String], index: Int)
+
+ val hs =
+ elemExamples.flatMap(elemExample ⇒ {
+ val words = elemExample.map(_.word)
+ val normTxt = elemExample.map(_.normalWord).mkString(" ")
+
+ elemExample.
+ filter(e ⇒ synsStems.contains(e.stem)).
+ flatMap(n ⇒ {
+ val i = elemExample.indexOf(n)
+ val suggs = ask(normTxt, i)
+
+ suggs.map(s ⇒ Holder(NCElementSynonymMlMdo(s.word, s.score),words, i))
+ })
+ }).
+ //filter(_.synomym.word.forall(_.isLower)). // TODO: nouns
+ toSeq.sortBy(-_.synomym.score).
+ take(5) // TODO: 5,
+
+ examplesCfg += elemId → hs.map(h ⇒ h.words → h.index).toMap
+
+ elemId → hs.map(_.synomym)
+ }
+
+ val cfg = NCModelMlConfigMdo(mlElementsData, examplesCfg.toMap)
+
+ logger.info(s"Config loaded: $cfg")
+
+ cfg
}
}
diff --git a/src/main/scala/org/apache/nlpcraft/server/mdo/NCModelMlConfigMdo.scala b/src/main/scala/org/apache/nlpcraft/server/ml/NCMlSuggestion.scala
similarity index 75%
copy from src/main/scala/org/apache/nlpcraft/server/mdo/NCModelMlConfigMdo.scala
copy to src/main/scala/org/apache/nlpcraft/server/ml/NCMlSuggestion.scala
index fa5c7c5..46826fd 100644
--- a/src/main/scala/org/apache/nlpcraft/server/mdo/NCModelMlConfigMdo.scala
+++ b/src/main/scala/org/apache/nlpcraft/server/ml/NCMlSuggestion.scala
@@ -15,15 +15,11 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.server.mdo
-
-import org.apache.nlpcraft.server.mdo.impl._
+package org.apache.nlpcraft.server.ml
/**
- * Probe model ML config MDO.
+ *
+ * @param word
+ * @param score
*/
-@NCMdoEntity(sql = false)
-case class NCModelMlConfigMdo(
- @NCMdoField mlElements: Map[String, Set[String]],
- @NCMdoField examples: Set[String]
-)
\ No newline at end of file
+case class NCMlSuggestion(word: String, score: Double)
diff --git a/src/main/scala/org/apache/nlpcraft/server/nlp/core/NCNlpServerManager.scala b/src/main/scala/org/apache/nlpcraft/server/nlp/core/NCNlpServerManager.scala
index a8080cf..105f1d4 100644
--- a/src/main/scala/org/apache/nlpcraft/server/nlp/core/NCNlpServerManager.scala
+++ b/src/main/scala/org/apache/nlpcraft/server/nlp/core/NCNlpServerManager.scala
@@ -47,8 +47,6 @@ object NCNlpServerManager extends NCService {
if (unsupported.nonEmpty)
abortWith(s"Configuration '$prop' contains unsupported providers: ${unsupported.mkString(",")}")
}
-
-
}
Config.check()
diff --git a/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ml/NCMlEnricher.scala b/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ml/NCMlEnricher.scala
index 35fda00..e420cfc 100644
--- a/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ml/NCMlEnricher.scala
+++ b/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ml/NCMlEnricher.scala
@@ -21,6 +21,7 @@ import io.opencensus.trace.Span
import org.apache.nlpcraft.common.NCService
import org.apache.nlpcraft.common.config.NCConfigurable
import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote}
+import org.apache.nlpcraft.server.ml.NCMlManager
import org.apache.nlpcraft.server.nlp.enrichers.NCServerEnricher
object NCMlEnricher extends NCServerEnricher {
@@ -36,7 +37,7 @@ object NCMlEnricher extends NCServerEnricher {
if (url.last == '/')
url = url.dropRight(1)
- addTags(span, "mklUrl" → url)
+ addTags(span, "url" → url)
// Tries to access spaCy proxy server.
// TODO: add health check.
@@ -50,10 +51,52 @@ object NCMlEnricher extends NCServerEnricher {
super.stop()
}
+ private def substitute(words: Seq[String], idx: Int, repl: String): String =
+ words.zipWithIndex.map { case (w, i) ⇒ if (idx == i) repl else w }.mkString(" ")
+
override def enrich(ns: NCNlpSentence, parent: Span): Unit = {
- val elem = ns.mlCfg.get.mlElements.head._1
- val tok = ns(1)
+ ns.mlCfg match {
+ case Some(cfg) ⇒
+ val nn = ns.filter(_.pos.startsWith("N"))
+
+ if (nn.nonEmpty) {
+ val normTxt = ns.map(_.origText).mkString(" ")
+
+ nn.foreach(n ⇒ {
+ val idx = ns.indexOf(n)
+
+ val sugg = NCMlManager.ask(normTxt, idx).filter(_.score > 0.5) // TODO:
+
+ logger.info(s"Suggestions for main sentence [text=$normTxt, nn=${n.origText}, suggestions=${sugg.mkString(",")}]")
+
+ cfg.mlElements.find(e ⇒ e._2.exists(w ⇒ sugg.exists(s ⇒ s.word == w.word))) match {
+ case Some((elemId, _)) ⇒ Some((idx, elemId))
+ case None ⇒
+ cfg.examples.foreach { case (elemId, elemExamples) ⇒
+ println("elemId="+elemId)
+ println("elemExamples="+elemExamples)
+ val all =
+ elemExamples.forall { case (example, idx) ⇒
+ val subs = substitute(example, idx, n.origText)
+ val suggs = NCMlManager.ask(subs, idx).filter(_.score > 0.5) // TODO:
+ val el = cfg.mlElements(elemId)
+ val ok = suggs.exists(s ⇒ el.exists(e ⇒ e.word == s.word))
+
+ logger.info(s"Suggestions for examples [subs=$subs, i=$idx, nn=${n.origText}, suggestions=${suggs.mkString(",")}, ok=$ok]")
+
+ ok
+ }
+
+ if (all) {
+ val tok = ns(idx)
- tok.add(NCNlpSentenceNote(Seq(tok.index), elem))
+ tok.add(NCNlpSentenceNote(Seq(tok.index), elemId))
+ }
+ }
+ }
+ })
+ }
+ case None ⇒ // No-op.
+ }
}
}
diff --git a/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala b/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala
index cf291c3..00de34f 100644
--- a/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala
+++ b/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala
@@ -364,7 +364,9 @@ object NCProbeManager extends NCService {
case Failure(e: NCE) ⇒ logger.warn(e.getMessage, e)
case Failure(_: EOFException) ⇒ () // Just ignoring.
- case Failure(e: Throwable) ⇒ logger.warn(s"Ignoring socket error: ${e.getLocalizedMessage}")
+ case Failure(e: Throwable) ⇒
+ e.printStackTrace()
+ logger.warn(s"Ignoring socket error: ${e.getLocalizedMessage}")
}
}
}
@@ -673,7 +675,8 @@ object NCProbeManager extends NCService {
respond("S2P_PROBE_OK")
}
catch {
- case _: NCE ⇒
+ case e: NCE ⇒
+ logger.error("Errors during ML initialization for probe", e)
// TODO: reason ?
respond("S2P_PROBE_ML_ERROR")
}