You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/06/17 18:19:35 UTC
[incubator-nlpcraft] 01/03: WIP.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-70_NEW
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 022469eb9dc06dbd14a6166aefcf7ce091c06e7e
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Wed Jun 16 20:39:47 2021 +0300
WIP.
---
.../apache/nlpcraft/common/nlp/NCNlpSentence.scala | 4 ++
.../probe/mgrs/conn/NCConnectionManager.scala | 29 +++++++++++-
.../nlpcraft/server/mdo/NCProbeModelMdo.scala | 9 +++-
.../nlp/enrichers/NCServerEnrichmentManager.scala | 17 +++++---
.../enrichers/ctxword/ContextWordEnricher.scala | 51 ++++++++++++++++++++++
.../nlpcraft/server/probe/NCProbeManager.scala | 25 ++++++++---
.../nlpcraft/server/query/NCQueryManager.scala | 4 +-
7 files changed, 124 insertions(+), 15 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index f508745..eef05de 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -18,6 +18,7 @@
package org.apache.nlpcraft.common.nlp
import org.apache.nlpcraft.common._
+import org.apache.nlpcraft.server.mdo.NCModelMLConfigMdo
import java.io.{Serializable => JSerializable}
import java.util.{Collections, List => JList}
@@ -40,6 +41,7 @@ import org.apache.nlpcraft.common.nlp.NCNlpSentence._
* @param srvReqId Server request ID.
* @param text Normalized text.
* @param enabledBuiltInToks Enabled built-in tokens.
+ * @param mlConfig Machine learning configuration. Optional.
* @param tokens Initial buffer.
* @param firstProbePhase Processing phase flag.
* @param deletedNotes Deleted overridden notes with their tokens.
@@ -50,6 +52,7 @@ class NCNlpSentence(
val srvReqId: String,
val text: String,
val enabledBuiltInToks: Set[String],
+ val mlConfig: Option[NCModelMLConfigMdo],
override val tokens: mutable.ArrayBuffer[NCNlpSentenceToken] = new mutable.ArrayBuffer[NCNlpSentenceToken](32),
var firstProbePhase: Boolean = true,
private val deletedNotes: mutable.HashMap[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = mutable.HashMap.empty,
@@ -67,6 +70,7 @@ class NCNlpSentence(
srvReqId = srvReqId,
text = text,
enabledBuiltInToks = enabledBuiltInToks,
+ mlConfig = mlConfig,
tokens = tokens.map(_.clone()),
deletedNotes = deletedNotes.map(p => p._1.clone() -> p._2.map(_.clone())),
initNlpNotes = initNlpNotes,
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala
index 159ffd2..c911342 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala
@@ -32,8 +32,9 @@ import java.io.{EOFException, IOException, InterruptedIOException}
import java.net.{InetAddress, NetworkInterface}
import java.util
import java.util.concurrent.CountDownLatch
-import java.util.{Properties, TimeZone}
+import java.util.{Collections, Properties, TimeZone}
import scala.collection.mutable
+import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava, SeqHasAsJava, SetHasAsScala}
/**
* Probe down/up link connection manager.
@@ -213,6 +214,28 @@ object NCConnectionManager extends NCService {
NCModelManager.getAllModels().map(wrapper => {
val mdl = wrapper.model
+ val ctxWordElems = mdl.getElements.asScala.filter(_.isContextWordSupport)
+
+ // TODO: validate: too many values, examples. missed them.
+ val (
+ values,
+ samples
+ ): (
+ java.util.Map[String, java.util.Map[String, java.util.List[String]]],
+ java.util.Map[String, java.util.List[String]]
+ ) =
+ if (ctxWordElems.isEmpty)
+ (Collections.emptyMap(), Collections.emptyMap())
+ else {
+ (
+ ctxWordElems.map(e =>
+ e.getId ->
+ e.getValues.asScala.map(p => p.getName -> p.getSynonyms).toMap.asJava
+ ).toMap.asJava,
+ wrapper.samples.map(p => p._1 -> p._2.flatMap(p => p).asJava).toMap.asJava
+ )
+ }
+
// Model already validated.
// util.HashSet created to avoid scala collections serialization error.
@@ -221,7 +244,9 @@ object NCConnectionManager extends NCService {
mdl.getId,
mdl.getName,
mdl.getVersion,
- new util.HashSet[String](mdl.getEnabledBuiltInTokens)
+ new util.HashSet[String](mdl.getEnabledBuiltInTokens),
+ values,
+ samples
)
})
), cryptoKey)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala
index 16edd61..ad80245 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala
@@ -19,6 +19,12 @@ package org.apache.nlpcraft.server.mdo
import org.apache.nlpcraft.server.mdo.impl._
+
+@NCMdoEntity(sql = false)
+case class NCModelMLConfigMdo(
+ @NCMdoField values: Map[String /*Element ID*/, Map[/*Value*/String, /*Synonym*/Seq[String]]],
+ @NCMdoField samples: Map[String /*Element ID*/, Seq[String]/*Samples*/]
+)
/**
* Probe model MDO.
*/
@@ -27,7 +33,8 @@ case class NCProbeModelMdo(
@NCMdoField id: String,
@NCMdoField name: String,
@NCMdoField version: String,
- @NCMdoField enabledBuiltInTokens: Set[String]
+ @NCMdoField enabledBuiltInTokens: Set[String],
+ @NCMdoField mlConfig: Option[NCModelMLConfigMdo]
) extends NCAnnotatedMdo[NCProbeModelMdo] {
override def hashCode(): Int = s"$id$name".hashCode()
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala
index 4f91bc2..e420676 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala
@@ -26,6 +26,7 @@ import org.apache.nlpcraft.common.pool.NCThreadPoolManager
import org.apache.nlpcraft.common.{NCService, _}
import org.apache.nlpcraft.server.ignite.NCIgniteHelpers._
import org.apache.nlpcraft.server.ignite.NCIgniteInstance
+import org.apache.nlpcraft.server.mdo.NCModelMLConfigMdo
import org.apache.nlpcraft.server.nlp.core.{NCNlpNerEnricher, NCNlpServerManager}
import org.apache.nlpcraft.server.nlp.enrichers.basenlp.NCBaseNlpEnricher
import org.apache.nlpcraft.server.nlp.enrichers.coordinate.NCCoordinatesEnricher
@@ -90,6 +91,7 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance {
* @param srvReqId Server request ID.
* @param normTxt Normalized text.
* @param enabledBuiltInToks Enabled built-in tokens.
+ * @param mlConf Machine learning configuration.
* @param parent Optional parent span.
* @return
*/
@@ -97,9 +99,11 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance {
srvReqId: String,
normTxt: String,
enabledBuiltInToks: Set[String],
- parent: Span = null): NCNlpSentence =
+ mlConf: Option[NCModelMLConfigMdo],
+ parent: Span = null
+ ): NCNlpSentence =
startScopedSpan("process", parent, "srvReqId" -> srvReqId, "txt" -> normTxt) { span =>
- val s = new NCNlpSentence(srvReqId, normTxt, enabledBuiltInToks)
+ val s = new NCNlpSentence(srvReqId, normTxt, enabledBuiltInToks, mlConf)
// Server-side enrichment pipeline.
// NOTE: order of enrichers is IMPORTANT.
@@ -134,6 +138,7 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance {
* @param srvReqId Server request ID.
* @param txt Input text.
* @param enabledBuiltInToks Set of enabled built-in token IDs.
+ * @param mlConf Machine learning configuration.
* @param parent Optional parent span.
*/
@throws[NCE]
@@ -141,7 +146,9 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance {
srvReqId: String,
txt: String,
enabledBuiltInToks: Set[String],
- parent: Span = null): NCNlpSentence = {
+ mlConf: Option[NCModelMLConfigMdo],
+ parent: Span = null
+ ): NCNlpSentence = {
startScopedSpan("enrichPipeline", parent, "srvReqId" -> srvReqId, "txt" -> txt) { span =>
val normTxt = NCPreProcessManager.normalize(txt, spellCheck = true, span)
@@ -159,9 +166,9 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance {
h.sentence
}
else
- process(srvReqId, normTxt, enabledBuiltInToks, span)
+ process(srvReqId, normTxt, enabledBuiltInToks, mlConf, span)
case None =>
- process(srvReqId, normTxt, enabledBuiltInToks, span)
+ process(srvReqId, normTxt, enabledBuiltInToks, mlConf, span)
}
}
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/ContextWordEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/ContextWordEnricher.scala
new file mode 100644
index 0000000..c2dd843
--- /dev/null
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/ContextWordEnricher.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.server.nlp.enrichers.ctxword
+
+import io.opencensus.trace.Span
+import org.apache.nlpcraft.common.NCService
+import org.apache.nlpcraft.common.nlp.NCNlpSentence
+import org.apache.nlpcraft.server.nlp.enrichers.NCServerEnricher
+
+/**
+ * ContextWord enricher.
+ */
+object ContextWordEnricher extends NCServerEnricher {
+ override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ =>
+ ackStarting()
+ ackStarted()
+ }
+
+ override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ =>
+ ackStopping()
+ ackStopped()
+ }
+
+ override def enrich(ns: NCNlpSentence, parent: Span): Unit = {
+ ns.mlConfig match {
+ case Some(cfg) =>
+ val nouns = ns.tokens.filter(_.pos.startsWith("N"))
+
+ if (nouns.nonEmpty) {
+ nouns
+ }
+
+ case None => // No-op.
+ }
+ }
+}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala
index f572b9f..67acba8 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala
@@ -31,7 +31,7 @@ import org.apache.nlpcraft.common.version.NCVersion
import org.apache.nlpcraft.common.{NCService, _}
import org.apache.nlpcraft.probe.mgrs.NCProbeMessage
import org.apache.nlpcraft.server.company.NCCompanyManager
-import org.apache.nlpcraft.server.mdo.{NCCompanyMdo, NCProbeMdo, NCProbeModelMdo, NCUserMdo}
+import org.apache.nlpcraft.server.mdo.{NCCompanyMdo, NCModelMLConfigMdo, NCProbeMdo, NCProbeModelMdo, NCUserMdo}
import org.apache.nlpcraft.server.nlp.enrichers.NCServerEnrichmentManager
import org.apache.nlpcraft.server.proclog.NCProcessLogManager
import org.apache.nlpcraft.server.query.NCQueryManager
@@ -45,7 +45,7 @@ import java.util.Collections
import java.util.concurrent.ConcurrentHashMap
import scala.collection.mutable
import scala.concurrent.{ExecutionContext, Future, Promise}
-import scala.jdk.CollectionConverters.SetHasAsScala
+import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsScala, SetHasAsScala}
import scala.util.{Failure, Success}
/**
@@ -613,25 +613,40 @@ object NCProbeManager extends NCService {
String,
String,
String,
- java.util.Set[String]
+ java.util.Set[String],
+ java.util.Map[String, java.util.Map[String, java.util.List[String]]],
+ java.util.Map[String, java.util.List[String]]
)]]("PROBE_MODELS").
map {
case (
mdlId,
mdlName,
mdlVer,
- enabledBuiltInToks
+ enabledBuiltInToks,
+ values,
+ samples
) =>
require(mdlId != null)
require(mdlName != null)
require(mdlVer != null)
require(enabledBuiltInToks != null)
+ require(values.isEmpty ^ samples.isEmpty)
NCProbeModelMdo(
id = mdlId,
name = mdlName,
version = mdlVer,
- enabledBuiltInTokens = enabledBuiltInToks.asScala.toSet
+ enabledBuiltInTokens = enabledBuiltInToks.asScala.toSet,
+ mlConfig =
+ if (!values.isEmpty)
+ Some(
+ NCModelMLConfigMdo(
+ values = values.asScala.map(p => p._1 -> p._2.asScala.map(p => p._1 -> p._2.asScala.toSeq).toMap).toMap,
+ samples = samples.asScala.map(p => p._1 -> p._2.asScala.toSeq).toMap
+ )
+ )
+ else
+ None
)
}.toSet
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/query/NCQueryManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/query/NCQueryManager.scala
index f4d2afe..32492b9 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/query/NCQueryManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/query/NCQueryManager.scala
@@ -272,7 +272,7 @@ object NCQueryManager extends NCService with NCIgniteInstance with NCOpenCensusS
logger.info(s"New request received:\n$tbl")
- val enabledBuiltInToks = NCProbeManager.getModel(mdlId, span).enabledBuiltInTokens
+ val mdl = NCProbeManager.getModel(mdlId, span)
@throws[NCE]
def unzipProperties(gzipOpt: Option[String]): Option[JavaMeta] =
@@ -288,7 +288,7 @@ object NCQueryManager extends NCService with NCIgniteInstance with NCOpenCensusS
company,
mdlId,
txt0,
- NCServerEnrichmentManager.enrichPipeline(srvReqId, txt0, enabledBuiltInToks),
+ NCServerEnrichmentManager.enrichPipeline(srvReqId, txt0, mdl.enabledBuiltInTokens, mdl.mlConfig),
usrAgent,
rmtAddr,
data,