You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/06/17 18:19:35 UTC

[incubator-nlpcraft] 01/03: WIP.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-70_NEW
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 022469eb9dc06dbd14a6166aefcf7ce091c06e7e
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Wed Jun 16 20:39:47 2021 +0300

    WIP.
---
 .../apache/nlpcraft/common/nlp/NCNlpSentence.scala |  4 ++
 .../probe/mgrs/conn/NCConnectionManager.scala      | 29 +++++++++++-
 .../nlpcraft/server/mdo/NCProbeModelMdo.scala      |  9 +++-
 .../nlp/enrichers/NCServerEnrichmentManager.scala  | 17 +++++---
 .../enrichers/ctxword/ContextWordEnricher.scala    | 51 ++++++++++++++++++++++
 .../nlpcraft/server/probe/NCProbeManager.scala     | 25 ++++++++---
 .../nlpcraft/server/query/NCQueryManager.scala     |  4 +-
 7 files changed, 124 insertions(+), 15 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index f508745..eef05de 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -18,6 +18,7 @@
 package org.apache.nlpcraft.common.nlp
 
 import org.apache.nlpcraft.common._
+import org.apache.nlpcraft.server.mdo.NCModelMLConfigMdo
 
 import java.io.{Serializable => JSerializable}
 import java.util.{Collections, List => JList}
@@ -40,6 +41,7 @@ import org.apache.nlpcraft.common.nlp.NCNlpSentence._
   * @param srvReqId Server request ID.
   * @param text Normalized text.
   * @param enabledBuiltInToks Enabled built-in tokens.
+  * @param mlConfig Machine learning configuration. Optional.
   * @param tokens Initial buffer.
   * @param firstProbePhase Processing phase flag.
   * @param deletedNotes Deleted overridden notes with their tokens.
@@ -50,6 +52,7 @@ class NCNlpSentence(
     val srvReqId: String,
     val text: String,
     val enabledBuiltInToks: Set[String],
+    val mlConfig: Option[NCModelMLConfigMdo],
     override val tokens: mutable.ArrayBuffer[NCNlpSentenceToken] = new mutable.ArrayBuffer[NCNlpSentenceToken](32),
     var firstProbePhase: Boolean = true,
     private val deletedNotes: mutable.HashMap[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = mutable.HashMap.empty,
@@ -67,6 +70,7 @@ class NCNlpSentence(
             srvReqId = srvReqId,
             text = text,
             enabledBuiltInToks = enabledBuiltInToks,
+            mlConfig = mlConfig,
             tokens = tokens.map(_.clone()),
             deletedNotes = deletedNotes.map(p => p._1.clone() -> p._2.map(_.clone())),
             initNlpNotes = initNlpNotes,
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala
index 159ffd2..c911342 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala
@@ -32,8 +32,9 @@ import java.io.{EOFException, IOException, InterruptedIOException}
 import java.net.{InetAddress, NetworkInterface}
 import java.util
 import java.util.concurrent.CountDownLatch
-import java.util.{Properties, TimeZone}
+import java.util.{Collections, Properties, TimeZone}
 import scala.collection.mutable
+import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava, SeqHasAsJava, SetHasAsScala}
 
 /**
   * Probe down/up link connection manager.
@@ -213,6 +214,28 @@ object NCConnectionManager extends NCService {
                         NCModelManager.getAllModels().map(wrapper => {
                             val mdl = wrapper.model
 
+                            val ctxWordElems = mdl.getElements.asScala.filter(_.isContextWordSupport)
+
+                            // TODO: validate: too many values, examples. missed them.
+                            val (
+                                values,
+                                samples
+                            ): (
+                                java.util.Map[String, java.util.Map[String, java.util.List[String]]],
+                                java.util.Map[String, java.util.List[String]]
+                            ) =
+                                if (ctxWordElems.isEmpty)
+                                    (Collections.emptyMap(), Collections.emptyMap())
+                                else {
+                                    (
+                                        ctxWordElems.map(e =>
+                                            e.getId ->
+                                                e.getValues.asScala.map(p => p.getName -> p.getSynonyms).toMap.asJava
+                                        ).toMap.asJava,
+                                        wrapper.samples.map(p => p._1 -> p._2.flatMap(p => p).asJava).toMap.asJava
+                                    )
+                                }
+
                             // Model already validated.
 
                             // util.HashSet created to avoid scala collections serialization error.
@@ -221,7 +244,9 @@ object NCConnectionManager extends NCService {
                                 mdl.getId,
                                 mdl.getName,
                                 mdl.getVersion,
-                                new util.HashSet[String](mdl.getEnabledBuiltInTokens)
+                                new util.HashSet[String](mdl.getEnabledBuiltInTokens),
+                                values,
+                                samples
                             )
                         })
                 ), cryptoKey)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala
index 16edd61..ad80245 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala
@@ -19,6 +19,12 @@ package org.apache.nlpcraft.server.mdo
 
 import org.apache.nlpcraft.server.mdo.impl._
 
+
+@NCMdoEntity(sql = false)
+case class NCModelMLConfigMdo(
+    @NCMdoField values: Map[String /*Element ID*/, Map[/*Value*/String, /*Synonym*/Seq[String]]],
+    @NCMdoField samples: Map[String /*Element ID*/, Seq[String]/*Samples*/]
+)
 /**
   * Probe model MDO.
   */
@@ -27,7 +33,8 @@ case class NCProbeModelMdo(
     @NCMdoField id: String,
     @NCMdoField name: String,
     @NCMdoField version: String,
-    @NCMdoField enabledBuiltInTokens: Set[String]
+    @NCMdoField enabledBuiltInTokens: Set[String],
+    @NCMdoField mlConfig: Option[NCModelMLConfigMdo]
 ) extends NCAnnotatedMdo[NCProbeModelMdo] {
     override def hashCode(): Int = s"$id$name".hashCode()
     
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala
index 4f91bc2..e420676 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala
@@ -26,6 +26,7 @@ import org.apache.nlpcraft.common.pool.NCThreadPoolManager
 import org.apache.nlpcraft.common.{NCService, _}
 import org.apache.nlpcraft.server.ignite.NCIgniteHelpers._
 import org.apache.nlpcraft.server.ignite.NCIgniteInstance
+import org.apache.nlpcraft.server.mdo.NCModelMLConfigMdo
 import org.apache.nlpcraft.server.nlp.core.{NCNlpNerEnricher, NCNlpServerManager}
 import org.apache.nlpcraft.server.nlp.enrichers.basenlp.NCBaseNlpEnricher
 import org.apache.nlpcraft.server.nlp.enrichers.coordinate.NCCoordinatesEnricher
@@ -90,6 +91,7 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance {
       * @param srvReqId Server request ID.
       * @param normTxt Normalized text.
       * @param enabledBuiltInToks Enabled built-in tokens.
+      * @param mlConf  Machine learning configuration.
       * @param parent Optional parent span.
       * @return
       */
@@ -97,9 +99,11 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance {
         srvReqId: String,
         normTxt: String,
         enabledBuiltInToks: Set[String],
-        parent: Span = null): NCNlpSentence =
+        mlConf: Option[NCModelMLConfigMdo],
+        parent: Span = null
+    ): NCNlpSentence =
         startScopedSpan("process", parent, "srvReqId" -> srvReqId, "txt" -> normTxt) { span =>
-            val s = new NCNlpSentence(srvReqId, normTxt, enabledBuiltInToks)
+            val s = new NCNlpSentence(srvReqId, normTxt, enabledBuiltInToks, mlConf)
 
             // Server-side enrichment pipeline.
             // NOTE: order of enrichers is IMPORTANT.
@@ -134,6 +138,7 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance {
       * @param srvReqId Server request ID.
       * @param txt Input text.
       * @param enabledBuiltInToks Set of enabled built-in token IDs.
+      * @param mlConf Machine learning configuration.
       * @param parent Optional parent span.
       */
     @throws[NCE]
@@ -141,7 +146,9 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance {
         srvReqId: String,
         txt: String,
         enabledBuiltInToks: Set[String],
-        parent: Span = null): NCNlpSentence = {
+        mlConf: Option[NCModelMLConfigMdo],
+        parent: Span = null
+    ): NCNlpSentence = {
         startScopedSpan("enrichPipeline", parent, "srvReqId" -> srvReqId, "txt" -> txt) { span =>
             val normTxt = NCPreProcessManager.normalize(txt, spellCheck = true, span)
 
@@ -159,9 +166,9 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance {
                             h.sentence
                         }
                         else
-                            process(srvReqId, normTxt, enabledBuiltInToks, span)
+                            process(srvReqId, normTxt, enabledBuiltInToks, mlConf, span)
                     case None =>
-                        process(srvReqId, normTxt, enabledBuiltInToks, span)
+                        process(srvReqId, normTxt, enabledBuiltInToks, mlConf, span)
                 }
             }
         }
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/ContextWordEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/ContextWordEnricher.scala
new file mode 100644
index 0000000..c2dd843
--- /dev/null
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/ContextWordEnricher.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.server.nlp.enrichers.ctxword
+
+import io.opencensus.trace.Span
+import org.apache.nlpcraft.common.NCService
+import org.apache.nlpcraft.common.nlp.NCNlpSentence
+import org.apache.nlpcraft.server.nlp.enrichers.NCServerEnricher
+
+/**
+  * ContextWord enricher.
+  */
+object ContextWordEnricher extends NCServerEnricher {
+    override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ =>
+        ackStarting()
+        ackStarted()
+    }
+
+    override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ =>
+        ackStopping()
+        ackStopped()
+    }
+
+    override def enrich(ns: NCNlpSentence, parent: Span): Unit = {
+        ns.mlConfig match {
+            case Some(cfg) =>
+                val nouns = ns.tokens.filter(_.pos.startsWith("N"))
+
+                if (nouns.nonEmpty) {
+                    nouns
+                }
+
+            case None => // No-op.
+        }
+    }
+}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala
index f572b9f..67acba8 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala
@@ -31,7 +31,7 @@ import org.apache.nlpcraft.common.version.NCVersion
 import org.apache.nlpcraft.common.{NCService, _}
 import org.apache.nlpcraft.probe.mgrs.NCProbeMessage
 import org.apache.nlpcraft.server.company.NCCompanyManager
-import org.apache.nlpcraft.server.mdo.{NCCompanyMdo, NCProbeMdo, NCProbeModelMdo, NCUserMdo}
+import org.apache.nlpcraft.server.mdo.{NCCompanyMdo, NCModelMLConfigMdo, NCProbeMdo, NCProbeModelMdo, NCUserMdo}
 import org.apache.nlpcraft.server.nlp.enrichers.NCServerEnrichmentManager
 import org.apache.nlpcraft.server.proclog.NCProcessLogManager
 import org.apache.nlpcraft.server.query.NCQueryManager
@@ -45,7 +45,7 @@ import java.util.Collections
 import java.util.concurrent.ConcurrentHashMap
 import scala.collection.mutable
 import scala.concurrent.{ExecutionContext, Future, Promise}
-import scala.jdk.CollectionConverters.SetHasAsScala
+import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsScala, SetHasAsScala}
 import scala.util.{Failure, Success}
 
 /**
@@ -613,25 +613,40 @@ object NCProbeManager extends NCService {
                             String,
                             String,
                             String,
-                            java.util.Set[String]
+                            java.util.Set[String],
+                            java.util.Map[String, java.util.Map[String, java.util.List[String]]],
+                            java.util.Map[String, java.util.List[String]]
                         )]]("PROBE_MODELS").
                         map {
                             case (
                                 mdlId,
                                 mdlName,
                                 mdlVer,
-                                enabledBuiltInToks
+                                enabledBuiltInToks,
+                                values,
+                                samples
                             ) =>
                                 require(mdlId != null)
                                 require(mdlName != null)
                                 require(mdlVer != null)
                                 require(enabledBuiltInToks != null)
+                                require(values.isEmpty ^ samples.isEmpty)
 
                                 NCProbeModelMdo(
                                     id = mdlId,
                                     name = mdlName,
                                     version = mdlVer,
-                                    enabledBuiltInTokens = enabledBuiltInToks.asScala.toSet
+                                    enabledBuiltInTokens = enabledBuiltInToks.asScala.toSet,
+                                    mlConfig =
+                                        if (!values.isEmpty)
+                                            Some(
+                                                NCModelMLConfigMdo(
+                                                    values = values.asScala.map(p => p._1 -> p._2.asScala.map(p => p._1 -> p._2.asScala.toSeq).toMap).toMap,
+                                                    samples = samples.asScala.map(p => p._1 -> p._2.asScala.toSeq).toMap
+                                                )
+                                            )
+                                        else
+                                            None
                                 )
                         }.toSet
 
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/query/NCQueryManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/query/NCQueryManager.scala
index f4d2afe..32492b9 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/query/NCQueryManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/query/NCQueryManager.scala
@@ -272,7 +272,7 @@ object NCQueryManager extends NCService with NCIgniteInstance with NCOpenCensusS
 
                 logger.info(s"New request received:\n$tbl")
 
-                val enabledBuiltInToks = NCProbeManager.getModel(mdlId, span).enabledBuiltInTokens
+                val mdl = NCProbeManager.getModel(mdlId, span)
 
                 @throws[NCE]
                 def unzipProperties(gzipOpt: Option[String]): Option[JavaMeta] =
@@ -288,7 +288,7 @@ object NCQueryManager extends NCService with NCIgniteInstance with NCOpenCensusS
                     company,
                     mdlId,
                     txt0,
-                    NCServerEnrichmentManager.enrichPipeline(srvReqId, txt0, enabledBuiltInToks),
+                    NCServerEnrichmentManager.enrichPipeline(srvReqId, txt0, mdl.enabledBuiltInTokens, mdl.mlConfig),
                     usrAgent,
                     rmtAddr,
                     data,