You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/06/28 14:09:43 UTC

[incubator-nlpcraft] 01/02: WIP.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-70_NEW
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 544d8a40860393698ec80b5c992e890770469c16
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Mon Jun 28 14:33:32 2021 +0300

    WIP.
---
 nlpcraft/pom.xml                                   |  4 ++++
 .../apache/nlpcraft/common/nlp/NCNlpSentence.scala |  2 +-
 .../enrichers/ctxword/NCContextWordEnricher.scala  | 26 ++++++++++++++++++++--
 pom.xml                                            |  7 ++++++
 4 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/nlpcraft/pom.xml b/nlpcraft/pom.xml
index 4d8c292..62d3683 100644
--- a/nlpcraft/pom.xml
+++ b/nlpcraft/pom.xml
@@ -232,6 +232,10 @@
             <groupId>org.jline</groupId>
             <artifactId>jline</artifactId>
         </dependency>
+        <dependency>
+            <groupId>org.jibx</groupId>
+            <artifactId>jibx-tools</artifactId>
+        </dependency>
 
         <!-- Test dependencies. -->
         <dependency>
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index d237558..ed22935 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -52,7 +52,7 @@ class NCNlpSentence(
     val srvReqId: String,
     val text: String,
     val enabledBuiltInToks: Set[String],
-    val mlConfig: Option[NCModelMLConfigMdo],
+    val mlConfig: Option[NCModelMLConfigMdo] = None,
     var mlData: Map[Int, Map[String, Double]] = Map.empty,
     override val tokens: mutable.ArrayBuffer[NCNlpSentenceToken] = new mutable.ArrayBuffer[NCNlpSentenceToken](32),
     var firstProbePhase: Boolean = true,
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
index 4982e29..79c970e 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
@@ -18,12 +18,16 @@
 package org.apache.nlpcraft.server.nlp.enrichers.ctxword
 
 import io.opencensus.trace.Span
-import org.apache.nlpcraft.common.nlp.core.NCNlpPorterStemmer.stem
+import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager
+import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager.stem
+import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceToken}
 import org.apache.nlpcraft.common.{NCE, NCService, U}
 import org.apache.nlpcraft.server.mdo.NCModelMLConfigMdo
 import org.apache.nlpcraft.server.nlp.enrichers.NCServerEnricher
+import org.apache.nlpcraft.server.nlp.enrichers.basenlp.NCBaseNlpEnricher
 import org.apache.nlpcraft.server.sugsyn.{NCSuggestSynonymManager, NCSuggestionRequest, NCWordSuggestion}
+import org.jibx.schema.codegen.extend.DefaultNameConverter
 
 import scala.collection.mutable
 import scala.concurrent.Await
@@ -33,9 +37,14 @@ import scala.concurrent.duration.Duration
   * ContextWord enricher.
   */
 object NCContextWordEnricher extends NCServerEnricher {
+    private final val POS_PLURALS = Set("NNS", "NNPS")
+    private final val POS_SINGULAR = Set("NN", "NNP")
+
     private final val MAX_CTXWORD_SCORE = 2
     private final val EXCLUSIVE_MIN_SCORE = -1.0
 
+    private final val CONVERTER = new DefaultNameConverter
+
     private case class ModelProbeKey(probeId: String, modelId: String)
     private case class ElementScore(elementId: String, averageScore: Double, senScore: Double, sampleScore: Double)
 
@@ -171,6 +180,19 @@ object NCContextWordEnricher extends NCServerEnricher {
     @throws[NCE]
     private def askSamples(cfg: NCModelMLConfigMdo): ElementStemScore = {
         val sampleWords = cfg.samples.map(spaceTokenize).toSeq
+
+
+        sampleWords.map(s => {
+            val sampleSen = new NCNlpSentence("sampleReqId", sampleWords.mkString(" "), Set.empty)
+
+            NCBaseNlpEnricher.enrich(sampleSen)
+
+            sampleSen.
+        })
+
+
+
+
         val sampleWordsStems = sampleWords.map(_.map(stem))
 
         val recs: Map[String, Seq[NCSuggestionRequest]] =
@@ -227,7 +249,7 @@ object NCContextWordEnricher extends NCServerEnricher {
                     }
                 }
 
-                val nounToks = ns.tokens.filter(_.pos.startsWith("N"))
+                val nounToks = ns.tokens.filter(t => NCPennTreebank.NOUNS_POS.contains(t.pos))
 
                 if (nounToks.nonEmpty) {
                     val key = ModelProbeKey(cfg.probeId, cfg.modelId)
diff --git a/pom.xml b/pom.xml
index fd0d687..59e871a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -154,6 +154,7 @@
         <lightstep.grpc.ver>0.15.8</lightstep.grpc.ver>
         <junit.ver>5.5.1</junit.ver>
         <jsonpath.ver>2.4.0</jsonpath.ver>
+        <jibx.tools.ver>1.3.3</jibx.tools.ver>
 
         <!-- Force specific encoding on text resources. -->
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
@@ -474,6 +475,12 @@
             </dependency>
 
             <dependency>
+                <groupId>org.jibx</groupId>
+                <artifactId>jibx-tools</artifactId>
+                <version>${jibx.tools.ver}</version>
+            </dependency>
+
+            <dependency>
                 <groupId>edu.stanford.nlp</groupId>
                 <artifactId>stanford-corenlp</artifactId>
                 <version>${stanford.corenlp.ver}</version>