You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/03/01 13:26:31 UTC

[incubator-nlpcraft] 01/01: WIP.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-258
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 3ec98be5682a711bd72a4d4732c97c0c77dcce55
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Mon Mar 1 16:26:17 2021 +0300

    WIP.
---
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 31 +++++++-----
 .../abstract/NCAbstractTokensVariantsSpec.scala    | 56 +++++++++++++---------
 2 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index c4a7936..8ecb39d 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -17,21 +17,22 @@
 
 package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model
 
-import java.io.Serializable
-import java.util
 import io.opencensus.trace.Span
 import org.apache.nlpcraft.common._
 import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken, NCNlpSentenceTokenBuffer, _}
 import org.apache.nlpcraft.model._
+import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind, TEXT}
 import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
 import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
-import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeSynonym, NCProbeSynonymsWrapper, NCProbeVariants}
+import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeSynonym, NCProbeVariants}
 
+import java.io.Serializable
+import java.util
 import scala.collection.JavaConverters._
-import scala.compat.java8.OptionConverters._
 import scala.collection.convert.DecorateAsScala
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.{Map, Seq, mutable}
+import scala.compat.java8.OptionConverters._
 
 /**
   * Model elements enricher.
@@ -63,7 +64,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
         element: NCElement,
         tokens: Seq[NCNlpSentenceToken],
         synonym: NCProbeSynonym,
-        parts: Seq[NCToken]
+        parts: Seq[(NCToken, NCSynonymChunkKind)]
     ) extends Ordered[ElementMatch] {
         // Tokens sparsity.
         lazy val sparsity: Int = tokens.zipWithIndex.tail.map {
@@ -198,7 +199,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
         direct: Boolean,
         syn: Option[NCProbeSynonym],
         metaOpt: Option[Map[String, Object]],
-        parts: Seq[NCToken]
+        parts: Seq[(NCToken, NCSynonymChunkKind)]
     ): Unit = {
         val params = mutable.ArrayBuffer.empty[(String, AnyRef)]
 
@@ -219,16 +220,16 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
 
         if (parts.nonEmpty) {
             val partsData: Seq[util.HashMap[String, Any]] =
-                parts.map(part ⇒ {
+                parts.map { case (part, kind) ⇒
                     val m = new util.HashMap[String, Any]()
 
-                    m.put("id", part.getId)
+                    m.put("id", if (kind == TEXT) "nlpcraft:nlp" else part.getId)
                     m.put("startcharindex", part.getStartCharIndex)
                     m.put("endcharindex", part.getEndCharIndex)
                     m.put(TOK_META_ALIASES_KEY, part.getMetadata.get(TOK_META_ALIASES_KEY))
 
                     m
-                })
+                }
 
             params += "parts" → partsData.asJava
         }
@@ -375,7 +376,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
                             var found = false
 
                             def addMatch(
-                                elm: NCElement, toks: Seq[NCNlpSentenceToken], syn: NCProbeSynonym, parts: Seq[NCToken]
+                                elm: NCElement, toks: Seq[NCNlpSentenceToken], syn: NCProbeSynonym, parts: Seq[(NCToken, NCSynonymChunkKind)]
                             ): Unit =
                                 if (
                                     (elm.getJiggleFactor.isEmpty || elm.getJiggleFactor.get() >= sparsity) &&
@@ -438,8 +439,14 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
                                     syn ← fastAccess(mdl.synonymsDsl, elm.getId, comb.length).getOrElse(Seq.empty)
                                     if !found
                                 )
-                                    if (syn.isMatch(comb.map(_.data)))
-                                        addMatch(elm, toks, syn, comb.filter(_.isToken).map(_.token))
+                                    if (syn.isMatch(comb.map(_.data))) {
+                                        val parts = comb.zip(syn.map(_.kind)).flatMap {
+                                            case (complex, kind) ⇒
+                                                if (complex.isToken) Some(complex.token → kind) else None
+                                        }
+
+                                        addMatch(elm, toks, syn, parts)
+                                    }
                             }
                         }
 
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
index 8912bc8..35e8e87 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
@@ -30,6 +30,11 @@ class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
     private def checkText(t: NCToken, txt: String): Unit =
         require(t.getOriginalText == txt, s"Expected text: $txt, token: $t")
 
+    private def checkToken(t: NCToken, id: String, txt: String): Unit = {
+        checkId(t, id)
+        checkText(t, txt)
+    }
+
     override def onContext(ctx: NCContext): NCResult = {
         val variants = ctx.getVariants.asScala
 
@@ -49,6 +54,18 @@ class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
             )
         }
 
+        def checkWrapAnyWord(t: NCToken, any: String): Unit = {
+            val parts = t.getPartTokens.asScala
+
+            require(parts.size == 2)
+
+            checkToken(parts.head, "nlpcraft:nlp", "the")
+            checkToken(parts.last, "anyWord", any)
+
+            require(parts.last.isAbstract, s"Unexpected abstract token: ${parts.last}")
+
+        }
+
         ctx.getRequest.getNormalizedText match {
             case "word the word" ⇒
                 require(variants.size == 1)
@@ -57,20 +74,10 @@ class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
 
                 require(toks.size == 2)
 
-                checkId(toks.head, "nlpcraft:nlp")
-                checkText(toks.head, "word")
-
-                checkId(toks.last, "wrapAnyWord")
-                checkText(toks.last, "the word")
-
-                val t2Parts = toks.last.getPartTokens.asScala
+                checkToken(toks.head, "nlpcraft:nlp", "word")
+                checkToken(toks.last, "wrapAnyWord", "the word")
 
-                require(t2Parts.size == 2)
-
-                checkId(t2Parts.head,"anyWord")
-                checkId(t2Parts.last, "anyWord")
-
-                t2Parts.foreach(t ⇒ require(t.isAbstract, s"Unexpected abstract token: $t"))
+                checkWrapAnyWord(toks.last, "word")
 
             case "10 w1 10 w2" ⇒
                 require(variants.nonEmpty)
@@ -85,16 +92,16 @@ class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
 
                 require(toks.size == 2)
 
-                checkText(toks.head, "10")
-                checkText(toks.last,"w1 10 w2")
+                checkToken(toks.head, "nlpcraft:nlp", "10")
+                checkToken(toks.last,"wrapNum", "w1 10 w2")
 
                 val t2Parts = toks.last.getPartTokens.asScala
 
                 require(t2Parts.size == 3)
 
-                checkId(t2Parts.head,"nlpcraft:nlp")
-                checkId(t2Parts(1),"nlpcraft:num")
-                checkId(t2Parts.last,"nlpcraft:nlp")
+                checkToken(t2Parts.head,"nlpcraft:nlp", "w1")
+                checkToken(t2Parts(1),"nlpcraft:num", "10")
+                checkToken(t2Parts.last,"nlpcraft:nlp", "w2")
 
             case "before limit top 6 the any" ⇒
                 require(variants.nonEmpty)
@@ -109,8 +116,8 @@ class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
 
                 require(toks.size == 2)
 
-                checkText(toks.head, "before limit top 6")
-                checkText(toks.last,"the any")
+                checkToken(toks.head, "wrapLimit", "before limit top 6")
+                checkToken(toks.last, "wrapAnyWord", "the any")
 
                 val wrap = toks.head.getPartTokens.asScala
 
@@ -118,6 +125,7 @@ class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
 
                 checkLimit(wrap.last)
 
+                checkWrapAnyWord(toks.last, "any")
             case "a wrap before limit top 6 the any" ⇒
                 require(variants.nonEmpty)
 
@@ -131,9 +139,9 @@ class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
 
                 require(toks.size == 3)
 
-                checkText(toks.head, "a")
-                checkText(toks(1), "wrap before limit top 6")
-                checkText(toks.last,"the any")
+                checkToken(toks.head, "nlpcraft:nlp", "a")
+                checkToken(toks(1), "wrapWrapLimit", "wrap before limit top 6")
+                checkToken(toks.last, "wrapAnyWord", "the any")
 
                 val wrap = toks(1).getPartTokens.asScala
 
@@ -147,6 +155,8 @@ class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
                 require(wrapLimit.getPartTokens.size == 3, s"Parts count: ${wrapLimit.getPartTokens.size()}")
 
                 checkLimit(wrapLimit.getPartTokens.asScala.last)
+
+                checkWrapAnyWord(toks.last, "any")
             case _ ⇒ throw new AssertionError(s"Unexpected request: ${ctx.getRequest.getNormalizedText}")
         }