You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/03/01 13:26:31 UTC
[incubator-nlpcraft] 01/01: WIP.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-258
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 3ec98be5682a711bd72a4d4732c97c0c77dcce55
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Mon Mar 1 16:26:17 2021 +0300
WIP.
---
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 31 +++++++-----
.../abstract/NCAbstractTokensVariantsSpec.scala | 56 +++++++++++++---------
2 files changed, 52 insertions(+), 35 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index c4a7936..8ecb39d 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -17,21 +17,22 @@
package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model
-import java.io.Serializable
-import java.util
import io.opencensus.trace.Span
import org.apache.nlpcraft.common._
import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken, NCNlpSentenceTokenBuffer, _}
import org.apache.nlpcraft.model._
+import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind, TEXT}
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
-import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeSynonym, NCProbeSynonymsWrapper, NCProbeVariants}
+import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeSynonym, NCProbeVariants}
+import java.io.Serializable
+import java.util
import scala.collection.JavaConverters._
-import scala.compat.java8.OptionConverters._
import scala.collection.convert.DecorateAsScala
import scala.collection.mutable.ArrayBuffer
import scala.collection.{Map, Seq, mutable}
+import scala.compat.java8.OptionConverters._
/**
* Model elements enricher.
@@ -63,7 +64,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
element: NCElement,
tokens: Seq[NCNlpSentenceToken],
synonym: NCProbeSynonym,
- parts: Seq[NCToken]
+ parts: Seq[(NCToken, NCSynonymChunkKind)]
) extends Ordered[ElementMatch] {
// Tokens sparsity.
lazy val sparsity: Int = tokens.zipWithIndex.tail.map {
@@ -198,7 +199,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
direct: Boolean,
syn: Option[NCProbeSynonym],
metaOpt: Option[Map[String, Object]],
- parts: Seq[NCToken]
+ parts: Seq[(NCToken, NCSynonymChunkKind)]
): Unit = {
val params = mutable.ArrayBuffer.empty[(String, AnyRef)]
@@ -219,16 +220,16 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
if (parts.nonEmpty) {
val partsData: Seq[util.HashMap[String, Any]] =
- parts.map(part ⇒ {
+ parts.map { case (part, kind) ⇒
val m = new util.HashMap[String, Any]()
- m.put("id", part.getId)
+ m.put("id", if (kind == TEXT) "nlpcraft:nlp" else part.getId)
m.put("startcharindex", part.getStartCharIndex)
m.put("endcharindex", part.getEndCharIndex)
m.put(TOK_META_ALIASES_KEY, part.getMetadata.get(TOK_META_ALIASES_KEY))
m
- })
+ }
params += "parts" → partsData.asJava
}
@@ -375,7 +376,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
var found = false
def addMatch(
- elm: NCElement, toks: Seq[NCNlpSentenceToken], syn: NCProbeSynonym, parts: Seq[NCToken]
+ elm: NCElement, toks: Seq[NCNlpSentenceToken], syn: NCProbeSynonym, parts: Seq[(NCToken, NCSynonymChunkKind)]
): Unit =
if (
(elm.getJiggleFactor.isEmpty || elm.getJiggleFactor.get() >= sparsity) &&
@@ -438,8 +439,14 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
syn ← fastAccess(mdl.synonymsDsl, elm.getId, comb.length).getOrElse(Seq.empty)
if !found
)
- if (syn.isMatch(comb.map(_.data)))
- addMatch(elm, toks, syn, comb.filter(_.isToken).map(_.token))
+ if (syn.isMatch(comb.map(_.data))) {
+ val parts = comb.zip(syn.map(_.kind)).flatMap {
+ case (complex, kind) ⇒
+ if (complex.isToken) Some(complex.token → kind) else None
+ }
+
+ addMatch(elm, toks, syn, parts)
+ }
}
}
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
index 8912bc8..35e8e87 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
@@ -30,6 +30,11 @@ class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
private def checkText(t: NCToken, txt: String): Unit =
require(t.getOriginalText == txt, s"Expected text: $txt, token: $t")
+ private def checkToken(t: NCToken, id: String, txt: String): Unit = {
+ checkId(t, id)
+ checkText(t, txt)
+ }
+
override def onContext(ctx: NCContext): NCResult = {
val variants = ctx.getVariants.asScala
@@ -49,6 +54,18 @@ class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
)
}
+ def checkWrapAnyWord(t: NCToken, any: String): Unit = {
+ val parts = t.getPartTokens.asScala
+
+ require(parts.size == 2)
+
+ checkToken(parts.head, "nlpcraft:nlp", "the")
+ checkToken(parts.last, "anyWord", any)
+
+ require(parts.last.isAbstract, s"Unexpected abstract token: ${parts.last}")
+
+ }
+
ctx.getRequest.getNormalizedText match {
case "word the word" ⇒
require(variants.size == 1)
@@ -57,20 +74,10 @@ class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
require(toks.size == 2)
- checkId(toks.head, "nlpcraft:nlp")
- checkText(toks.head, "word")
-
- checkId(toks.last, "wrapAnyWord")
- checkText(toks.last, "the word")
-
- val t2Parts = toks.last.getPartTokens.asScala
+ checkToken(toks.head, "nlpcraft:nlp", "word")
+ checkToken(toks.last, "wrapAnyWord", "the word")
- require(t2Parts.size == 2)
-
- checkId(t2Parts.head,"anyWord")
- checkId(t2Parts.last, "anyWord")
-
- t2Parts.foreach(t ⇒ require(t.isAbstract, s"Unexpected abstract token: $t"))
+ checkWrapAnyWord(toks.last, "word")
case "10 w1 10 w2" ⇒
require(variants.nonEmpty)
@@ -85,16 +92,16 @@ class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
require(toks.size == 2)
- checkText(toks.head, "10")
- checkText(toks.last,"w1 10 w2")
+ checkToken(toks.head, "nlpcraft:nlp", "10")
+ checkToken(toks.last,"wrapNum", "w1 10 w2")
val t2Parts = toks.last.getPartTokens.asScala
require(t2Parts.size == 3)
- checkId(t2Parts.head,"nlpcraft:nlp")
- checkId(t2Parts(1),"nlpcraft:num")
- checkId(t2Parts.last,"nlpcraft:nlp")
+ checkToken(t2Parts.head,"nlpcraft:nlp", "w1")
+ checkToken(t2Parts(1),"nlpcraft:num", "10")
+ checkToken(t2Parts.last,"nlpcraft:nlp", "w2")
case "before limit top 6 the any" ⇒
require(variants.nonEmpty)
@@ -109,8 +116,8 @@ class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
require(toks.size == 2)
- checkText(toks.head, "before limit top 6")
- checkText(toks.last,"the any")
+ checkToken(toks.head, "wrapLimit", "before limit top 6")
+ checkToken(toks.last, "wrapAnyWord", "the any")
val wrap = toks.head.getPartTokens.asScala
@@ -118,6 +125,7 @@ class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
checkLimit(wrap.last)
+ checkWrapAnyWord(toks.last, "any")
case "a wrap before limit top 6 the any" ⇒
require(variants.nonEmpty)
@@ -131,9 +139,9 @@ class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
require(toks.size == 3)
- checkText(toks.head, "a")
- checkText(toks(1), "wrap before limit top 6")
- checkText(toks.last,"the any")
+ checkToken(toks.head, "nlpcraft:nlp", "a")
+ checkToken(toks(1), "wrapWrapLimit", "wrap before limit top 6")
+ checkToken(toks.last, "wrapAnyWord", "the any")
val wrap = toks(1).getPartTokens.asScala
@@ -147,6 +155,8 @@ class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
require(wrapLimit.getPartTokens.size == 3, s"Parts count: ${wrapLimit.getPartTokens.size()}")
checkLimit(wrapLimit.getPartTokens.asScala.last)
+
+ checkWrapAnyWord(toks.last, "any")
case _ ⇒ throw new AssertionError(s"Unexpected request: ${ctx.getRequest.getNormalizedText}")
}