You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/02/12 20:23:52 UTC
[incubator-nlpcraft] branch NLPCRAFT-236 updated: WIP.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-236
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-236 by this push:
new cf19be3 WIP.
cf19be3 is described below
commit cf19be35609cb2c51cebc7f276411c09d4562b7b
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Fri Feb 12 23:23:38 2021 +0300
WIP.
---
.../apache/nlpcraft/common/nlp/NCNlpSentence.scala | 57 +++++++-----
.../nlpcraft/probe/mgrs/NCProbeVariants.scala | 103 +++++++++++++++------
.../abstract/NCAbstractTokensVariantsSpec.scala | 97 +++++++++++++++++++
.../nlp/enrichers/sort/NCEnricherSortSpec.scala | 5 -
4 files changed, 207 insertions(+), 55 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index 0d65199..a446204 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -368,7 +368,6 @@ object NCNlpSentence {
t
}
-
/**
* Fixes notes with references list to other notes indexes.
*
@@ -495,7 +494,8 @@ class NCNlpSentence(
val srvReqId: String,
val text: String,
val enabledBuiltInToks: Set[String],
- override val tokens: ArrayBuffer[NCNlpSentenceToken] = new ArrayBuffer[NCNlpSentenceToken](32)
+ override val tokens: mutable.ArrayBuffer[NCNlpSentenceToken] = new mutable.ArrayBuffer[NCNlpSentenceToken](32),
+ val deletedNotes: mutable.HashMap[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = mutable.HashMap.empty
) extends NCNlpSentenceTokenBuffer(tokens) with java.io.Serializable {
@transient
private var hash: java.lang.Integer = _
@@ -505,7 +505,13 @@ class NCNlpSentence(
// Deep copy.
override def clone(): NCNlpSentence =
- new NCNlpSentence(srvReqId, text, enabledBuiltInToks, tokens.map(_.clone()))
+ new NCNlpSentence(
+ srvReqId,
+ text,
+ enabledBuiltInToks,
+ tokens.map(_.clone()),
+ deletedNotes.map(p ⇒ p._1.clone() → p._2.map(_.clone()))
+ )
/**
* Utility method that gets set of notes for given note type collected from
@@ -548,7 +554,7 @@ class NCNlpSentence(
case class Key(id: String, start: Int, end: Int) {
private def in(i: Int): Boolean = i >= start && i <= end
- def intersect(id: String, start: Int, end: Int): Boolean = in(start) || in(end)
+ def intersect(id: String, start: Int, end: Int): Boolean = id == this.id && (in(start) || in(end))
}
val keys: Seq[Key] =
@@ -583,6 +589,13 @@ class NCNlpSentence(
*/
@throws[NCE]
def collapse(mdl: NCModel, lastPhase: Boolean = false): Seq[NCNlpSentence] = {
+ def collapse0(ns: NCNlpSentence): Option[NCNlpSentence] = {
+ if (lastPhase)
+ dropAbstract(mdl, ns)
+
+ if (collapseSentence(ns, getNotNlpNotes(ns).map(_.noteType).distinct)) Some(ns) else None
+ }
+
// Always deletes `similar` notes.
// Some words with same note type can be detected various ways.
// We keep only one variant - with `best` direct and sparsity parameters,
@@ -625,7 +638,7 @@ class NCNlpSentence(
val minDelSize = if (toksByIdx.isEmpty) 1 else toksByIdx.map(_.size).max - 1
- val sens =
+ var sens =
if (delCombs.nonEmpty) {
val deleted = mutable.ArrayBuffer.empty[Seq[NCNlpSentenceNote]]
@@ -641,6 +654,19 @@ class NCNlpSentence(
if (!deleted.exists(_.forall(delComb.contains))) {
val nsClone = this.clone()
+ nsClone.deletedNotes ++= delComb.map(n ⇒ {
+ val savedDelNote = n.clone()
+ val savedDelToks = n.tokenIndexes.map(idx ⇒ nsClone(idx).clone())
+
+ val mainNotes =
+ savedDelToks.flatten.filter(n ⇒ n.noteType != "nlpcraft:nlp" && n != savedDelNote)
+
+ for (savedDelTok ← savedDelToks; mainNote ← mainNotes)
+ savedDelTok.remove(mainNote)
+
+ savedDelNote → savedDelToks
+ })
+
delComb.foreach(nsClone.removeNote)
// Has overlapped notes for some tokens.
@@ -648,13 +674,7 @@ class NCNlpSentence(
deleted += delComb
- if (lastPhase)
- dropAbstract(mdl, nsClone)
-
- if (collapseSentence(nsClone, getNotNlpNotes(nsClone).map(_.noteType).distinct))
- Some(nsClone)
- else
- None
+ collapse0(nsClone)
}
else
None
@@ -697,15 +717,10 @@ class NCNlpSentence(
m.values.map(_.sentence).toSeq
}
- else {
- if (lastPhase)
- dropAbstract(mdl, this)
-
- if (collapseSentence(this, getNotNlpNotes(this).map(_.noteType).distinct))
- Seq(this)
- else
- Seq.empty
- }.distinct
+ else
+ collapse0(this).flatMap(p ⇒ Option(Seq(p))).getOrElse(Seq.empty)
+
+ sens = sens.distinct
sens.foreach(sen ⇒
sen.foreach(tok ⇒
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
index caa5a4a..f5bcfbf 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
@@ -17,14 +17,13 @@
package org.apache.nlpcraft.probe.mgrs
-import java.io.Serializable
-import java.util
-
-import org.apache.nlpcraft.common.TOK_META_ALIASES_KEY
-import org.apache.nlpcraft.common.nlp.NCNlpSentence
+import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceToken}
+import org.apache.nlpcraft.common.{NCE, TOK_META_ALIASES_KEY}
import org.apache.nlpcraft.model.NCVariant
import org.apache.nlpcraft.model.impl.{NCTokenImpl, NCVariantImpl}
+import java.io.Serializable
+import java.util
import scala.collection.JavaConverters._
import scala.collection.{Seq, mutable}
@@ -49,6 +48,12 @@ object NCProbeVariants {
val keys2Toks = toks.flatten.map(t ⇒ Key(t.getId, t.getStartCharIndex, t.getEndCharIndex) → t).toMap
val partsKeys = mutable.HashSet.empty[Key]
+ val nlpTok2nlpSen: Map[NCNlpSentenceToken, Seq[NCNlpSentence]] =
+ sens.
+ flatMap(sen ⇒ sen.map(_ → sen)).
+ groupBy { case (tok, _) ⇒ tok }.
+ map { case (tok, seq) ⇒ tok → seq.map { case (_, sen) ⇒ sen } }
+
seq.flatten.foreach { case (tok, tokNlp) ⇒
if (tokNlp.isUser) {
val userNotes = tokNlp.filter(_.isUser)
@@ -72,14 +77,43 @@ object NCProbeVariants {
keys2Toks.get(key) match {
case Some(tok) ⇒ tok
case None ⇒
- val toks =
- keys2Toks.filter { case (k, _) ⇒ k.from == key.from && k.to == key.to }.values
-
- require(toks.size == 1, s"Unexpected state [key=$key, tokens=${toks.mkString(",")}]")
-
- val tok = toks.head
-
- tok
+ val deletedNotes = nlpTok2nlpSen(tokNlp).flatMap(_.deletedNotes).distinct
+
+ def find(noteTypeFilter: String ⇒ Boolean): Option[NCNlpSentenceToken] =
+ deletedNotes.toStream.
+ flatMap { case (delNote, delNoteToks) ⇒
+ if (noteTypeFilter(delNote.noteType)) {
+ val toks =
+ delNoteToks.
+ dropWhile(_.startCharIndex != key.from).
+ reverse.
+ dropWhile(_.endCharIndex != key.to).
+ reverse
+
+ toks.size match {
+ case 0 ⇒ None
+ case 1 ⇒
+ // TODO:
+ Some(toks.head)
+ case _ ⇒
+ // TODO:
+ Some(toks.last)
+ }
+ }
+ else
+ None
+ }.headOption
+
+ var nlpTokOpt = find(_ == key.id)
+
+ if (nlpTokOpt.isEmpty && key.id == "nlpcraft:nlp")
+ nlpTokOpt = find(_ ⇒ true)
+
+ NCTokenImpl(
+ mdl,
+ srvReqId,
+ nlpTokOpt.getOrElse(throw new NCE(s"Part not found for: $key"))
+ )
}
})
@@ -103,27 +137,38 @@ object NCProbeVariants {
var vars = toks.filter(sen ⇒
!sen.exists(t ⇒
t.getId != "nlpcraft:nlp" &&
- partsKeys.contains(Key(t.getId, t.getStartCharIndex, t.getEndCharIndex))
+ partsKeys.contains(Key(t.getId, t.getStartCharIndex, t.getEndCharIndex))
)
).map(p ⇒ new NCVariantImpl(p.asJava))
if (lastPhase) {
- if (vars.size > 1)
+ if (vars.size > 1) {
vars = vars.filter(v ⇒ !v.asScala.forall(_.getId == "nlpcraft:nlp"))
-//
-// // Assertions.
-// for (v ← vars;
-// toks = v.asScala;
-// (tok, idx) ← toks.filter(_.isUserDefined).zipWithIndex;
-// part ← tok.getPartTokens.asScala
-// ) {
-// require(part.getIndex < toks.size, s"Part has unexpected index [tokens=${toks.mkString(", ")}, token=$idx, part=$part]}")
-//
-// require(
-// toks(part.getIndex).getId == part.getId,
-// s"Part has unexpected ID [tokens=${toks.mkString(", ")}, token=$idx, part=$part]}"
-// )
-// }
+
+ val sorted = vars.sortBy(p ⇒ -p.asScala.count(_.getId != "nlpcraft:nlp"))
+
+ val saved = mutable.ArrayBuffer.empty :+ sorted.head
+
+ for (v ← sorted.tail) {
+ val toks = v.asScala
+
+ if (!saved.exists(s ⇒
+ s.size == toks.size &&
+ s.asScala.zip(toks).forall {
+ case (savedTok, tok) ⇒
+ savedTok.getStartCharIndex == tok.getStartCharIndex &&
+ savedTok.getEndCharIndex == tok.getEndCharIndex &&
+ (
+ savedTok.getId == tok.getId && savedTok.getMetadata == tok.getMetadata ||
+ tok.getId == "nlpcraft:nlp"
+ )
+ }
+ ))
+ saved += v
+ }
+
+ vars = saved.sortBy(sorted.indexOf)
+ }
}
vars
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
new file mode 100644
index 0000000..29abbaf
--- /dev/null
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.model.`abstract`
+
+import org.apache.nlpcraft.model.{NCContext, NCResult, NCToken}
+import org.apache.nlpcraft.{NCTestContext, NCTestEnvironment}
+import org.junit.jupiter.api.Test
+
+import scala.collection.JavaConverters._
+
+class NCAbstractTokensModelVariants extends NCAbstractTokensModel {
+ private def checkId(t: NCToken, id: String): Unit =
+ require(t.getId == id, s"Expected ID: $id, token: $t")
+ private def checkText(t: NCToken, txt: String): Unit =
+ require(t.getOriginalText == txt, s"Expected text: $txt, token: $t")
+
+ override def onContext(ctx: NCContext): NCResult = {
+ val variants = ctx.getVariants.asScala
+
+ ctx.getRequest.getNormalizedText match {
+ case "word the word" ⇒
+ require(variants.size == 1)
+
+ val toks = variants.head.asScala
+
+ require(toks.size == 2)
+
+ val t1 = toks.head
+
+ checkId(t1, "nlpcraft:nlp")
+ checkText(t1, "word")
+
+ val t2 = toks.last
+
+ checkId(t2, "wrapAnyWord")
+ checkText(t2, "the word")
+
+ val t2Parts = t2.getPartTokens.asScala
+
+ require(t2Parts.size == 2)
+
+ checkId(t2Parts.head,"anyWord")
+ checkId(t2Parts.last, "anyWord")
+ case "10 w1 10 w2" ⇒
+ require(variants.size == 1)
+
+ val toks = variants.head.asScala
+
+ require(toks.size == 2)
+
+ val t1 = toks.head
+
+ checkId(t1, "nlpcraft:nlp")
+ checkText(t1, "10")
+
+ val t2 = toks.last
+
+ checkId(t2,"wrapNum")
+ checkText(t2,"w1 10 w2")
+
+ val t2Parts = t2.getPartTokens.asScala
+
+ require(t2Parts.size == 3)
+
+ checkId(t2Parts.head,"nlpcraft:nlp")
+ checkId(t2Parts(1),"nlpcraft:num")
+ checkId(t2Parts.last,"nlpcraft:nlp")
+ case _ ⇒ throw new AssertionError(s"Unexpected request: ${ctx.getRequest.getNormalizedText}")
+ }
+
+ NCResult.text("OK")
+ }
+}
+
+@NCTestEnvironment(model = classOf[NCAbstractTokensModelVariants], startClient = true)
+class NCAbstractTokensVariantsSpec extends NCTestContext {
+ @Test
+ def test(): Unit = {
+ checkResult("word the word", "OK")
+ checkResult("10 w1 10 w2", "OK")
+ }
+}
\ No newline at end of file
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
index 3317331..cc03066 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
@@ -153,11 +153,6 @@ class NCEnricherSortSpec extends NCEnricherBaseSpec {
usr(text = "A", id = "A"),
usr(text = "B", id = "B"),
srt(text = "classify", subjNotes = Seq("B"), subjIndexes = Seq(1))
- ),
- Seq(
- usr(text = "A", id = "A"),
- usr(text = "B", id = "B"),
- nlp(text = "classify")
)
),
_ ⇒ checkAll(