You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/09/26 21:02:26 UTC
[incubator-nlpcraft] branch master updated: Stop words processing
related issues.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/master by this push:
new 3e4faad Stop words processing related issues.
3e4faad is described below
commit 3e4faad3b79103ebb065b41effd00f8993c35090
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Mon Sep 27 00:02:16 2021 +0300
Stop words processing related issues.
---
.../cargps/src/main/resources/cargps_model.yaml | 2 +-
.../src/main/resources/stopwords/stop_words.txt | 1 +
.../apache/nlpcraft/common/nlp/NCNlpSentence.scala | 25 +-
.../nlpcraft/common/nlp/NCNlpSentenceNote.scala | 140 ++++---
.../nlpcraft/common/nlp/NCNlpSentenceToken.scala | 36 +-
.../common/nlp/NCNlpSentenceTokenBuffer.scala | 9 +-
.../scala/org/apache/nlpcraft/model/NCElement.java | 2 +-
.../apache/nlpcraft/model/NCModelFileAdapter.java | 5 +
.../org/apache/nlpcraft/model/NCModelView.java | 11 +
.../apache/nlpcraft/model/impl/NCTokenImpl.scala | 31 +-
.../nlpcraft/model/impl/json/NCModelJson.java | 7 +
.../org/apache/nlpcraft/probe/NCProbeBoot.scala | 2 +
.../nlpcraft/probe/mgrs/NCProbeIdlToken.scala | 65 +++
.../apache/nlpcraft/probe/mgrs/NCProbeModel.scala | 8 +-
.../nlpcraft/probe/mgrs/NCProbeSynonym.scala | 229 +---------
.../nlpcraft/probe/mgrs/NCProbeVariants.scala | 3 +
.../probe/mgrs/deploy/NCDeployManager.scala | 2 +-
.../probe/mgrs/nlp/NCProbeEnrichmentManager.scala | 6 +-
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 461 ++++++++++++---------
.../mgrs/nlp/enrichers/sort/NCSortEnricher.scala | 187 ++++-----
.../enrichers/stopword/NCStopWordEnricher.scala | 23 +-
.../probe/mgrs/sentence/NCSentenceManager.scala | 118 ++++--
.../probe/mgrs/synonyms/NCSynonymsManager.scala | 436 +++++++++++++++++++
.../nlp/enrichers/NCServerEnrichmentManager.scala | 5 +-
.../nlp/enrichers/numeric/NCNumericEnricher.scala | 158 ++++---
.../scala/org/apache/nlpcraft/NCTestElement.scala | 2 +
.../model/stm/indexes/NCSpecModelAdapter.scala | 25 +-
.../model/stop/NCStopWordsAllowedSpec.scala | 72 ++++
.../nlpcraft/model/stop/NCStopWordsBaseSpec.scala | 73 ++++
.../model/stop/NCStopWordsInsideSpec.scala | 72 ++++
.../nlp/enrichers/limit/NCEnricherLimitSpec.scala | 6 -
.../model/NCEnricherNestedModelSpec.scala | 3 +-
.../model/NCEnricherNestedModelSpec4.scala | 79 +++-
.../nlp/enrichers/sort/NCEnricherSortSpec.scala | 3 +-
34 files changed, 1516 insertions(+), 791 deletions(-)
diff --git a/nlpcraft-examples/cargps/src/main/resources/cargps_model.yaml b/nlpcraft-examples/cargps/src/main/resources/cargps_model.yaml
index cd5fb4e..62f45c8 100644
--- a/nlpcraft-examples/cargps/src/main/resources/cargps_model.yaml
+++ b/nlpcraft-examples/cargps/src/main/resources/cargps_model.yaml
@@ -60,7 +60,7 @@ elements:
- id: "x:addr:st"
greedy: false
synonyms:
- - "{//[a-zA-Z0-9]+//}[1,3]"
+ - "{^^{is_alphanum(tok_txt) && tok_is_between_ids('x:addr:num', 'x:addr:kind') == true}^^}[1,3]"
- id: "x:addr"
synonyms:
diff --git a/nlpcraft/src/main/resources/stopwords/stop_words.txt b/nlpcraft/src/main/resources/stopwords/stop_words.txt
index dfaa83d..5644efd 100644
--- a/nlpcraft/src/main/resources/stopwords/stop_words.txt
+++ b/nlpcraft/src/main/resources/stopwords/stop_words.txt
@@ -63,6 +63,7 @@
# POSES list exceptions.
~may
+~no
# Postfixes list.
*ent | ~NN ~NNS ~NNP ~NNPS
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index f508745..9d9f4e3 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -59,7 +59,7 @@ class NCNlpSentence(
@transient
private var hash: java.lang.Integer = _
- private def calcHash(): Int = U.mkJavaHash(srvReqId, text, enabledBuiltInToks, tokens)
+ private def calcHash(): Int = U.mkJavaHash(tokens)
// Deep copy.
override def clone(): NCNlpSentence =
@@ -74,6 +74,18 @@ class NCNlpSentence(
firstProbePhase = firstProbePhase
)
+ def copy(srvReqId: Option[String]): NCNlpSentence =
+ new NCNlpSentence(
+ srvReqId = srvReqId.getOrElse(this.srvReqId),
+ text = this.text,
+ enabledBuiltInToks = this.enabledBuiltInToks,
+ tokens = this.tokens,
+ deletedNotes = this.deletedNotes,
+ initNlpNotes = this.initNlpNotes,
+ nlpTokens = this.nlpTokens,
+ firstProbePhase = this.firstProbePhase
+ )
+
/**
* Utility method that gets set of notes for given note type collected from
* tokens in this sentence. Notes are sorted in the same order they appear
@@ -101,10 +113,11 @@ class NCNlpSentence(
override def equals(obj: Any): Boolean = obj match {
case x: NCNlpSentence =>
+ tokens.size == x.tokens.size &&
tokens == x.tokens &&
- srvReqId == x.srvReqId &&
- text == x.text &&
- enabledBuiltInToks == x.enabledBuiltInToks
+ srvReqId == x.srvReqId &&
+ text == x.text &&
+ enabledBuiltInToks == x.enabledBuiltInToks
case _ => false
}
@@ -139,8 +152,8 @@ class NCNlpSentence(
// One possible difference - stopwords indexes.
def wordsEqualOrSimilar0(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean = {
- val set1 = n1.wordIndexes.toSet
- val set2 = n2.wordIndexes.toSet
+ val set1 = n1.wordIndexesSet
+ val set2 = n2.wordIndexesSet
set1 == set2 || set1.subsetOf(set2) && set2.diff(set1).forall(stopIdxs.contains)
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala
index 7e306f4..255e086 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala
@@ -33,6 +33,10 @@ import scala.jdk.CollectionConverters.{CollectionHasAsScala, SeqHasAsJava}
class NCNlpSentenceNote(private val values: Map[String, JSerializable]) extends JSerializable with NCAsciiLike {
import NCNlpSentenceNote._
+ private lazy val dataWithoutIndexes = this.filter(p => !SKIP_CLONE.contains(p._1))
+ private lazy val skipNlp = dataWithoutIndexes.filter { case (key, _) => key != "noteType" }
+
+
@transient
private lazy val hash = values.hashCode()
@@ -42,6 +46,7 @@ class NCNlpSentenceNote(private val values: Map[String, JSerializable]) extends
lazy val tokenTo: Int = values("tokMaxIndex").asInstanceOf[Int] // Last index.
lazy val tokenIndexes: Seq[Int] = values("tokWordIndexes").asInstanceOf[JList[Int]].asScala.toSeq // Includes 1st and last indices too.
lazy val wordIndexes: Seq[Int] = values("wordIndexes").asInstanceOf[JList[Int]].asScala.toSeq // Includes 1st and last indices too.
+ lazy val wordIndexesSet: Set[Int] = wordIndexes.toSet
lazy val sparsity: Int = values("sparsity").asInstanceOf[Int]
lazy val isDirect: Boolean = values("direct").asInstanceOf[Boolean]
lazy val isUser: Boolean = {
@@ -67,36 +72,36 @@ class NCNlpSentenceNote(private val values: Map[String, JSerializable]) extends
/**
* Clones this note.
*/
- def clone(indexes: Seq[Int], wordIndexes: Seq[Int], params: (String, Any)*): NCNlpSentenceNote =
- NCNlpSentenceNote(
+ def clone(indexes: Seq[Int], wordIndexes: Seq[Int], params: (String, JSerializable)*): NCNlpSentenceNote =
+ apply(
indexes,
Some(wordIndexes),
noteType,
- values.filter(p => !SKIP_CLONE.contains(p._1)).toSeq ++ params:_*
+ dataWithoutIndexes ++ params.toMap
)
- override def clone(): NCNlpSentenceNote = {
- val m = mutable.Map.empty[String, JSerializable] ++ values
-
- new NCNlpSentenceNote(m.toMap)
- }
+ override def clone(): NCNlpSentenceNote = new NCNlpSentenceNote(values)
/**
*
- * @return
+ * @param n
*/
- override def toAscii: String =
- values.iterator.toSeq.sortBy(_._1).foldLeft(NCAsciiTable("Key", "Value"))((t, p) => t += p).toString
+ def equalsWithoutIndexes(n: NCNlpSentenceNote): Boolean =
+ this.noteType == n.noteType &&
+ this.wordIndexes.size == n.wordIndexes.size &&
+ this.wordIndexes.zip(n.wordIndexes).map(p => p._1 - p._2).distinct.size == 1 &&
+ this.dataWithoutIndexes == n.dataWithoutIndexes
/**
*
* @return
*/
- def skipNlp(): Map[String, JSerializable] =
- values.filter { case (key, _) => !SKIP_CLONE.contains(key) && key != "noteType" }
+ override def toAscii: String =
+ values.iterator.toSeq.sortBy(_._1).foldLeft(NCAsciiTable("Key", "Value"))((t, p) => t += p).toString
/**
*
+ * @return
*/
def asMetadata(): Map[String, JSerializable] =
if (isUser)
@@ -107,7 +112,7 @@ class NCNlpSentenceNote(private val values: Map[String, JSerializable]) extends
else {
val md = mutable.Map.empty[String, JSerializable]
- val m = if (noteType != "nlpcraft:nlp") skipNlp() else values
+ val m = if (noteType != "nlpcraft:nlp") skipNlp else values
m.foreach { case (name, value) => md += (name.toLowerCase() -> value)}
@@ -118,13 +123,8 @@ class NCNlpSentenceNote(private val values: Map[String, JSerializable]) extends
*
* @param kvs
*/
- def clone(kvs : (String, JSerializable)*): NCNlpSentenceNote = {
- val m = mutable.HashMap.empty[String, JSerializable] ++ values
-
- kvs.foreach(kv => m += kv._1 -> kv._2)
-
- new NCNlpSentenceNote(m.toMap)
- }
+ def clone(kvs : (String, JSerializable)*): NCNlpSentenceNote =
+ new NCNlpSentenceNote(values ++ kvs)
/**
*
@@ -133,35 +133,11 @@ class NCNlpSentenceNote(private val values: Map[String, JSerializable]) extends
* @return
*/
def getKey(withIndexes: Boolean = true, withReferences: Boolean = true): Seq[Any] = {
- def addRefs(names: String*): Seq[String] = if (withReferences) names else Seq.empty
-
- val names: Seq[String] =
- if (isUser)
- Seq.empty
- else
- noteType match {
- case "nlpcraft:continent" => Seq("continent")
- case "nlpcraft:subcontinent" => Seq("continent", "subcontinent")
- case "nlpcraft:country" => Seq("continent", "subcontinent", "country")
- case "nlpcraft:region" => Seq("continent", "subcontinent", "country", "region")
- case "nlpcraft:city" => Seq("continent", "subcontinent", "country", "region", "city")
- case "nlpcraft:metro" => Seq("metro")
- case "nlpcraft:date" => Seq("from", "to")
- case "nlpcraft:relation" => Seq("type", "note") ++ addRefs("indexes")
- case "nlpcraft:sort" => Seq("asc", "subjnotes", "bynotes") ++ addRefs("subjindexes", "byindexes")
- case "nlpcraft:limit" => Seq("limit", "note") ++ addRefs("indexes", "asc") // Asc flag has sense only with references for limit.
- case "nlpcraft:coordinate" => Seq("latitude", "longitude")
- case "nlpcraft:num" => Seq("from", "to", "unit", "unitType")
- case x if x.startsWith("google:") => Seq("meta", "mentionsBeginOffsets", "mentionsContents", "mentionsTypes")
- case x if x.startsWith("stanford:") => Seq("nne")
- case x if x.startsWith("opennlp:") => Seq.empty
- case x if x.startsWith("spacy:") => Seq("vector")
-
- case _ => throw new AssertionError(s"Unexpected note type: $noteType")
- }
-
val seq1 = if (withIndexes) Seq(wordIndexes, noteType) else Seq(noteType)
- val seq2 = names.map(name => this.getOrElse(name, null))
+ val seq2 = if (isUser)
+ Seq.empty
+ else
+ getBuiltProperties(noteType, withReferences).map(name => this.getOrElse(name, null))
seq1 ++ seq2
}
@@ -219,7 +195,7 @@ object NCNlpSentenceNote {
indexes: Seq[Int],
wordIndexesOpt: Option[Seq[Int]],
typ: String,
- params: (String, Any)*
+ params: Map[String, Any]
): NCNlpSentenceNote = {
def calc(seq: Seq[Int]): (Int, Int, Int, JList[Int], Int) =
(U.calcSparsity(seq), seq.min, seq.max, seq.asJava, seq.length)
@@ -227,18 +203,18 @@ object NCNlpSentenceNote {
val (sparsity, tokMinIndex, tokMaxIndex, tokWordIndexes, len) = calc(wordIndexesOpt.getOrElse(indexes))
new NCNlpSentenceNote(
- mutable.HashMap[String, JSerializable]((
- params.filter(_._2 != null) :+
- ("noteType" -> typ) :+
- ("tokMinIndex" -> indexes.min) :+
- ("tokMaxIndex" -> indexes.max) :+
- ("tokWordIndexes" -> indexes.asJava) :+
- ("minIndex" -> tokMinIndex) :+
- ("maxIndex" -> tokMaxIndex) :+
- ("wordIndexes" -> tokWordIndexes) :+
- ("wordLength" -> len) :+
- ("sparsity" -> sparsity)
- ).map(p => p._1 -> p._2.asInstanceOf[JSerializable]): _*).toMap
+ params.filter(_._2 != null).map(p => p._1 -> p._2.asInstanceOf[JSerializable]) ++
+ Map[String, JSerializable](
+ "noteType" -> typ,
+ "tokMinIndex" -> indexes.min,
+ "tokMaxIndex" -> indexes.max,
+ "tokWordIndexes" -> indexes.asJava.asInstanceOf[JSerializable],
+ "minIndex" -> tokMinIndex,
+ "maxIndex" -> tokMaxIndex,
+ "wordIndexes" -> tokWordIndexes.asInstanceOf[JSerializable],
+ "wordLength" -> len,
+ "sparsity" -> sparsity
+ )
)
}
@@ -250,7 +226,7 @@ object NCNlpSentenceNote {
* @param params Parameters.
*/
def apply(indexes: Seq[Int], typ: String, params: (String, Any)*): NCNlpSentenceNote =
- apply(indexes, None, typ, params: _*)
+ apply(indexes, None, typ, params.toMap)
/**
* Creates new note with given parameters.
@@ -260,7 +236,7 @@ object NCNlpSentenceNote {
* @param params Parameters.
*/
def apply(indexes: mutable.Seq[Int], typ: String, params: (String, Any)*): NCNlpSentenceNote =
- apply(indexes.toSeq, None, typ, params: _*)
+ apply(indexes.toSeq, None, typ, params.toMap)
/**
* Creates new note with given parameters.
@@ -271,7 +247,7 @@ object NCNlpSentenceNote {
* @param params Parameters.
*/
def apply(indexes: Seq[Int], wordIndexes: Seq[Int], typ: String, params: (String, Any)*): NCNlpSentenceNote =
- apply(indexes, Some(wordIndexes), typ, params: _*)
+ apply(indexes, Some(wordIndexes), typ, params.toMap)
/**
* Creates new note with given parameters.
@@ -282,5 +258,37 @@ object NCNlpSentenceNote {
* @param params Parameters.
*/
def apply(indexes: mutable.Seq[Int], wordIndexes: mutable.Seq[Int], typ: String, params: (String, Any)*): NCNlpSentenceNote =
- apply(indexes.toSeq, Some(wordIndexes.toSeq), typ, params: _*)
+ apply(indexes.toSeq, Some(wordIndexes.toSeq), typ, params.toMap)
+
+ /**
+ *
+ * @param noteType
+ * @param withReferences
+ */
+ def getBuiltProperties(noteType: String, withReferences: Boolean = true): Seq[String] = {
+ def addRefs(names: String*): Seq[String] = if (withReferences) names else Seq.empty
+
+ noteType match {
+ case "nlpcraft:nlp" => Seq.empty
+
+ case "nlpcraft:continent" => Seq("continent")
+ case "nlpcraft:subcontinent" => Seq("continent", "subcontinent")
+ case "nlpcraft:country" => Seq("continent", "subcontinent", "country")
+ case "nlpcraft:region" => Seq("continent", "subcontinent", "country", "region")
+ case "nlpcraft:city" => Seq("continent", "subcontinent", "country", "region", "city")
+ case "nlpcraft:metro" => Seq("metro")
+ case "nlpcraft:date" => Seq("from", "to")
+ case "nlpcraft:relation" => Seq("type", "note") ++ addRefs("indexes")
+ case "nlpcraft:sort" => Seq("asc", "subjnotes", "bynotes") ++ addRefs("subjindexes", "byindexes")
+ case "nlpcraft:limit" => Seq("limit", "note") ++ addRefs("indexes", "asc") // Asc flag has sense only with references for limit.
+ case "nlpcraft:coordinate" => Seq("latitude", "longitude")
+ case "nlpcraft:num" => Seq("from", "to", "unit", "unitType")
+ case x if x.startsWith("google:") => Seq("meta", "mentionsBeginOffsets", "mentionsContents", "mentionsTypes")
+ case x if x.startsWith("stanford:") => Seq("nne")
+ case x if x.startsWith("opennlp:") => Seq.empty
+ case x if x.startsWith("spacy:") => Seq("vector")
+
+ case _ => throw new AssertionError(s"Unexpected note type: $noteType")
+ }
+ }
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala
index 4b94b98..1c66da1 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala
@@ -17,6 +17,7 @@
package org.apache.nlpcraft.common.nlp
+import org.apache.nlpcraft.common.U
import org.apache.nlpcraft.common.nlp.pos._
import java.util.{List => JList}
@@ -56,6 +57,22 @@ case class NCNlpSentenceToken(
def isSwearWord: Boolean = getNlpValue[Boolean]("swear")
def isEnglish: Boolean = getNlpValue[Boolean]("english")
+ @transient
+ private var hash: java.lang.Integer = _
+
+ //noinspection HashCodeUsesVar
+ override def hashCode(): Int = {
+ if (hash == null)
+ hash = U.mkJavaHash(index, notes, stopsReasons)
+
+ hash
+ }
+
+ override def equals(obj: Any): Boolean = obj match {
+ case x: NCNlpSentenceToken => x.index == index && x.notes == notes && x.stopsReasons == stopsReasons
+ case _ => false
+ }
+
/**
*
* @param noteType Note type.
@@ -67,17 +84,7 @@ case class NCNlpSentenceToken(
* Shallow copy.
*/
def clone(index: Int): NCNlpSentenceToken =
- NCNlpSentenceToken(
- index,
- {
- val m = mutable.HashSet.empty[NCNlpSentenceNote]
-
- notes.foreach(n => m += n.clone())
-
- m
- },
- stopsReasons.clone()
- )
+ NCNlpSentenceToken(index, mutable.HashSet.empty[NCNlpSentenceNote] ++ notes.clone(), stopsReasons.clone())
/**
* Clones note.
@@ -90,7 +97,11 @@ case class NCNlpSentenceToken(
*
* @param note Note.
*/
- def remove(note: NCNlpSentenceNote): Unit = notes.remove(note)
+ def remove(note: NCNlpSentenceNote): Unit = {
+ notes.remove(note)
+
+ hash = null
+ }
/**
* Tests whether or not this token contains note.
@@ -172,6 +183,7 @@ case class NCNlpSentenceToken(
* @param note Element.
*/
def add(note: NCNlpSentenceNote): Unit = {
+ hash = null
val added = notes.add(note)
if (added && note.isNlp)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceTokenBuffer.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceTokenBuffer.scala
index a3d1156..3034a5e 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceTokenBuffer.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceTokenBuffer.scala
@@ -26,12 +26,6 @@ import scala.language.implicitConversions
* @param tokens Initial buffer.
*/
class NCNlpSentenceTokenBuffer(val tokens: ArrayBuffer[NCNlpSentenceToken] = new ArrayBuffer[NCNlpSentenceToken](16)) extends java.io.Serializable {
- /** Stringified stems. */
- lazy val stems: String = tokens.map(_.stem).mkString(" ")
-
- /** Stem-based hashcode. */
- lazy val stemsHash: Int = stems.hashCode()
-
type SSOT = IndexedSeq[IndexedSeq[Option[NCNlpSentenceToken]]]
type SST = IndexedSeq[IndexedSeq[NCNlpSentenceToken]]
@@ -113,8 +107,7 @@ class NCNlpSentenceTokenBuffer(val tokens: ArrayBuffer[NCNlpSentenceToken] = new
object NCNlpSentenceTokenBuffer {
implicit def toTokens(x: NCNlpSentenceTokenBuffer): ArrayBuffer[NCNlpSentenceToken] = x.tokens
- implicit def toBuf( toks: Iterable[NCNlpSentenceToken]): NCNlpSentenceTokenBuffer = apply(toks)
- def apply(toks: Iterable[NCNlpSentenceToken]): NCNlpSentenceTokenBuffer =
+ def apply(toks: Seq[NCNlpSentenceToken]): NCNlpSentenceTokenBuffer =
new NCNlpSentenceTokenBuffer(new ArrayBuffer[NCNlpSentenceToken](toks.size) ++ toks)
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCElement.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCElement.java
index b5b6cbd..02f06ea 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCElement.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCElement.java
@@ -383,7 +383,7 @@ public interface NCElement extends NCMetadata, Serializable {
return Optional.empty();
}
- // TODO:
+ // TODO: add javadoc
default Optional<Boolean> isGreedy() {
return Optional.empty();
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelFileAdapter.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelFileAdapter.java
index efa2b68..61cb84d 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelFileAdapter.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelFileAdapter.java
@@ -559,6 +559,11 @@ abstract public class NCModelFileAdapter extends NCModelAdapter {
}
@Override
+ public boolean isStopWordsAllowed() {
+ return proxy.isStopWordsAllowed();
+ }
+
+ @Override
public Map<String, Set<String>> getRestrictedCombinations() {
return restrictedCombinations;
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelView.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelView.java
index 30a2b40..2d06412 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelView.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/NCModelView.java
@@ -278,6 +278,9 @@ public interface NCModelView extends NCMetadata {
*/
boolean DFLT_IS_NO_USER_TOKENS_ALLOWED = true;
+ // TODO: add javadoc
+ boolean DFLT_IS_STOPWORDS_ALLOWED = true;
+
/**
* Default set of enabled built-in tokens. The following built-in tokens are enabled by default:
* <ul>
@@ -1235,4 +1238,12 @@ public interface NCModelView extends NCMetadata {
default Map<String, Set<String>> getRestrictedCombinations() {
return Collections.emptyMap();
}
+
+ /**
+ * // TODO: add javadoc
+ * @return
+ */
+ default boolean isStopWordsAllowed() {
+ return DFLT_IS_STOPWORDS_ALLOWED;
+ }
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala
index 4b2f251..1bd9add 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala
@@ -17,17 +17,16 @@
package org.apache.nlpcraft.model.impl
-import java.io.{Serializable => JSerializable}
-import java.util.Collections
-import java.util.{List => JList}
-
import org.apache.nlpcraft.common._
import org.apache.nlpcraft.common.nlp.NCNlpSentenceToken
import org.apache.nlpcraft.model._
import org.apache.nlpcraft.probe.mgrs.NCProbeModel
+import java.io.{Serializable => JSerializable}
+import java.lang
+import java.util.{Collections, List => JList}
import scala.collection.mutable
-import scala.jdk.CollectionConverters.{CollectionHasAsScala, MapHasAsJava, MapHasAsScala, SeqHasAsJava}
+import scala.jdk.CollectionConverters.{CollectionHasAsScala, SeqHasAsJava}
/**
*
@@ -49,9 +48,9 @@ private[nlpcraft] class NCTokenImpl(
value: String,
startCharIndex: Int,
endCharIndex: Int,
- meta: Map[String, Object],
+ meta: java.util.Map[String, Object],
isAbstractProp: Boolean
-) extends NCMetadataAdapter(new java.util.HashMap(mutable.HashMap(meta.toSeq:_ *).asJava)) with NCToken with JSerializable {
+) extends NCMetadataAdapter(meta) with NCToken with JSerializable {
require(mdl != null)
require(srvReqId != null)
require(id != null)
@@ -105,12 +104,12 @@ private[nlpcraft] object NCTokenImpl {
// nlpcraft:nlp and some optional (after collapsing).
require(tok.size <= 2, s"Unexpected token [size=${tok.size}, token=$tok]")
- val md = mutable.HashMap.empty[String, JSerializable]
+ val md = new java.util.HashMap[String, AnyRef]()
tok.foreach(n => {
val id = n.noteType.toLowerCase
- n.asMetadata().foreach { case (k, v) => md += s"$id:$k" -> v}
+ n.asMetadata().foreach { case (k, v) => md.put(s"$id:$k", v.asInstanceOf[AnyRef]) }
})
val usrNotes = tok.filter(_.isUser)
@@ -118,8 +117,6 @@ private[nlpcraft] object NCTokenImpl {
// No overlapping allowed at this point.
require(usrNotes.size <= 1, s"Unexpected elements notes: $usrNotes")
- def convertMeta(): ScalaMeta = md.toMap.map(p => p._1 -> p._2.asInstanceOf[AnyRef])
-
usrNotes.headOption match {
case Some(usrNote) =>
require(mdl.elements.contains(usrNote.noteType), s"Element is not found: ${usrNote.noteType}")
@@ -139,9 +136,9 @@ private[nlpcraft] object NCTokenImpl {
}
// Special synthetic meta data element.
- md.put("nlpcraft:nlp:freeword", false)
+ md.put("nlpcraft:nlp:freeword", java.lang.Boolean.FALSE)
- elm.getMetadata.asScala.foreach { case (k, v) => md.put(k, v.asInstanceOf[JSerializable]) }
+ md.putAll(elm.getMetadata)
new NCTokenImpl(
mdl.model,
@@ -153,7 +150,7 @@ private[nlpcraft] object NCTokenImpl {
value = usrNote.dataOpt("value").orNull,
startCharIndex = tok.startCharIndex,
endCharIndex = tok.endCharIndex,
- meta = convertMeta(),
+ meta = md,
isAbstractProp = mdl.model.getAbstractTokens.contains(elm.getId)
)
@@ -162,10 +159,10 @@ private[nlpcraft] object NCTokenImpl {
val note = tok.toSeq.minBy(n => if (n.isNlp) 1 else 0)
- val isStop: Boolean = md("nlpcraft:nlp:stopword").asInstanceOf[Boolean]
+ val isStop = md.get("nlpcraft:nlp:stopword").asInstanceOf[Boolean]
// Special synthetic meta data element.
- md.put("nlpcraft:nlp:freeword", !isStop && note.isNlp)
+ md.put("nlpcraft:nlp:freeword", lang.Boolean.valueOf(!isStop && note.isNlp))
new NCTokenImpl(
mdl.model,
@@ -177,7 +174,7 @@ private[nlpcraft] object NCTokenImpl {
value = null,
startCharIndex = tok.startCharIndex,
endCharIndex = tok.endCharIndex,
- meta = convertMeta(),
+ meta = md,
isAbstractProp = mdl.model.getAbstractTokens.contains(note.noteType)
)
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/json/NCModelJson.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/json/NCModelJson.java
index f332e08..043297c 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/json/NCModelJson.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/json/NCModelJson.java
@@ -63,6 +63,7 @@ public class NCModelJson {
private boolean maxSynonymsThresholdError = DFLT_MAX_SYNONYMS_THRESHOLD_ERROR;
private long conversationTimeout = DFLT_CONV_TIMEOUT_MS;
private int conversationDepth = DFLT_CONV_DEPTH;
+ private boolean isStopWordsAllowed = DFLT_IS_STOPWORDS_ALLOWED;
public String getId() {
return id;
@@ -278,4 +279,10 @@ public class NCModelJson {
return restrictedCombinations;
}
public void setRestrictedCombinations(Map<String, String[]> restrictedCombinations) { this.restrictedCombinations = restrictedCombinations;}
+ public boolean isStopWordsAllowed() {
+ return isStopWordsAllowed;
+ }
+ public void setStopWordsAllowed(boolean stopWordsAllowed) {
+ isStopWordsAllowed = stopWordsAllowed;
+ }
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala
index ecf7a18..561860f 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala
@@ -50,6 +50,7 @@ import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.stopword.NCStopWordEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.suspicious.NCSuspiciousNounsEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.validate.NCValidateManager
import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
+import org.apache.nlpcraft.probe.mgrs.synonyms.NCSynonymsManager
import java.io._
import java.util.concurrent.CompletableFuture
@@ -527,6 +528,7 @@ private [probe] object NCProbeBoot extends LazyLogging with NCOpenCensusTrace {
startedMgrs += NCConnectionManager.start(span)
startedMgrs += NCDialogFlowManager.start(span)
startedMgrs += NCSentenceManager.start(span)
+ startedMgrs += NCSynonymsManager.start(span)
}
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeIdlToken.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeIdlToken.scala
new file mode 100644
index 0000000..5da9808
--- /dev/null
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeIdlToken.scala
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.probe.mgrs
+
+import org.apache.nlpcraft.common.nlp.NCNlpSentenceToken
+import org.apache.nlpcraft.model.{NCToken, _}
+
+/**
+ *
+ * @param token
+ * @param word
+ */
+case class NCProbeIdlToken(token: NCToken, word: NCNlpSentenceToken) {
+ val (origText: String, wordIndexes: Set[Int], minIndex: Int, maxIndex: Int, isToken: Boolean, isWord: Boolean) =
+ if (token != null)
+ (token.origText, token.wordIndexes.toSet, token.wordIndexes.head, token.wordIndexes.last, true, false)
+ else
+ (word.origText, word.wordIndexes.toSet, word.wordIndexes.head, word.wordIndexes.last, false, true)
+
+ private lazy val hash = if (isToken) Seq(wordIndexes, token.getId).hashCode() else wordIndexes.hashCode()
+
+ override def hashCode(): Int = hash
+
+ def isSubsetOf(minIndex: Int, maxIndex: Int, indexes: Set[Int]): Boolean =
+ if (this.minIndex > maxIndex || this.maxIndex < minIndex)
+ false
+ else
+ wordIndexes.subsetOf(indexes)
+
+ override def equals(obj: Any): Boolean = obj match {
+ case x: NCProbeIdlToken =>
+ hash == x.hash && (isToken && x.isToken && token == x.token || isWord && x.isWord && word == x.word)
+ case _ => false
+ }
+
+ // Added for debug reasons.
+ override def toString: String = {
+ val idxs = wordIndexes.mkString(",")
+
+ if (isToken && token.getId != "nlpcraft:nlp") s"'$origText' (${token.getId}) [$idxs]]" else s"'$origText' [$idxs]"
+ }
+}
+
+/**
+ *
+ */
+object NCProbeIdlToken {
+ def apply(t: NCToken): NCProbeIdlToken = NCProbeIdlToken(token = t, word = null)
+ def apply(t: NCNlpSentenceToken): NCProbeIdlToken = NCProbeIdlToken(token = null, word = t)
+}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
index 75ae18b..ea41793 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
@@ -45,9 +45,13 @@ case class NCProbeModel(
solver: NCIntentSolver,
intents: Seq[NCIdlIntent],
callbacks: Map[String /* Intent ID */, NCProbeModelCallback],
- continuousSynonyms: Map[String /*Element ID*/ , Map[Int /*Synonym length*/ , NCProbeSynonymsWrapper]], // Fast access map.
+ continuousSynonyms:
+ Map[
+ String /*Element ID*/,
+ /*Fast access map.*/ Map[Int /*Synonym length*/ , NCProbeSynonymsWrapper]
+ ],
sparseSynonyms: Map[String /*Element ID*/, Seq[NCProbeSynonym]],
- idlSynonyms: Map[String /*Element ID*/ , Seq[NCProbeSynonym]], // Fast access map.
+ idlSynonyms: Map[String /*Element ID*/ , Seq[NCProbeSynonym]],
addStopWordsStems: Set[String],
exclStopWordsStems: Set[String],
suspWordsStems: Set[String],
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
index c370738..2b533b3 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
@@ -17,11 +17,6 @@
package org.apache.nlpcraft.probe.mgrs
-import org.apache.nlpcraft.common.U
-import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken, NCNlpSentenceTokenBuffer}
-import org.apache.nlpcraft.model._
-import org.apache.nlpcraft.model.intent.NCIdlContext
-import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent
import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind._
import scala.collection.mutable
@@ -54,180 +49,18 @@ class NCProbeSynonym(
lazy val hasIdl: Boolean = idlChunks != 0
lazy val isValueSynonym: Boolean = value != null
lazy val stems: String = map(_.wordStem).mkString(" ")
- lazy val stemsHash: Int = stems.hashCode
- /**
- *
- * @param kind
- * @return
- */
- private def getSort(kind: NCSynonymChunkKind): Int =
- kind match {
- case TEXT => 0
- case IDL => 1
- case REGEX => 2
- case _ => throw new AssertionError(s"Unexpected kind: $kind")
- }
-
- /**
- *
- * @param tok
- * @param chunk
- */
- private def isMatch(tok: NCNlpSentenceToken, chunk: NCProbeSynonymChunk): Boolean =
- chunk.kind match {
- case TEXT => chunk.wordStem == tok.stem
- case REGEX =>
- val regex = chunk.regex
-
- regex.matcher(tok.origText).matches() || regex.matcher(tok.normText).matches()
- case IDL => throw new AssertionError()
- case _ => throw new AssertionError()
- }
-
- /**
- *
- * @param toks
- * @param isMatch
- * @param getIndex
- * @param shouldBeNeighbors
- * @tparam T
- * @return
- */
- private def sparseMatch0[T](
- toks: Seq[T],
- isMatch: (T, NCProbeSynonymChunk) => Boolean,
- getIndex: T => Int,
- shouldBeNeighbors: Boolean
- ): Option[Seq[T]] =
- if (toks.size >= this.size) {
- lazy val res = mutable.ArrayBuffer.empty[T]
- lazy val all = mutable.HashSet.empty[T]
-
- var state = 0
-
- for (chunk <- this if state != -1) {
- val seq =
- if (state == 0) {
- state = 1
-
- toks.filter(t => isMatch(t, chunk))
- }
- else
- toks.filter(t => !res.contains(t) && isMatch(t, chunk))
-
- if (seq.nonEmpty) {
- val head = seq.head
-
- if (!permute && res.nonEmpty && getIndex(head) <= getIndex(res.last))
- state = -1
- else {
- all ++= seq
-
- if (all.size > this.size)
- state = -1
- else
- res += head
- }
- }
- else
- state = -1
- }
-
- if (state != -1 && all.size == res.size && (!shouldBeNeighbors || U.isIncreased(res.map(getIndex).toSeq.sorted)))
- Some(res.toSeq)
- else
- None
- }
- else
- None
-
- /**
- *
- * @param tow
- * @param chunk
- * @param req
- */
- private def isMatch(tow: NCIdlContent, chunk: NCProbeSynonymChunk, req: NCRequest): Boolean = {
- def get0[T](fromToken: NCToken => T, fromWord: NCNlpSentenceToken => T): T =
- if (tow.isLeft) fromToken(tow.swap.toOption.get) else fromWord(tow.toOption.get)
-
- chunk.kind match {
- case TEXT => chunk.wordStem == get0(_.stem, _.stem)
-
- case REGEX =>
- val r = chunk.regex
-
- r.matcher(get0(_.origText, _.origText)).matches() || r.matcher(get0(_.normText, _.normText)).matches()
-
- case IDL =>
- get0(t => chunk.idlPred.apply(t, NCIdlContext(req = req)).value.asInstanceOf[Boolean], _ => false)
-
- case _ => throw new AssertionError()
- }
- }
-
- /**
- *
- * @param toks
- */
- def isMatch(toks: NCNlpSentenceTokenBuffer): Boolean = {
- require(toks != null)
- require(!sparse && !hasIdl)
-
- if (toks.length == length) {
- if (isTextOnly)
- toks.stemsHash == stemsHash && toks.stems == stems
- else
- toks.zip(this).sortBy(p => getSort(p._2.kind)).forall { case (tok, chunk) => isMatch(tok, chunk) }
- }
- else
- false
- }
-
- /**
- *
- * @param tows
- * @param req
- * @return
- */
- def isMatch(tows: Seq[NCIdlContent], req: NCRequest): Boolean = {
- require(tows != null)
-
- if (tows.length == length && tows.count(_.isLeft) >= idlChunks)
- tows.zip(this).sortBy(p => getSort(p._2.kind)).forall { case (tow, chunk) => isMatch(tow, chunk, req) }
- else
- false
- }
-
- /**
- *
- * @param toks
- */
- def sparseMatch(toks: NCNlpSentenceTokenBuffer): Option[Seq[NCNlpSentenceToken]] = {
- require(toks != null)
- require(sparse && !hasIdl)
-
- sparseMatch0(toks.toSeq, isMatch, (t: NCNlpSentenceToken) => t.startCharIndex, shouldBeNeighbors = false)
- }
-
- /**
- *
- * @param tows
- * @param req
- */
- def sparseMatch(tows: Seq[NCIdlContent], req: NCRequest): Option[Seq[NCIdlContent]] = {
- require(tows != null)
- require(req != null)
- require(hasIdl)
-
- sparseMatch0(
- tows,
- (t: NCIdlContent, chunk: NCProbeSynonymChunk) => isMatch(t, chunk, req),
- (t: NCIdlContent) => if (t.isLeft) t.swap.toOption.get.getStartCharIndex else t.toOption.get.startCharIndex,
- shouldBeNeighbors = !sparse
- )
- }
+ private lazy val hash =
+ Seq(
+ super.hashCode(),
+ isTextOnly,
+ regexChunks,
+ idlChunks,
+ isValueSynonym,
+ isElementId,
+ isValueName,
+ value
+ ).map(p => if (p == null) 0 else p.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
override def toString(): String = mkString(" ")
@@ -286,41 +119,23 @@ class NCProbeSynonym(
}
}
- override def canEqual(other: Any): Boolean = other.isInstanceOf[NCProbeSynonym]
-
override def equals(other: Any): Boolean = other match {
case that: NCProbeSynonym =>
- super.equals(that) &&
- (that canEqual this) &&
- isTextOnly == that.isTextOnly &&
- regexChunks == that.regexChunks &&
- idlChunks == that.idlChunks &&
- isValueSynonym == that.isValueSynonym &&
- isElementId == that.isElementId &&
- isValueName == that.isValueName &&
- value == that.value
+ isElementId == that.isElementId &&
+ isTextOnly == that.isTextOnly &&
+ regexChunks == that.regexChunks &&
+ idlChunks == that.idlChunks &&
+ isValueSynonym == that.isValueSynonym &&
+ isValueName == that.isValueName &&
+ value == that.value &&
+ super.equals(that)
case _ => false
}
- override def hashCode(): Int = {
- val state = Seq(
- super.hashCode(),
- isTextOnly,
- regexChunks,
- idlChunks,
- isValueSynonym,
- isElementId,
- isValueName,
- value
- )
-
- state.map(p => if (p == null) 0 else p.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
- }
+ override def hashCode(): Int = hash
}
object NCProbeSynonym {
- type NCIdlContent = Either[NCToken, NCNlpSentenceToken]
-
/**
*
* @param isElementId
@@ -341,9 +156,9 @@ object NCProbeSynonym {
permute: Boolean
): NCProbeSynonym = {
val syn = new NCProbeSynonym(isElementId, isValueName, isDirect, value, sparse, permute)
-
+
syn ++= chunks
-
+
syn
}
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
index bcf2c9c..2b91128 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
@@ -22,6 +22,7 @@ import org.apache.nlpcraft.common.nlp.{NCNlpSentence => NlpSentence, NCNlpSenten
import org.apache.nlpcraft.common.{NCE, TOK_META_ALIASES_KEY}
import org.apache.nlpcraft.model.NCVariant
import org.apache.nlpcraft.model.impl.{NCTokenImpl, NCTokenLogger, NCVariantImpl}
+import org.apache.nlpcraft.probe.mgrs.synonyms.NCSynonymsManager
import java.io.{Serializable => JSerializable}
import java.util
@@ -267,6 +268,8 @@ object NCProbeVariants {
for ((tok, tokNlp) <- toks.zip(nlpSen) if tokNlp.isUser)
process(tok, tokNlp)
+ ok = ok && NCSynonymsManager.isStillValidIdl(srvReqId, toks.toSeq)
+
if (ok) Some(new NCVariantImpl(toks.asJava)) else None
})
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
index 5b40b69..332dd26 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
@@ -1665,7 +1665,7 @@ object NCDeployManager extends NCService {
(if (expectedJson) MAPPER_JSON else MAPPER_YAML).readValue(body, classOf[NCElementJson])
catch {
case e: Exception =>
- // TODO:
+ // TODO: fix text
throw new NCE(s"Error parsing element[" +
s"modelId=${mdl.getId}, " +
s"definitionClass=${claxx.getName}, " +
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
index 26359e2..b3fe3e1 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
@@ -44,6 +44,7 @@ import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.suspicious.NCSuspiciousNouns
import org.apache.nlpcraft.probe.mgrs.nlp.impl._
import org.apache.nlpcraft.probe.mgrs.nlp.validate._
import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
+import org.apache.nlpcraft.probe.mgrs.synonyms.NCSynonymsManager
import org.apache.nlpcraft.probe.mgrs.{NCProbeMessage, NCProbeVariants}
import java.io.Serializable
@@ -294,6 +295,9 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats {
): Unit = {
require(errMsg.isDefined || (resType.isDefined && resBody.isDefined))
+ NCSentenceManager.clearRequestData(srvReqId)
+ NCSynonymsManager.clearRequestData(srvReqId)
+
val msg = NCProbeMessage(msgName)
msg.addData("srvReqId", srvReqId)
@@ -521,8 +525,6 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats {
)
})
- NCSentenceManager.clearCache(srvReqId)
-
// Final validation before execution.
try
sensSeq.foreach(NCValidateManager.postValidate(mdl, _, span))
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 6e6f7d1..7196985 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -19,19 +19,20 @@ package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model
import io.opencensus.trace.Span
import org.apache.nlpcraft.common._
+import org.apache.nlpcraft.common.nlp.NCNlpSentence.NoteLink
import org.apache.nlpcraft.common.nlp.{NCNlpSentence => Sentence, NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken}
import org.apache.nlpcraft.model._
-import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent
+import org.apache.nlpcraft.model.impl.NCTokenImpl
import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.NCSynonymChunkKind
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
-import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants, NCTokenPartKey, NCProbeSynonym => Synonym}
+import org.apache.nlpcraft.probe.mgrs.synonyms.NCSynonymsManager
+import org.apache.nlpcraft.probe.mgrs.{NCProbeIdlToken => IdlToken, NCProbeModel, NCProbeVariants, NCTokenPartKey, NCProbeSynonym => Synonym}
import java.io.Serializable
import java.util.{List => JList}
import scala.collection.mutable
-import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.CollectionConverters._
import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava, MapHasAsScala, SeqHasAsJava}
@@ -40,78 +41,14 @@ import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava, MapHasAsSca
*/
object NCModelEnricher extends NCProbeEnricher {
type TokType = (NCToken, NCSynonymChunkKind)
- type Cache = mutable.Map[String, ArrayBuffer[Seq[Int]]]
-
- object Complex {
- def apply(t: NCToken): Complex =
- Complex(
- data = Left(t),
- isToken = true,
- isWord = false,
- token = t,
- word = null,
- origText = t.origText,
- wordIndexes = t.wordIndexes.toSet,
- minIndex = t.wordIndexes.head,
- maxIndex = t.wordIndexes.last
- )
- def apply(t: NlpToken): Complex =
- Complex(
- data = Right(t),
- isToken = false,
- isWord = true,
- token = null,
- word = t,
- origText = t.origText,
- wordIndexes = t.wordIndexes.toSet,
- minIndex = t.wordIndexes.head,
- maxIndex = t.wordIndexes.last
- )
+ object IdlTokensSeq {
+ def apply(all: Seq[IdlToken]): IdlTokensSeq = IdlTokensSeq(all.filter(_.isToken), all.flatMap(_.wordIndexes).toSet)
}
- case class Complex(
- data: NCIdlContent,
- isToken: Boolean,
- isWord: Boolean,
- token: NCToken,
- word: NlpToken,
- origText: String,
- wordIndexes: Set[Int],
- minIndex: Int,
- maxIndex: Int
- ) {
- private final val hash = if (isToken) Seq(wordIndexes, token.getId).hashCode() else wordIndexes.hashCode()
-
- override def hashCode(): Int = hash
-
- def isSubsetOf(minIndex: Int, maxIndex: Int, indexes: Set[Int]): Boolean =
- if (this.minIndex > maxIndex || this.maxIndex < minIndex)
- false
- else
- wordIndexes.subsetOf(indexes)
-
- override def equals(obj: Any): Boolean = obj match {
- case x: Complex =>
- hash == x.hash && (isToken && x.isToken && token == x.token || isWord && x.isWord && word == x.word)
- case _ => false
- }
-
- // Added for debug reasons.
- override def toString: String = {
- val idxs = wordIndexes.mkString(",")
-
- if (isToken && token.getId != "nlpcraft:nlp") s"'$origText' (${token.getId}) [$idxs]]" else s"'$origText' [$idxs]"
- }
- }
-
- object ComplexSeq {
- def apply(all: Seq[Complex]): ComplexSeq = ComplexSeq(all.filter(_.isToken), all.flatMap(_.wordIndexes).toSet)
- }
-
- case class ComplexSeq(tokensComplexes: Seq[Complex], wordsIndexes: Set[Int]) {
+ case class IdlTokensSeq(tokens: Seq[IdlToken], wordsIndexes: Set[Int]) {
private val (idxsSet: Set[Int], minIndex: Int, maxIndex: Int) = {
- val seq = tokensComplexes.flatMap(_.wordIndexes).distinct.sorted
+ val seq = tokens.flatMap(_.wordIndexes).distinct.sorted
(seq.toSet, seq.head, seq.last)
}
@@ -122,10 +59,10 @@ object NCModelEnricher extends NCProbeEnricher {
else
this.idxsSet.exists(idxsSet.contains)
- override def toString: String = tokensComplexes.mkString(" | ")
+ override def toString: String = tokens.mkString(" | ")
}
- case class ComplexHolder(complexesWords: Seq[Complex], complexes: Seq[ComplexSeq])
+ case class IdlTokensHolder(tokens: Seq[IdlToken], seqs: Seq[IdlTokensSeq])
/**
*
@@ -282,23 +219,65 @@ object NCModelEnricher extends NCProbeEnricher {
}
/**
- * Gets all sequential permutations of given tokens.
*
- * For example, if buffer contains "a b c d" tokens, then this function will return the
- * sequence of following token sequences in this order:
- * "a b c d"
- * "a b c"
- * "b c d"
- * "a b"
- * "b c"
- * "c d"
- * "a"
- * "b"
- * "c"
- * "d"
+ * 1. Prepares combination of tokens (sliding).
+ * Example: 'A B C D' -> {'A B C', 'A B', 'B C', 'A', 'B', 'C'}
+ * One sentence converted to 4 pieces.
+ *
+ * 2. Additionally, each piece converted into set of elements with all possible its stopwords permutations.
+ * Example: Piece: 'x1, x2(stopword), x3(stopword), x4' will be expanded into
+ * {'x1, x2, x3, x4', 'x1, x2, x4', 'x1, x3, x4', 'x1, x4'}
+ *
+ * 3. All variants collected, duplicated deleted, etc.
+ *
+ * @param toks
+ */
+ private def combosTokens(toks: Seq[NlpToken]): Seq[(Seq[NlpToken], Seq[NlpToken])] =
+ combos(toks).flatMap(combo => {
+ val stops = combo.filter(s => s.isStopWord && s != combo.head && s != combo.last)
+
+ val slides = mutable.ArrayBuffer.empty[mutable.ArrayBuffer[NlpToken]]
+
+ for (stop <- stops)
+ if (slides.nonEmpty && slides.last.last.index + 1 == stop.index)
+ slides.last += stop
+ else
+ slides += mutable.ArrayBuffer.empty :+ stop
+
+ // Too many stopords inside skipped.
+ val bigSlides = slides.filter(_.size > 2)
+
+ var stops4Delete: Seq[Seq[NlpToken]] =
+ if (bigSlides.nonEmpty) {
+ val allBig = bigSlides.flatten
+ val stops4AllCombs = stops.filter(p => !allBig.contains(p))
+
+ if (stops4AllCombs.nonEmpty)
+ for (
+ seq1 <- Range.inclusive(0, stops4AllCombs.size).flatMap(stops4AllCombs.combinations);
+ seq2 <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations)
+ )
+ yield seq1 ++ seq2.flatten
+ else
+ for (seq <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations))
+ yield seq.toSeq.flatten
+ }
+ else
+ Range.inclusive(1, stops.size).flatMap(stops.combinations)
+
+ stops4Delete = stops4Delete.filter(seq => !seq.contains(combo.head) && !seq.contains(combo.last))
+
+ (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => !del.contains(t)))).map(_ -> combo).distinct
+
+ }).
+ filter(_._1.nonEmpty).
+ groupBy(_._1).
+ map(p => p._1 -> p._2.map(_._2).minBy(p => (-p.size, p.head.index))).
+ sortBy { case(data, combo) => (-combo.size, -data.size, combo.head.index, data.head.index) }
+
+ /**
*
* @param toks
- * @return
*/
private def combos[T](toks: Seq[T]): Seq[Seq[T]] =
(for (n <- toks.size until 0 by -1) yield toks.sliding(n)).flatten.map(p => p)
@@ -308,9 +287,12 @@ object NCModelEnricher extends NCProbeEnricher {
* @param seq
* @param s
*/
- private def toParts(seq: Seq[NCIdlContent], s: Synonym): Seq[TokType] =
+ private def toParts(mdl: NCProbeModel, stvReqId: String, seq: Seq[IdlToken], s: Synonym): Seq[TokType] =
seq.zip(s.map(_.kind)).flatMap {
- case (complex, kind) => if (complex.isLeft) Some(complex.swap.toOption.get -> kind) else None
+ case (idlTok, kind) =>
+ val t = if (idlTok.isToken) idlTok.token else mkNlpToken(mdl, stvReqId, idlTok.word)
+
+ Some(t -> kind)
}
/**
@@ -318,10 +300,10 @@ object NCModelEnricher extends NCProbeEnricher {
* @param tows
* @param ns
*/
- private def toTokens(tows: Seq[NCIdlContent], ns: Sentence): Seq[NlpToken] =
+ private def toTokens(tows: Seq[IdlToken], ns: Sentence): Seq[NlpToken] =
(
- tows.filter(_.isRight).map(_.toOption.get) ++
- tows.filter(_.isLeft).map(_.swap.toOption.get).
+ tows.filter(_.isWord).map(_.word) ++
+ tows.filter(_.isToken).map(_.token).
flatMap(w => ns.filter(t => t.wordIndexes.intersect(w.wordIndexes).nonEmpty))
).sortBy(_.startCharIndex)
@@ -329,7 +311,6 @@ object NCModelEnricher extends NCProbeEnricher {
*
* @param m
* @param id
- * @return
*/
private def get(m: Map[String , Seq[Synonym]], id: String): Seq[Synonym] = m.getOrElse(id, Seq.empty)
@@ -349,10 +330,10 @@ object NCModelEnricher extends NCProbeEnricher {
* @param mdl
* @param ns
*/
- private def mkComplexes(mdl: NCProbeModel, ns: Sentence): ComplexHolder = {
- val complexesWords = ns.map(Complex(_))
+ private def mkHolder(mdl: NCProbeModel, ns: Sentence): IdlTokensHolder = {
+ val toks = ns.map(IdlToken(_))
- val complexes =
+ val seqs =
NCProbeVariants.convert(ns.srvReqId, mdl, NCSentenceManager.collapse(mdl.model, ns.clone())).
map(_.asScala).
par.
@@ -371,15 +352,29 @@ object NCModelEnricher extends NCProbeEnricher {
// Single word token is not split as words - token.
// Partly (not strict in) token - word.
if (t.wordIndexes.length == 1 || senPartComb.contains(t))
- Seq(Complex(t))
+ Seq(IdlToken(t))
else
- t.wordIndexes.map(complexesWords)
+ t.wordIndexes.map(toks)
)
// Drops without tokens (IDL part works with tokens).
- }).filter(_.exists(_.isToken)).map(ComplexSeq(_)).distinct
+ }).filter(_.exists(_.isToken)).map(IdlTokensSeq(_)).distinct
).seq
- ComplexHolder(complexesWords, complexes)
+ IdlTokensHolder(toks, seqs)
+ }
+
+ /**
+ *
+ * @param mdl
+ * @param srvReqId
+ * @param t
+ */
+ private def mkNlpToken(mdl: NCProbeModel, srvReqId: String, t: NlpToken): NCToken = {
+ val notes = mutable.HashSet.empty[NlpNote]
+
+ notes += t.getNlpNote
+
+ NCTokenImpl(mdl, srvReqId, NlpToken(t.index, notes, t.stopsReasons))
}
/**
@@ -387,60 +382,37 @@ object NCModelEnricher extends NCProbeEnricher {
* @param h
* @param toks
*/
- private def mkCombinations(h: ComplexHolder, toks: Seq[NlpToken], cache: Set[Seq[Complex]]): Seq[Seq[Complex]] = {
+ private def mkCombinations(h: IdlTokensHolder, toks: Seq[NlpToken]): Seq[Seq[IdlToken]] = {
val idxs = toks.flatMap(_.wordIndexes).toSet
- h.complexes.par.
- flatMap(complexSeq => {
- val rec = complexSeq.tokensComplexes.filter(_.wordIndexes.exists(idxs.contains))
+ h.seqs.par.
+ flatMap(seq => {
+ val rec = seq.tokens.filter(_.wordIndexes.exists(idxs.contains))
// Drops without tokens (IDL part works with tokens).
- if (rec.nonEmpty) {
- val data = rec ++
- (complexSeq.wordsIndexes.intersect(idxs) -- rec.flatMap(_.wordIndexes)).map(h.complexesWords)
-
- if (!cache.contains(data)) Some(data) else None
- }
+ if (rec.nonEmpty)
+ Some(rec ++
+ (seq.wordsIndexes.intersect(idxs) -- rec.flatMap(_.wordIndexes)).map(h.tokens)
+ )
else
None
}).seq
}
- private def add(
- dbgType: String,
- ns: Sentence,
- contCache: Cache,
- elemId: String,
- greedy: Boolean,
- elemToks: Seq[NlpToken],
- sliceToksIdxs: Seq[Int],
- syn: Synonym,
- parts: Seq[TokType] = Seq.empty
- ): Unit = {
- val resIdxs = elemToks.map(_.index)
- val resIdxsSorted = resIdxs.sorted
-
- if (resIdxsSorted == sliceToksIdxs && U.isContinuous(resIdxsSorted))
- contCache(elemId) += sliceToksIdxs
-
- val ok =
- (!greedy || !alreadyMarked(ns, elemId, elemToks, sliceToksIdxs)) &&
- ( parts.isEmpty || !parts.exists { case (t, _) => t.getId == elemId })
-
- if (ok)
- mark(ns, elemId, elemToks, direct = syn.isDirect && U.isIncreased(resIdxs), syn = Some(syn), parts = parts)
-
- if (DEEP_DEBUG)
- logger.trace(
- s"${if (ok) "Added" else "Skipped"} element [" +
- s"id=$elemId, " +
- s"type=$dbgType, " +
- s"text='${elemToks.map(_.origText).mkString(" ")}', " +
- s"indexes=${resIdxs.mkString("[", ",", "]")}, " +
- s"allTokensIndexes=${sliceToksIdxs.mkString("[", ",", "]")}, " +
- s"synonym=$syn" +
- s"]"
- )
+ /**
+ *
+ * @param matched
+ * @param toks2Match
+ */
+ private def getSparsedTokens(matched: Seq[NlpToken], toks2Match: Seq[NlpToken]): Seq[NlpToken] = {
+ require(matched.nonEmpty)
+
+ // Matched tokens should be already sorted.
+ val stopsInside = toks2Match.filter(t =>
+ t.isStopWord && !matched.contains(matched) && t.index > matched.head.index && t.index < matched.last.index
+ )
+
+ if (stopsInside.nonEmpty) (matched ++ stopsInside).sortBy(_.index) else matched
}
@throws[NCE]
@@ -451,8 +423,12 @@ object NCModelEnricher extends NCProbeEnricher {
"enrich", parent, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text
) { span =>
val req = NCRequestImpl(senMeta, ns.srvReqId)
- val combToks = combos(ns.toSeq)
- lazy val ch = mkComplexes(mdl, ns)
+
+ lazy val ch = mkHolder(mdl, ns)
+ lazy val variantsToks =
+ ch.seqs.map(
+ p => p.tokens.map(p => if (p.isToken) p.token else mkNlpToken(mdl, ns.srvReqId, p.word))
+ )
def execute(simpleEnabled: Boolean, idlEnabled: Boolean): Unit =
startScopedSpan(
@@ -461,44 +437,80 @@ object NCModelEnricher extends NCProbeEnricher {
if (DEEP_DEBUG)
logger.trace(s"Execution started [simpleEnabled=$simpleEnabled, idlEnabled=$idlEnabled]")
- val contCache = mutable.HashMap.empty ++
- mdl.elements.keys.map(k => k -> mutable.ArrayBuffer.empty[Seq[Int]])
- lazy val idlCache = mutable.HashSet.empty[Seq[Complex]]
-
for (
- toks <- combToks;
+ // 'toksExt' is piece of sentence, 'toks' is the same as 'toksExt' or without some stopwords set.
+ (toks, toksExt) <- combosTokens(ns.toSeq);
idxs = toks.map(_.index);
e <- mdl.elements.values;
- eId = e.getId;
+ elemId = e.getId;
greedy = e.isGreedy.orElse(mdl.model.isGreedy)
- if
- !greedy ||
- !contCache(eId).exists(_.containsSlice(idxs)) && !alreadyMarked(ns, eId, toks, idxs)
+ if !greedy || !alreadyMarked(ns, elemId, toks, idxs)
) {
+ def add(
+ dbgType: String,
+ elemToks: Seq[NlpToken],
+ syn: Synonym,
+ parts: Seq[TokType] = Seq.empty
+ ): Unit = {
+ val resIdxs = elemToks.map(_.index)
+
+ val ok =
+ (!greedy || !alreadyMarked(ns, elemId, elemToks, idxs)) &&
+ ( parts.isEmpty || !parts.exists { case (t, _) => t.getId == elemId })
+
+ if (ok)
+ mark(
+ ns,
+ elemId,
+ elemToks,
+ direct = syn.isDirect && U.isIncreased(resIdxs),
+ syn = Some(syn),
+ parts = parts
+ )
+
+ if (DEEP_DEBUG)
+ logger.trace(
+ s"${if (ok) "Added" else "Skipped"} element [" +
+ s"id=$elemId, " +
+ s"type=$dbgType, " +
+ s"text='${elemToks.map(_.origText).mkString(" ")}', " +
+ s"indexes=${resIdxs.mkString("[", ",", "]")}, " +
+ s"allTokensIndexes=${idxs.mkString("[", ",", "]")}, " +
+ s"synonym=$syn" +
+ s"]"
+ )
+ }
+
// 1. SIMPLE.
- if (simpleEnabled && (if (idlEnabled) mdl.hasIdlSynonyms(eId) else !mdl.hasIdlSynonyms(eId))) {
+ if (simpleEnabled && (if (idlEnabled) mdl.hasIdlSynonyms(elemId) else !mdl.hasIdlSynonyms(elemId))) {
lazy val tokStems = toks.map(_.stem).mkString(" ")
// 1.1 Continuous.
var found = false
if (mdl.hasContinuousSynonyms)
- fastAccess(mdl.continuousSynonyms, eId, toks.length) match {
+ fastAccess(mdl.continuousSynonyms, elemId, toks.length) match {
case Some(h) =>
def tryMap(syns: Map[String, Synonym], notFound: () => Unit): Unit =
syns.get(tokStems) match {
case Some(s) =>
found = true
- add("simple continuous", ns, contCache, eId, greedy, toks, idxs, s)
+ add("simple continuous", toksExt, s)
case None => notFound()
}
def tryScan(syns: Seq[Synonym]): Unit =
- for (s <- syns if !found)
- if (s.isMatch(toks)) {
- found = true
- add("simple continuous scan", ns, contCache, eId, greedy, toks, idxs, s)
- }
+ for (syn <- syns if !found)
+ NCSynonymsManager.onMatch(
+ ns.srvReqId,
+ elemId,
+ syn,
+ toks,
+ _ => {
+ found = true
+ add("simple continuous scan", toksExt, syn)
+ }
+ )
tryMap(
h.txtDirectSynonyms,
@@ -514,52 +526,60 @@ object NCModelEnricher extends NCProbeEnricher {
// 1.2 Sparse.
if (!found && mdl.hasSparseSynonyms)
- for (s <- get(mdl.sparseSynonyms, eId))
- s.sparseMatch(toks) match {
- case Some(res) => add("simple sparse", ns, contCache, eId, greedy, res, idxs, s)
- case None => // No-op.
- }
+ for (syn <- get(mdl.sparseSynonyms, elemId))
+ NCSynonymsManager.onSparseMatch(
+ ns.srvReqId,
+ elemId,
+ syn,
+ toks,
+ res => add("simple sparse", getSparsedTokens(res, toks), syn)
+ )
}
// 2. IDL.
if (idlEnabled) {
- val allSyns = get(mdl.idlSynonyms, eId)
- lazy val allCombs = mkCombinations(ch, toks, idlCache.toSet)
+ val allSyns = get(mdl.idlSynonyms, elemId)
+ lazy val allCombs = mkCombinations(ch, toks)
// 2.1 Continuous.
-
if (!mdl.hasSparseSynonyms) {
var found = false
- for (
- s <- allSyns;
- comb <- allCombs
- if !found;
- data = comb.map(_.data)
- )
- if (s.isMatch(data, req)) {
- add("IDL continuous", ns, contCache, eId, greedy, toks, idxs, s, toParts(data, s))
-
- idlCache += comb
-
- found = true
- }
+ for (syn <- allSyns; comb <- allCombs; if !found)
+ NCSynonymsManager.onMatch(
+ ns.srvReqId,
+ elemId,
+ syn,
+ comb,
+ req,
+ variantsToks,
+ _ => {
+ val parts = toParts(mdl, ns.srvReqId, comb, syn)
+
+ add("IDL continuous", toksExt, syn, parts)
+
+ found = true
+ }
+ )
}
else
// 2.2 Sparse.
- for (
- s <- allSyns;
- comb <- allCombs
- )
- s.sparseMatch(comb.map(_.data), req) match {
- case Some(res) =>
- val typ = if (s.sparse) "IDL sparse" else "IDL continuous"
-
- add(typ, ns, contCache, eId, greedy, toTokens(res, ns), idxs, s, toParts(res, s))
-
- idlCache += comb
- case None => // No-op.
- }
+ for (syn <- allSyns; comb <- allCombs)
+ NCSynonymsManager.onSparseMatch(
+ ns.srvReqId,
+ elemId,
+ syn,
+ comb,
+ req,
+ variantsToks,
+ res => {
+ val toks = getSparsedTokens(toTokens(res, ns), toTokens(comb, ns))
+ val parts = toParts(mdl, ns.srvReqId, res, syn)
+ val typ = if (syn.sparse) "IDL sparse"else "IDL continuous"
+
+ add(typ, toks, syn, parts)
+ }
+ )
}
}
}
@@ -576,6 +596,43 @@ object NCModelEnricher extends NCProbeEnricher {
processParsers(mdl, ns, span, req)
}
+
+ NCSynonymsManager.clearIteration(ns.srvReqId)
+
+ normalize(ns)
+ }
+
+ /**
+ *
+ * @param ns
+ */
+ private def normalize(ns: Sentence): Unit = {
+ val usrNotes = ns.flatten.filter(_.isUser).distinct
+ val links = NCSentenceManager.getLinks(usrNotes)
+ val parts = NCSentenceManager.getPartKeys(usrNotes)
+
+ val usrNotesIdxs = usrNotes.
+ filter(n => !links.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))).
+ filter(n => !parts.contains(NCTokenPartKey(n, ns))).
+ zipWithIndex
+
+ usrNotesIdxs.
+ foreach { case (n, idx) =>
+ usrNotesIdxs.find { case (candidate, candidateIdx) =>
+ candidateIdx != idx &&
+ candidate.noteType == n.noteType &&
+ candidate.dataOpt("parts") == n.dataOpt("parts") &&
+ candidate.wordIndexesSet.subsetOf(n.wordIndexesSet) &&
+ n.wordIndexes.filter(n => !candidate.wordIndexes.contains(n)).
+ forall(wordIdx => ns.tokens.exists(t => t.wordIndexes.contains(wordIdx) && t.isStopWord))
+ } match {
+ case Some(better) =>
+ ns.removeNote(n)
+
+ logger.trace(s"Element removed: $n, better: $better")
+ case None => // No-op.
+ }
+ }
}
// TODO: simplify, add tests, check model properties (sparse etc) for optimization.
@@ -598,11 +655,11 @@ object NCModelEnricher extends NCProbeEnricher {
||
(
n.tokenIndexes == toksIdxsSorted ||
- n.tokenIndexes.containsSlice(toksIdxsSorted) &&
- U.isContinuous(toksIdxsSorted) &&
- U.isContinuous(n.tokenIndexes)
+ n.tokenIndexes.containsSlice(toksIdxsSorted) &&
+ U.isContinuous(toksIdxsSorted) &&
+ U.isContinuous(n.tokenIndexes)
)
)
))
}
-}
\ No newline at end of file
+}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index 286c8b4..6e0780e 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -17,7 +17,6 @@
package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.sort
-import java.io.Serializable
import io.opencensus.trace.Span
import org.apache.nlpcraft.common.NCService
import org.apache.nlpcraft.common.makro.NCMacroParser
@@ -26,6 +25,7 @@ import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSe
import org.apache.nlpcraft.probe.mgrs.NCProbeModel
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
+import java.io.Serializable
import java.util.{List => JList}
import scala.collection.mutable
import scala.jdk.CollectionConverters._
@@ -187,59 +187,50 @@ object NCSortEnricher extends NCProbeEnricher {
*
* @param toksNoteData
*/
- private def split(toks: Seq[NCNlpSentenceToken], othersRefs: Seq[NCNlpSentenceToken], toksNoteData: Seq[NoteData], nullable: Boolean): Seq[Seq[NoteData]] = {
- val res =
- if (toksNoteData.nonEmpty) {
- val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
-
- /**
- * Returns flag which indicates are token contiguous or not.
- *
- * @param tok1Idx First token index.
- * @param tok2Idx Second token index.
- */
- def contiguous(tok1Idx: Int, tok2Idx: Int): Boolean = {
- val between = toks.filter(t => t.index > tok1Idx && t.index < tok2Idx)
-
- between.isEmpty || between.forall(p => p.isStopWord || p.stem == stemAnd)
- }
+ private def split(
+ toks: Seq[NCNlpSentenceToken],
+ othersRefs: Seq[NCNlpSentenceToken],
+ toksNoteData: Seq[NoteData]
+ ): Seq[Seq[NoteData]] =
+ if (toksNoteData.nonEmpty) {
+ val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
+
+ /**
+ * Returns flag which indicates are token contiguous or not.
+ *
+ * @param tok1Idx First token index.
+ * @param tok2Idx Second token index.
+ */
+ def contiguous(tok1Idx: Int, tok2Idx: Int): Boolean = {
+ val between = toks.filter(t => t.index > tok1Idx && t.index < tok2Idx)
+
+ between.isEmpty || between.forall(p => p.isStopWord || p.stem == stemAnd)
+ }
- val toks2 = toks.filter(othersRefs.contains)
+ val toks2 = toks.filter(othersRefs.contains)
- val minIdx = toks2.dropWhile(t => !isUserNotValue(t)).head.index
- val maxIdx = toks2.reverse.dropWhile(t => !isUserNotValue(t)).head.index
+ val minIdx = toks2.dropWhile(t => !isUserNotValue(t)).head.index
+ val maxIdx = toks2.reverse.dropWhile(t => !isUserNotValue(t)).head.index
- require(minIdx <= maxIdx)
+ require(minIdx <= maxIdx)
- def fill(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = mutable.ArrayBuffer.empty[NoteData]): Unit = {
- seq += nd
+ def fill(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = mutable.ArrayBuffer.empty[NoteData]): Unit = {
+ seq += nd
- toksNoteData.
- filter(p => nd.indexes.last < p.indexes.head && contiguous(nd.indexes.last, p.indexes.head)).
- foreach(fill(_, mutable.ArrayBuffer.empty[NoteData] ++ seq.clone()))
+ toksNoteData.
+ filter(p => nd.indexes.last < p.indexes.head && contiguous(nd.indexes.last, p.indexes.head)).
+ foreach(fill(_, mutable.ArrayBuffer.empty[NoteData] ++ seq.clone()))
- if (seq.nonEmpty && seq.head.indexes.head == minIdx && seq.last.indexes.last == maxIdx)
- res += seq
- }
+ if (seq.nonEmpty && seq.head.indexes.head == minIdx && seq.last.indexes.last == maxIdx)
+ res += seq
+ }
- toksNoteData.filter(_.indexes.head == minIdx).foreach(p => fill(p))
+ toksNoteData.filter(_.indexes.head == minIdx).foreach(p => fill(p))
- res
- }
- else
- Seq.empty
-
- if (res.isEmpty && !nullable)
- throw new AssertionError(s"Invalid empty result " +
- s"[tokensTexts=[${toks.map(_.origText).mkString("|")}]" +
- s", notes=[${toks.flatten.map(n => s"${n.noteType}:[${n.tokenIndexes.mkString(",")}]").mkString("|")}]" +
- s", tokensIndexes=[${toks.map(_.index).mkString("|")}]" +
- s", allData=[${toksNoteData.mkString("|")}]" +
- s"]"
- )
-
- res.toSeq
- }
+ res
+ }
+ else
+ Seq.empty
/**
*
@@ -346,71 +337,75 @@ object NCSortEnricher extends NCProbeEnricher {
if (data1.nonEmpty || data2.nonEmpty) {
val seq1 =
if (data1.nonEmpty)
- split(part1, othersRefs, data1, nullable = false)
+ split(part1, othersRefs, data1)
else
- split(part2, othersRefs, data2, nullable = false)
- val seq2 =
- if (data1.nonEmpty && data2.nonEmpty)
- split(part2, othersRefs, data2, nullable = true)
- else
- Seq.empty
- val asc = orderOpt.flatMap(o => Some(order(o.synonymIndex)._2))
-
- typ match {
- case TYPE_SUBJ =>
- require(seq1.nonEmpty)
- require(seq2.isEmpty)
- require(sortToks.nonEmpty)
-
- // Ignores invalid cases.
- if (byToks.isEmpty)
- res =
- Some(
+ split(part2, othersRefs, data2)
+
+ if (seq1.nonEmpty) {
+ val seq2 =
+ if (data1.nonEmpty && data2.nonEmpty)
+ split(part2, othersRefs, data2)
+ else
+ Seq.empty
+
+ val asc = orderOpt.flatMap(o => Some(order(o.synonymIndex)._2))
+
+ typ match {
+ case TYPE_SUBJ =>
+ require(seq1.nonEmpty)
+ require(seq2.isEmpty)
+ require(sortToks.nonEmpty)
+
+ // Ignores invalid cases.
+ if (byToks.isEmpty)
+ res =
+ Some(
+ Match(
+ asc = asc,
+ main = sortToks,
+ stop = orderToks,
+ subjSeq = seq1,
+ bySeq = Seq.empty
+ )
+ )
+
+ case TYPE_SUBJ_BY =>
+ require(seq1.nonEmpty)
+ require(sortToks.nonEmpty)
+ require(byToks.nonEmpty)
+
+ if (seq2.isEmpty)
+ res = None
+ else
+ res = Some(
Match(
asc = asc,
main = sortToks,
- stop = orderToks,
+ stop = byToks ++ orderToks,
subjSeq = seq1,
- bySeq = Seq.empty
+ bySeq = seq2
)
)
- case TYPE_SUBJ_BY =>
- require(seq1.nonEmpty)
- require(sortToks.nonEmpty)
- require(byToks.nonEmpty)
+ case TYPE_BY =>
+ require(seq1.nonEmpty)
+ require(seq2.isEmpty)
+ require(sortToks.nonEmpty)
+ require(byToks.nonEmpty)
- if (seq2.isEmpty)
- res = None
- else
+ // `Sort by` as one element, see validation.
res = Some(
Match(
asc = asc,
- main = sortToks,
- stop = byToks ++ orderToks,
- subjSeq = seq1,
- bySeq = seq2
+ main = sortToks ++ byToks,
+ stop = orderToks,
+ subjSeq = Seq.empty,
+ bySeq = seq1
)
)
- case TYPE_BY =>
- require(seq1.nonEmpty)
- require(seq2.isEmpty)
- require(sortToks.nonEmpty)
- require(byToks.nonEmpty)
-
- // `Sort by` as one element, see validation.
- res = Some(
- Match(
- asc = asc,
- main = sortToks ++ byToks,
- stop = orderToks,
- subjSeq = Seq.empty,
- bySeq = seq1
- )
- )
-
- case _ => throw new AssertionError(s"Unexpected type: $typ")
+ case _ => throw new AssertionError(s"Unexpected type: $typ")
+ }
}
}
case None => // No-op.
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala
index fc904d2..c0abd73 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala
@@ -17,8 +17,6 @@
package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.stopword
-import java.io.Serializable
-
import io.opencensus.trace.Span
import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager
import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceToken}
@@ -26,6 +24,7 @@ import org.apache.nlpcraft.common.{NCE, NCService, U}
import org.apache.nlpcraft.probe.mgrs.NCProbeModel
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
+import java.io.Serializable
import scala.annotation.tailrec
/**
@@ -225,12 +224,20 @@ object NCStopWordEnricher extends NCProbeEnricher {
startScopedSpan(
"enrich", parent, "srvReqId" -> ns.srvReqId, "mdlId" -> mdl.model.getId, "txt" -> ns.text
) { _ =>
- mark(mdl.exclStopWordsStems, f = false)
- mark(mdl.addStopWordsStems, f = true)
- processGeo(ns)
- processDate(ns)
- processNums(ns)
- processCommonStops(mdl, ns)
+ if (mdl.model.isStopWordsAllowed) {
+ mark(mdl.exclStopWordsStems, f = false)
+ mark(mdl.addStopWordsStems, f = true)
+
+ // If stop word swallowed by any built token (numeric, date etc) - it's stop word marking dropped.
+ ns.filter(t => t.isStopWord && !t.isNlp).foreach(t => ns.fixNote(t.getNlpNote, "stopWord" -> false))
+
+ processGeo(ns)
+ processDate(ns)
+ processNums(ns)
+ processCommonStops(mdl, ns)
+ }
+ else
+ ns.filter(_.isStopWord).foreach(t => ns.fixNote(t.getNlpNote, "stopWord" -> false))
}
}
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index d5dfc1e..12c6810 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -43,12 +43,7 @@ object NCSentenceManager extends NCService {
type CacheValue = Seq[Seq[NCNlpSentenceNote]]
private val combCache = mutable.HashMap.empty[String, mutable.HashMap[CacheKey, CacheValue]]
-
- /**
- *
- * @param notes
- */
- private def getLinks(notes: Seq[NCNlpSentenceNote]): Seq[NoteLink] = {
+ def getLinks(notes: Seq[NCNlpSentenceNote]): Seq[NoteLink] = {
val noteLinks = mutable.ArrayBuffer.empty[NoteLink]
for (n <- notes.filter(n => n.noteType == "nlpcraft:limit" || n.noteType == "nlpcraft:references"))
@@ -77,16 +72,31 @@ object NCSentenceManager extends NCService {
/**
*
+ * @param n
+ */
+ private def getParts(n: NCNlpSentenceNote): Option[Seq[NCTokenPartKey]] = {
+ val res: Option[JList[NCTokenPartKey]] = n.dataOpt("parts")
+
+ res match {
+ case Some(v) => Some(v.asScala)
+ case None => None
+ }
+ }
+
+ /**
+ *
* @param notes
*/
- private def getPartKeys(notes: NCNlpSentenceNote*): Seq[NCTokenPartKey] =
- notes.
- filter(_.isUser).
- flatMap(n => {
- val optList: Option[JList[NCTokenPartKey]] = n.dataOpt("parts")
+ def getPartKeys(notes: Seq[NCNlpSentenceNote]): Seq[NCTokenPartKey] =
+ notes.filter(_.isUser).flatMap(getParts).flatten.distinct
- optList
- }).flatMap(_.asScala).distinct
+ /**
+ *
+ * @param note
+ * @return
+ */
+ def getPartKeys(note: NCNlpSentenceNote): Seq[NCTokenPartKey] =
+ if (note.isUser) getParts(note).getOrElse(Seq.empty) else Seq.empty
/**
*
@@ -213,7 +223,8 @@ object NCSentenceManager extends NCService {
private def simpleCopy(
ns: NCNlpSentence,
history: mutable.ArrayBuffer[(Int, Int)],
- toksCopy: NCNlpSentence, i: Int
+ toksCopy: NCNlpSentence,
+ i: Int
): Seq[NCNlpSentenceToken] = {
val tokCopy = toksCopy(i)
@@ -279,9 +290,9 @@ object NCSentenceManager extends NCService {
private def fixIndexes(ns: NCNlpSentence, userNoteTypes: Seq[String]): Unit = {
// Replaces other notes indexes.
for (t <- userNoteTypes :+ "nlpcraft:nlp"; note <- ns.getNotes(t)) {
- val toks = ns.filter(_.contains(note)).sortBy(_.index)
+ val toks = ns.filter(_.contains(note))
- val newNote = note.clone(toks.map(_.index).toSeq, toks.flatMap(_.wordIndexes).toSeq.sorted)
+ val newNote = note.clone(toks.map(_.index), toks.flatMap(_.wordIndexes).toSeq.sorted)
toks.foreach(t => {
t.remove(note)
@@ -486,8 +497,9 @@ object NCSentenceManager extends NCService {
*
* @param ns Sentence.
* @param notNlpTypes Token types.
+ * @param lastPhase Phase.
*/
- private def collapseSentence(ns: NCNlpSentence, notNlpTypes: Seq[String]): Boolean = {
+ private def collapseSentence(ns: NCNlpSentence, notNlpTypes: Seq[String], lastPhase: Boolean): Boolean = {
ns.
filter(!_.isNlp).
filter(_.isStopWord).
@@ -522,7 +534,8 @@ object NCSentenceManager extends NCService {
fixIndexesReferencesList("nlpcraft:sort", "subjindexes", "subjnotes", ns, histSeq) &&
fixIndexesReferencesList("nlpcraft:sort", "byindexes", "bynotes", ns, histSeq)
- if (res) {
+ // On last phase - just for performance reasons.
+ if (res && lastPhase) {
// Validation (all indexes calculated well)
require(
!res ||
@@ -544,21 +557,23 @@ object NCSentenceManager extends NCService {
* @param mdl
* @param ns
*/
- private def dropAbstract(mdl: NCModel, ns: NCNlpSentence): Unit =
- if (!mdl.getAbstractTokens.isEmpty) {
- val notes = ns.flatten
+ private def dropAbstract(mdl: NCModel, ns: NCNlpSentence): Unit = {
+ val abstractToks = mdl.getAbstractTokens
- val keys = getPartKeys(notes: _*)
+ if (!abstractToks.isEmpty) {
+ val notes = ns.flatten.distinct.filter(n => abstractToks.contains(n.noteType))
+
+ val keys = getPartKeys(notes)
val noteLinks = getLinks(notes)
notes.filter(n => {
- val noteToks = ns.tokens.filter(_.contains(n))
+ lazy val noteToks = ns.tokens.filter(t => t.index >= n.tokenFrom && t.index <= n.tokenTo)
- mdl.getAbstractTokens.contains(n.noteType) &&
- !keys.exists(_.intersect(n.noteType, noteToks.head.startCharIndex, noteToks.last.startCharIndex)) &&
- !noteLinks.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))
+ !noteLinks.contains(NoteLink(n.noteType, n.tokenIndexes.sorted)) &&
+ !keys.exists(_.intersect(n.noteType, noteToks.head.startCharIndex, noteToks.last.startCharIndex))
}).foreach(ns.removeNote)
}
+ }
/**
*
@@ -602,7 +617,7 @@ object NCSentenceManager extends NCService {
if (lastPhase)
dropAbstract(mdl, ns)
- if (collapseSentence(ns, getNotNlpNotes(ns.toSeq).map(_.noteType).distinct)) Some(ns) else None
+ if (collapseSentence(ns, getNotNlpNotes(ns.tokens).map(_.noteType).distinct, lastPhase)) Some(ns) else None
}
// Always deletes `similar` notes.
@@ -635,8 +650,8 @@ object NCSentenceManager extends NCService {
redundant.foreach(sen.removeNote)
var delCombs: Seq[NCNlpSentenceNote] =
- getNotNlpNotes(sen.toSeq).
- flatMap(note => getNotNlpNotes(note.tokenIndexes.sorted.map(i => sen(i))).filter(_ != note)).
+ getNotNlpNotes(sen.tokens).
+ flatMap(note => getNotNlpNotes(note.tokenIndexes.map(sen(_))).filter(_ != note)).
distinct
// Optimization. Deletes all wholly swallowed notes.
@@ -647,9 +662,9 @@ object NCSentenceManager extends NCService {
// There aren't links on it.
filter(n => !links.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))).
// It doesn't have links.
- filter(getPartKeys(_).isEmpty).
+ filter(n => getPartKeys(n).isEmpty).
flatMap(note => {
- val noteWordsIdxs = note.wordIndexes.toSet
+ val noteWordsIdxs = note.wordIndexesSet
val key = NCTokenPartKey(note, sen)
val delCombOthers =
@@ -657,7 +672,7 @@ object NCSentenceManager extends NCService {
if (
delCombOthers.nonEmpty &&
- !delCombOthers.exists(o => noteWordsIdxs.subsetOf(o.wordIndexes.toSet))
+ !delCombOthers.exists(o => noteWordsIdxs.subsetOf(o.wordIndexesSet))
)
Some(note)
else
@@ -675,7 +690,7 @@ object NCSentenceManager extends NCService {
groupBy { case (idx, _) => idx }.
map { case (_, seq) => seq.map { case (_, note) => note }.toSet }.
toSeq.sortBy(-_.size)
-
+
def findCombinations(): Seq[Seq[NCNlpSentenceNote]] =
NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala.toSeq)
@@ -709,7 +724,7 @@ object NCSentenceManager extends NCService {
Holder(
// We have to delete some keys to have possibility to compare sentences.
- notes.map(_.clone().filter { case (name, _) => name != "direct" }).toSeq,
+ notes.map(_.clone().toMap.filter { case (name, _) => name != "direct" }).toSeq,
sen,
notes.filter(_.isNlp).map(p => if (p.isDirect) 0 else 1).sum
)
@@ -732,7 +747,6 @@ object NCSentenceManager extends NCService {
)
)
-
def notNlpNotes(s: NCNlpSentence): Seq[NCNlpSentenceNote] = s.flatten.filter(!_.isNlp)
// Drops similar sentences (with same notes structure). Keeps with more found.
@@ -752,20 +766,38 @@ object NCSentenceManager extends NCService {
}
}.toSeq
- sens =
- sens.filter(s => {
- def mkNotNlp(s: NCNlpSentence): Set[NCNlpSentenceNote] = s.flatten.filter(!_.isNlp).toSet
+ var sensWithNotes = sens.map(s => s -> s.flatten.filter(!_.isNlp).toSet)
- val notNlpNotes = mkNotNlp(s)
+ var sensWithNotesIdxs = sensWithNotes.zipWithIndex
- !sens.filter(_ != s).map(mkNotNlp).exists(notNlpNotes.subsetOf)
- })
+ sens =
+ sensWithNotesIdxs.filter { case ((_, notNlpNotes1), idx1) =>
+ !sensWithNotesIdxs.
+ filter { case (_, idx2) => idx2 != idx1 }.
+ exists { case((_, notNlpNotes2), _) => notNlpNotes1.subsetOf(notNlpNotes2) }
+ }.map { case ((sen, _), _) => sen }
// Drops similar sentences (with same tokens structure).
// Among similar sentences we prefer one with minimal free words count.
- sens.groupBy(notNlpNotes(_).map(_.getKey(withIndexes = false))).
+ sens = sens.groupBy(notNlpNotes(_).map(_.getKey(withIndexes = false))).
map { case (_, seq) => seq.minBy(_.filter(p => p.isNlp && !p.isStopWord).map(_.wordIndexes.length).sum) }.
toSeq
+
+ // Drops sentences if they are just subset of another.
+ sensWithNotes = sensWithNotes.filter { case (sen, _) => sens.contains(sen) }
+
+ sensWithNotesIdxs = sensWithNotes.zipWithIndex
+
+ sens = sensWithNotesIdxs.filter { case ((_, notNlpNotes1), idx1) =>
+ !sensWithNotesIdxs.exists { case ((_, notNlpNotes2), idx2) =>
+ idx1 != idx2 && {
+ notNlpNotes2.size > notNlpNotes1.size &&
+ notNlpNotes1.forall(t1 => notNlpNotes2.exists(_.equalsWithoutIndexes(t1)))
+ }
+ }
+ }.map { case ((sen, _), _) => sen }
+
+ sens
}
override def start(parent: Span): NCService = {
@@ -797,5 +829,5 @@ object NCSentenceManager extends NCService {
*
* @param srvReqId
*/
- def clearCache(srvReqId: String): Unit = combCache -= srvReqId
+ def clearRequestData(srvReqId: String): Unit = combCache -= srvReqId
}
\ No newline at end of file
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
new file mode 100644
index 0000000..e2d59f6
--- /dev/null
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
@@ -0,0 +1,436 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.probe.mgrs.synonyms
+
+import io.opencensus.trace.Span
+import org.apache.nlpcraft.common.nlp.{NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken}
+import org.apache.nlpcraft.common.{NCService, U}
+import org.apache.nlpcraft.model._
+import org.apache.nlpcraft.model.intent.{NCIdlContext, NCIdlFunction}
+import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{IDL, NCSynonymChunkKind, REGEX, TEXT}
+import org.apache.nlpcraft.probe.mgrs.{NCProbeIdlToken => IdlToken, NCProbeSynonymChunk, NCProbeSynonym => Synonym}
+
+import scala.collection.mutable
+import scala.collection.parallel.CollectionConverters.ImmutableIterableIsParallelizable
+import scala.compat.java8.OptionConverters._
+import scala.jdk.CollectionConverters.ListHasAsScala
+
+/**
+ *
+ */
+object NCSynonymsManager extends NCService {
+ private class CacheHolder[T] {
+ private lazy val cache =
+ mutable.HashMap.empty[String, mutable.HashMap[Int, mutable.HashMap[Seq[T], mutable.HashSet[Synonym]]]]
+
+ def isUnprocessed(elemId: String, s: Synonym, tokens: Seq[T]): Boolean =
+ cache.
+ getOrElseUpdate(
+ elemId,
+ mutable.HashMap.empty[Int, mutable.HashMap[Seq[T], mutable.HashSet[Synonym]]]
+ ).
+ getOrElseUpdate(
+ tokens.length,
+ mutable.HashMap.empty[Seq[T], mutable.HashSet[Synonym]]
+ ).
+ getOrElseUpdate(
+ tokens,
+ mutable.HashSet.empty[Synonym]
+ ).add(s)
+ }
+
+ private case class SavedIdlKey(id: String, startCharIndex: Int, endCharIndex: Int, other: Map[String, AnyRef] = Map.empty)
+
+ private object SavedIdlKey {
+ def apply(t: NCToken): SavedIdlKey =
+ if (t.isUserDefined)
+ SavedIdlKey(t.getId, t.getStartCharIndex, t.getEndCharIndex)
+ else
+ SavedIdlKey(
+ t.getId,
+ t.getStartCharIndex,
+ t.getEndCharIndex,
+ NlpNote.getBuiltProperties(t.getId).flatMap(p => t.metaOpt(p).asScala match {
+ case Some(v) => Some(p -> v)
+ case None => None
+ }).toMap
+ )
+ }
+
+ private case class Value(request: NCRequest, variants: Seq[Seq[NCToken]], predicate: NCIdlFunction) {
+ override def toString: String = variants.toString()
+ }
+
+ private val savedIdl = mutable.HashMap.empty[String, mutable.HashMap[SavedIdlKey, mutable.ArrayBuffer[Value]]]
+ private val idlChunksCache = mutable.HashMap.empty[String, mutable.HashMap[(IdlToken, NCProbeSynonymChunk), Boolean]]
+ private val idlCaches = mutable.HashMap.empty[String, CacheHolder[IdlToken]]
+ private val tokCaches = mutable.HashMap.empty[String, CacheHolder[Int]]
+
+ override def start(parent: Span): NCService = {
+ ackStarting()
+
+ ackStarted()
+ }
+
+ override def stop(parent: Span): Unit = {
+ ackStopping()
+
+ ackStopped()
+ }
+
+ /**
+ *
+ * @param tok
+ * @param chunk
+ */
+ private def isMatch(tok: NlpToken, chunk: NCProbeSynonymChunk): Boolean =
+ chunk.kind match {
+ case TEXT => chunk.wordStem == tok.stem
+ case REGEX => chunk.regex.matcher(tok.origText).matches() || chunk.regex.matcher(tok.normText).matches()
+ case IDL => throw new AssertionError()
+ case _ => throw new AssertionError()
+ }
+
+ /**
+ *
+ * @param kind
+ */
+ private def getSort(kind: NCSynonymChunkKind): Int =
+ kind match {
+ case TEXT => 0
+ case IDL => 1
+ case REGEX => 2
+ case _ => throw new AssertionError(s"Unexpected kind: $kind")
+ }
+
+ /**
+ *
+ * @param s
+ * @param toks
+ * @param isMatch
+ * @param getIndex
+ * @param shouldBeNeighbors
+ * @tparam T
+ */
+ private def sparseMatch0[T](
+ s: Synonym,
+ toks: Seq[T],
+ isMatch: (T, NCProbeSynonymChunk) => Boolean,
+ getIndex: T => Int,
+ shouldBeNeighbors: Boolean
+ ): Option[Seq[T]] =
+ if (toks.size >= s.size) {
+ lazy val res = mutable.ArrayBuffer.empty[T]
+ lazy val all = mutable.HashSet.empty[T]
+
+ var state = 0
+
+ for (chunk <- s if state != -1) {
+ val seq =
+ if (state == 0) {
+ state = 1
+
+ toks.filter(t => isMatch(t, chunk))
+ }
+ else
+ toks.filter(t => !res.contains(t) && isMatch(t, chunk))
+
+ if (seq.nonEmpty) {
+ val head = seq.head
+
+ if (!s.permute && res.nonEmpty && getIndex(head) <= getIndex(res.last))
+ state = -1
+ else {
+ all ++= seq
+
+ if (all.size > s.size)
+ state = -1
+ else
+ res += head
+ }
+ }
+ else
+ state = -1
+ }
+
+ if (state != -1 && all.size == res.size && (!shouldBeNeighbors || U.isIncreased(res.map(getIndex).toSeq.sorted)))
+ Some(res.toSeq)
+ else
+ None
+ }
+ else
+ None
+
+ /**
+ *
+ * @param req
+ * @param tok
+ * @param pred
+ * @param variantsToks
+ */
+ private def save(req: NCRequest, tok: NCToken, pred: NCIdlFunction, variantsToks: Seq[Seq[NCToken]]): Unit = {
+ savedIdl.
+ getOrElseUpdate(req.getServerRequestId, mutable.HashMap.empty).
+ getOrElseUpdate(SavedIdlKey(tok), mutable.ArrayBuffer.empty) +=
+ Value(req, variantsToks, pred)
+ }
+
+ /**
+ *
+ * @param srvReqId
+ * @param elemId
+ * @param s
+ * @param tokens
+ */
+ private def isUnprocessedTokens(srvReqId: String, elemId: String, s: Synonym, tokens: Seq[Int]): Boolean =
+ tokCaches.getOrElseUpdate(srvReqId, new CacheHolder[Int]).isUnprocessed(elemId, s, tokens)
+
+ /**
+ *
+ * @param srvReqId
+ * @param elemId
+ * @param s
+ * @param tokens
+ */
+ private def isUnprocessedIdl(srvReqId: String, elemId: String, s: Synonym, tokens: Seq[IdlToken]): Boolean =
+ idlCaches.getOrElseUpdate(srvReqId, new CacheHolder[IdlToken]).isUnprocessed(elemId, s, tokens)
+
+ /**
+ *
+ * @param tow
+ * @param chunk
+ * @param req
+ * @param variantsToks
+ */
+ private def isMatch(
+ tow: IdlToken, chunk: NCProbeSynonymChunk, req: NCRequest, variantsToks: Seq[Seq[NCToken]]
+ ): Boolean =
+ idlChunksCache.
+ getOrElseUpdate(req.getServerRequestId,
+ mutable.HashMap.empty[(IdlToken, NCProbeSynonymChunk), Boolean]
+ ).
+ getOrElseUpdate(
+ (tow, chunk),
+ {
+ def get0[T](fromToken: NCToken => T, fromWord: NlpToken => T): T =
+ if (tow.isToken) fromToken(tow.token) else fromWord(tow.word)
+
+ chunk.kind match {
+ case TEXT => chunk.wordStem == get0(_.stem, _.stem)
+
+ case REGEX =>
+ chunk.regex.matcher(get0(_.origText, _.origText)).matches() ||
+ chunk.regex.matcher(get0(_.normText, _.normText)).matches()
+
+ case IDL =>
+ val ok =
+ variantsToks.par.exists(vrntToks =>
+ get0(t =>
+ chunk.idlPred.apply(t, NCIdlContext(toks = vrntToks, req = req)).
+ value.asInstanceOf[Boolean],
+ _ => false
+ )
+ )
+
+ if (ok)
+ save(req, tow.token, chunk.idlPred, variantsToks)
+
+ ok
+
+ case _ => throw new AssertionError()
+ }
+ }
+ )
+
+ /**
+ *
+ * @param srvReqId
+ * @param elemId
+ * @param syn
+ * @param toks
+ * @param callback
+ */
+ def onMatch(srvReqId: String, elemId: String, syn: Synonym, toks: Seq[NlpToken], callback: Unit => Unit): Unit =
+ if (isUnprocessedTokens(srvReqId, elemId, syn, toks.map(_.index))) {
+ require(toks != null)
+ require(!syn.sparse && !syn.hasIdl)
+
+ if (
+ toks.length == syn.length && {
+ if (syn.isTextOnly)
+ toks.zip(syn).forall(p => p._1.stem == p._2.wordStem)
+ else
+ toks.zip(syn).sortBy(p => getSort(p._2.kind)).forall { case (tok, chunk) => isMatch(tok, chunk) }
+ }
+ )
+ callback()
+ }
+
+ /**
+ *
+ * @param srvReqId
+ * @param elemId
+ * @param s
+ * @param toks
+ * @param req
+ * @param variantsToks
+ * @param callback
+ */
+ def onMatch(
+ srvReqId: String,
+ elemId: String,
+ s: Synonym,
+ toks: Seq[IdlToken],
+ req: NCRequest,
+ variantsToks: Seq[Seq[NCToken]],
+ callback: Unit => Unit
+ ): Unit =
+ if (isUnprocessedIdl(srvReqId, elemId, s, toks)) {
+ require(toks != null)
+
+ if (
+ toks.length == s.length &&
+ toks.count(_.isToken) >= s.idlChunks && {
+ toks.zip(s).sortBy(p => getSort(p._2.kind)).forall {
+ case (tow, chunk) => isMatch(tow, chunk, req, variantsToks)
+ }
+ }
+ )
+ callback()
+ }
+
+ /**
+ *
+ * @param srvReqId
+ * @param elemId
+ * @param syn
+ * @param toks
+ * @param callback
+ */
+ def onSparseMatch(
+ srvReqId: String, elemId: String, syn: Synonym, toks: Seq[NlpToken], callback: Seq[NlpToken] => Unit
+ ): Unit =
+ if (isUnprocessedTokens(srvReqId, elemId, syn, toks.map(_.index))) {
+ require(toks != null)
+ require(syn.sparse && !syn.hasIdl)
+
+ sparseMatch0(syn, toks, isMatch, (t: NlpToken) => t.startCharIndex, shouldBeNeighbors = false) match {
+ case Some(res) => callback(res)
+ case None => // No-op.
+ }
+ }
+
+ /**
+ *
+ * @param srvReqId
+ * @param elemId
+ * @param syn
+ * @param toks
+ * @param req
+ * @param variantsToks
+ * @param callback
+ */
+ def onSparseMatch(
+ srvReqId: String,
+ elemId: String,
+ syn: Synonym,
+ toks: Seq[IdlToken],
+ req: NCRequest,
+ variantsToks: Seq[Seq[NCToken]],
+ callback: Seq[IdlToken] => Unit
+ ): Unit =
+ if (isUnprocessedIdl(srvReqId, elemId, syn, toks)) {
+ require(toks != null)
+ require(req != null)
+ require(syn.hasIdl)
+
+ sparseMatch0(
+ syn,
+ toks,
+ (t: IdlToken, chunk: NCProbeSynonymChunk) => isMatch(t, chunk, req, variantsToks),
+ (t: IdlToken) => if (t.isToken) t.token.getStartCharIndex else t.word.startCharIndex,
+ shouldBeNeighbors = !syn.sparse
+ ) match {
+ case Some(res) => callback(res)
+ case None => // No-op.
+ }
+ }
+
+ /**
+ *
+ * @param srvReqId
+ * @param senToks
+ */
+ def isStillValidIdl(srvReqId: String, senToks: Seq[NCToken]): Boolean =
+ savedIdl.get(srvReqId) match {
+ case Some(m) =>
+ lazy val allCheckedSenToks = {
+ val set = mutable.HashSet.empty[SavedIdlKey]
+
+ def add(t: NCToken): Unit = {
+ set += SavedIdlKey(t)
+
+ t.getPartTokens.asScala.foreach(add)
+ }
+
+ senToks.foreach(add)
+
+ set
+ }
+
+ senToks.forall(tok =>
+ m.get(SavedIdlKey(tok)) match {
+ case Some(vals) =>
+ vals.exists(
+ v =>
+ v.variants.exists(winHistVariant =>
+ v.predicate.apply(
+ tok, NCIdlContext(toks = winHistVariant, req = v.request)
+ ).value.asInstanceOf[Boolean] &&
+ winHistVariant.map(SavedIdlKey(_)).forall(t =>
+ t.id == "nlpcraft:nlp" || allCheckedSenToks.contains(t)
+ )
+ )
+ )
+
+ case None => true
+ })
+
+ case None => true
+ }
+
+ /**
+ *
+ * @param srvReqId
+ */
+ def clearRequestData(srvReqId: String): Unit = {
+ clearIteration(srvReqId)
+
+ savedIdl -= srvReqId
+ }
+
+ /**
+ *
+ * @param srvReqId
+ */
+ def clearIteration(srvReqId: String): Unit = {
+ idlChunksCache -= srvReqId
+ idlCaches -= srvReqId
+ tokCaches -= srvReqId
+ }
+}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala
index 03b749f..df745a0 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/NCServerEnrichmentManager.scala
@@ -156,7 +156,7 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance {
if (h.enabledBuiltInTokens == normEnabledBuiltInToks) {
prepareAsciiTable(h.sentence).info(logger, Some(s"Sentence enriched (from cache): '$normTxt'"))
- h.sentence
+ h.sentence.copy(Some(srvReqId))
}
else
process(srvReqId, normTxt, enabledBuiltInToks, span)
@@ -224,7 +224,8 @@ object NCServerEnrichmentManager extends NCService with NCIgniteInstance {
.getNotes(hdr.noteType)
.filter(_.contains(hdr.noteName))
.map(note => {
- val s = note(hdr.noteName).toString()
+ val s = note(hdr.noteName).toString
+
if (isStopWord) s"${r(s)}" else s
})
.toSeq
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/numeric/NCNumericEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/numeric/NCNumericEnricher.scala
index 670a4dc..cf39575 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/numeric/NCNumericEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/numeric/NCNumericEnricher.scala
@@ -207,7 +207,7 @@ object NCNumericEnricher extends NCServerEnricher {
toIncl: Boolean,
toFractional: Boolean,
unitDataOpt: Option[NCNumericUnitData],
- ): Seq[NCNlpSentenceNote] = {
+ ): Unit= {
val params =
mutable.ArrayBuffer.empty[(String, Any)] ++
Seq(
@@ -223,7 +223,7 @@ object NCNumericEnricher extends NCServerEnricher {
"isToPositiveInfinity" -> (to == MAX_VALUE)
)
- def mkAndAssign(toks: Seq[NCNlpSentenceToken], typ: String, params: (String, Any)*):NCNlpSentenceNote = {
+ def mkAndAssign(toks: Seq[NCNlpSentenceToken], params: (String, Any)*):NCNlpSentenceNote = {
val note = NCNlpSentenceNote(toks.map(_.index), "nlpcraft:num", params:_*)
toks.foreach(_.add(note))
@@ -241,17 +241,17 @@ object NCNumericEnricher extends NCServerEnricher {
}
if (unitData.tokens == toks)
- Seq(mkAndAssign(toks, "nlpcraft:num", extend():_*))
+ Seq(mkAndAssign(toks, extend():_*))
else {
Seq(
mkAndAssign(
- toks.filter(t => !unitData.tokens.contains(t)), "nlpcraft:num", params.clone():_*
+ toks.filter(t => !unitData.tokens.contains(t)), params.clone():_*
),
- mkAndAssign(toks, "nlpcraft:num", extend():_*)
+ mkAndAssign(toks, extend():_*)
)
}
- case None => Seq(mkAndAssign(toks, "nlpcraft:num", params:_*))
+ case None => Seq(mkAndAssign(toks, params:_*))
}
}
@@ -316,7 +316,7 @@ object NCNumericEnricher extends NCServerEnricher {
Some(NCNumericUnitData(num1.unitData.get.unit, num1.tokens ++ num2.tokens))
}
- val notes = p._2 match {
+ p._2 match {
case BETWEEN_EXCLUSIVE =>
mkNotes(
prepToks,
@@ -364,79 +364,75 @@ object NCNumericEnricher extends NCServerEnricher {
processed ++= toks
- val notes =
- prep.prepositionType match {
- case MORE =>
- mkNotes(
- toks,
- num.value,
- fromIncl = false,
- fromFractional = num.isFractional,
- to = MAX_VALUE,
- toIncl = true,
- toFractional = num.isFractional,
- num.unitData
- )
- case MORE_OR_EQUAL =>
- mkNotes(
- toks,
- num.value,
- fromIncl = true,
- fromFractional = num.isFractional,
- to = MAX_VALUE,
- toIncl = true,
- toFractional = num.isFractional,
- num.unitData
- )
- case LESS =>
- mkNotes(
- toks,
- MIN_VALUE,
- fromIncl = true,
- fromFractional = num.isFractional,
- to = num.value,
- toIncl = false,
- toFractional = num.isFractional,
- num.unitData
- )
- case LESS_OR_EQUAL =>
- mkNotes(
- toks,
- MIN_VALUE,
- fromIncl = true,
- fromFractional = num.isFractional,
- to = num.value,
- toIncl = true,
- toFractional = num.isFractional,
- num.unitData
- )
- case EQUAL =>
- mkNotes(
- toks,
- num.value,
- fromIncl = true,
- fromFractional = num.isFractional,
- to = num.value,
- toIncl = true,
- toFractional = num.isFractional,
- num.unitData
- )
- case NOT_EQUAL =>
- mkNotes(
- toks,
- num.value,
- fromIncl = false,
- fromFractional = num.isFractional,
- to = num.value,
- toIncl = false,
- toFractional = num.isFractional,
- num.unitData
- )
- case _ => throw new AssertionError(s"Illegal note type: ${prep.prepositionType}.")
- }
-
- for (note <- notes)
- toks.foreach(_.add(note))
+ prep.prepositionType match {
+ case MORE =>
+ mkNotes(
+ toks,
+ num.value,
+ fromIncl = false,
+ fromFractional = num.isFractional,
+ to = MAX_VALUE,
+ toIncl = true,
+ toFractional = num.isFractional,
+ num.unitData
+ )
+ case MORE_OR_EQUAL =>
+ mkNotes(
+ toks,
+ num.value,
+ fromIncl = true,
+ fromFractional = num.isFractional,
+ to = MAX_VALUE,
+ toIncl = true,
+ toFractional = num.isFractional,
+ num.unitData
+ )
+ case LESS =>
+ mkNotes(
+ toks,
+ MIN_VALUE,
+ fromIncl = true,
+ fromFractional = num.isFractional,
+ to = num.value,
+ toIncl = false,
+ toFractional = num.isFractional,
+ num.unitData
+ )
+ case LESS_OR_EQUAL =>
+ mkNotes(
+ toks,
+ MIN_VALUE,
+ fromIncl = true,
+ fromFractional = num.isFractional,
+ to = num.value,
+ toIncl = true,
+ toFractional = num.isFractional,
+ num.unitData
+ )
+ case EQUAL =>
+ mkNotes(
+ toks,
+ num.value,
+ fromIncl = true,
+ fromFractional = num.isFractional,
+ to = num.value,
+ toIncl = true,
+ toFractional = num.isFractional,
+ num.unitData
+ )
+ case NOT_EQUAL =>
+ mkNotes(
+ toks,
+ num.value,
+ fromIncl = false,
+ fromFractional = num.isFractional,
+ to = num.value,
+ toIncl = false,
+ toFractional = num.isFractional,
+ num.unitData
+ )
+ case _ => throw new AssertionError(s"Illegal note type: ${prep.prepositionType}.")
+ }
}
}
@@ -448,7 +444,7 @@ object NCNumericEnricher extends NCServerEnricher {
// Numeric without conditions.
for (num <- nums if !processed.exists(num.tokens.contains)) {
- val notes = mkNotes(
+ mkNotes(
num.tokens,
num.value,
fromIncl = true,
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/NCTestElement.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/NCTestElement.scala
index daf1ab0..9d4c746 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/NCTestElement.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/NCTestElement.scala
@@ -36,10 +36,12 @@ case class NCTestElement(id: String, syns: String*) extends NCElement {
var permutateSynonyms: Optional[lang.Boolean] = super.isPermutateSynonyms
var sparse: Optional[lang.Boolean] = super.isSparse
var greedy: Optional[lang.Boolean] = super.isGreedy
+ var groups: Seq[String] = Seq(id)
override def getId: String = id
override def getSynonyms: util.List[String] = (syns :+ id).asJava
override def getValues: util.List[NCValue] = values
+ override def getGroups: util.List[String] = groups.asJava
override def getMetadata: util.Map[String, AnyRef] = metadata
override def getDescription: String = description
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stm/indexes/NCSpecModelAdapter.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stm/indexes/NCSpecModelAdapter.scala
index c0a8ac4..f9911f6 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stm/indexes/NCSpecModelAdapter.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stm/indexes/NCSpecModelAdapter.scala
@@ -19,11 +19,12 @@ package org.apache.nlpcraft.model.stm.indexes
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
+import org.apache.nlpcraft.NCTestElement
import org.apache.nlpcraft.model.{NCElement, NCModelAdapter}
import java.util
-import java.util.Collections
-import scala.jdk.CollectionConverters.{SeqHasAsJava, SetHasAsJava}
+import java.util.Optional
+import scala.jdk.CollectionConverters.SetHasAsJava
object NCSpecModelAdapter {
val mapper = new ObjectMapper()
@@ -34,8 +35,8 @@ object NCSpecModelAdapter {
class NCSpecModelAdapter extends NCModelAdapter("nlpcraft.stm.idxs.test", "STM Indexes Test Model", "1.0") {
override def getElements: util.Set[NCElement] =
Set(
- mkElement("A2", "G1", "a a"),
- mkElement("B2", "G1", "b b"),
+ mkElement("A2", "G1", "a a", greedy = false),
+ mkElement("B2", "G1", "b b", greedy = false),
mkElement("X", "G2", "x"),
mkElement("Y", "G2", "y"),
@@ -43,14 +44,12 @@ class NCSpecModelAdapter extends NCModelAdapter("nlpcraft.stm.idxs.test", "STM I
mkElement("Z", "G3", "z")
).asJava
- private def mkElement(id: String, group: String, syns: String*): NCElement =
- new NCElement {
- override def getId: String = id
- override def getSynonyms: util.List[String] = {
- val seq: Seq[String] = syns
+ private def mkElement(id: String, group: String, syns: String, greedy: Boolean = true): NCElement = {
+ val e = NCTestElement(id, syns)
- seq.asJava
- }
- override def getGroups: util.List[String] = Collections.singletonList(group)
- }
+ e.greedy = Optional.of(greedy)
+ e.groups = Seq(group)
+
+ e
+ }
}
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsAllowedSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsAllowedSpec.scala
new file mode 100644
index 0000000..3f87f35
--- /dev/null
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsAllowedSpec.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.model.stop
+
+import org.apache.nlpcraft.model.{NCContext, NCElement, NCModelAdapter, NCResult}
+import org.apache.nlpcraft.{NCTestContext, NCTestElement, NCTestEnvironment}
+import org.junit.jupiter.api.Test
+
+import java.util
+import scala.jdk.CollectionConverters.CollectionHasAsScala
+import scala.language.implicitConversions
+
+/**
+ *
+ */
+class NCStopWordsAllowedModelAdapter extends NCModelAdapter("nlpcraft.test", "Test Model", "1.0") {
+ override def getElements: util.Set[NCElement] = Set(NCTestElement("a", "the test"))
+
+ override def onContext(ctx: NCContext): NCResult = {
+ ctx.getVariants.asScala.forall(t => t.asScala.exists(_.isStopWord) == isStopWordsAllowed)
+
+ NCResult.text("OK")
+ }
+}
+/**
+ *
+ */
+class NCStopWordsAllowedModel extends NCStopWordsAllowedModelAdapter {
+ override def isStopWordsAllowed: Boolean = true
+}
+
+/**
+ *
+ */
+class NCStopWordsNotAllowedModel extends NCStopWordsAllowedModelAdapter {
+ override def isStopWordsAllowed: Boolean = false
+}
+
+/**
+ *
+ */
+@NCTestEnvironment(model = classOf[NCStopWordsAllowedModel], startClient = true)
+class NCStopWordsAllowedSpec extends NCTestContext {
+ @Test
+ def test(): Unit = {
+ checkResult("the", "OK")
+ checkResult("the test", "OK")
+ checkResult("the the test", "OK")
+ checkResult("test the the test", "OK")
+ }
+}
+
+/**
+ *
+ */
+@NCTestEnvironment(model = classOf[NCStopWordsNotAllowedModel], startClient = true)
+class NCStopWordsNotAllowedSpec extends NCStopWordsAllowedSpec
\ No newline at end of file
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsBaseSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsBaseSpec.scala
new file mode 100644
index 0000000..07ca216
--- /dev/null
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsBaseSpec.scala
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.model.stop
+
+import org.apache.nlpcraft.model.{NCElement, NCIntent, NCModelAdapter, NCResult}
+import org.apache.nlpcraft.{NCTestContext, NCTestElement, NCTestEnvironment}
+import org.junit.jupiter.api.Test
+
+import java.util
+import scala.language.implicitConversions
+
+/**
+ *
+ */
+class NCStopWordsBaseModel extends NCModelAdapter("nlpcraft.test", "Test Model", "1.0") {
+ override def getElements: util.Set[NCElement] = Set(
+ NCTestElement("a"),
+ NCTestElement("b"),
+ NCTestElement("xy", "x y"),
+ )
+
+ @NCIntent(
+ "intent=twoWords " +
+ " term(a)~{# == 'a'}" +
+ " term(b)~{# == 'b'}"
+ )
+ def onTwoWords(): NCResult = NCResult.text("OK")
+
+ @NCIntent(
+ "intent=oneWord " +
+ " term(xt)~{# == 'xy'}"
+ )
+ def onOneWord(): NCResult = NCResult.text("OK")
+}
+
+/**
+ *
+ */
+@NCTestEnvironment(model = classOf[NCStopWordsBaseModel], startClient = true)
+class NCStopWordsBaseSpec extends NCTestContext {
+ @Test
+ def testTwoWords(): Unit = {
+ checkIntent("a b", "twoWords")
+ checkIntent("a the b", "twoWords")
+ checkIntent("a the the b", "twoWords")
+ checkIntent("the a the b", "twoWords")
+ checkIntent("the a the b the the", "twoWords")
+ }
+
+ @Test
+ def testOneWord(): Unit = {
+ checkIntent("x y", "oneWord")
+ checkIntent("x the y", "oneWord")
+ checkIntent("x the the y", "oneWord")
+ checkIntent("the x the y", "oneWord")
+ checkIntent("the x the y the the", "oneWord")
+ }
+}
\ No newline at end of file
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala
new file mode 100644
index 0000000..b51207c
--- /dev/null
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.model.stop
+
+import org.apache.nlpcraft.model.{NCElement, NCIntent, NCIntentMatch, NCModelAdapter, NCResult}
+import org.apache.nlpcraft.{NCTestContext, NCTestElement, NCTestEnvironment}
+import org.junit.jupiter.api.Test
+
+import java.util
+import scala.jdk.CollectionConverters.CollectionHasAsScala
+import scala.language.implicitConversions
+
+/**
+ *
+ */
+class NCStopWordsInsideModel extends NCModelAdapter("nlpcraft.test", "Test Model", "1.0") {
+ override def getElements: util.Set[NCElement] = Set(NCTestElement("complex", "a b"))
+
+ @NCIntent("intent=i term={# == 'complex'}")
+ def onI(ctx: NCIntentMatch): NCResult = {
+ require(ctx.getContext.getVariants.size() == 1)
+ require(ctx.getContext.getVariants.asScala.head.asScala.size == 1)
+ require(ctx.getContext.getVariants.asScala.head.asScala.head.getNormalizedText == ctx.getContext.getRequest.getNormalizedText)
+
+ NCResult.text("OK")
+ }
+}
+
+/**
+ *
+ */
+@NCTestEnvironment(model = classOf[NCStopWordsInsideModel], startClient = true)
+class NCStopWordsInsideSpec extends NCTestContext {
+ @Test
+ def test(): Unit = {
+ checkIntent("a b", "i")
+ checkIntent("a the b", "i")
+ checkIntent("a , b", "i")
+ checkIntent("a, b", "i")
+ checkIntent("a, the b", "i")
+ checkIntent("a, the, b", "i")
+ }
+}
+
+class NCStopWordsInsideSparseModel extends NCStopWordsInsideModel {
+ override def isPermutateSynonyms: Boolean = true
+ override def isSparse: Boolean = true
+}
+
+@NCTestEnvironment(model = classOf[NCStopWordsInsideSparseModel], startClient = true)
+class NCStopWordsInsideSparseSpec extends NCStopWordsInsideSpec {
+ @Test
+ def test2(): Unit = {
+ // TODO: extend it.
+ }
+}
+
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala
index 503e093..3aee776 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCEnricherLimitSpec.scala
@@ -68,12 +68,6 @@ class NCEnricherLimitSpec extends NCEnricherBaseSpec {
lim(text = "handful of", limit = 5, index = 1, note = "A", asc = false),
usr(text = "A", id = "A"),
usr(text = "B", id = "B")
- ),
- Seq(
- nlp("handful"),
- nlp("of"),
- usr(text = "A", id = "A"),
- usr(text = "B", id = "B")
)
)
)
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
index 4d5d991..8b25e87 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
@@ -94,8 +94,7 @@ class NCEnricherNestedModelSpec2 extends NCEnricherNestedModelSpec1 {
),
_ => checkExists(
"y the y",
- usr(text = "y y", id = "y3"),
- nlp(text = "the", isStop = true)
+ usr(text = "y the y", id = "y3")
),
_ => checkExists(
"y xxx y",
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
index 27082f1..758171f 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
@@ -17,17 +17,15 @@
package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model
-import org.apache.nlpcraft.model.{NCElement, NCIntent, NCIntentMatch, NCModelAdapter, NCResult}
+import org.apache.nlpcraft.model.{NCElement, NCIntent, NCModelAdapter, NCResult}
import org.apache.nlpcraft.{NCTestContext, NCTestElement, NCTestEnvironment}
import org.junit.jupiter.api.Test
import java.util
import scala.jdk.CollectionConverters.SetHasAsJava
-/**
- * Nested Elements test model.
- */
-class NCNestedTestModel41 extends NCModelAdapter("nlpcraft.nested4.test.mdl", "Nested Test Model", "1.0") {
+// It shouldn't be too slow.
+class NCNestedTestModel4Adapter extends NCModelAdapter("nlpcraft.nested4.test.mdl", "Nested Test Model", "1.0") {
override def getElements: util.Set[NCElement] =
Set(
NCTestElement("e1", "//[a-zA-Z0-9]+//"),
@@ -36,16 +34,22 @@ class NCNestedTestModel41 extends NCModelAdapter("nlpcraft.nested4.test.mdl", "N
override def getAbstractTokens: util.Set[String] = Set("e1").asJava
override def getEnabledBuiltInTokens: util.Set[String] = Set.empty[String].asJava
+}
- @NCIntent("intent=onE2 term(t1)={# == 'e2'}[8, 100]")
- def onAB(ctx: NCIntentMatch): NCResult = NCResult.text("OK")
+/**
+ * Greedy(one element expected) + not permuted.
+ */
+class NCNestedTestModel41 extends NCNestedTestModel4Adapter {
+ @NCIntent("intent=onE2 term(t1)={# == 'e2'}")
+ def onAB(): NCResult = NCResult.text("OK")
+ override def isGreedy: Boolean = true
override def isPermutateSynonyms: Boolean = false
override def isSparse: Boolean = false
}
/**
- * It shouldn't be too slow.
+ *
*/
@NCTestEnvironment(model = classOf[NCNestedTestModel41], startClient = true)
class NCEnricherNestedModelSpec41 extends NCTestContext {
@@ -53,17 +57,66 @@ class NCEnricherNestedModelSpec41 extends NCTestContext {
def test(): Unit = checkIntent("the a " * 11, "onE2")
}
-class NCNestedTestModel42 extends NCNestedTestModel41 {
+/**
+ * Not-greedy(few elements expected) + not permuted.
+ */
+class NCNestedTestModel42 extends NCNestedTestModel4Adapter {
+ @NCIntent("intent=onE2 term(t1)={# == 'e2'}[3, 100]")
+ def onAB(): NCResult = NCResult.text("OK")
+
+ override def isGreedy: Boolean = false
+ override def isPermutateSynonyms: Boolean = false
+ override def isSparse: Boolean = false
+}
+
+/**
+ *
+ */
+@NCTestEnvironment(model = classOf[NCNestedTestModel41], startClient = true)
+class NCEnricherNestedModelSpec42 extends NCTestContext {
+ @Test
+ def test(): Unit = checkIntent("the a " * 11, "onE2")
+}
+
+/**
+ * Greedy(one element expected) + permuted.
+ */
+class NCNestedTestModel43 extends NCNestedTestModel4Adapter {
+ @NCIntent("intent=onE2 term(t1)={# == 'e2'}[1, 100]")
+ def onAB(): NCResult = NCResult.text("OK")
+
+ override def isGreedy: Boolean = true
override def isPermutateSynonyms: Boolean = true
override def isSparse: Boolean = true
}
/**
- * It shouldn't be too slow.
+ *
*/
-@NCTestEnvironment(model = classOf[NCNestedTestModel42], startClient = true)
-class NCEnricherNestedModelSpec42 extends NCTestContext {
+@NCTestEnvironment(model = classOf[NCNestedTestModel43], startClient = true)
+class NCEnricherNestedModelSpec43 extends NCTestContext {
+ @Test
+ def test(): Unit = checkIntent("the a " * 4, "onE2")
+}
+
+/**
+ * Not-greedy(few elements expected) + permuted.
+ */
+class NCNestedTestModel44 extends NCNestedTestModel4Adapter {
+ @NCIntent("intent=onE2 term(t1)={# == 'e2'}[3, 100]")
+ def onAB(): NCResult = NCResult.text("OK")
+
+ override def isGreedy: Boolean = false
+ override def isPermutateSynonyms: Boolean = true
+ override def isSparse: Boolean = true
+}
+
+/**
+ *
+ */
+@NCTestEnvironment(model = classOf[NCNestedTestModel44], startClient = true)
+class NCEnricherNestedModelSpec44 extends NCTestContext {
@Test
- def test(): Unit = checkIntent("the a " * 8, "onE2")
+ def test(): Unit = checkIntent("the a " * 4, "onE2")
}
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
index 228885d..7b8d858 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
@@ -224,8 +224,7 @@ class NCEnricherSortSpec extends NCEnricherBaseSpec {
_ => checkExists(
"sort A the A the A",
srt(text = "sort", typ = SUBJ_ONLY, note = "wrapperA", index = 1),
- usr("A A A", "wrapperA"),
- nlp("the the", isStop = true)
+ usr("A the A the A", "wrapperA")
)
)
}