You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/09/18 09:05:14 UTC
[incubator-nlpcraft] 03/08: WIP.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 1ccf5c5cc5a489b2565500d31209e4af19aa8187
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Thu Sep 16 12:10:22 2021 +0300
WIP.
---
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 25 ++++++++++++++++++----
.../model/NCEnricherNestedModelSpec.scala | 3 +--
.../nlp/enrichers/sort/NCEnricherSortSpec.scala | 3 +--
3 files changed, 23 insertions(+), 8 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 6908265..22af412 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -450,6 +450,22 @@ object NCModelEnricher extends NCProbeEnricher {
)
}
+ /**
+ *
+ * @param matched
+ * @param toks2Match
+ */
+ private def getSparsedTokens(matched: Seq[NlpToken], toks2Match: Seq[NlpToken]): Seq[NlpToken] = {
+ require(matched.nonEmpty)
+
+ // Matched tokens should be already sorted.
+ val stopsInside = toks2Match.filter(t =>
+ t.isStopWord && !matched.contains(matched) && t.index > matched.head.index && t.index < matched.last.index
+ )
+
+ if (stopsInside.nonEmpty) (matched ++ stopsInside).sortBy(_.index) else matched
+ }
+
@throws[NCE]
override def enrich(mdl: NCProbeModel, ns: Sentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = {
require(isStarted)
@@ -526,9 +542,10 @@ object NCModelEnricher extends NCProbeEnricher {
for (s <- get(mdl.sparseSynonyms, eId))
s.sparseMatch(toks) match {
case Some(res) =>
- println("!!!toks="+toks.map(_.origText))
- println("!!!res="+res.map(_.origText))
- add("simple sparse", ns, contCache, eId, greedy, res, idxs, s)
+// println("!!!toks="+toks.map(_.origText))
+// println("!!!res="+res.map(_.origText))
+// println
+ add("simple sparse", ns, contCache, eId, greedy, getSparsedTokens(res, toks), idxs, s)
case None => // No-op.
}
}
@@ -566,7 +583,7 @@ object NCModelEnricher extends NCProbeEnricher {
case Some(res) =>
val typ = if (s.sparse) "IDL sparse" else "IDL continuous"
- add(typ, ns, contCache, eId, greedy, toTokens(res, ns), idxs, s, toParts(res, s))
+ add(typ, ns, contCache, eId, greedy, getSparsedTokens(toTokens(res, ns), toTokens(comb.map(_.data), ns)), idxs, s, toParts(res, s))
idlCache += comb
case None => // No-op.
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
index 4d5d991..8b25e87 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
@@ -94,8 +94,7 @@ class NCEnricherNestedModelSpec2 extends NCEnricherNestedModelSpec1 {
),
_ => checkExists(
"y the y",
- usr(text = "y y", id = "y3"),
- nlp(text = "the", isStop = true)
+ usr(text = "y the y", id = "y3")
),
_ => checkExists(
"y xxx y",
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
index 228885d..7b8d858 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCEnricherSortSpec.scala
@@ -224,8 +224,7 @@ class NCEnricherSortSpec extends NCEnricherBaseSpec {
_ => checkExists(
"sort A the A the A",
srt(text = "sort", typ = SUBJ_ONLY, note = "wrapperA", index = 1),
- usr("A A A", "wrapperA"),
- nlp("the the", isStop = true)
+ usr("A the A the A", "wrapperA")
)
)
}