You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2020/09/17 19:36:54 UTC
[incubator-nlpcraft] branch master updated: Sort enricher fixes.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/master by this push:
new bed81f1 Sort enricher fixes.
bed81f1 is described below
commit bed81f116dcf15225ad6873dadf8e512a209692e
Author: Sergey Kamov <se...@apache.org>
AuthorDate: Thu Sep 17 22:36:48 2020 +0300
Sort enricher fixes.
---
.../apache/nlpcraft/common/nlp/NCNlpSentence.scala | 22 ++--
.../mgrs/nlp/enrichers/sort/NCSortEnricher.scala | 125 ++++++++++++---------
2 files changed, 81 insertions(+), 66 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index 15af813..e1614c6 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -401,12 +401,12 @@ object NCNlpSentence {
filter(_.isNlp).
foreach(n ⇒ ns.fixNote(n, "stopWord" → false))
- val nsNotes: Map[String, Seq[Int]] = ns.tokens.flatten.map(p ⇒ p.noteType → p.tokenIndexes).toMap
+ val all = ns.tokens.flatten
+ val nsNotes: Map[String, Seq[Int]] = all.map(p ⇒ p.noteType → p.tokenIndexes).toMap
for (
- t ← ns.tokens;
- stopReason ← t.stopsReasons
- if nsNotes.getOrElse(stopReason.noteType, Seq.empty) == stopReason.tokenIndexes
+ t ← ns.tokens; stopReason ← t.stopsReasons
+ if all.contains(stopReason) && nsNotes.getOrElse(stopReason.noteType, Seq.empty) == stopReason.tokenIndexes
)
ns.fixNote(t.getNlpNote, "stopWord" → true)
@@ -422,16 +422,16 @@ object NCNlpSentence {
fixIndexesReferencesList("nlpcraft:sort", "byindexes", "bynotes", ns, history)
if (res)
- // Validation (all indexes calculated well)
- require(
- !ns.flatten.
+ // Validation (all indexes calculated well)
+ require(
+ !ns.flatten.
exists(n ⇒ ns.filter(_.wordIndexes.exists(n.wordIndexes.contains)).exists(t ⇒ !t.contains(n))),
- s"Invalid sentence:\n" +
- ns.map(t ⇒
+ s"Invalid sentence:\n" +
+ ns.map(t ⇒
// Human readable invalid sentence for debugging.
s"${t.origText}{index:${t.index}}[${t.map(n ⇒ s"${n.noteType}, {range:${n.tokenFrom}-${n.tokenTo}}").mkString("|")}]"
- ).mkString("\n")
- )
+ ).mkString("\n")
+ )
res
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index 0c4f07d..2b2e11d 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -71,6 +71,19 @@ object NCSortEnricher extends NCProbeEnricher {
"ORDER SORT BY x" → TYPE_BY
)
+ private final val SORT_WORDS = Seq("sort", "rank", "classify", "order", "arrange", "organize", "segment", "shuffle")
+ private final val BY_WORDS = Seq("by", "on", "with")
+ private final val ASC_WORDS = Seq(
+ "top down" → false,
+ "bottom up" → true,
+ "ascending" → true,
+ "asc" → true,
+ "descending" → false,
+ "desc" → false,
+ "{in|by|from} {top down|descending} {order|way|fashion|*}" → false,
+ "{in|by|from} {bottom up|ascending} {order|way|fashion|*}" → true
+ )
+
case class NoteData(note: String, indexes: Seq[Int]) {
// Added for debug reasons.
override def toString: String = s"NoteData [note=$note, indexes=[${indexes.mkString(",")}]]"
@@ -92,6 +105,15 @@ object NCSortEnricher extends NCProbeEnricher {
require(main.nonEmpty)
require(subjSeq.nonEmpty || bySeq.nonEmpty)
+ // Special case. Same elements found without ASC flag. Should be skipped as already processed.
+ def isSubCase(m: Match): Boolean =
+ // Stops skipped.
+ asc.isDefined &&
+ m.asc.isEmpty &&
+ main == m.main &&
+ subjSeq == m.subjSeq &&
+ bySeq == m.bySeq
+
// Added for debug reasons.
override def toString: String = {
def s1[T](seq: Seq[NCNlpSentenceToken]): String = s"[${seq.map(_.origText).mkString(", ")}]"
@@ -421,58 +443,63 @@ object NCSortEnricher extends NCProbeEnricher {
"mdlId" → mdl.model.getId,
"txt" → ns.text) { _ ⇒
val notes = mutable.HashSet.empty[NCNlpSentenceNote]
+ val matches = mutable.ArrayBuffer.empty[Match]
for (toks ← ns.tokenMixWithStopWords() if validImportant(ns, toks)) {
tryToMatch(toks) match {
- case Some(m) ⇒
- def addNotes(
- params: ArrayBuffer[(String, Any)],
- seq: Seq[NoteData],
- notesName: String,
- idxsName: String
- ): ArrayBuffer[(String, Any)] = {
- params += notesName → seq.map(_.note).asJava
- params += idxsName → seq.map(_.indexes.asJava).asJava
-
- params
- }
+ case Some(m) ⇒
+ if (!matches.exists(_.isSubCase(m))) {
+ def addNotes(
+ params: ArrayBuffer[(String, Any)],
+ seq: Seq[NoteData],
+ notesName: String,
+ idxsName: String
+ ): ArrayBuffer[(String, Any)] = {
+ params += notesName → seq.map(_.note).asJava
+ params += idxsName → seq.map(_.indexes.asJava).asJava
+
+ params
+ }
- def mkNote(params: ArrayBuffer[(String, Any)]): Unit = {
- val note = NCNlpSentenceNote(m.main.map(_.index), TOK_ID, params: _*)
+ def mkNote(params: ArrayBuffer[(String, Any)]): Unit = {
+ val note = NCNlpSentenceNote(m.main.map(_.index), TOK_ID, params: _*)
- if (!notes.exists(n ⇒ ns.notesEqualOrSimilar(n, note))) {
- notes += note
+ if (!notes.exists(n ⇒ ns.notesEqualOrSimilar(n, note))) {
+ notes += note
- m.main.foreach(_.add(note))
- m.stop.foreach(_.addStopReason(note))
+ m.main.foreach(_.add(note))
+ m.stop.foreach(_.addStopReason(note))
+
+ matches += m
+ }
}
- }
- def mkParams(): mutable.ArrayBuffer[(String, Any)] = {
- val params = mutable.ArrayBuffer.empty[(String, Any)]
+ def mkParams(): mutable.ArrayBuffer[(String, Any)] = {
+ val params = mutable.ArrayBuffer.empty[(String, Any)]
- if (m.asc.isDefined)
- params += "asc" → m.asc.get
+ if (m.asc.isDefined)
+ params += "asc" → m.asc.get
- params
- }
+ params
+ }
- if (m.subjSeq.nonEmpty)
- for (subj ← m.subjSeq) {
- def addSubj(): ArrayBuffer[(String, Any)] =
- addNotes(mkParams(), subj, "subjnotes", "subjindexes")
+ if (m.subjSeq.nonEmpty)
+ for (subj ← m.subjSeq) {
+ def addSubj(): ArrayBuffer[(String, Any)] =
+ addNotes(mkParams(), subj, "subjnotes", "subjindexes")
- if (m.bySeq.nonEmpty)
- for (by ← m.bySeq)
- mkNote(addNotes(addSubj(), by, "bynotes", "byindexes"))
- else
- mkNote(addSubj())
- }
- else {
- require(m.bySeq.nonEmpty)
+ if (m.bySeq.nonEmpty)
+ for (by ← m.bySeq)
+ mkNote(addNotes(addSubj(), by, "bynotes", "byindexes"))
+ else
+ mkNote(addSubj())
+ }
+ else {
+ require(m.bySeq.nonEmpty)
- for (by ← m.bySeq)
- mkNote(addNotes(mkParams(), by, "bynotes", "byindexes"))
+ for (by ← m.bySeq)
+ mkNote(addNotes(mkParams(), by, "bynotes", "byindexes"))
+ }
}
case None ⇒ // No-op.
@@ -487,12 +514,11 @@ object NCSortEnricher extends NCProbeEnricher {
*/
override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
// Single words.
- sort = Seq("sort", "rank", "classify", "order", "arrange", "organize", "segment", "shuffle")
- .map(NCNlpCoreManager.stem)
+ sort = SORT_WORDS.map(NCNlpCoreManager.stem)
// Single words.
// Cannot be same as in SORT.
- by = Seq("by", "on", "with").map(NCNlpCoreManager.stem)
+ by = BY_WORDS.map(NCNlpCoreManager.stem)
// Multiple words.
// Cannot be same as in SORT and BY.
@@ -500,22 +526,11 @@ object NCSortEnricher extends NCProbeEnricher {
order = {
val p = NCMacroParser()
- Seq(
- "top down" → false,
- "bottom up" → true,
- "ascending" → true,
- "asc" → true,
- "descending" → false,
- "desc" → false,
- "{in|by|from} {top down|descending} {order|way|fashion|*}" → false,
- "{in|by|from} {bottom up|ascending} {order|way|fashion|*}" → true
- ).flatMap { case (txt, asc) ⇒ p.expand(txt).map(p ⇒ NCNlpCoreManager.stem(p) → asc ) }
+ ASC_WORDS.flatMap { case (txt, asc) ⇒ p.expand(txt).map(p ⇒ NCNlpCoreManager.stem(p) → asc ) }
}
stemAnd = NCNlpCoreManager.stem("and")
-
- maskWords =
- (sort ++ by ++ order.map(_._1)).flatMap(_.split(" ")).map(_.trim).filter(_.nonEmpty).distinct
+ maskWords = (sort ++ by ++ order.map(_._1)).flatMap(_.split(" ")).map(_.trim).filter(_.nonEmpty).distinct
validate()