You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2020/09/17 19:36:54 UTC

[incubator-nlpcraft] branch master updated: Sort enricher fixes.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


The following commit(s) were added to refs/heads/master by this push:
     new bed81f1  Sort enricher fixes.
bed81f1 is described below

commit bed81f116dcf15225ad6873dadf8e512a209692e
Author: Sergey Kamov <se...@apache.org>
AuthorDate: Thu Sep 17 22:36:48 2020 +0300

    Sort enricher fixes.
---
 .../apache/nlpcraft/common/nlp/NCNlpSentence.scala |  22 ++--
 .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala   | 125 ++++++++++++---------
 2 files changed, 81 insertions(+), 66 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index 15af813..e1614c6 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -401,12 +401,12 @@ object NCNlpSentence {
             filter(_.isNlp).
             foreach(n ⇒ ns.fixNote(n, "stopWord" → false))
 
-        val nsNotes: Map[String, Seq[Int]] = ns.tokens.flatten.map(p ⇒ p.noteType → p.tokenIndexes).toMap
+        val all = ns.tokens.flatten
+        val nsNotes: Map[String, Seq[Int]] = all.map(p ⇒ p.noteType → p.tokenIndexes).toMap
 
         for (
-            t ← ns.tokens;
-            stopReason ← t.stopsReasons
-            if nsNotes.getOrElse(stopReason.noteType, Seq.empty) == stopReason.tokenIndexes
+            t ← ns.tokens; stopReason ← t.stopsReasons
+                if all.contains(stopReason) && nsNotes.getOrElse(stopReason.noteType, Seq.empty) == stopReason.tokenIndexes
         )
             ns.fixNote(t.getNlpNote, "stopWord" → true)
 
@@ -422,16 +422,16 @@ object NCNlpSentence {
                 fixIndexesReferencesList("nlpcraft:sort", "byindexes", "bynotes", ns, history)
 
         if (res)
-        // Validation (all indexes calculated well)
-        require(
-            !ns.flatten.
+            // Validation (all indexes calculated well)
+            require(
+                !ns.flatten.
                 exists(n ⇒ ns.filter(_.wordIndexes.exists(n.wordIndexes.contains)).exists(t ⇒ !t.contains(n))),
-            s"Invalid sentence:\n" +
-                ns.map(t ⇒
+                    s"Invalid sentence:\n" +
+                    ns.map(t ⇒
                     // Human readable invalid sentence for debugging.
                     s"${t.origText}{index:${t.index}}[${t.map(n ⇒ s"${n.noteType}, {range:${n.tokenFrom}-${n.tokenTo}}").mkString("|")}]"
-                ).mkString("\n")
-        )
+                    ).mkString("\n")
+            )
 
         res
     }
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index 0c4f07d..2b2e11d 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -71,6 +71,19 @@ object NCSortEnricher extends NCProbeEnricher {
             "ORDER SORT BY x" → TYPE_BY
         )
 
+    private final val SORT_WORDS = Seq("sort", "rank", "classify", "order", "arrange", "organize", "segment", "shuffle")
+    private final val BY_WORDS = Seq("by", "on", "with")
+    private final val ASC_WORDS = Seq(
+        "top down" → false,
+        "bottom up" → true,
+        "ascending" → true,
+        "asc" → true,
+        "descending" → false,
+        "desc" → false,
+        "{in|by|from} {top down|descending} {order|way|fashion|*}" → false,
+        "{in|by|from} {bottom up|ascending} {order|way|fashion|*}" → true
+    )
+
     case class NoteData(note: String, indexes: Seq[Int]) {
         // Added for debug reasons.
         override def toString: String = s"NoteData [note=$note, indexes=[${indexes.mkString(",")}]]"
@@ -92,6 +105,15 @@ object NCSortEnricher extends NCProbeEnricher {
         require(main.nonEmpty)
         require(subjSeq.nonEmpty || bySeq.nonEmpty)
 
+        // Special case. Same elements found without ASC flag. Should be skipped as already processed.
+        def isSubCase(m: Match): Boolean =
+            // Stops skipped.
+            asc.isDefined &&
+            m.asc.isEmpty &&
+            main == m.main &&
+            subjSeq == m.subjSeq &&
+            bySeq == m.bySeq
+
         // Added for debug reasons.
         override def toString: String = {
             def s1[T](seq: Seq[NCNlpSentenceToken]): String = s"[${seq.map(_.origText).mkString(", ")}]"
@@ -421,58 +443,63 @@ object NCSortEnricher extends NCProbeEnricher {
             "mdlId" → mdl.model.getId,
             "txt" → ns.text) { _ ⇒
             val notes = mutable.HashSet.empty[NCNlpSentenceNote]
+            val matches = mutable.ArrayBuffer.empty[Match]
 
             for (toks ← ns.tokenMixWithStopWords() if validImportant(ns, toks)) {
                 tryToMatch(toks) match {
-                    case Some(m) ⇒
-                        def addNotes(
-                            params: ArrayBuffer[(String, Any)],
-                            seq: Seq[NoteData],
-                            notesName: String,
-                            idxsName: String
-                        ): ArrayBuffer[(String, Any)] = {
-                            params += notesName → seq.map(_.note).asJava
-                            params += idxsName → seq.map(_.indexes.asJava).asJava
-
-                            params
-                        }
+                    case Some(m)  ⇒
+                        if (!matches.exists(_.isSubCase(m))) {
+                            def addNotes(
+                                params: ArrayBuffer[(String, Any)],
+                                seq: Seq[NoteData],
+                                notesName: String,
+                                idxsName: String
+                            ): ArrayBuffer[(String, Any)] = {
+                                params += notesName → seq.map(_.note).asJava
+                                params += idxsName → seq.map(_.indexes.asJava).asJava
+
+                                params
+                            }
 
-                        def mkNote(params: ArrayBuffer[(String, Any)]): Unit = {
-                            val note = NCNlpSentenceNote(m.main.map(_.index), TOK_ID, params: _*)
+                            def mkNote(params: ArrayBuffer[(String, Any)]): Unit = {
+                                val note = NCNlpSentenceNote(m.main.map(_.index), TOK_ID, params: _*)
 
-                            if (!notes.exists(n ⇒ ns.notesEqualOrSimilar(n, note))) {
-                                notes += note
+                                if (!notes.exists(n ⇒ ns.notesEqualOrSimilar(n, note))) {
+                                    notes += note
 
-                                m.main.foreach(_.add(note))
-                                m.stop.foreach(_.addStopReason(note))
+                                    m.main.foreach(_.add(note))
+                                    m.stop.foreach(_.addStopReason(note))
+
+                                    matches += m
+                                }
                             }
-                        }
 
-                        def mkParams(): mutable.ArrayBuffer[(String, Any)] = {
-                            val params = mutable.ArrayBuffer.empty[(String, Any)]
+                            def mkParams(): mutable.ArrayBuffer[(String, Any)] = {
+                                val params = mutable.ArrayBuffer.empty[(String, Any)]
 
-                            if (m.asc.isDefined)
-                                params += "asc" → m.asc.get
+                                if (m.asc.isDefined)
+                                    params += "asc" → m.asc.get
 
-                            params
-                        }
+                                params
+                            }
 
-                        if (m.subjSeq.nonEmpty)
-                            for (subj ← m.subjSeq) {
-                                def addSubj(): ArrayBuffer[(String, Any)] =
-                                    addNotes(mkParams(), subj, "subjnotes", "subjindexes")
+                            if (m.subjSeq.nonEmpty)
+                                for (subj ← m.subjSeq) {
+                                    def addSubj(): ArrayBuffer[(String, Any)] =
+                                        addNotes(mkParams(), subj, "subjnotes", "subjindexes")
 
-                                if (m.bySeq.nonEmpty)
-                                    for (by ← m.bySeq)
-                                        mkNote(addNotes(addSubj(), by, "bynotes", "byindexes"))
-                                else
-                                    mkNote(addSubj())
-                            }
-                        else {
-                            require(m.bySeq.nonEmpty)
+                                    if (m.bySeq.nonEmpty)
+                                        for (by ← m.bySeq)
+                                            mkNote(addNotes(addSubj(), by, "bynotes", "byindexes"))
+                                    else
+                                        mkNote(addSubj())
+                                }
+                            else {
+                                require(m.bySeq.nonEmpty)
 
-                            for (by ← m.bySeq)
-                                mkNote(addNotes(mkParams(), by, "bynotes", "byindexes"))
+                                for (by ← m.bySeq)
+                                    mkNote(addNotes(mkParams(), by, "bynotes", "byindexes"))
+                            }
                         }
 
                     case None ⇒ // No-op.
@@ -487,12 +514,11 @@ object NCSortEnricher extends NCProbeEnricher {
      */
     override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
         // Single words.
-        sort = Seq("sort", "rank", "classify", "order", "arrange", "organize", "segment", "shuffle")
-            .map(NCNlpCoreManager.stem)
+        sort = SORT_WORDS.map(NCNlpCoreManager.stem)
 
         // Single words.
         // Cannot be same as in SORT.
-        by = Seq("by", "on", "with").map(NCNlpCoreManager.stem)
+        by = BY_WORDS.map(NCNlpCoreManager.stem)
 
         // Multiple words.
         // Cannot be same as in SORT and BY.
@@ -500,22 +526,11 @@ object NCSortEnricher extends NCProbeEnricher {
         order = {
             val p = NCMacroParser()
 
-            Seq(
-                "top down" → false,
-                "bottom up" → true,
-                "ascending" → true,
-                "asc" → true,
-                "descending" → false,
-                "desc" → false,
-                "{in|by|from} {top down|descending} {order|way|fashion|*}" → false,
-                "{in|by|from} {bottom up|ascending} {order|way|fashion|*}" → true
-            ).flatMap { case (txt, asc) ⇒ p.expand(txt).map(p ⇒ NCNlpCoreManager.stem(p) → asc ) }
+            ASC_WORDS.flatMap { case (txt, asc) ⇒ p.expand(txt).map(p ⇒ NCNlpCoreManager.stem(p) → asc ) }
         }
 
         stemAnd = NCNlpCoreManager.stem("and")
-
-        maskWords =
-            (sort ++ by ++ order.map(_._1)).flatMap(_.split(" ")).map(_.trim).filter(_.nonEmpty).distinct
+        maskWords = (sort ++ by ++ order.map(_._1)).flatMap(_.split(" ")).map(_.trim).filter(_.nonEmpty).distinct
 
         validate()