You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2020/04/05 10:10:50 UTC
[incubator-nlpcraft] branch NLPCRAFT-30 updated: SortEnricher bugfix.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-30
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


The following commit(s) were added to refs/heads/NLPCRAFT-30 by this push:
     new 5ee71a2  SortEnricher bugfix.
5ee71a2 is described below

commit 5ee71a2a7074500d79e820d99cc149863e0bb5e9
Author: Sergey Kamov <se...@apache.org>
AuthorDate: Sun Apr 5 13:10:43 2020 +0300

    SortEnricher bugfix.
---
 .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala   | 91 +++++++++++-----------
 1 file changed, 47 insertions(+), 44 deletions(-)

diff --git a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index a3be897..3a9661b 100644
--- a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++ b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -150,60 +150,56 @@ object NCSortEnricher extends NCProbeEnricher {
       *
       * @param toks
       */
-    private def split(toks: Seq[NCNlpSentenceToken], nullable: Boolean): Seq[Seq[NoteData]] = {
+    private def split(toks: Seq[NCNlpSentenceToken]): Seq[Seq[NoteData]] = {
+        require(toks.nonEmpty)
+
+        val min = toks.head.index
+        val max = toks.last.index
+
         val all =
             toks.flatten.
-                filter(!_.isNlp).map(n ⇒ NoteData(n.noteType, n.tokenFrom to n.tokenTo)).
+                filter(!_.isNlp).
+                filter(n ⇒ n.tokenIndexes.head >= min && n.tokenIndexes.last <= max).
+                map(n ⇒ NoteData(n.noteType, n.tokenFrom to n.tokenTo)).
                 sortBy(_.indexes.head).distinct
 
-        val res =
-            if (all.nonEmpty) {
-                val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
-
-                /**
-                  * Returns flag which indicates are token contiguous or not.
-                  *
-                  * @param tok1Idx First token index.
-                  * @param tok2Idx Second token index.
-                  */
-                def contiguous(tok1Idx: Int, tok2Idx: Int): Boolean = {
-                    val between = toks.filter(t ⇒ t.index > tok1Idx && t.index < tok2Idx)
+        if (all.nonEmpty) {
+            val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
 
-                    between.isEmpty || between.forall(p ⇒ p.isStopWord || p.stem == STEM_AND)
-                }
-
-                val min = toks.dropWhile(_.isNlp).head.index
-                val max = toks.reverse.dropWhile(_.isNlp).head.index
+            /**
+              * Returns flag which indicates are token contiguous or not.
+              *
+              * @param tok1Idx First token index.
+              * @param tok2Idx Second token index.
+              */
+            def contiguous(tok1Idx: Int, tok2Idx: Int): Boolean = {
+                val between = toks.filter(t ⇒ t.index > tok1Idx && t.index < tok2Idx)
 
-                require(min <= max)
+                between.isEmpty || between.forall(p ⇒ p.isStopWord || p.stem == STEM_AND)
+            }
 
-                def fill(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = mutable.ArrayBuffer.empty[NoteData]): Unit = {
-                    seq += nd
+            val minIdx = toks.dropWhile(_.isNlp).head.index
+            val maxIdx = toks.reverse.dropWhile(_.isNlp).head.index
 
-                    all.
-                        filter(p ⇒ nd.indexes.last < p.indexes.head && contiguous(nd.indexes.last, p.indexes.head)).
-                        foreach(fill(_, mutable.ArrayBuffer.empty[NoteData] ++ seq.clone()))
+            require(minIdx <= maxIdx)
 
-                    if (seq.nonEmpty && seq.head.indexes.head == min && seq.last.indexes.last == max)
-                        res += seq
-                }
+            def fill(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = mutable.ArrayBuffer.empty[NoteData]): Unit = {
+                seq += nd
 
-                all.filter(_.indexes.head == min).foreach(p ⇒ fill(p))
+                all.
+                    filter(p ⇒ nd.indexes.last < p.indexes.head && contiguous(nd.indexes.last, p.indexes.head)).
+                    foreach(fill(_, mutable.ArrayBuffer.empty[NoteData] ++ seq.clone()))
 
-                res
+                if (seq.nonEmpty && seq.head.indexes.head == minIdx && seq.last.indexes.last == maxIdx)
+                    res += seq
             }
-            else
-                Seq.empty
 
-        if (res.isEmpty && !nullable)
-            throw new AssertionError(s"Invalid null result " +
-                s"[tokens=[${toks.map(_.origText).mkString(", ")}]" +
-                s", tokensIndexes=[${toks.map(_.index).mkString(", ")}]" +
-                s", allData=[${all.mkString(", ")}]" +
-                s"]"
-            )
+            all.filter(_.indexes.head == minIdx).foreach(p ⇒ fill(p))
 
-        res
+            res
+        }
+        else
+            Seq.empty
     }
 
     /**
@@ -306,13 +302,20 @@ object NCSortEnricher extends NCProbeEnricher {
                             else
                                 (others.filter(_.index < sepIdxs.head), others.filter(_.index > sepIdxs.last))
 
+                        val notes = subj.flatten
+
                         require(subj.nonEmpty)
 
-                        val subjSeq = split(subj, nullable = false)
-                        val bySeq = split(by, nullable = true)
-                        val asc = h.order.flatMap(order ⇒ Some(ORDER(order.synonymIndex)._2))
+                        val subjSeq = split(subj)
+
+                        if (subjSeq.nonEmpty) {
+                            val bySeq = if (by.isEmpty) Seq.empty else split(by)
+                            val asc = h.order.flatMap(order ⇒ Some(ORDER(order.synonymIndex)._2))
 
-                        Some(Match(asc, main = h.sort.tokens, stop = h.byTokens ++ h.orderTokens, subjSeq, bySeq))
+                            Some(Match(asc, main = h.sort.tokens, stop = h.byTokens ++ h.orderTokens, subjSeq, bySeq))
+                        }
+                        else
+                            None
                     }
                     else
                         None