You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2020/03/14 07:00:36 UTC

[incubator-nlpcraft] 04/13: WIP.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit c73bf59544ded0fbf80f820d174bba52cb30c79f
Author: Sergey Kamov <se...@apache.org>
AuthorDate: Thu Mar 12 16:38:04 2020 +0300

    WIP.
---
 .../nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala  |  24 ++++-
 .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala   | 108 ++++++++++++---------
 2 files changed, 83 insertions(+), 49 deletions(-)

diff --git a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala
index c632c2e..d2b1a4a 100644
--- a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala
+++ b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala
@@ -53,6 +53,8 @@ abstract class NCProbeEnricher extends NCService with LazyLogging {
             Set.empty
         else {
             def getCommon(sortedToks: Seq[NCNlpSentenceToken]): Set[String] = {
+                require(sortedToks.nonEmpty)
+
                 val h = sortedToks.head
                 val l = sortedToks.last
 
@@ -64,12 +66,16 @@ abstract class NCProbeEnricher extends NCService with LazyLogging {
                 notes.filter(!_.isNlp).filter(n ⇒ h.index == n.tokenFrom && l.index == n.tokenTo).map(_.noteType).toSet
             }
 
-            val sortedToks = toks.sortBy(_.index)
+            var sortedToks = toks.sortBy(_.index)
 
             var res = getCommon(sortedToks)
 
-            if (res.isEmpty)
-                res = getCommon(sortedToks.filter(!_.isStopWord))
+            if (res.isEmpty) {
+                sortedToks = sortedToks.filter(!_.isStopWord)
+
+                if (sortedToks.nonEmpty)
+                    res = getCommon(sortedToks)
+            }
 
             if (res.isEmpty) Set.empty else res
         }
@@ -83,7 +89,10 @@ abstract class NCProbeEnricher extends NCService with LazyLogging {
       */
     protected def hasReference(typ: String, refNoteName: String, refNoteVal: String, matched: Seq[NCNlpSentenceToken]): Boolean =
         matched.forall(t ⇒
-            t.isTypeOf(typ) && t.getNotes(typ).exists(n ⇒ n(refNoteName).asInstanceOf[String] == refNoteVal)
+            t.isTypeOf(typ) && t.getNotes(typ).exists(n ⇒ n.get(refNoteName) match {
+                case Some(s) ⇒ s.asInstanceOf[String] == refNoteVal
+                case None ⇒ false
+            })
         )
 
     /**
@@ -95,7 +104,12 @@ abstract class NCProbeEnricher extends NCService with LazyLogging {
       */
     protected def hasReferences(typ: String, refNoteName: String, refNoteVals: Seq[String], matched: Seq[NCNlpSentenceToken]): Boolean =
         matched.forall(t ⇒
-            t.isTypeOf(typ) && t.getNotes(typ).exists(n ⇒ n(refNoteName).asInstanceOf[java.util.List[String]].asScala == refNoteVals)
+            t.isTypeOf(typ) && t.getNotes(typ).exists(n ⇒
+                n.get(refNoteName) match {
+                    case Some(s) ⇒ s.asInstanceOf[java.util.List[String]].asScala == refNoteVals
+                    case None ⇒ false
+                }
+            )
         )
 
     /**
diff --git a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index 4351213..ad426b5 100644
--- a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++ b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -129,6 +129,19 @@ object NCSortEnricher extends NCProbeEnricher {
     }
 
     /**
+      * Return flag which indicates are token contiguous or not.
+      *
+      * @param toks Tokens.
+      * @param tok1Idx First token index.
+      * @param tok2Idx Second token index.
+      */
+    private def contiguous(toks: Seq[NCNlpSentenceToken], tok1Idx: Int, tok2Idx: Int): Boolean = {
+        val between = toks.filter(t ⇒ t.index > tok1Idx && t.index < tok2Idx)
+
+        between.isEmpty || between.forall(_.isStopWord)
+    }
+
+    /**
       * [Token] -> [NoteData]
       * [Token(A, B), Token(A), Token(C, D), Token(C, D, X), Token(Z)] ⇒
       * [ [A (0, 1), C (2, 3), Z (4)], [A (0, 1), D (2, 3), Z (4) ] ]
@@ -136,30 +149,23 @@ object NCSortEnricher extends NCProbeEnricher {
       * @param toks
       */
     private def split(toks: Seq[NCNlpSentenceToken]): Seq[Seq[NoteData]] = {
-        val all: Seq[NoteData] = toks.
-            flatten.
-            filter(!_.isNlp).
-            map(n ⇒ NoteData(n.noteType, n.tokenFrom to n.tokenTo)).
-            sortBy(_.indexes.head)
+        val all =
+            toks.flatten.filter(!_.isNlp).map(n ⇒ NoteData(n.noteType, n.tokenFrom to n.tokenTo)).sortBy(_.indexes.head).distinct
 
         if (all.nonEmpty) {
-            val first = all.head.indexes.head
-            val last = all.last.indexes.last
-
             val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
 
             def fill(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = mutable.ArrayBuffer.empty[NoteData]): Unit = {
                 seq += nd
 
                 all.
-                    filter(p ⇒ nd.indexes.last < p.indexes.head  && {
-                        val between = toks.slice(nd.indexes.last, p.indexes.head - 1)
-
-                        between.isEmpty || between.forall(_.isStopWord)
-                    }).
+                    filter(p ⇒ nd.indexes.last < p.indexes.head && contiguous(toks, nd.indexes.last, p.indexes.head)).
                     foreach(fill(_, mutable.ArrayBuffer.empty[NoteData] ++ seq.clone()))
 
-                if (seq.nonEmpty && seq.head.indexes.head == first && seq.last.indexes.last == last)
+                if (seq.nonEmpty &&
+                    seq.head.indexes.head == all.head.indexes.head &&
+                    seq.last.indexes.last == all.last.indexes.last
+                )
                     res += seq
             }
 
@@ -171,6 +177,10 @@ object NCSortEnricher extends NCProbeEnricher {
             Seq.empty
     }
 
+    /**
+      *
+      * @param toks
+      */
     private def tryToMatch(toks: Seq[NCNlpSentenceToken]): Option[Match] = {
         case class KeyWord(tokens: Seq[NCNlpSentenceToken], synonymIndex: Int) {
             // Added for tests reasons.
@@ -219,37 +229,47 @@ object NCSortEnricher extends NCProbeEnricher {
         hOpt match {
             case Some(h) ⇒
                 val others = toks.filter(t ⇒ !h.all.contains(t))
-                val othersWithoutStops = others.filter(!_.isStopWord)
-
-                if (
-                    othersWithoutStops.nonEmpty &&
-                    othersWithoutStops.forall(t ⇒ t.exists(n ⇒ n.isUser || SORT_TYPES.contains(n.noteType))) &&
-                    SEQS.contains(
-                        // It removes duplicates (`SORT x x ORDER x x x` converts to `SORT x ORDER x`)
-                        toks.map(t ⇒
-                            h.getKeyWordType(t).getOrElse("x")).
-                            foldLeft("")((x, y) ⇒ if (x.endsWith(y)) x else s"$x $y").trim
-                    )
-                ) {
-                    val subj = mutable.ArrayBuffer.empty[NCNlpSentenceToken]
-                    val by = mutable.ArrayBuffer.empty[NCNlpSentenceToken]
-
-                    others.foreach(t ⇒ (if (subj.isEmpty || subj.last.index + 1 == t.index) subj else by) += t)
-
-                    require(subj.nonEmpty)
-
-                    Some(
-                        Match(
-                            asc = h.order match {
-                                case Some(order) ⇒ Some(ORDER(order.synonymIndex)._2)
-                                case None ⇒ None
-                            },
-                            main = h.sort.tokens,
-                            stop = h.byTokens ++ h.orderTokens,
-                            subjSeq = split(subj),
-                            bySeq = split(by)
+
+                if (others.nonEmpty) {
+                    val othersRefs = others.filter(t ⇒ t.exists(n ⇒ n.isUser || SORT_TYPES.contains(n.noteType)))
+
+                    if (
+                        othersRefs.nonEmpty &&
+                        others.filter(p ⇒ !othersRefs.contains(p)).forall(_.isStopWord) &&
+                        SEQS.contains(
+                            // It removes duplicates (`SORT x x ORDER x x x` converts to `SORT x ORDER x`)
+                            toks.map(t ⇒
+                                h.getKeyWordType(t).getOrElse("x")).
+                                foldLeft("")((x, y) ⇒ if (x.endsWith(y)) x else s"$x $y").trim
+                        )
+                    ) {
+                        val subj = mutable.ArrayBuffer.empty[NCNlpSentenceToken]
+                        val by = mutable.ArrayBuffer.empty[NCNlpSentenceToken]
+
+                        others.foreach(t ⇒
+                            if (subj.isEmpty || by.isEmpty && contiguous(others, subj.last.index, t.index))
+                                subj += t
+                            else
+                                by += t
+                        )
+
+                        require(subj.nonEmpty)
+
+                        Some(
+                            Match(
+                                asc = h.order match {
+                                    case Some(order) ⇒ Some(ORDER(order.synonymIndex)._2)
+                                    case None ⇒ None
+                                },
+                                main = h.sort.tokens,
+                                stop = h.byTokens ++ h.orderTokens,
+                                subjSeq = split(subj),
+                                bySeq = split(by)
+                            )
                         )
-                    )
+                    }
+                    else
+                        None
                 }
                 else
                     None