You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2020/03/14 07:00:33 UTC

[incubator-nlpcraft] 01/13: WIP.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 17776cd96fe6ace895839b7b38c7737c7f0c509c
Author: Sergey Kamov <se...@apache.org>
AuthorDate: Wed Mar 11 18:34:15 2020 +0300

    WIP.
---
 .../nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala  |  21 ++-
 .../aggregation/NCAggregationEnricher.scala        |   2 +-
 .../mgrs/nlp/enrichers/limit/NCLimitEnricher.scala |   2 +-
 .../enrichers/relation/NCRelationEnricher.scala    |   2 +-
 .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala   | 157 +++++++++++----------
 5 files changed, 103 insertions(+), 81 deletions(-)

diff --git a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala
index 0c6ca9e..c632c2e 100644
--- a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala
+++ b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala
@@ -27,6 +27,8 @@ import org.nlpcraft.probe.mgrs.NCModelDecorator
 
 import scala.collection.{Map, Seq}
 import scala.language.implicitConversions
+import scala.collection.JavaConverters._
+
 /**
  * Base class for NLP enricher.
  */
@@ -75,12 +77,25 @@ abstract class NCProbeEnricher extends NCService with LazyLogging {
     /**
       *
       * @param typ
-      * @param refNote
+      * @param refNoteName
+      * @param refNoteVal
+      * @param matched
+      */
+    protected def hasReference(typ: String, refNoteName: String, refNoteVal: String, matched: Seq[NCNlpSentenceToken]): Boolean =
+        matched.forall(t ⇒
+            t.isTypeOf(typ) && t.getNotes(typ).exists(n ⇒ n(refNoteName).asInstanceOf[String] == refNoteVal)
+        )
+
+    /**
+      *
+      * @param typ
+      * @param refNoteName
+      * @param refNoteVals
       * @param matched
       */
-    protected def isReference(typ: String, refNote: String, matched: Seq[NCNlpSentenceToken]): Boolean =
+    protected def hasReferences(typ: String, refNoteName: String, refNoteVals: Seq[String], matched: Seq[NCNlpSentenceToken]): Boolean =
         matched.forall(t ⇒
-            t.isTypeOf(typ) && t.getNotes(typ).exists(n ⇒ n("note").asInstanceOf[String] == refNote)
+            t.isTypeOf(typ) && t.getNotes(typ).exists(n ⇒ n(refNoteName).asInstanceOf[java.util.List[String]].asScala == refNoteVals)
         )
 
     /**
diff --git a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/aggregation/NCAggregationEnricher.scala b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/aggregation/NCAggregationEnricher.scala
index 2195363..6fa6a2a 100644
--- a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/aggregation/NCAggregationEnricher.scala
+++ b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/aggregation/NCAggregationEnricher.scala
@@ -99,7 +99,7 @@ object NCAggregationEnricher extends NCProbeEnricher {
             for (toks ← ns.tokenMixWithStopWords() if areSuitableTokens(buf, toks))
                 tryToMatch(toks) match {
                     case Some(m) ⇒
-                        for (refNote ← m.refNotes if !isReference(TOK_ID, refNote, m.matched)) {
+                        for (refNote ← m.refNotes if !hasReference(TOK_ID, "note", refNote, m.matched)) {
                             val note = NCNlpSentenceNote(
                                 m.matched.map(_.index),
                                 TOK_ID,
diff --git a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
index b57fcf3..bcde957 100644
--- a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
+++ b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
@@ -210,7 +210,7 @@ object NCLimitEnricher extends NCProbeEnricher {
             for (toks ← ns.tokenMixWithStopWords().sortBy(p ⇒ (-p.size, -p.head.index)) if areSuitableTokens(buf, toks))
                 tryToMatch(numsMap, groupsMap, toks) match {
                     case Some(m) ⇒
-                        for (refNote ← m.refNotes if !isReference(TOK_ID, refNote, m.matched)) {
+                        for (refNote ← m.refNotes if !hasReference(TOK_ID, "note", refNote, m.matched)) {
                             val note = NCNlpSentenceNote(
                                 m.matched.map(_.index),
                                 TOK_ID,
diff --git a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
index 0613a51..2284766 100644
--- a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
+++ b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
@@ -129,7 +129,7 @@ object NCRelationEnricher extends NCProbeEnricher {
             for (toks ← ns.tokenMixWithStopWords() if areSuitableTokens(buf, toks))
                 tryToMatch(toks) match {
                     case Some(m) ⇒
-                        for (refNote ← m.refNotes if !isReference(TOK_ID, refNote, Seq(m.matched.head))) {
+                        for (refNote ← m.refNotes if !hasReference(TOK_ID, "note", refNote, Seq(m.matched.head))) {
                             val note = NCNlpSentenceNote(
                                 Seq(m.matchedHead.index),
                                 TOK_ID,
diff --git a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index 7e9de2c..8678c7d 100644
--- a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++ b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -28,6 +28,7 @@ import org.nlpcraft.probe.mgrs.NCModelDecorator
 import org.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
 import scala.collection.{Map, Seq, mutable}
 
 /**
@@ -103,10 +104,13 @@ object NCSortEnricher extends NCProbeEnricher {
         asc: Option[Boolean],
         main: Seq[NCNlpSentenceToken],
         stop: Seq[NCNlpSentenceToken],
-        subj: Seq[Seq[NoteData]],
-        by: Seq[Seq[NoteData]]
+        subjSeq: Seq[Seq[NoteData]],
+        bySeq: Seq[Seq[NoteData]]
     ) {
-        lazy val all = main ++ stop
+        require(main.nonEmpty)
+        require(subjSeq.nonEmpty)
+
+        lazy val all: Seq[NCNlpSentenceToken] = main ++ stop
     }
 
     override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
@@ -121,38 +125,32 @@ object NCSortEnricher extends NCProbeEnricher {
     // [Token(A, B), Token(A), Token(C, D), Token(C, D, X), Token(Z)] ⇒
     // [[A (0, 1), C (2, 3), Z (4)], [A (0, 1), D (2, 3), Z (4)]]
     private def split(toks: Seq[NCNlpSentenceToken]): Seq[Seq[NoteData]] = {
-        val all = toks.
+        val all: Seq[NoteData] = toks.
             flatten.
-            filter(n ⇒ !n.isNlp).
+            filter(!_.isNlp).
             map(n ⇒ NoteData(n.noteType, n.tokenFrom to n.tokenTo)).
             sortBy(_.indexes.head)
 
-        val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
-        val used = mutable.ArrayBuffer.empty[NoteData]
+        if (all.nonEmpty) {
+            val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
 
-        def go(seq: mutable.ArrayBuffer[NoteData], nd: NoteData): Boolean =
-            if (!used.contains(nd)) {
-                if (seq.isEmpty) {
-                    if (nd.indexes.head == 0) {
-                        seq += nd
-                        used += nd
+            def go(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = mutable.ArrayBuffer.empty[NoteData]): Unit = {
+                seq += nd
 
-                        all.find(nd ⇒ !used.contains(nd)) match {
-                            case Some(next) ⇒ go(seq, next)
-                            case None ⇒ false
-                        }
-                    }
-                    else
-                        false
-                }
-                else {
-                    false
-                }
+                all.
+                    filter(p ⇒ p.indexes.head == nd.indexes.last + 1).
+                    foreach(go(_, mutable.ArrayBuffer.empty[NoteData] ++ seq.clone()))
+
+                if (seq.nonEmpty && seq.head.indexes.head == toks.head.index && seq.last.indexes.last == toks.last.index)
+                    res += seq
             }
-            else
-                false
 
-        res
+            go(all.head)
+
+            res
+        }
+        else
+            Seq.empty
     }
 
     private def tryToMatch(toks: Seq[NCNlpSentenceToken]): Option[Match] = {
@@ -200,6 +198,7 @@ object NCSortEnricher extends NCProbeEnricher {
                 case None ⇒ None
             }
 
+
         hOpt match {
             case Some(h) ⇒
                 val others = toks.filter(t ⇒ !h.all.contains(t))
@@ -222,19 +221,16 @@ object NCSortEnricher extends NCProbeEnricher {
 
                     require(subj.nonEmpty)
 
-                    val asc =
-                        h.order match {
-                            case Some(order) ⇒ Some(ORDER(order.synonymIndex)._2)
-                            case None ⇒ None
-                        }
-
                     Some(
                         Match(
-                            asc = asc,
+                            asc = h.order match {
+                                case Some(order) ⇒ Some(ORDER(order.synonymIndex)._2)
+                                case None ⇒ None
+                            },
                             main = h.sort.tokens,
                             stop = h.byTokens ++ h.orderTokens,
-                            subj = split(subj),
-                            by = split(by)
+                            subjSeq = split(subj),
+                            bySeq = split(by)
                         )
                     )
                 }
@@ -244,9 +240,9 @@ object NCSortEnricher extends NCProbeEnricher {
         }
     }
 
-//    def suitable(m: Match, notes: Seq[String], refName: String): Boolean =
-//        notes.forall(note ⇒ !isReference(TOK_ID, refName, note, m.all))
-//
+    // TODO:
+    private def suitable(m: Match, notes: Seq[String], refName: String): Boolean =
+        !hasReferences(TOK_ID, refName, notes, m.main)
 
     override def enrich(mdl: NCModelDecorator, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span): Boolean =
         startScopedSpan("enrich", parent,
@@ -256,42 +252,53 @@ object NCSortEnricher extends NCProbeEnricher {
             val buf = mutable.Buffer.empty[Set[NCNlpSentenceToken]]
             var changed: Boolean = false
 
-//            for (toks ← ns.tokenMixWithStopWords() if areSuitableTokens(buf, toks))
-//                tryToMatch(toks) match {
-//                    case Some(m)
-////                        if suitable(m, m.subj.map(_.note), "subjNotes") &&
-////                        (m.by.isEmpty || suitable(m, m.by.map(_.note), "byNotes")) ⇒
-//                        ⇒
-//                        val params = mutable.ArrayBuffer.empty[(String, Any)]
-//
-//                        m.asc match {
-//                            case Some(asc) ⇒ params += "asc" → asc
-//                            case None ⇒ // No-op.
-//                        }
-//
-//                        def addNotes(seq: Seq[NoteData], notesName: String, idxsName: String): Unit = {
-//                            params += notesName → seq.map(_.note).asJava
-//                            params += idxsName → seq.map(_.indexes.asJava).asJava
-//                        }
-//
-//                        addNotes(m.subj, "subjNotes", "subjIndexes")
-//
-//                        if (m.by.nonEmpty)
-//                            addNotes(m.by, "byNotes", "byIndexes")
-//
-//                        val note = NCNlpSentenceNote(m.main.map(_.index), TOK_ID, params :_*)
-//
-//                        m.main.foreach(_.add(note))
-//                        m.stop.foreach(_.addStopReason(note))
-//
-//                        changed = true
-//
-//                    case None ⇒ // No-op.
-//
-//                if (changed)
-//                    buf += toks.toSet
-//            }
+            for (toks ← ns.tokenMixWithStopWords() if areSuitableTokens(buf, toks))
+                tryToMatch(toks) match {
+                    case Some(m) ⇒
+                        for (subj ← m.subjSeq if suitable(m, subj.map(_.note), "subjNotes")) {
+                            def addNotes(
+                                params: ArrayBuffer[(String, Any)],
+                                seq: Seq[NoteData],
+                                notesName: String,
+                                idxsName: String
+                            ): ArrayBuffer[(String, Any)] = {
+                                params += notesName → seq.map(_.note).asJava
+                                params += idxsName → seq.map(_.indexes.asJava).asJava
+
+                                params
+                            }
+
+                            def mkParams(): ArrayBuffer[(String, Any)] = {
+                                val params = mutable.ArrayBuffer.empty[(String, Any)]
+
+                                if (m.asc.isDefined)
+                                    params += "asc" → m.asc.get
+
+                                addNotes(params, subj, "subjNotes", "subjIndexes")
+                            }
+
+                            def mkNote(params: ArrayBuffer[(String, Any)]): Unit = {
+                                val note = NCNlpSentenceNote(m.main.map(_.index), TOK_ID, params:_*)
+
+                                m.main.foreach(_.add(note))
+                                m.stop.foreach(_.addStopReason(note))
+
+                                changed = true
+                            }
+
+                            if (m.bySeq.nonEmpty)
+                                for (by ← m.bySeq if suitable(m, by.map(_.note), "byNotes"))
+                                    mkNote(addNotes(mkParams(), by, "byNotes", "byIndexes"))
+                            else
+                                mkNote(mkParams())
+                        }
+
+                    case None ⇒ // No-op.
+
+                if (changed)
+                    buf += toks.toSet
+            }
 
             changed
         }
-}
+}
\ No newline at end of file