You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2020/03/12 11:55:26 UTC

[incubator-nlpcraft] branch NLPCRAFT-2 created (now 468a82b)

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a change to branch NLPCRAFT-2
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git.


      at 468a82b  WIP.

This branch includes the following new commits:

     new 17776cd  WIP.
     new b53b17c  WIP.
     new 468a82b  WIP.

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[incubator-nlpcraft] 01/03: WIP.

Posted by se...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-2
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 17776cd96fe6ace895839b7b38c7737c7f0c509c
Author: Sergey Kamov <se...@apache.org>
AuthorDate: Wed Mar 11 18:34:15 2020 +0300

    WIP.
---
 .../nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala  |  21 ++-
 .../aggregation/NCAggregationEnricher.scala        |   2 +-
 .../mgrs/nlp/enrichers/limit/NCLimitEnricher.scala |   2 +-
 .../enrichers/relation/NCRelationEnricher.scala    |   2 +-
 .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala   | 157 +++++++++++----------
 5 files changed, 103 insertions(+), 81 deletions(-)

diff --git a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala
index 0c6ca9e..c632c2e 100644
--- a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala
+++ b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala
@@ -27,6 +27,8 @@ import org.nlpcraft.probe.mgrs.NCModelDecorator
 
 import scala.collection.{Map, Seq}
 import scala.language.implicitConversions
+import scala.collection.JavaConverters._
+
 /**
  * Base class for NLP enricher.
  */
@@ -75,12 +77,25 @@ abstract class NCProbeEnricher extends NCService with LazyLogging {
     /**
       *
       * @param typ
-      * @param refNote
+      * @param refNoteName
+      * @param refNoteVal
+      * @param matched
+      */
+    protected def hasReference(typ: String, refNoteName: String, refNoteVal: String, matched: Seq[NCNlpSentenceToken]): Boolean =
+        matched.forall(t ⇒
+            t.isTypeOf(typ) && t.getNotes(typ).exists(n ⇒ n(refNoteName).asInstanceOf[String] == refNoteVal)
+        )
+
+    /**
+      *
+      * @param typ
+      * @param refNoteName
+      * @param refNoteVals
       * @param matched
       */
-    protected def isReference(typ: String, refNote: String, matched: Seq[NCNlpSentenceToken]): Boolean =
+    protected def hasReferences(typ: String, refNoteName: String, refNoteVals: Seq[String], matched: Seq[NCNlpSentenceToken]): Boolean =
         matched.forall(t ⇒
-            t.isTypeOf(typ) && t.getNotes(typ).exists(n ⇒ n("note").asInstanceOf[String] == refNote)
+            t.isTypeOf(typ) && t.getNotes(typ).exists(n ⇒ n(refNoteName).asInstanceOf[java.util.List[String]].asScala == refNoteVals)
         )
 
     /**
diff --git a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/aggregation/NCAggregationEnricher.scala b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/aggregation/NCAggregationEnricher.scala
index 2195363..6fa6a2a 100644
--- a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/aggregation/NCAggregationEnricher.scala
+++ b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/aggregation/NCAggregationEnricher.scala
@@ -99,7 +99,7 @@ object NCAggregationEnricher extends NCProbeEnricher {
             for (toks ← ns.tokenMixWithStopWords() if areSuitableTokens(buf, toks))
                 tryToMatch(toks) match {
                     case Some(m) ⇒
-                        for (refNote ← m.refNotes if !isReference(TOK_ID, refNote, m.matched)) {
+                        for (refNote ← m.refNotes if !hasReference(TOK_ID, "note", refNote, m.matched)) {
                             val note = NCNlpSentenceNote(
                                 m.matched.map(_.index),
                                 TOK_ID,
diff --git a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
index b57fcf3..bcde957 100644
--- a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
+++ b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
@@ -210,7 +210,7 @@ object NCLimitEnricher extends NCProbeEnricher {
             for (toks ← ns.tokenMixWithStopWords().sortBy(p ⇒ (-p.size, -p.head.index)) if areSuitableTokens(buf, toks))
                 tryToMatch(numsMap, groupsMap, toks) match {
                     case Some(m) ⇒
-                        for (refNote ← m.refNotes if !isReference(TOK_ID, refNote, m.matched)) {
+                        for (refNote ← m.refNotes if !hasReference(TOK_ID, "note", refNote, m.matched)) {
                             val note = NCNlpSentenceNote(
                                 m.matched.map(_.index),
                                 TOK_ID,
diff --git a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
index 0613a51..2284766 100644
--- a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
+++ b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
@@ -129,7 +129,7 @@ object NCRelationEnricher extends NCProbeEnricher {
             for (toks ← ns.tokenMixWithStopWords() if areSuitableTokens(buf, toks))
                 tryToMatch(toks) match {
                     case Some(m) ⇒
-                        for (refNote ← m.refNotes if !isReference(TOK_ID, refNote, Seq(m.matched.head))) {
+                        for (refNote ← m.refNotes if !hasReference(TOK_ID, "note", refNote, Seq(m.matched.head))) {
                             val note = NCNlpSentenceNote(
                                 Seq(m.matchedHead.index),
                                 TOK_ID,
diff --git a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index 7e9de2c..8678c7d 100644
--- a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++ b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -28,6 +28,7 @@ import org.nlpcraft.probe.mgrs.NCModelDecorator
 import org.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
 import scala.collection.{Map, Seq, mutable}
 
 /**
@@ -103,10 +104,13 @@ object NCSortEnricher extends NCProbeEnricher {
         asc: Option[Boolean],
         main: Seq[NCNlpSentenceToken],
         stop: Seq[NCNlpSentenceToken],
-        subj: Seq[Seq[NoteData]],
-        by: Seq[Seq[NoteData]]
+        subjSeq: Seq[Seq[NoteData]],
+        bySeq: Seq[Seq[NoteData]]
     ) {
-        lazy val all = main ++ stop
+        require(main.nonEmpty)
+        require(subjSeq.nonEmpty)
+
+        lazy val all: Seq[NCNlpSentenceToken] = main ++ stop
     }
 
     override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
@@ -121,38 +125,32 @@ object NCSortEnricher extends NCProbeEnricher {
     // [Token(A, B), Token(A), Token(C, D), Token(C, D, X), Token(Z)] ⇒
     // [[A (0, 1), C (2, 3), Z (4)], [A (0, 1), D (2, 3), Z (4)]]
     private def split(toks: Seq[NCNlpSentenceToken]): Seq[Seq[NoteData]] = {
-        val all = toks.
+        val all: Seq[NoteData] = toks.
             flatten.
-            filter(n ⇒ !n.isNlp).
+            filter(!_.isNlp).
             map(n ⇒ NoteData(n.noteType, n.tokenFrom to n.tokenTo)).
             sortBy(_.indexes.head)
 
-        val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
-        val used = mutable.ArrayBuffer.empty[NoteData]
+        if (all.nonEmpty) {
+            val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
 
-        def go(seq: mutable.ArrayBuffer[NoteData], nd: NoteData): Boolean =
-            if (!used.contains(nd)) {
-                if (seq.isEmpty) {
-                    if (nd.indexes.head == 0) {
-                        seq += nd
-                        used += nd
+            def go(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = mutable.ArrayBuffer.empty[NoteData]): Unit = {
+                seq += nd
 
-                        all.find(nd ⇒ !used.contains(nd)) match {
-                            case Some(next) ⇒ go(seq, next)
-                            case None ⇒ false
-                        }
-                    }
-                    else
-                        false
-                }
-                else {
-                    false
-                }
+                all.
+                    filter(p ⇒ p.indexes.head == nd.indexes.last + 1).
+                    foreach(go(_, mutable.ArrayBuffer.empty[NoteData] ++ seq.clone()))
+
+                if (seq.nonEmpty && seq.head.indexes.head == toks.head.index && seq.last.indexes.last == toks.last.index)
+                    res += seq
             }
-            else
-                false
 
-        res
+            go(all.head)
+
+            res
+        }
+        else
+            Seq.empty
     }
 
     private def tryToMatch(toks: Seq[NCNlpSentenceToken]): Option[Match] = {
@@ -200,6 +198,7 @@ object NCSortEnricher extends NCProbeEnricher {
                 case None ⇒ None
             }
 
+
         hOpt match {
             case Some(h) ⇒
                 val others = toks.filter(t ⇒ !h.all.contains(t))
@@ -222,19 +221,16 @@ object NCSortEnricher extends NCProbeEnricher {
 
                     require(subj.nonEmpty)
 
-                    val asc =
-                        h.order match {
-                            case Some(order) ⇒ Some(ORDER(order.synonymIndex)._2)
-                            case None ⇒ None
-                        }
-
                     Some(
                         Match(
-                            asc = asc,
+                            asc = h.order match {
+                                case Some(order) ⇒ Some(ORDER(order.synonymIndex)._2)
+                                case None ⇒ None
+                            },
                             main = h.sort.tokens,
                             stop = h.byTokens ++ h.orderTokens,
-                            subj = split(subj),
-                            by = split(by)
+                            subjSeq = split(subj),
+                            bySeq = split(by)
                         )
                     )
                 }
@@ -244,9 +240,9 @@ object NCSortEnricher extends NCProbeEnricher {
         }
     }
 
-//    def suitable(m: Match, notes: Seq[String], refName: String): Boolean =
-//        notes.forall(note ⇒ !isReference(TOK_ID, refName, note, m.all))
-//
+    // TODO:
+    private def suitable(m: Match, notes: Seq[String], refName: String): Boolean =
+        !hasReferences(TOK_ID, refName, notes, m.main)
 
     override def enrich(mdl: NCModelDecorator, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span): Boolean =
         startScopedSpan("enrich", parent,
@@ -256,42 +252,53 @@ object NCSortEnricher extends NCProbeEnricher {
             val buf = mutable.Buffer.empty[Set[NCNlpSentenceToken]]
             var changed: Boolean = false
 
-//            for (toks ← ns.tokenMixWithStopWords() if areSuitableTokens(buf, toks))
-//                tryToMatch(toks) match {
-//                    case Some(m)
-////                        if suitable(m, m.subj.map(_.note), "subjNotes") &&
-////                        (m.by.isEmpty || suitable(m, m.by.map(_.note), "byNotes")) ⇒
-//                        ⇒
-//                        val params = mutable.ArrayBuffer.empty[(String, Any)]
-//
-//                        m.asc match {
-//                            case Some(asc) ⇒ params += "asc" → asc
-//                            case None ⇒ // No-op.
-//                        }
-//
-//                        def addNotes(seq: Seq[NoteData], notesName: String, idxsName: String): Unit = {
-//                            params += notesName → seq.map(_.note).asJava
-//                            params += idxsName → seq.map(_.indexes.asJava).asJava
-//                        }
-//
-//                        addNotes(m.subj, "subjNotes", "subjIndexes")
-//
-//                        if (m.by.nonEmpty)
-//                            addNotes(m.by, "byNotes", "byIndexes")
-//
-//                        val note = NCNlpSentenceNote(m.main.map(_.index), TOK_ID, params :_*)
-//
-//                        m.main.foreach(_.add(note))
-//                        m.stop.foreach(_.addStopReason(note))
-//
-//                        changed = true
-//
-//                    case None ⇒ // No-op.
-//
-//                if (changed)
-//                    buf += toks.toSet
-//            }
+            for (toks ← ns.tokenMixWithStopWords() if areSuitableTokens(buf, toks))
+                tryToMatch(toks) match {
+                    case Some(m) ⇒
+                        for (subj ← m.subjSeq if suitable(m, subj.map(_.note), "subjNotes")) {
+                            def addNotes(
+                                params: ArrayBuffer[(String, Any)],
+                                seq: Seq[NoteData],
+                                notesName: String,
+                                idxsName: String
+                            ): ArrayBuffer[(String, Any)] = {
+                                params += notesName → seq.map(_.note).asJava
+                                params += idxsName → seq.map(_.indexes.asJava).asJava
+
+                                params
+                            }
+
+                            def mkParams(): ArrayBuffer[(String, Any)] = {
+                                val params = mutable.ArrayBuffer.empty[(String, Any)]
+
+                                if (m.asc.isDefined)
+                                    params += "asc" → m.asc.get
+
+                                addNotes(params, subj, "subjNotes", "subjIndexes")
+                            }
+
+                            def mkNote(params: ArrayBuffer[(String, Any)]): Unit = {
+                                val note = NCNlpSentenceNote(m.main.map(_.index), TOK_ID, params:_*)
+
+                                m.main.foreach(_.add(note))
+                                m.stop.foreach(_.addStopReason(note))
+
+                                changed = true
+                            }
+
+                            if (m.bySeq.nonEmpty)
+                                for (by ← m.bySeq if suitable(m, by.map(_.note), "byNotes"))
+                                    mkNote(addNotes(mkParams(), by, "byNotes", "byIndexes"))
+                            else
+                                mkNote(mkParams())
+                        }
+
+                    case None ⇒ // No-op.
+
+                if (changed)
+                    buf += toks.toSet
+            }
 
             changed
         }
-}
+}
\ No newline at end of file


[incubator-nlpcraft] 02/03: WIP.

Posted by se...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-2
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit b53b17cd9aa9adc2376f0f0b7a87173caa31893d
Author: Sergey Kamov <se...@apache.org>
AuthorDate: Thu Mar 12 12:36:33 2020 +0300

    WIP.
---
 .../org/nlpcraft/model/impl/NCTokenLogger.scala    |  24 +++-
 .../nlp/enrichers/post/NCPostEnrichProcessor.scala | 145 +++++++++++++++------
 2 files changed, 124 insertions(+), 45 deletions(-)

diff --git a/src/main/scala/org/nlpcraft/model/impl/NCTokenLogger.scala b/src/main/scala/org/nlpcraft/model/impl/NCTokenLogger.scala
index e6b603f..2968839 100644
--- a/src/main/scala/org/nlpcraft/model/impl/NCTokenLogger.scala
+++ b/src/main/scala/org/nlpcraft/model/impl/NCTokenLogger.scala
@@ -465,11 +465,29 @@ object NCTokenLogger extends LazyLogging {
                             s"type=$t, indexes=[$getIndexes], note=$note"
 
                         case "nlpcraft:sort" ⇒
-                            val asc: Boolean = get("asc")
-                            val note = mkString("note")
+                            def n2s(l: java.util.List[String]): String = l.asScala.mkString(", ")
+                            def i2s(l: java.util.List[java.util.List[Int]]): String =
+                                l.asScala.map(_.asScala).map(p ⇒ s"[${p.mkString(", ")}]").mkString(", ")
+
+                            val subjNotes: java.util.List[String] = get("subjnotes")
+                            val subjIndexes: java.util.List[java.util.List[Int]] = get("subjindexes")
+
+                            var s = s"subjNotes=${n2s(subjNotes)}, subjIndexes=[${i2s(subjIndexes)}]"
+
+                            if (has("asc"))
+                                s = s"$s, asc=${get("asc")}"
 
-                            s"asc=$asc, indexes=[$getIndexes], note=$note"
+                            val byNotesOpt: Option[java.util.List[String]] = getOpt("bynotes")
 
+                            byNotesOpt match {
+                                case Some(byNotes) ⇒
+                                    val byIndexes: java.util.List[java.util.List[Int]] = get("byindexes")
+
+                                    s = s"$s, byNotes=${n2s(byNotes)}, byIndexes=[${i2s(byIndexes)}]"
+                                case None ⇒ // No-op.
+                            }
+
+                            s
                         case "nlpcraft:limit" ⇒
                             val limit = mkDouble3("limit")
                             val note = mkString("note")
diff --git a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/post/NCPostEnrichProcessor.scala b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/post/NCPostEnrichProcessor.scala
index 6d71975..b41ce7b 100644
--- a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/post/NCPostEnrichProcessor.scala
+++ b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/post/NCPostEnrichProcessor.scala
@@ -231,9 +231,10 @@ object NCPostEnrichProcessor extends NCService with LazyLogging {
             Seq(
                 "nlpcraft:aggregation",
                 "nlpcraft:relation",
-                "nlpcraft:sort",
                 "nlpcraft:limit"
-            ).forall(t ⇒ fixIndexesReferences(t, ns, history))
+            ).forall(t ⇒ fixIndexesReferences(t, ns, history) &&
+            fixIndexesReferencesList("nlpcraft:limit", "subjIndexes", "subjNotes", ns, history) &&
+            fixIndexesReferencesList("nlpcraft:limit", "byIndexes", "byNotes", ns, history))
 
         if (res)
             // Validation (all indexes calculated well)
@@ -251,7 +252,57 @@ object NCPostEnrichProcessor extends NCService with LazyLogging {
     }
 
     /**
+      *
+      * @param ns
+      * @param idxs
+      * @param notesType
+      * @param id
+      * @return
+      */
+    private def checkRelation(ns: NCNlpSentence, idxs: Seq[Int], notesType: String, id: String): Boolean = {
+        val types =
+            idxs.flatMap(idx ⇒ {
+                val types = ns(idx).map(p ⇒ p).filter(!_.isNlp).map(_.noteType)
+
+                types.size match {
+                    case 0 ⇒ None
+                    case 1 ⇒ Some(types.head)
+                    case _ ⇒ throw new AssertionError(s"Unexpected tokes: ${ns(idx)}")
+                }
+            }).distinct
+
+        /**
+        Example:
+             1. Sentence 'maximum x' (single element related function)
+              - maximum is aggregate function linked to date element.
+              - x defined as 2 elements: date and num.
+              So, the variant 'maximum x (as num)' should be excluded.
+
+              2. Sentence 'compare x and y' (multiple elements related function)
+              - compare is relation function linked to date element.
+              - x an y defined as 2 elements: date and num.
+              So, variants 'x (as num) and x (as date)'  and 'x (as date) and x (as num)'
+              should't be excluded, but invalid relation should be deleted for these combinations.
+          */
+
+        types.size match {
+            case 0 ⇒ throw new AssertionError("Unexpected empty types")
+            case 1 ⇒ types.head == notesType
+            case _ ⇒
+                // Equal elements should be processed together with function element.
+                if (types.size == 1)
+                    false
+                else {
+                    ns.removeNote(id)
+
+                    true
+                }
+        }
+    }
+
+    /**
       * Fixes notes with references to other notes indexes.
+      * Note that 'idxsField' is 'indexes' and 'noteField' is 'note' for all kind of references.
       *
       * @param noteType Note type.
       * @param ns Sentence.
@@ -262,7 +313,7 @@ object NCPostEnrichProcessor extends NCService with LazyLogging {
         ns.filter(_.isTypeOf(noteType)).foreach(tok ⇒
             tok.getNoteOpt(noteType, "indexes") match {
                 case Some(n) ⇒
-                    val idxs = n.data[java.util.List[Int]]("indexes").asScala
+                    val idxs: Seq[Int] = n.data[java.util.List[Int]]("indexes").asScala
                     var fixed = idxs
 
                     history.foreach { case (idxOld, idxNew) ⇒ fixed = fixed.map(i ⇒ if (i == idxOld) idxNew else i) }
@@ -280,48 +331,58 @@ object NCPostEnrichProcessor extends NCService with LazyLogging {
             }
         )
 
-        ns.flatMap(_.getNotes(noteType)).forall(p = rel ⇒ {
-            val idxs = rel.data[java.util.List[Int]]("indexes")
-            val notesType = rel.data[String]("note")
+        ns.flatMap(_.getNotes(noteType)).forall(
+            n ⇒ checkRelation(ns, n.data[java.util.List[Int]]("indexes").asScala, n.data[String]("note"), n.id)
+        )
+    }
+
+    /**
+      * Fixes notes with references list to other notes indexes.
+      *
+      * @param noteType Note type.
+      * @param idxsField Indexes field.
+      * @param noteField Note field.
+      * @param ns Sentence.
+      * @param history Indexes transformation history.
+      * @return Valid flag.
+      */
+    private def fixIndexesReferencesList(
+        noteType: String,
+        idxsField: String,
+        noteField: String,
+        ns: NCNlpSentence,
+        history: Seq[(Int, Int)]
+    ): Boolean = {
+        ns.filter(_.isTypeOf(noteType)).foreach(tok ⇒
+            tok.getNoteOpt(noteType, idxsField) match {
+                case Some(n) ⇒
+                    val idxs: Seq[Seq[Int]] = n.data[java.util.List[java.util.List[Int]]](idxsField).asScala.map(_.asScala)
+                    var fixed = idxs
+
+                    history.foreach { case (idxOld, idxNew) ⇒ fixed = fixed.map(_.map(i ⇒ if (i == idxOld) idxNew else i)) }
 
-                    val types =
-                        idxs.asScala.flatMap(idx ⇒ {
-                            val types = ns(idx).map(p ⇒ p).filter(!_.isNlp).map(_.noteType)
+                    fixed = fixed.distinct
 
-                            types.size match {
-                                case 0 ⇒ None
-                                case 1 ⇒ Some(types.head)
-                                case _ ⇒ throw new AssertionError(s"Unexpected tokes: ${ns(idx)}")
-                            }
-                        }).distinct
-
-                    /**
-                    Example:
-                     1. Sentence 'maximum x' (single element related function)
-                      - maximum is aggregate function linked to date element.
-                      - x defined as 2 elements: date and num.
-                      So, the variant 'maximum x (as num)' should be excluded.
-
-                      2. Sentence 'compare x and y' (multiple elements related function)
-                      - compare is relation function linked to date element.
-                      - x an y defined as 2 elements: date and num.
-                      So, variants 'x (as num) and x (as date)'  and 'x (as date) and x (as num)'
-                      should't be excluded, but invalid relation should be deleted for these combinations.
-                      */
-
-                    types.size match {
-                        case 0 ⇒ throw new AssertionError("Unexpected empty types")
-                        case 1 ⇒ types.head == notesType
-                        case _ ⇒
-                            // Equal elements should be processed together with function element.
-                            if (types.size == 1)
-                                false
-                            else {
-                                ns.removeNote(rel.id)
-
-                                true
-                            }
+                    if (idxs != fixed) {
+                        n += idxsField → fixed.map(_.asJava).asJava.asInstanceOf[java.io.Serializable]
+
+                        def x(seq: Seq[Seq[Int]]): String = s"[${seq.map(p ⇒ s"[${p.mkString(",")}]").mkString(", ")}]"
+
+                        logger.trace(s"`$noteType` note `indexes` fixed [old=${x(idxs)}}, new=${x(fixed)}]")
                     }
+                case None ⇒ // No-op.
+            }
+        )
+
+        ns.flatMap(_.getNotes(noteType)).forall(rel ⇒ {
+            val idxsList: util.List[util.List[Int]] = rel.data[java.util.List[java.util.List[Int]]](idxsField)
+            val notesTypes = rel.data[util.List[String]](noteField)
+
+            require(idxsList.size() == notesTypes.size())
+
+            idxsList.asScala.zip(notesTypes.asScala).forall {
+                case (idxs, notesType) ⇒ checkRelation(ns, idxs.asScala, notesType, rel.id)
+            }
         })
     }
 


[incubator-nlpcraft] 03/03: WIP.

Posted by se...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-2
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 468a82b0d845d13e4872be2f4b1b97baa24383f0
Author: Sergey Kamov <se...@apache.org>
AuthorDate: Thu Mar 12 14:55:06 2020 +0300

    WIP.
---
 .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala   | 105 +++++++++++++--------
 1 file changed, 64 insertions(+), 41 deletions(-)

diff --git a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index 8678c7d..4351213 100644
--- a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++ b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -35,13 +35,12 @@ import scala.collection.{Map, Seq, mutable}
   * Sort enricher.
   */
 object NCSortEnricher extends NCProbeEnricher {
-    private final val SORT: Seq[String] =
+    private final val SORT =
         Seq("sort", "rank", "classify", "order", "arrange", "organize", "segment", "shuffle").map(NCNlpCoreManager.stem)
 
-    private final val BY: Seq[String] =
-        Seq("by", "on", "with").map(NCNlpCoreManager.stem)
+    private final val BY: Seq[String] = Seq("by", "on", "with").map(NCNlpCoreManager.stem)
 
-    private final val ORDER: Seq[(String, Boolean)] = {
+    private final val ORDER = {
         val p = NCMacroParser()
 
         Seq(
@@ -56,11 +55,9 @@ object NCSortEnricher extends NCProbeEnricher {
         ).flatMap { case (txt, asc) ⇒ p.expand(txt).map(p ⇒ NCNlpCoreManager.stem(p) → asc) }
     }
 
-    require(SORT.size + BY.size + ORDER.size == (SORT ++ BY ++ ORDER.unzip._1).distinct.size)
+    private final val TOK_ID = "nlpcraft:sort"
 
-    private final val TOK_ID: String = "nlpcraft:sort"
-
-    private final val SORT_TYPES: Seq[String] = Seq(
+    private final val SORT_TYPES = Seq(
         "nlpcraft:continent",
         "nlpcraft:subcontinent",
         "nlpcraft:country",
@@ -86,18 +83,6 @@ object NCSortEnricher extends NCProbeEnricher {
             s"SORT x BY ORDER"
         )
 
-    // Validation.
-    SEQS.map(_.split(" ")).foreach(seq ⇒ {
-        require(seq.forall(p ⇒ p == "SORT" || p == "ORDER" || p == "BY" || p == "x"))
-
-        seq.groupBy(p ⇒ p).foreach { case (key, group) ⇒
-            key match {
-                case "x" ⇒ require(group.length <= 2)
-                case _ ⇒ require(group.length == 1)
-            }
-        }
-    })
-
     case class NoteData(note: String, indexes: Seq[Int])
 
     private case class Match(
@@ -113,17 +98,43 @@ object NCSortEnricher extends NCProbeEnricher {
         lazy val all: Seq[NCNlpSentenceToken] = main ++ stop
     }
 
-    override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
-        super.start()
-    }
+    /**
+      *
+      */
+    private def validate() {
+        require(SORT.size + BY.size + ORDER.size == (SORT ++ BY ++ ORDER.unzip._1).distinct.size)
 
-    override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ ⇒
-        super.stop()
+        val seq1 = SORT.flatMap(_.split(" "))
+        val seq2 = BY.flatMap(_.split(" "))
+        val seq3 = ORDER.map(_._1).flatMap(_.split(" "))
+
+        require(seq1.size == seq1.distinct.size)
+        require(seq2.size == seq2.distinct.size)
+        require(seq3.size == seq3.distinct.size)
+
+        require(seq1.intersect(seq2).isEmpty)
+        require(seq1.intersect(seq3).isEmpty)
+        require(seq2.intersect(seq3).isEmpty)
+
+        SEQS.map(_.split(" ")).foreach(seq ⇒ {
+            require(seq.forall(p ⇒ p == "SORT" || p == "ORDER" || p == "BY" || p == "x"))
+
+            seq.groupBy(p ⇒ p).foreach { case (key, group) ⇒
+                key match {
+                    case "x" ⇒ require(group.length <= 2)
+                    case _ ⇒ require(group.length == 1)
+                }
+            }
+        })
     }
 
-    // [Token] -> [NoteData]
-    // [Token(A, B), Token(A), Token(C, D), Token(C, D, X), Token(Z)] ⇒
-    // [[A (0, 1), C (2, 3), Z (4)], [A (0, 1), D (2, 3), Z (4)]]
+    /**
+      * [Token] -> [NoteData]
+      * [Token(A, B), Token(A), Token(C, D), Token(C, D, X), Token(Z)] ⇒
+      * [ [A (0, 1), C (2, 3), Z (4)], [A (0, 1), D (2, 3), Z (4) ] ]
+      *
+      * @param toks
+      */
     private def split(toks: Seq[NCNlpSentenceToken]): Seq[Seq[NoteData]] = {
         val all: Seq[NoteData] = toks.
             flatten.
@@ -132,20 +143,27 @@ object NCSortEnricher extends NCProbeEnricher {
             sortBy(_.indexes.head)
 
         if (all.nonEmpty) {
+            val first = all.head.indexes.head
+            val last = all.last.indexes.last
+
             val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
 
-            def go(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = mutable.ArrayBuffer.empty[NoteData]): Unit = {
+            def fill(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = mutable.ArrayBuffer.empty[NoteData]): Unit = {
                 seq += nd
 
                 all.
-                    filter(p ⇒ p.indexes.head == nd.indexes.last + 1).
-                    foreach(go(_, mutable.ArrayBuffer.empty[NoteData] ++ seq.clone()))
+                    filter(p ⇒ nd.indexes.last < p.indexes.head  && {
+                        val between = toks.slice(nd.indexes.last, p.indexes.head - 1)
 
-                if (seq.nonEmpty && seq.head.indexes.head == toks.head.index && seq.last.indexes.last == toks.last.index)
+                        between.isEmpty || between.forall(_.isStopWord)
+                    }).
+                    foreach(fill(_, mutable.ArrayBuffer.empty[NoteData] ++ seq.clone()))
+
+                if (seq.nonEmpty && seq.head.indexes.head == first && seq.last.indexes.last == last)
                     res += seq
             }
 
-            go(all.head)
+            fill(all.head)
 
             res
         }
@@ -198,7 +216,6 @@ object NCSortEnricher extends NCProbeEnricher {
                 case None ⇒ None
             }
 
-
         hOpt match {
             case Some(h) ⇒
                 val others = toks.filter(t ⇒ !h.all.contains(t))
@@ -240,11 +257,7 @@ object NCSortEnricher extends NCProbeEnricher {
         }
     }
 
-    // TODO:
-    private def suitable(m: Match, notes: Seq[String], refName: String): Boolean =
-        !hasReferences(TOK_ID, refName, notes, m.main)
-
-    override def enrich(mdl: NCModelDecorator, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span): Boolean =
+    override def enrich(mdl: NCModelDecorator, ns: NCNlpSentence, meta: Map[String, Serializable], parent: Span): Boolean =
         startScopedSpan("enrich", parent,
             "srvReqId" → ns.srvReqId,
             "modelId" → mdl.model.getId,
@@ -255,7 +268,7 @@ object NCSortEnricher extends NCProbeEnricher {
             for (toks ← ns.tokenMixWithStopWords() if areSuitableTokens(buf, toks))
                 tryToMatch(toks) match {
                     case Some(m) ⇒
-                        for (subj ← m.subjSeq if suitable(m, subj.map(_.note), "subjNotes")) {
+                        for (subj ← m.subjSeq if !hasReferences(TOK_ID, "subjNotes", subj.map(_.note), m.main)) {
                             def addNotes(
                                 params: ArrayBuffer[(String, Any)],
                                 seq: Seq[NoteData],
@@ -287,7 +300,7 @@ object NCSortEnricher extends NCProbeEnricher {
                             }
 
                             if (m.bySeq.nonEmpty)
-                                for (by ← m.bySeq if suitable(m, by.map(_.note), "byNotes"))
+                                for (by ← m.bySeq if !hasReferences(TOK_ID, "byNotes", by.map(_.note), m.main))
                                     mkNote(addNotes(mkParams(), by, "byNotes", "byIndexes"))
                             else
                                 mkNote(mkParams())
@@ -301,4 +314,14 @@ object NCSortEnricher extends NCProbeEnricher {
 
             changed
         }
+
+    override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
+        validate()
+
+        super.start()
+    }
+
+    override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ ⇒
+        super.stop()
+    }
 }
\ No newline at end of file