You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/09/29 13:36:46 UTC
[incubator-nlpcraft] branch NLPCRAFT-456 updated: WIP.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-456
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


The following commit(s) were added to refs/heads/NLPCRAFT-456 by this push:
     new a9dc535  WIP.
a9dc535 is described below

commit a9dc535b9eeb5ef6c3aecce521f12513184c6e12
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Wed Sep 29 16:36:36 2021 +0300

    WIP.
---
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala    | 10 +++++-----
 .../probe/mgrs/sentence/NCSentenceManager.scala       | 19 ++++---------------
 2 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index a39edfd..26bda8b 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -607,8 +607,8 @@ object NCModelEnricher extends NCProbeEnricher {
       */
     private def normalize(ns: Sentence): Unit = {
         // Find and removes user notes if sentence contains notes with similar structure but less count of swallowed stop-words.
-        // These stop-words can be used fro detection another user tokens and harmless if they are free words.
-        // Notes with links and with references on them - aren't touched.
+        // These stop-words can be used for detection another user tokens and if they are free words it is harmless too.
+        // Ignored notes with links and with references on them.
         val usrNotes = ns.flatten.filter(_.isUser).distinct
         val links = NCSentenceManager.getLinks(usrNotes)
         val parts = NCSentenceManager.getPartKeys(usrNotes)
@@ -658,9 +658,9 @@ object NCModelEnricher extends NCProbeEnricher {
 
                     lazy val noteOk2 =
                         n.tokenIndexes == toksIdxsSorted ||
-                            n.tokenIndexes.containsSlice(toksIdxsSorted) &&
-                                U.isContinuous(toksIdxsSorted) &&
-                                U.isContinuous(n.tokenIndexes)
+                        n.tokenIndexes.containsSlice(toksIdxsSorted) &&
+                        U.isContinuous(toksIdxsSorted) &&
+                        U.isContinuous(n.tokenIndexes)
 
                     noteOk1 || noteOk2
                 }
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index 2643630..50137a2 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -684,7 +684,7 @@ object NCSentenceManager extends NCService {
       * lengths - the winning note is chosen based on this priority.
       */
     @throws[NCE]
-    private def collapseSentence(sen: NCNlpSentence, mdl: NCModel, lastPhase: Boolean = false): Seq[NCNlpSentence] = {
+    def collapse(mdl: NCModel, sen: NCNlpSentence, lastPhase: Boolean = false): Seq[NCNlpSentence] = {
         // Always deletes `similar` notes.
         // Some words with same note type can be detected various ways.
         // We keep only one variant -  with `best` direct and sparsity parameters,
@@ -762,18 +762,16 @@ object NCSentenceManager extends NCService {
 
         // There are optimizations below. Similar variants by some criteria deleted.
 
-        def notNlpNotes(s: NCNlpSentence): Seq[NCNlpSentenceNote] = s.flatten.filter(!_.isNlp)
-
         // Drops similar sentences with same notes structure based on greedy elements. Keeps with more notes found.
         val notGreedyElems =
             mdl.getElements.asScala.flatMap(e => if (!e.isGreedy.orElse(mdl.isGreedy)) Some(e.getId) else None).toSet
 
-        sens = sens.groupBy(notNlpNotes(_).groupBy(_.noteType).keys.toSeq.sorted.distinct).
+        sens = sens.groupBy(p => getNotNlpNotes(p.tokens).groupBy(_.noteType).keys.toSeq.sorted.distinct).
             flatMap { case (types, sensSeq) =>
                 if (types.exists(notGreedyElems.contains))
                     sensSeq
                 else {
-                    val m: Map[NCNlpSentence, Int] = sensSeq.map(p => p -> notNlpNotes(p).size).toMap
+                    val m: Map[NCNlpSentence, Int] = sensSeq.map(p => p -> getNotNlpNotes(p.tokens).size).toMap
 
                     val max = m.values.max
 
@@ -794,7 +792,7 @@ object NCSentenceManager extends NCService {
             }.map { case ((sen, _), _) => sen }
 
         // Drops similar sentences. Among similar sentences we prefer one with minimal free words count.
-        sens = sens.groupBy(notNlpNotes(_).map(_.getKey(withIndexes = false))).
+        sens = sens.groupBy(p => getNotNlpNotes(p.tokens).map(_.getKey(withIndexes = false))).
             map { case (_, seq) => seq.minBy(_.filter(p => p.isNlp && !p.isStopWord).map(_.wordIndexes.length).sum) }.
             toSeq
 
@@ -833,15 +831,6 @@ object NCSentenceManager extends NCService {
 
     /**
       *
-      * @param mdl
-      * @param sen
-      * @param lastPhase
-      */
-    def collapse(mdl: NCModel, sen: NCNlpSentence, lastPhase: Boolean = false): Seq[NCNlpSentence] =
-        collapseSentence(sen, mdl, lastPhase)
-
-    /**
-      *
       * @param srvReqId
       */
     def clearRequestData(srvReqId: String): Unit = combCache -= srvReqId