You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by ar...@apache.org on 2021/03/10 00:37:53 UTC
[incubator-nlpcraft] 10/17: WIP.

This is an automated email from the ASF dual-hosted git repository.

aradzinski pushed a commit to branch NLPCRAFT-261
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 5b6f191effe9f52e7940ea28e237800ce98a81e3
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Tue Mar 9 12:29:57 2021 +0300

    WIP.
---
 .../probe/mgrs/nlp/NCProbeEnrichmentManager.scala  |   2 +
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala |   1 +
 .../probe/mgrs/sentence/NCComboHelper.java         |   7 +-
 .../probe/mgrs/sentence/NCSentenceManager.scala    | 106 ++++++++-------------
 4 files changed, 48 insertions(+), 68 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
index c328e57..d07f86b 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
@@ -433,6 +433,8 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats {
             while (continue) {
                 step = step + 1
 
+                println(s"step=$step")
+
                 if (step >= MAX_NESTED_TOKENS)
                     throw new NCE(s"Stack overflow on nested tokens processing (> $MAX_NESTED_TOKENS).")
 
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 13b7d45..0a11314 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -21,6 +21,7 @@ import io.opencensus.trace.Span
 import org.apache.nlpcraft.common._
 import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken, NCNlpSentenceTokenBuffer, _}
 import org.apache.nlpcraft.model._
+import org.apache.nlpcraft.model.impl.NCTokenLogger
 import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind, TEXT}
 import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
 import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCComboHelper.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCComboHelper.java
index da2d3fd..6ed99b9 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCComboHelper.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCComboHelper.java
@@ -28,7 +28,7 @@ import java.util.concurrent.RecursiveTask;
 import static java.util.stream.Collectors.toList;
 
 /**
- *
+ * It is not converted to scala because scala and java long values implicit conversion performance problems.
  */
 class NCComboHelper extends RecursiveTask<List<Long>> {
     private static final long THRESHOLD = (long)Math.pow(2, 20);
@@ -53,6 +53,9 @@ class NCComboHelper extends RecursiveTask<List<Long>> {
      * @return
      */
     static <T> List<List<T>> findCombinations(List<Set<T>> words, ForkJoinPool pool) {
+        assert words != null && !words.isEmpty();
+        assert pool != null;
+
         // Build dictionary of unique words.
         List<T> dict = words.stream().flatMap(Collection::stream).distinct().collect(toList());
 
@@ -70,7 +73,7 @@ class NCComboHelper extends RecursiveTask<List<Long>> {
         // Prepare Fork/Join task to iterate over the power set of all combinations.
         return pool.invoke(
             new NCComboHelper(
-                1,
+                words.stream().mapToInt(Set::size).max().orElseThrow() - 1,
                 (long)Math.pow(2, dict.size()),
                 wordBits,
                 wordCounts
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index e0dd59c..af58782 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -585,7 +585,7 @@ object NCSentenceManager extends NCService {
       * lengths - the winning note is chosen based on this priority.
       */
     @throws[NCE]
-    private def collapseSentence(thisSen: NCNlpSentence, mdl: NCModel, lastPhase: Boolean = false): Seq[NCNlpSentence] = {
+    private def collapseSentence(sen: NCNlpSentence, mdl: NCModel, lastPhase: Boolean = false): Seq[NCNlpSentence] = {
         def collapse0(ns: NCNlpSentence): Option[NCNlpSentence] = {
             if (lastPhase)
                 dropAbstract(mdl, ns)
@@ -598,37 +598,37 @@ object NCSentenceManager extends NCService {
         // We keep only one variant -  with `best` direct and sparsity parameters,
         // other variants for these words are redundant.
         val redundant: Seq[NCNlpSentenceNote] =
-        thisSen.flatten.filter(!_.isNlp).distinct.
-            groupBy(_.getKey()).
-            map(p ⇒ p._2.sortBy(p ⇒
-                (
-                    // System notes don't have such flags.
-                    if (p.isUser) {
-                        if (p.isDirect)
-                            0
+            sen.flatten.filter(!_.isNlp).distinct.
+                groupBy(_.getKey()).
+                map(p ⇒ p._2.sortBy(p ⇒
+                    (
+                        // System notes don't have such flags.
+                        if (p.isUser) {
+                            if (p.isDirect)
+                                0
+                            else
+                                1
+                        }
                         else
-                            1
-                    }
-                    else
-                        0,
-                    if (p.isUser)
-                        p.sparsity
-                    else
-                        0
-                )
-            )).
-            flatMap(_.drop(1)).
-            toSeq
+                            0,
+                        if (p.isUser)
+                            p.sparsity
+                        else
+                            0
+                    )
+                )).
+                flatMap(_.drop(1)).
+                toSeq
 
-        redundant.foreach(thisSen.removeNote)
+        redundant.foreach(sen.removeNote)
 
         var delCombs: Seq[NCNlpSentenceNote] =
-            getNotNlpNotes(thisSen).
-                flatMap(note ⇒ getNotNlpNotes(note.tokenIndexes.sorted.map(i ⇒ thisSen(i))).filter(_ != note)).
+            getNotNlpNotes(sen).
+                flatMap(note ⇒ getNotNlpNotes(note.tokenIndexes.sorted.map(i ⇒ sen(i))).filter(_ != note)).
                 distinct
 
         // Optimization. Deletes all wholly swallowed notes.
-        val links = getLinks(thisSen.flatten)
+        val links = getLinks(sen.flatten)
 
         val swallowed =
             delCombs.
@@ -638,7 +638,7 @@ object NCSentenceManager extends NCService {
                 filter(getPartKeys(_).isEmpty).
                 flatMap(note ⇒ {
                     val noteWordsIdxs = note.wordIndexes.toSet
-                    val key = PartKey(note, thisSen)
+                    val key = PartKey(note, sen)
 
                     val delCombOthers =
                         delCombs.filter(_ != note).flatMap(n ⇒ if (getPartKeys(n).contains(key)) Some(n) else None)
@@ -646,9 +646,10 @@ object NCSentenceManager extends NCService {
                     if (delCombOthers.exists(o ⇒ noteWordsIdxs == o.wordIndexes.toSet)) Some(note) else None
                 })
 
+
         delCombs = delCombs.filter(p ⇒ !swallowed.contains(p))
-        addDeleted(thisSen, thisSen, swallowed)
-        swallowed.foreach(thisSen.removeNote)
+        addDeleted(sen, sen, swallowed)
+        swallowed.foreach(sen.removeNote)
 
         val toksByIdx: Seq[Set[NCNlpSentenceNote]] =
             delCombs.flatMap(note ⇒ note.wordIndexes.map(_ → note)).
@@ -656,49 +657,22 @@ object NCSentenceManager extends NCService {
                 map { case (_, seq) ⇒ seq.map { case (_, note) ⇒ note }.toSet }.
                 toSeq.sortBy(-_.size)
 
-//        val minDelSize = if (toksByIdx.isEmpty) 1 else toksByIdx.map(_.size).max - 1
-
         var sens =
             if (delCombs.nonEmpty) {
-                val deleted = mutable.ArrayBuffer.empty[Set[NCNlpSentenceNote]]
-
-//                val combs = (minDelSize to delCombs.size).
-//                    flatMap(i ⇒
-//                        delCombs.combinations(i).
-//                            filter(delComb ⇒
-//                                !toksByIdx.exists(
-//                                    rec ⇒
-//                                        rec.size - delCombs.size <= 1 &&
-//                                            rec.count(note ⇒ !delComb.contains(note)) > 1
-//                                )
-//                            )
-//                    ).
-//                    sortBy(_.size).
-//                    map(_.toSet)
-//
-                val combs = NCComboHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala)
-
                 val sens =
-                    combs.
-                        flatMap(delComb ⇒
-                            // Already processed with less subset of same deleted tokens.
-                            if (!deleted.exists(_.subsetOf(delComb.toSet))) {
-                                val nsClone = thisSen.clone()
+                    NCComboHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala).
+                        flatMap(delComb ⇒ {
+                            val nsClone = sen.clone()
 
-                                // Saves deleted notes for sentence and their tokens.
-                                addDeleted(thisSen, nsClone, delComb)
-                                delComb.foreach(nsClone.removeNote)
+                            // Saves deleted notes for sentence and their tokens.
+                            addDeleted(sen, nsClone, delComb)
+                            delComb.foreach(nsClone.removeNote)
 
-                                // Has overlapped notes for some tokens.
-                                require(!nsClone.exists(_.count(!_.isNlp) > 1))
+                            // Has overlapped notes for some tokens.
+                            require(!nsClone.exists(_.count(!_.isNlp) > 1))
 
-                                deleted += delComb.toSet
-
-                                collapse0(nsClone)
-                            }
-                            else
-                                None
-                        )
+                            collapse0(nsClone)
+                        })
 
                 // It removes sentences which have only one difference - 'direct' flag of their user tokens.
                 // `Direct` sentences have higher priority.
@@ -735,7 +709,7 @@ object NCSentenceManager extends NCService {
                 m.values.map(_.sentence).toSeq
             }
             else
-                collapse0(thisSen).flatMap(p ⇒ Option(Seq(p))).getOrElse(Seq.empty)
+                collapse0(sen).flatMap(p ⇒ Option(Seq(p))).getOrElse(Seq.empty)
 
         sens = sens.distinct