You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by ar...@apache.org on 2021/03/10 00:37:47 UTC
[incubator-nlpcraft] 04/17: WIP.
This is an automated email from the ASF dual-hosted git repository.
aradzinski pushed a commit to branch NLPCRAFT-261
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit b7ab66f52702ddd00f2b88219963154b833401c8
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Thu Mar 4 16:05:48 2021 +0300
WIP.
---
.../apache/nlpcraft/common/nlp/NCNlpSentence.scala | 290 +++++++++++----------
.../model/NCEnricherNestedModelSpec4.scala | 53 ++++
.../probe/mgrs/nlp/enrichers/model/Test1.java | 135 ++++++++++
.../probe/mgrs/nlp/enrichers/model/Test2.java | 114 ++++++++
4 files changed, 453 insertions(+), 139 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index 113e088..f530327 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -22,10 +22,9 @@ import org.apache.nlpcraft.common.NCE
import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
import org.apache.nlpcraft.model.NCModel
-import java.util
-import java.util.{List ⇒ JList}
import java.io.{Serializable ⇒ JSerializable}
-import java.util.Collections
+import java.util
+import java.util.{Collections, List ⇒ JList}
import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer
import scala.collection.{Map, Seq, Set, mutable}
@@ -42,8 +41,10 @@ object NCNlpSentence extends LazyLogging {
require(start <= end)
private def in(i: Int): Boolean = i >= start && i <= end
+
def intersect(id: String, start: Int, end: Int): Boolean = id == this.id && (in(start) || in(end))
}
+
object PartKey {
def apply(m: util.HashMap[String, JSerializable]): PartKey = {
def get[T](name: String): T = m.get(name).asInstanceOf[T]
@@ -71,7 +72,7 @@ object NCNlpSentence extends LazyLogging {
noteLinks ++=
(for ((name, idxs) ← names.asScala.zip(idxsSeq.asScala.map(_.asScala)))
yield NoteLink(name, idxs.sorted)
- )
+ )
}
if (n.contains("subjnotes")) add("subjnotes", "subjindexes")
@@ -409,7 +410,8 @@ object NCNlpSentence extends LazyLogging {
"stopWord" → stop,
"bracketed" → false,
"direct" → direct,
- "dict" → (if (nsCopyToks.size == 1) nsCopyToks.head.getNlpNote.data[Boolean]("dict") else false),
+ "dict" → (if (nsCopyToks.size == 1) nsCopyToks.head.getNlpNote.data[Boolean]("dict")
+ else false),
"english" → nsCopyToks.forall(_.getNlpNote.data[Boolean]("english")),
"swear" → nsCopyToks.exists(_.getNlpNote.data[Boolean]("swear"))
)
@@ -456,7 +458,8 @@ object NCNlpSentence extends LazyLogging {
var fixed = idxs
history.foreach {
- case (idxOld, idxNew) ⇒ fixed = fixed.map(_.map(i ⇒ if (i == idxOld) idxNew else i).distinct)
+ case (idxOld, idxNew) ⇒ fixed = fixed.map(_.map(i ⇒ if (i == idxOld) idxNew
+ else i).distinct)
}
if (fixed.forall(_.size == 1))
@@ -467,6 +470,7 @@ object NCNlpSentence extends LazyLogging {
ok = false
case None ⇒ // No-op.
}
+
ok &&
ns.flatMap(_.getNotes(noteType)).forall(rel ⇒
rel.dataOpt[JList[Int]](idxsField) match {
@@ -518,9 +522,9 @@ object NCNlpSentence extends LazyLogging {
val res =
fixIndexesReferences("nlpcraft:relation", "indexes", "note", ns, history) &&
- fixIndexesReferences("nlpcraft:limit", "indexes", "note", ns, history) &&
- fixIndexesReferencesList("nlpcraft:sort", "subjindexes", "subjnotes", ns, history) &&
- fixIndexesReferencesList("nlpcraft:sort", "byindexes", "bynotes", ns, history)
+ fixIndexesReferences("nlpcraft:limit", "indexes", "note", ns, history) &&
+ fixIndexesReferencesList("nlpcraft:sort", "subjindexes", "subjnotes", ns, history) &&
+ fixIndexesReferencesList("nlpcraft:sort", "byindexes", "bynotes", ns, history)
if (res) {
// Validation (all indexes calculated well)
@@ -538,115 +542,40 @@ object NCNlpSentence extends LazyLogging {
res
}
-}
-
-import org.apache.nlpcraft.common.nlp.NCNlpSentence._
-
-/**
- * Parsed NLP sentence is a collection of tokens. Each token is a collection of notes and
- * each note is a collection of KV pairs.
- *
- * @param srvReqId Server request ID.
- * @param text Normalized text.
- * @param enabledBuiltInToks Enabled built-in tokens.
- * @param tokens Initial buffer.
- * @param deletedNotes Deleted overridden notes with their tokens.
- */
-class NCNlpSentence(
- val srvReqId: String,
- val text: String,
- val enabledBuiltInToks: Set[String],
- override val tokens: mutable.ArrayBuffer[NCNlpSentenceToken] = new mutable.ArrayBuffer[NCNlpSentenceToken](32),
- private val deletedNotes: mutable.HashMap[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = mutable.HashMap.empty,
- private var initNlpNotes: Map[NoteKey, NCNlpSentenceNote] = null,
- private val nlpTokens: mutable.HashMap[TokenKey, NCNlpSentenceToken] = mutable.HashMap.empty
-) extends NCNlpSentenceTokenBuffer(tokens) with JSerializable {
- @transient
- private var hash: java.lang.Integer = _
-
- private def calcHash(): Int =
- Seq(srvReqId, text, enabledBuiltInToks, tokens).map(_.hashCode()).foldLeft(0)((a, b) ⇒ 31 * a + b)
-
- private def addDeleted(sen: NCNlpSentence, dels: Iterable[NCNlpSentenceNote]): Unit =
- sen.deletedNotes ++= dels.map(n ⇒ {
- val savedDelNote = n.clone()
- val savedDelToks = n.tokenIndexes.map(idx ⇒ this(idx).clone())
-
- val mainNotes = savedDelToks.flatten.filter(n ⇒ n.noteType != "nlpcraft:nlp" && n != savedDelNote)
-
- // Deleted note's tokens should contains only nlp data and deleted notes.
- for (savedDelTok ← savedDelToks; mainNote ← mainNotes)
- savedDelTok.remove(mainNote)
-
- savedDelNote → savedDelToks
- })
-
- // Deep copy.
- override def clone(): NCNlpSentence =
- new NCNlpSentence(
- srvReqId,
- text,
- enabledBuiltInToks,
- tokens.map(_.clone()),
- deletedNotes.map(p ⇒ p._1.clone() → p._2.map(_.clone())),
- initNlpNotes = initNlpNotes
- )
-
- /**
- * Utility method that gets set of notes for given note type collected from
- * tokens in this sentence. Notes are sorted in the same order they appear
- * in this sentence.
- *
- * @param noteType Note type.
- */
- def getNotes(noteType: String): Seq[NCNlpSentenceNote] = this.flatMap(_.getNotes(noteType)).distinct
-
- /**
- * Utility method that removes note with given ID from all tokens in this sentence.
- * No-op if such note wasn't found.
- *
- * @param note Note.
- */
- def removeNote(note: NCNlpSentenceNote): Unit = this.foreach(_.remove(note))
-
- //noinspection HashCodeUsesVar
- override def hashCode(): Int = {
- if (hash == null)
- hash = calcHash()
-
- hash
- }
-
- def fixNote(note: NCNlpSentenceNote, kvs: (String, JSerializable)*): Unit = {
- val fixed = note.clone(kvs: _*)
-
- this.filter(t ⇒ t.index >= fixed.tokenIndexes.head && t.index <= fixed.tokenIndexes.last).foreach(t ⇒ {
- t.remove(note)
- t.add(fixed)
- })
-
- hash = null
- }
private def dropAbstract(mdl: NCModel, ns: NCNlpSentence): Unit =
if (!mdl.getAbstractTokens.isEmpty) {
val notes = ns.flatten
- val keys = getPartKeys(notes :_*)
+ val keys = getPartKeys(notes: _*)
val noteLinks = getLinks(notes)
notes.filter(n ⇒ {
val noteToks = ns.tokens.filter(_.contains(n))
mdl.getAbstractTokens.contains(n.noteType) &&
- !keys.exists(_.intersect(n.noteType, noteToks.head.startCharIndex, noteToks.last.startCharIndex)) &&
- !noteLinks.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))
+ !keys.exists(_.intersect(n.noteType, noteToks.head.startCharIndex, noteToks.last.startCharIndex)) &&
+ !noteLinks.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))
}).foreach(ns.removeNote)
}
private def getNotNlpNotes(toks: Seq[NCNlpSentenceToken]): Seq[NCNlpSentenceNote] =
toks.flatten.filter(!_.isNlp).distinct
+ private def addDeleted(thisSen: NCNlpSentence, sen: NCNlpSentence, dels: Iterable[NCNlpSentenceNote]): Unit =
+ sen.deletedNotes ++= dels.map(n ⇒ {
+ val savedDelNote = n.clone()
+ val savedDelToks = n.tokenIndexes.map(idx ⇒ thisSen(idx).clone())
+
+ val mainNotes = savedDelToks.flatten.filter(n ⇒ n.noteType != "nlpcraft:nlp" && n != savedDelNote)
+
+ // Deleted note's tokens should contains only nlp data and deleted notes.
+ for (savedDelTok ← savedDelToks; mainNote ← mainNotes)
+ savedDelTok.remove(mainNote)
+
+ savedDelNote → savedDelToks
+ })
+
/**
* This collapser handles several tasks:
* - "overall" collapsing after all other individual collapsers had their turn.
@@ -656,12 +585,13 @@ class NCNlpSentence(
* lengths - the winning note is chosen based on this priority.
*/
@throws[NCE]
- def collapse(mdl: NCModel, lastPhase: Boolean = false): Seq[NCNlpSentence] = {
+ private def collapseSentence(thisSen: NCNlpSentence, mdl: NCModel, lastPhase: Boolean = false): Seq[NCNlpSentence] = {
def collapse0(ns: NCNlpSentence): Option[NCNlpSentence] = {
if (lastPhase)
dropAbstract(mdl, ns)
- if (collapseSentence(ns, getNotNlpNotes(ns).map(_.noteType).distinct)) Some(ns) else None
+ if (collapseSentence(ns, getNotNlpNotes(ns).map(_.noteType).distinct)) Some(ns)
+ else None
}
// Always deletes `similar` notes.
@@ -669,37 +599,37 @@ class NCNlpSentence(
// We keep only one variant - with `best` direct and sparsity parameters,
// other variants for these words are redundant.
val redundant: Seq[NCNlpSentenceNote] =
- this.flatten.filter(!_.isNlp).distinct.
- groupBy(_.getKey()).
- map(p ⇒ p._2.sortBy(p ⇒
- (
- // System notes don't have such flags.
- if (p.isUser) {
- if (p.isDirect)
- 0
- else
- 1
- }
- else
- 0,
- if (p.isUser)
- p.sparsity
- else
+ thisSen.flatten.filter(!_.isNlp).distinct.
+ groupBy(_.getKey()).
+ map(p ⇒ p._2.sortBy(p ⇒
+ (
+ // System notes don't have such flags.
+ if (p.isUser) {
+ if (p.isDirect)
0
- )
- )).
- flatMap(_.drop(1)).
- toSeq
+ else
+ 1
+ }
+ else
+ 0,
+ if (p.isUser)
+ p.sparsity
+ else
+ 0
+ )
+ )).
+ flatMap(_.drop(1)).
+ toSeq
- redundant.foreach(this.removeNote)
+ redundant.foreach(thisSen.removeNote)
var delCombs: Seq[NCNlpSentenceNote] =
- getNotNlpNotes(this).
- flatMap(note ⇒ getNotNlpNotes(note.tokenIndexes.sorted.map(i ⇒ this(i))).filter(_ != note)).
+ getNotNlpNotes(thisSen).
+ flatMap(note ⇒ getNotNlpNotes(note.tokenIndexes.sorted.map(i ⇒ thisSen(i))).filter(_ != note)).
distinct
// Optimization. Deletes all wholly swallowed notes.
- val links = getLinks(this.flatten)
+ val links = getLinks(thisSen.flatten)
val swallowed =
delCombs.
@@ -709,22 +639,24 @@ class NCNlpSentence(
filter(getPartKeys(_).isEmpty).
flatMap(note ⇒ {
val noteWordsIdxs = note.wordIndexes.toSet
- val key = PartKey(note, this)
+ val key = PartKey(note, thisSen)
val delCombOthers =
- delCombs.filter(_ != note).flatMap(n ⇒ if (getPartKeys(n).contains(key)) Some(n) else None)
+ delCombs.filter(_ != note).flatMap(n ⇒ if (getPartKeys(n).contains(key)) Some(n)
+ else None)
- if (delCombOthers.exists(o ⇒ noteWordsIdxs == o.wordIndexes.toSet)) Some(note) else None
+ if (delCombOthers.exists(o ⇒ noteWordsIdxs == o.wordIndexes.toSet)) Some(note)
+ else None
})
delCombs = delCombs.filter(p ⇒ !swallowed.contains(p))
- addDeleted(this, swallowed)
- swallowed.foreach(this.removeNote)
+ addDeleted(thisSen, thisSen, swallowed)
+ swallowed.foreach(thisSen.removeNote)
- val toksByIdx: Seq[Seq[NCNlpSentenceNote]] =
+ val toksByIdx: Seq[Set[NCNlpSentenceNote]] =
delCombs.flatMap(note ⇒ note.wordIndexes.map(_ → note)).
groupBy { case (idx, _) ⇒ idx }.
- map { case (_, seq) ⇒ seq.map { case (_, note) ⇒ note } }.
+ map { case (_, seq) ⇒ seq.map { case (_, note) ⇒ note }.toSet }.
toSeq.sortBy(-_.size)
val minDelSize = if (toksByIdx.isEmpty) 1 else toksByIdx.map(_.size).max - 1
@@ -741,7 +673,7 @@ class NCNlpSentence(
!toksByIdx.exists(
rec ⇒
rec.size - delCombs.size <= 1 &&
- rec.count(note ⇒ !delComb.contains(note)) > 1
+ rec.count(note ⇒ !delComb.contains(note)) > 1
)
)
).
@@ -750,10 +682,10 @@ class NCNlpSentence(
flatMap(delComb ⇒
// Already processed with less subset of same deleted tokens.
if (!deleted.exists(_.subsetOf(delComb))) {
- val nsClone = this.clone()
+ val nsClone = thisSen.clone()
// Saves deleted notes for sentence and their tokens.
- addDeleted(nsClone, delComb)
+ addDeleted(thisSen, nsClone, delComb)
delComb.foreach(nsClone.removeNote)
// Has overlapped notes for some tokens.
@@ -787,7 +719,8 @@ class NCNlpSentence(
p.clone().filter(_._1 != "direct")
)
- (Key(get(sysNotes), get(userNotes)), sen, nlpNotes.map(p ⇒ if (p.isDirect) 0 else 1).sum)
+ (Key(get(sysNotes), get(userNotes)), sen, nlpNotes.map(p ⇒ if (p.isDirect) 0
+ else 1).sum)
}).
foreach { case (key, sen, directCnt) ⇒
m.get(key) match {
@@ -802,7 +735,7 @@ class NCNlpSentence(
m.values.map(_.sentence).toSeq
}
else
- collapse0(this).flatMap(p ⇒ Option(Seq(p))).getOrElse(Seq.empty)
+ collapse0(thisSen).flatMap(p ⇒ Option(Seq(p))).getOrElse(Seq.empty)
sens = sens.distinct
@@ -822,6 +755,81 @@ class NCNlpSentence(
map { case (_, seq) ⇒ seq.minBy(_.filter(p ⇒ p.isNlp && !p.isStopWord).map(_.wordIndexes.length).sum) }.
toSeq
}
+}
+
+import org.apache.nlpcraft.common.nlp.NCNlpSentence._
+
+/**
+ * Parsed NLP sentence is a collection of tokens. Each token is a collection of notes and
+ * each note is a collection of KV pairs.
+ *
+ * @param srvReqId Server request ID.
+ * @param text Normalized text.
+ * @param enabledBuiltInToks Enabled built-in tokens.
+ * @param tokens Initial buffer.
+ * @param deletedNotes Deleted overridden notes with their tokens.
+ */
+class NCNlpSentence(
+ val srvReqId: String,
+ val text: String,
+ val enabledBuiltInToks: Set[String],
+ override val tokens: mutable.ArrayBuffer[NCNlpSentenceToken] = new mutable.ArrayBuffer[NCNlpSentenceToken](32),
+ private val deletedNotes: mutable.HashMap[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = mutable.HashMap.empty,
+ private var initNlpNotes: Map[NoteKey, NCNlpSentenceNote] = null,
+ private val nlpTokens: mutable.HashMap[TokenKey, NCNlpSentenceToken] = mutable.HashMap.empty
+) extends NCNlpSentenceTokenBuffer(tokens) with JSerializable {
+ @transient
+ private var hash: java.lang.Integer = _
+
+ private def calcHash(): Int =
+ Seq(srvReqId, text, enabledBuiltInToks, tokens).map(_.hashCode()).foldLeft(0)((a, b) ⇒ 31 * a + b)
+
+ // Deep copy.
+ override def clone(): NCNlpSentence =
+ new NCNlpSentence(
+ srvReqId,
+ text,
+ enabledBuiltInToks,
+ tokens.map(_.clone()),
+ deletedNotes.map(p ⇒ p._1.clone() → p._2.map(_.clone())),
+ initNlpNotes = initNlpNotes
+ )
+
+ /**
+ * Utility method that gets set of notes for given note type collected from
+ * tokens in this sentence. Notes are sorted in the same order they appear
+ * in this sentence.
+ *
+ * @param noteType Note type.
+ */
+ def getNotes(noteType: String): Seq[NCNlpSentenceNote] = this.flatMap(_.getNotes(noteType)).distinct
+
+ /**
+ * Utility method that removes note with given ID from all tokens in this sentence.
+ * No-op if such note wasn't found.
+ *
+ * @param note Note.
+ */
+ def removeNote(note: NCNlpSentenceNote): Unit = this.foreach(_.remove(note))
+
+ //noinspection HashCodeUsesVar
+ override def hashCode(): Int = {
+ if (hash == null)
+ hash = calcHash()
+
+ hash
+ }
+
+ def fixNote(note: NCNlpSentenceNote, kvs: (String, JSerializable)*): Unit = {
+ val fixed = note.clone(kvs: _*)
+
+ this.filter(t ⇒ t.index >= fixed.tokenIndexes.head && t.index <= fixed.tokenIndexes.last).foreach(t ⇒ {
+ t.remove(note)
+ t.add(fixed)
+ })
+
+ hash = null
+ }
/**
* Returns flag are note notes equal (or similar) or not. Reason of ignored difference can be stopwords tokens.
@@ -931,4 +939,8 @@ class NCNlpSentence(
*
*/
def getDeletedNotes: Predef.Map[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = deletedNotes.toMap
+
+ def collapse(mdl: NCModel, lastPhase: Boolean = false): Seq[NCNlpSentence] = {
+ collapseSentence(this, mdl, lastPhase)
+ }
}
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
new file mode 100644
index 0000000..afdeaab
--- /dev/null
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model
+
+import org.apache.nlpcraft.model.{NCElement, NCIntent, NCIntentMatch, NCModelAdapter, NCResult}
+import org.apache.nlpcraft.{NCTestContext, NCTestElement, NCTestEnvironment}
+import org.junit.jupiter.api.Test
+
+import java.util
+import scala.collection.JavaConverters._
+
+/**
+ * Nested Elements test model.
+ */
+class NCNestedTestModel4 extends NCModelAdapter(
+ "nlpcraft.nested3.test.mdl", "Nested Data Test Model", "1.0"
+) {
+ override def getElements: util.Set[NCElement] =
+ Set(
+ NCTestElement("e1", "//[a-zA-Z0-9]+//"),
+ NCTestElement("e2", "the ^^(id == 'e1')^^")
+ )
+
+ override def getAbstractTokens: util.Set[String] = Set("e1").asJava
+ override def getEnabledBuiltInTokens: util.Set[String] = Set.empty[String].asJava
+
+ @NCIntent("intent=onE2 term(t1)={id == 'e2'}[8, 100]")
+ def onAB(ctx: NCIntentMatch): NCResult = NCResult.text("OK")
+}
+
+/**
+ * It shouldn't be too slow.
+ */
+@NCTestEnvironment(model = classOf[NCNestedTestModel4], startClient = true)
+class NCEnricherNestedModelSpec4 extends NCTestContext {
+ @Test1
+ def test(): Unit = checkIntent("the a " * 8, "onE2")
+}
\ No newline at end of file
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/Test1.java b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/Test1.java
new file mode 100644
index 0000000..398e858
--- /dev/null
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/Test1.java
@@ -0,0 +1,135 @@
+package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model;
+
+import com.google.common.collect.ImmutableSet;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+public class Test1 {
+ private static List<Set<String>> ROWS =
+ Arrays.asList(
+ ImmutableSet.of("A", "B", "C"),
+ ImmutableSet.of("B", "C", "D"),
+ ImmutableSet.of("B", "D")
+ );
+
+// // Uncomment it. Works too long time. Normalized result is 256.
+// private static List<Set<String>> ROWS = Arrays.asList(
+// ImmutableSet.of("A", "B"),
+// ImmutableSet.of("C", "B"),
+// ImmutableSet.of("D", "E"),
+// ImmutableSet.of("D", "F"),
+// ImmutableSet.of("G", "H"),
+// ImmutableSet.of("I", "H"),
+// ImmutableSet.of("J", "K"),
+// ImmutableSet.of("L", "K"),
+// ImmutableSet.of("M", "N"),
+// ImmutableSet.of("M", "O"),
+// ImmutableSet.of("P", "Q"),
+// ImmutableSet.of("P", "R"),
+// ImmutableSet.of("S", "T"),
+// ImmutableSet.of("S", "U"),
+// ImmutableSet.of("V", "W"),
+// ImmutableSet.of("X", "W")
+// );
+
+ private static Set<String> ALL = ROWS.stream().flatMap(Collection::stream).collect(Collectors.toSet());
+
+ // Goal: Find minimal set of combinations with following feature.
+ // After removing combination values from each row - list should contain rows with size <= 1.
+
+ // Expected solution: [C, B], [A, C, D], [A, B, D]
+ // Example:
+ // list - [C, B] = {{A}, {D}, {D}}
+ // list - [A, C, D] = {{B}, {B}, {B}}
+ // list - [A, B, D] = {{C}, {C}, {null}}
+
+
+ // Additional. Redundant solutions: [A, B, C] ([C, B] enough), [A, B, C, D] ([A, C, D] enough) etc
+
+ // Easiest.
+ public static void main(String[] args) {
+ long t = System.currentTimeMillis();
+
+ System.out.println("1. start [time=" + (System.currentTimeMillis() - t) + ']');
+
+ // 1. Extends.
+ List<Set<String>> extRows = extendNulls();
+
+ // 2. All valid rows (permutation)
+ // Or manually permute, like https://stackoverflow.com/questions/17192796/generate-all-combinations-from-multiple-lists
+ Set<List<String>> allSingleOrNullRows = com.google.common.collect.Sets.cartesianProduct(extRows);
+
+ System.out.println("2. permuted [size=" + allSingleOrNullRows.size() + ", time=" + (System.currentTimeMillis() - t) + ']');
+
+ // 3. Collects all suitable combinations.
+ Set<Set<String>> combs =
+ allSingleOrNullRows.
+ stream().
+ // Calculates how that single or empty lines can be constructed (it is required combination).
+ map(row -> {
+ Set<String> copy = new HashSet<>(ALL);
+
+ copy.removeAll(row);
+
+ return copy;
+ }).
+ distinct().
+ filter(Test1::isSuitable).
+ collect(Collectors.toSet());
+
+ System.out.println("3. calculated [size=" + combs.size() + ", time=" + (System.currentTimeMillis() - t) + ']');
+
+ // 3. Normalize variants (keeps only minimal valid subsets, see task description)
+ Set<Set<String>> normCombs = squeeze(combs);
+
+ System.out.println("4. normalized [size=" + normCombs.size() + ", time=" + (System.currentTimeMillis() - t) + ']');
+ System.out.println("Norm results:" + normCombs);
+ }
+
+ /**
+ * Removes `candidate` from each row of ROWS.
+ * Return true if result list doesn't contain any row with size > 1.
+ * <p>
+ * If ROWS is {{a, b}, {a, c}}. Candidate {a, b} - ok, candidate {a} - ok, candidate {b} - no.
+ */
+ private static boolean isSuitable(Set<String> candidate) {
+ for (Set<String> row : ROWS) {
+ Set<String> copy = new HashSet<>(row);
+
+ copy.removeAll(candidate);
+
+ if (copy.size() > 1) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ private static Set<Set<String>> squeeze(Set<Set<String>> combs) {
+ Set<Set<String>> normCombs = new HashSet<>();
+
+ for (Set<String> comb : combs.stream().sorted(Comparator.comparingInt(Set::size)).collect(Collectors.toList())) {
+ // Skips already added shorter variants.
+ if (normCombs.stream().filter(comb::containsAll).findAny().isEmpty()) {
+ normCombs.add(comb);
+ }
+ }
+ return normCombs;
+ }
+
+ // Adds "" which means empty row. For example for small row it returns {{A, B, C, ""}, {B, C, D, ""}, {B, D, ""} }
+ private static List<Set<String>> extendNulls() {
+ return ROWS.stream().map(
+ p -> Stream.concat(p.stream(), Stream.of("")).collect(Collectors.toSet())
+ ).collect(Collectors.toList());
+ }
+}
+
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/Test2.java b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/Test2.java
new file mode 100644
index 0000000..b8e5e42
--- /dev/null
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/Test2.java
@@ -0,0 +1,114 @@
+package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model;
+
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Sets;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+public class Test2 {
+// private static List<Set<String>> ROWS =
+// Arrays.asList(
+// ImmutableSet.of("A", "B", "C"),
+// ImmutableSet.of("B", "C", "D"),
+// ImmutableSet.of("B", "D")
+// );
+
+ // Uncomment it. Works too long time. Normalized result is 256.
+ private static List<Set<String>> ROWS = Arrays.asList(
+ ImmutableSet.of("A", "B"),
+ ImmutableSet.of("C", "B"),
+ ImmutableSet.of("D", "E"),
+ ImmutableSet.of("D", "F"),
+ ImmutableSet.of("G", "H"),
+ ImmutableSet.of("I", "H"),
+ ImmutableSet.of("J", "K"),
+ ImmutableSet.of("L", "K"),
+ ImmutableSet.of("M", "N"),
+ ImmutableSet.of("M", "O"),
+ ImmutableSet.of("P", "Q"),
+ ImmutableSet.of("P", "R"),
+ ImmutableSet.of("S", "T"),
+ ImmutableSet.of("S", "U"),
+ ImmutableSet.of("V", "W"),
+ ImmutableSet.of("X", "W")
+ );
+
+ private static Set<String> ALL = ROWS.stream().flatMap(Collection::stream).collect(Collectors.toSet());
+
+ // Goal: Find minimal set of combinations with following feature.
+ // After removing combination values from each row - list should contain rows with size <= 1.
+
+ // Expected solution: [C, B], [A, C, D], [A, B, D]
+ // Example:
+ // list - [C, B] = {{A}, {D}, {D}}
+ // list - [A, C, D] = {{B}, {B}, {B}}
+ // list - [A, B, D] = {{C}, {C}, {null}}
+
+
+ // Additional. Redundant solutions: [A, B, C] ([C, B] enough), [A, B, C, D] ([A, C, D] enough) etc
+
+ // Easiest.
+ public static void main(String[] args) {
+ long t = System.currentTimeMillis();
+
+ System.out.println("1. start [time=" + (System.currentTimeMillis() - t) + ']');
+
+ Set<Set<String>> combs = new HashSet<>();
+
+ for (int i = 1; i < ALL.size(); i++) {
+ combs.addAll(
+ Sets.combinations(ALL, i).
+ stream().
+ filter(Test2::isSuitable).
+ collect(Collectors.toSet())
+ );
+ }
+
+ System.out.println("2. calculated [size=" + combs.size() + ", time=" + (System.currentTimeMillis() - t) + ']');
+
+ // Normalize variants (keeps only minimal valid subsets, see task description)
+ Set<Set<String>> normCombs = squeeze(combs);
+
+ System.out.println("3. normalized [size=" + normCombs.size() + ", time=" + (System.currentTimeMillis() - t) + ']');
+ System.out.println("Norm results:" + normCombs);
+ }
+
+ private static Set<Set<String>> squeeze(Set<Set<String>> combs) {
+ Set<Set<String>> normCombs = new HashSet<>();
+
+ for (Set<String> comb : combs.stream().sorted(Comparator.comparingInt(Set::size)).collect(Collectors.toList())) {
+ // Skips already added shorter variants.
+ if (normCombs.stream().filter(comb::containsAll).findAny().isEmpty()) {
+ normCombs.add(comb);
+ }
+ }
+ return normCombs;
+ }
+
+ /**
+ * Removes `candidate` from each row of ROWS.
+ * Return true if result list doesn't contain any row with size > 1.
+ * <p>
+ * If ROWS is {{a, b}, {a, c}}. Candidate {a, b} - ok, candidate {a} - ok, candidate {b} - no.
+ */
+ private static boolean isSuitable(Set<String> candidate) {
+ for (Set<String> row : ROWS) {
+ Set<String> copy = new HashSet<>(row);
+
+ copy.removeAll(candidate);
+
+ if (copy.size() > 1) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+}
+