You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/03/08 12:26:24 UTC
[incubator-nlpcraft] 03/05: WIP.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-261
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 7815090de3aa042a4839793cdb65cd1677afbb86
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Mon Mar 8 15:23:17 2021 +0300
WIP.
---
.../apache/nlpcraft/common/nlp/NCNlpSentence.scala | 776 +--------------------
.../org/apache/nlpcraft/common/nlp/SyTest.java | 241 -------
.../org/apache/nlpcraft/probe/NCProbeBoot.scala | 2 +
.../probe/mgrs/nlp/NCProbeEnrichmentManager.scala | 3 +-
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 7 +-
.../mgrs/sentence/NCSentenceManager.scala} | 380 +++-------
.../nlpcraft/probe/mgrs/nlp/enrichers/SyTest.java | 278 --------
.../model/NCEnricherNestedModelSpec4.scala | 2 +-
.../probe/mgrs/nlp/enrichers/model/Test1.java | 135 ----
.../probe/mgrs/nlp/enrichers/model/Test2.java | 114 ---
10 files changed, 108 insertions(+), 1830 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index 95a98a3..11d515a 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -18,778 +18,17 @@
package org.apache.nlpcraft.common.nlp
import com.typesafe.scalalogging.LazyLogging
-import org.apache.nlpcraft.common.NCE
-import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
-import org.apache.nlpcraft.common.util.NCComboRecursiveTask
-import org.apache.nlpcraft.model.NCModel
-
-import java.io.{Serializable => JSerializable}
-import java.util
-import java.util.concurrent.ForkJoinPool
-import java.util.{Collections, Comparator, List => JList}
+
+import java.io.{Serializable ⇒ JSerializable}
+import java.util.{Collections, List ⇒ JList}
import scala.collection.JavaConverters._
-import scala.collection.mutable.ArrayBuffer
import scala.collection.{Map, Seq, Set, mutable}
import scala.language.implicitConversions
object NCNlpSentence extends LazyLogging {
- implicit def toTokens(x: NCNlpSentence): ArrayBuffer[NCNlpSentenceToken] = x.tokens
-
case class NoteKey(start: Int, end: Int)
case class TokenKey(id: String, start: Int, end: Int)
case class NoteLink(note: String, indexes: Seq[Int])
-
- case class PartKey(id: String, start: Int, end: Int) {
- require(start <= end)
-
- private def in(i: Int): Boolean = i >= start && i <= end
- def intersect(id: String, start: Int, end: Int): Boolean = id == this.id && (in(start) || in(end))
- }
-
- object PartKey {
- def apply(m: util.HashMap[String, JSerializable]): PartKey = {
- def get[T](name: String): T = m.get(name).asInstanceOf[T]
-
- PartKey(get("id"), get("startcharindex"), get("endcharindex"))
- }
-
- def apply(t: NCNlpSentenceNote, sen: NCNlpSentence): PartKey =
- PartKey(t.noteType, sen(t.tokenFrom).startCharIndex, sen(t.tokenTo).endCharIndex)
- }
-
- private def getLinks(notes: Seq[NCNlpSentenceNote]): Seq[NoteLink] = {
- val noteLinks = mutable.ArrayBuffer.empty[NoteLink]
-
- for (n ← notes.filter(n ⇒ n.noteType == "nlpcraft:limit" || n.noteType == "nlpcraft:references"))
- noteLinks += NoteLink(n("note").asInstanceOf[String], n("indexes").asInstanceOf[JList[Int]].asScala.sorted)
-
- for (n ← notes.filter(_.noteType == "nlpcraft:sort")) {
- def add(noteName: String, idxsName: String): Unit = {
- val names = n(noteName).asInstanceOf[JList[String]]
- val idxsSeq = n(idxsName).asInstanceOf[JList[JList[Int]]]
-
- require(names.size() == idxsSeq.size())
-
- noteLinks ++=
- (for ((name, idxs) ← names.asScala.zip(idxsSeq.asScala.map(_.asScala)))
- yield NoteLink(name, idxs.sorted)
- )
- }
-
- if (n.contains("subjnotes")) add("subjnotes", "subjindexes")
- if (n.contains("bynotes")) add("bynotes", "byindexes")
- }
-
- noteLinks
- }
-
- private def getPartKeys(notes: NCNlpSentenceNote*): Seq[PartKey] =
- notes.
- filter(_.isUser).
- flatMap(n ⇒ {
- val optList: Option[JList[util.HashMap[String, JSerializable]]] = n.dataOpt("parts")
-
- optList
- }).flatMap(_.asScala).map(m ⇒ PartKey(m)).distinct
-
- /**
- *
- * @param ns
- * @param idxs
- * @param notesType
- * @param note
- * @return
- */
- private def checkRelation(ns: NCNlpSentence, idxs: Seq[Int], notesType: String, note: NCNlpSentenceNote): Boolean = {
- val types = idxs.flatMap(idx ⇒ ns(idx).map(p ⇒ p).filter(!_.isNlp).map(_.noteType)).distinct
-
- /**
- * Example:
- * 1. Sentence 'maximum x' (single element related function)
- * - maximum is aggregate function linked to date element.
- * - x defined as 2 elements: date and num.
- * So, the variant 'maximum x (as num)' should be excluded.
- * *
- * 2. Sentence 'compare x and y' (multiple elements related function)
- * - compare is relation function linked to date element.
- * - x an y defined as 2 elements: date and num.
- * So, variants 'x (as num) and x (as date)' and 'x (as date) and x (as num)'
- * should not be excluded, but invalid relation should be deleted for these combinations.
- */
- types.size match {
- case 0 ⇒ false
- case 1 ⇒ types.head == notesType
- case _ ⇒
- // Equal elements should be processed together with function element.
- if (types.size == 1)
- false
- else {
- ns.removeNote(note)
-
- logger.trace(s"Removed note: $note")
-
- true
- }
- }
- }
-
- /**
- * Fixes notes with references to other notes indexes.
- * Note that 'idxsField' is 'indexes' and 'noteField' is 'note' for all kind of references.
- *
- * @param noteType Note type.
- * @param idxsField Indexes field.
- * @param noteField Note field.
- * @param ns Sentence.
- * @param history Indexes transformation history.
- * @return Valid flag.
- */
- private def fixIndexesReferences(
- noteType: String,
- idxsField: String,
- noteField: String,
- ns: NCNlpSentence,
- history: Seq[(Int, Int)]
- ): Boolean = {
- ns.filter(_.isTypeOf(noteType)).foreach(tok ⇒
- tok.getNoteOpt(noteType, idxsField) match {
- case Some(n) ⇒
- val idxs: Seq[Int] = n.data[JList[Int]](idxsField).asScala
- var fixed = idxs
-
- history.foreach { case (idxOld, idxNew) ⇒ fixed = fixed.map(i ⇒ if (i == idxOld) idxNew else i) }
-
- fixed = fixed.distinct
-
- if (idxs != fixed)
- ns.fixNote(n, "indexes" → fixed.asJava.asInstanceOf[JSerializable])
- case None ⇒ // No-op.
- }
- )
-
- ns.flatMap(_.getNotes(noteType)).forall(
- n ⇒ checkRelation(ns, n.data[JList[Int]]("indexes").asScala, n.data[String](noteField), n)
- )
- }
-
- /**
- *
- * @param note
- * @param idxsField
- * @param noteField
- * @param ns
- */
- private def fixNoteIndexes(note: String, idxsField: String, noteField: String, ns: NCNlpSentence): Unit =
- ns.flatMap(_.getNotes(note)).foreach(
- n ⇒ checkRelation(ns, n.data[JList[Int]](idxsField).asScala, n.data[String](noteField), n)
- )
-
- /**
- *
- * @param note
- * @param idxsField
- * @param noteField
- * @param ns
- */
- private def fixNoteIndexesList(note: String, idxsField: String, noteField: String, ns: NCNlpSentence): Unit = {
- ns.flatMap(_.getNotes(note)).foreach(rel ⇒
- rel.dataOpt[JList[JList[Int]]](idxsField) match {
- case Some(idxsList) ⇒
- val notesTypes = rel.data[JList[String]](noteField)
-
- require(idxsList.size() == notesTypes.size())
-
- idxsList.asScala.zip(notesTypes.asScala).foreach {
- case (idxs, notesType) ⇒ checkRelation(ns, idxs.asScala, notesType, rel)
- }
- case None ⇒ // No-op.
- }
- )
- }
-
- /**
- * Copies token.
- *
- * @param ns Sentence.
- * @param history Indexes transformation history.
- * @param toksCopy Copied tokens.
- * @param i Index.
- */
- private def simpleCopy(
- ns: NCNlpSentence,
- history: mutable.ArrayBuffer[(Int, Int)],
- toksCopy: NCNlpSentence, i: Int
- ): Seq[NCNlpSentenceToken] = {
- val tokCopy = toksCopy(i)
-
- history += tokCopy.index → ns.size
-
- ns += tokCopy.clone(ns.size)
- }
-
- /**
- * Glues stop words.
- *
- * @param ns Sentence.
- * @param userNoteTypes Notes types.
- * @param history Indexes transformation history.
- */
- private def unionStops(
- ns: NCNlpSentence,
- userNoteTypes: Seq[String],
- history: mutable.ArrayBuffer[(Int, Int)]
- ): Unit = {
- // Java collection used because using scala collections (mutable.Buffer.empty[mutable.Buffer[Token]]) is reason
- // Of compilation errors which seems as scala compiler internal error.
- val bufs = new util.ArrayList[mutable.Buffer[NCNlpSentenceToken]]()
-
- def last[T](l: JList[T]): T = l.get(l.size() - 1)
-
- ns.filter(t ⇒ t.isStopWord && !t.isBracketed).foreach(t ⇒
- if (!bufs.isEmpty && last(bufs).last.index + 1 == t.index)
- last(bufs) += t
- else
- bufs.add(mutable.Buffer.empty[NCNlpSentenceToken] :+ t)
- )
-
- val idxsSeq = bufs.asScala.filter(_.lengthCompare(1) > 0).map(_.map(_.index))
-
- if (idxsSeq.nonEmpty) {
- val nsCopyToks = ns.clone()
- ns.clear()
-
- val buf = mutable.Buffer.empty[Int]
-
- for (i ← nsCopyToks.indices)
- idxsSeq.find(_.contains(i)) match {
- case Some(idxs) ⇒
- if (!buf.contains(idxs.head)) {
- buf += idxs.head
-
- ns += mkCompound(ns, nsCopyToks, idxs, stop = true, ns.size, None, history)
- }
- case None ⇒ simpleCopy(ns, history, nsCopyToks, i)
- }
-
- fixIndexes(ns, userNoteTypes)
- }
- }
-
- /**
- * Fixes indexes for all notes after recreating tokens.
- *
- * @param ns Sentence.
- * @param userNoteTypes Notes types.
- */
- private def fixIndexes(ns: NCNlpSentence, userNoteTypes: Seq[String]) {
- // Replaces other notes indexes.
- for (t ← userNoteTypes :+ "nlpcraft:nlp"; note ← ns.getNotes(t)) {
- val toks = ns.filter(_.contains(note)).sortBy(_.index)
-
- val newNote = note.clone(toks.map(_.index), toks.flatMap(_.wordIndexes).sorted)
-
- toks.foreach(t ⇒ {
- t.remove(note)
- t.add(newNote)
- })
- }
-
- // Special case - field index of core NLP note.
- ns.zipWithIndex.foreach { case (tok, idx) ⇒ ns.fixNote(tok.getNlpNote, "index" → idx) }
- }
-
- /**
- * Zip notes with same type.
- *
- * @param ns Sentence.
- * @param nType Notes type.
- * @param userNotesTypes Notes types.
- * @param history Indexes transformation history.
- */
- private def zipNotes(
- ns: NCNlpSentence,
- nType: String,
- userNotesTypes: Seq[String],
- history: mutable.ArrayBuffer[(Int, Int)]
- ): Unit = {
- val nts = ns.getNotes(nType).filter(n ⇒ n.tokenFrom != n.tokenTo).sortBy(_.tokenFrom)
-
- val overlapped =
- nts.flatMap(n ⇒ n.tokenFrom to n.tokenTo).map(ns(_)).exists(
- t ⇒ userNotesTypes.map(pt ⇒ t.getNotes(pt).size).sum > 1
- )
-
- if (nts.nonEmpty && !overlapped) {
- val nsCopyToks = ns.clone()
- ns.clear()
-
- val buf = mutable.ArrayBuffer.empty[Int]
-
- for (i ← nsCopyToks.indices)
- nts.find(_.tokenIndexes.contains(i)) match {
- case Some(n) ⇒
- if (!buf.contains(n.tokenFrom)) {
- buf += n.tokenFrom
-
- ns += mkCompound(ns, nsCopyToks, n.tokenIndexes, stop = false, ns.size, Some(n), history)
- }
- case None ⇒ simpleCopy(ns, history, nsCopyToks, i)
- }
-
- fixIndexes(ns, userNotesTypes)
- }
- }
-
- /**
- * Makes compound note.
- *
- * @param ns Sentence.
- * @param nsCopyToks Tokens.
- * @param indexes Indexes.
- * @param stop Flag.
- * @param idx Index.
- * @param commonNote Common note.
- * @param history Indexes transformation history.
- */
- private def mkCompound(
- ns: NCNlpSentence,
- nsCopyToks: Seq[NCNlpSentenceToken],
- indexes: Seq[Int],
- stop: Boolean,
- idx: Int,
- commonNote: Option[NCNlpSentenceNote],
- history: mutable.ArrayBuffer[(Int, Int)]
- ): NCNlpSentenceToken = {
- val t = NCNlpSentenceToken(idx)
-
- // Note, it adds stop-words too.
- val content = nsCopyToks.zipWithIndex.filter(p ⇒ indexes.contains(p._2)).map(_._1)
-
- content.foreach(t ⇒ history += t.index → idx)
-
- def mkValue(get: NCNlpSentenceToken ⇒ String): String = {
- val buf = mutable.Buffer.empty[String]
-
- val n = content.size - 1
-
- content.zipWithIndex.foreach(p ⇒ {
- val t = p._1
- val idx = p._2
-
- buf += get(t)
-
- if (idx < n && t.endCharIndex != content(idx + 1).startCharIndex)
- buf += " "
- })
-
- buf.mkString
- }
-
- val origText = mkValue((t: NCNlpSentenceToken) ⇒ t.origText)
-
- val idxs = Seq(idx)
- val wordIdxs = content.flatMap(_.wordIndexes).sorted
-
- val direct =
- commonNote match {
- case Some(n) if n.isUser ⇒ n.isDirect
- case _ ⇒ content.forall(_.isDirect)
- }
-
- val params = Seq(
- "index" → idx,
- "pos" → NCPennTreebank.SYNTH_POS,
- "posDesc" → NCPennTreebank.SYNTH_POS_DESC,
- "lemma" → mkValue((t: NCNlpSentenceToken) ⇒ t.lemma),
- "origText" → origText,
- "normText" → mkValue((t: NCNlpSentenceToken) ⇒ t.normText),
- "stem" → mkValue((t: NCNlpSentenceToken) ⇒ t.stem),
- "start" → content.head.startCharIndex,
- "end" → content.last.endCharIndex,
- "charLength" → origText.length,
- "quoted" → false,
- "stopWord" → stop,
- "bracketed" → false,
- "direct" → direct,
- "dict" → (if (nsCopyToks.size == 1) nsCopyToks.head.getNlpNote.data[Boolean]("dict") else false),
- "english" → nsCopyToks.forall(_.getNlpNote.data[Boolean]("english")),
- "swear" → nsCopyToks.exists(_.getNlpNote.data[Boolean]("swear"))
- )
-
- val nlpNote = NCNlpSentenceNote(idxs, wordIdxs, "nlpcraft:nlp", params: _*)
-
- t.add(nlpNote)
-
- // Adds processed note with fixed indexes.
- commonNote match {
- case Some(n) ⇒
- ns.removeNote(n)
- t.add(n.clone(idxs, wordIdxs))
- case None ⇒ // No-op.
- }
-
- t
- }
-
- /**
- * Fixes notes with references list to other notes indexes.
- *
- * @param noteType Note type.
- * @param idxsField Indexes field.
- * @param noteField Note field.
- * @param ns Sentence.
- * @param history Indexes transformation history.
- * @return Valid flag.
- */
- private def fixIndexesReferencesList(
- noteType: String,
- idxsField: String,
- noteField: String,
- ns: NCNlpSentence,
- history: Seq[(Int, Int)]
- ): Boolean = {
- var ok = true
-
- for (tok ← ns.filter(_.isTypeOf(noteType)) if ok)
- tok.getNoteOpt(noteType, idxsField) match {
- case Some(n) ⇒
- val idxs: Seq[Seq[Int]] =
- n.data[JList[JList[Int]]](idxsField).asScala.map(_.asScala)
- var fixed = idxs
-
- history.foreach {
- case (idxOld, idxNew) ⇒ fixed = fixed.map(_.map(i ⇒ if (i == idxOld) idxNew else i).distinct)
- }
-
- if (fixed.forall(_.size == 1))
- // Fix double dimension array to one dimension,
- // so it should be called always in spite of 'fixIndexesReferences' method.
- ns.fixNote(n, idxsField → fixed.map(_.head).asJava.asInstanceOf[JSerializable])
- else
- ok = false
- case None ⇒ // No-op.
- }
-
- ok &&
- ns.flatMap(_.getNotes(noteType)).forall(rel ⇒
- rel.dataOpt[JList[Int]](idxsField) match {
- case Some(idxsList) ⇒
- val notesTypes = rel.data[JList[String]](noteField)
-
- require(idxsList.size() == notesTypes.size())
-
- idxsList.asScala.zip(notesTypes.asScala).forall {
- case (idxs, notesType) ⇒ checkRelation(ns, Seq(idxs), notesType, rel)
- }
- case None ⇒ true
- }
- )
- }
-
- /**
- * Fixes tokens positions.
- *
- * @param ns Sentence.
- * @param notNlpTypes Token types.
- */
- private def collapseSentence(ns: NCNlpSentence, notNlpTypes: Seq[String]): Boolean = {
- ns.
- filter(!_.isNlp).
- filter(_.isStopWord).
- flatten.
- filter(_.isNlp).
- foreach(n ⇒ ns.fixNote(n, "stopWord" → false))
-
- val all = ns.tokens.flatten
- val nsNotes: Map[String, Seq[Int]] = all.map(p ⇒ p.noteType → p.tokenIndexes).toMap
-
- for (
- t ← ns.tokens; stopReason ← t.stopsReasons
- if all.contains(stopReason) && nsNotes.getOrElse(stopReason.noteType, Seq.empty) == stopReason.tokenIndexes
- )
- ns.fixNote(t.getNlpNote, "stopWord" → true)
-
- val history = mutable.ArrayBuffer.empty[(Int, Int)]
-
- fixNoteIndexes("nlpcraft:relation", "indexes", "note", ns)
- fixNoteIndexes("nlpcraft:limit", "indexes", "note", ns)
- fixNoteIndexesList("nlpcraft:sort", "subjindexes", "subjnotes", ns)
- fixNoteIndexesList("nlpcraft:sort", "byindexes", "bynotes", ns)
-
- notNlpTypes.foreach(typ ⇒ zipNotes(ns, typ, notNlpTypes, history))
- unionStops(ns, notNlpTypes, history)
-
- val res =
- fixIndexesReferences("nlpcraft:relation", "indexes", "note", ns, history) &&
- fixIndexesReferences("nlpcraft:limit", "indexes", "note", ns, history) &&
- fixIndexesReferencesList("nlpcraft:sort", "subjindexes", "subjnotes", ns, history) &&
- fixIndexesReferencesList("nlpcraft:sort", "byindexes", "bynotes", ns, history)
-
- if (res) {
- // Validation (all indexes calculated well)
- require(
- !res ||
- !ns.flatten.
- exists(n ⇒ ns.filter(_.wordIndexes.exists(n.wordIndexes.contains)).exists(t ⇒ !t.contains(n))),
- s"Invalid sentence:\n" +
- ns.map(t ⇒
- // Human readable invalid sentence for debugging.
- s"${t.origText}{index:${t.index}}[${t.map(n ⇒ s"${n.noteType}, {range:${n.tokenFrom}-${n.tokenTo}}").mkString("|")}]"
- ).mkString("\n")
- )
- }
-
- res
- }
-
- private def dropAbstract(mdl: NCModel, ns: NCNlpSentence): Unit =
- if (!mdl.getAbstractTokens.isEmpty) {
- val notes = ns.flatten
-
- val keys = getPartKeys(notes: _*)
- val noteLinks = getLinks(notes)
-
- notes.filter(n ⇒ {
- val noteToks = ns.tokens.filter(_.contains(n))
-
- mdl.getAbstractTokens.contains(n.noteType) &&
- !keys.exists(_.intersect(n.noteType, noteToks.head.startCharIndex, noteToks.last.startCharIndex)) &&
- !noteLinks.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))
- }).foreach(ns.removeNote)
- }
-
- private def getNotNlpNotes(toks: Seq[NCNlpSentenceToken]): Seq[NCNlpSentenceNote] =
- toks.flatten.filter(!_.isNlp).distinct
-
- private def addDeleted(thisSen: NCNlpSentence, sen: NCNlpSentence, dels: Iterable[NCNlpSentenceNote]): Unit =
- sen.deletedNotes ++= dels.map(n ⇒ {
- val savedDelNote = n.clone()
- val savedDelToks = n.tokenIndexes.map(idx ⇒ thisSen(idx).clone())
-
- val mainNotes = savedDelToks.flatten.filter(n ⇒ n.noteType != "nlpcraft:nlp" && n != savedDelNote)
-
- // Deleted note's tokens should contains only nlp data and deleted notes.
- for (savedDelTok ← savedDelToks; mainNote ← mainNotes)
- savedDelTok.remove(mainNote)
-
- savedDelNote → savedDelToks
- })
-
- /**
- * This collapser handles several tasks:
- * - "overall" collapsing after all other individual collapsers had their turn.
- * - Special further enrichment of tokens like linking, etc.
- *
- * In all cases of overlap (full or partial) - the "longest" note wins. In case of overlap and equal
- * lengths - the winning note is chosen based on this priority.
- */
- @throws[NCE]
- private def collapseSentence(thisSen: NCNlpSentence, mdl: NCModel, lastPhase: Boolean = false): Seq[NCNlpSentence] = {
- def collapse0(ns: NCNlpSentence): Option[NCNlpSentence] = {
- if (lastPhase)
- dropAbstract(mdl, ns)
-
- if (collapseSentence(ns, getNotNlpNotes(ns).map(_.noteType).distinct)) Some(ns) else None
- }
-
- // Always deletes `similar` notes.
- // Some words with same note type can be detected various ways.
- // We keep only one variant - with `best` direct and sparsity parameters,
- // other variants for these words are redundant.
- val redundant: Seq[NCNlpSentenceNote] =
- thisSen.flatten.filter(!_.isNlp).distinct.
- groupBy(_.getKey()).
- map(p ⇒ p._2.sortBy(p ⇒
- (
- // System notes don't have such flags.
- if (p.isUser) {
- if (p.isDirect)
- 0
- else
- 1
- }
- else
- 0,
- if (p.isUser)
- p.sparsity
- else
- 0
- )
- )).
- flatMap(_.drop(1)).
- toSeq
-
- redundant.foreach(thisSen.removeNote)
-
- var delCombs: Seq[NCNlpSentenceNote] =
- getNotNlpNotes(thisSen).
- flatMap(note ⇒ getNotNlpNotes(note.tokenIndexes.sorted.map(i ⇒ thisSen(i))).filter(_ != note)).
- distinct
-
- println("delCombs="+delCombs.mkString("\n"))
-
- // Optimization. Deletes all wholly swallowed notes.
- val links = getLinks(thisSen.flatten)
-
- val swallowed =
- delCombs.
- // There aren't links on it.
- filter(n ⇒ !links.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))).
- // It doesn't have links.
- filter(getPartKeys(_).isEmpty).
- flatMap(note ⇒ {
- val noteWordsIdxs = note.wordIndexes.toSet
- val key = PartKey(note, thisSen)
-
- val delCombOthers =
- delCombs.filter(_ != note).flatMap(n ⇒ if (getPartKeys(n).contains(key)) Some(n) else None)
-
- if (delCombOthers.exists(o ⇒ noteWordsIdxs == o.wordIndexes.toSet)) Some(note) else None
- })
-
- delCombs = delCombs.filter(p ⇒ !swallowed.contains(p))
- addDeleted(thisSen, thisSen, swallowed)
- swallowed.foreach(thisSen.removeNote)
-
- val toksByIdx: Seq[Seq[NCNlpSentenceNote]] =
- delCombs.flatMap(note ⇒ note.wordIndexes.map(_ → note)).
- groupBy { case (idx, _) ⇒ idx }.
- map { case (_, seq) ⇒ seq.map { case (_, note) ⇒ note } }.
- toSeq.sortBy(-_.size)
-
-// val toksByIdx1 =
-// delCombs.flatMap(note ⇒ note.wordIndexes.map(_ → note)).
-// groupBy { case (idx, _) ⇒ idx }.
-// map { case (idx, seq) ⇒ idx → seq.map { case (_, note) ⇒ note } }.
-// toSeq.sortBy(_._2.size)
-//
-// toksByIdx.foreach{ case (seq) ⇒
-// println(s"toksByIdx seq=${seq.map(i ⇒ s"${i.noteType} ${i.wordIndexes.mkString(",")}").mkString(" | ")}")
-// }
-
-// toksByIdx1.sortBy(_._1).foreach{ case (i, seq) ⇒
-// println(s"toksByIdx1 ${i} seq=${seq.map(i ⇒ s"${i.noteType} ${i.wordIndexes.mkString(",")}").mkString(" | ")}")
-// }
-
- val dict = mutable.HashMap.empty[String, NCNlpSentenceNote]
- val dictBack = mutable.HashMap.empty[NCNlpSentenceNote, String]
-
- var i = 'A'
-
- val converted: Seq[Seq[String]] =
- toksByIdx.map(seq ⇒ {
- seq.map(
- n ⇒ {
- val s =
- dictBack.get(n) match {
- case Some(s) ⇒ s
- case None ⇒ {
- val s = s"$i"
-
- i = (i.toInt + 1).toChar
-
- dict += s → n
- dictBack += n → s
-
- s
- }
- }
-
- s
- }
- )
- })
-
- //val minDelSize = if (toksByIdx.isEmpty) 1 else toksByIdx.map(_.size).max - 1
-
- var sens =
- if (delCombs.nonEmpty) {
- val p = new ForkJoinPool()
-
- val tmp = NCComboRecursiveTask.findCombinations(
- toksByIdx.map(_.asJava).asJava,
- p
- )
-
- p.shutdown()
-
- val seq1 = tmp.asScala.map(_.asScala)
-
- val sens =
- seq1.
- flatMap(p ⇒ {
- val delComb: Seq[NCNlpSentenceNote] = p
-
- val nsClone = thisSen.clone()
-
- // Saves deleted notes for sentence and their tokens.
- addDeleted(thisSen, nsClone, delComb)
- delComb.foreach(nsClone.removeNote)
-
- // Has overlapped notes for some tokens.
- require(
- !nsClone.exists(_.count(!_.isNlp) > 1),
- s"Invalid notes: ${nsClone.filter(_.count(!_.isNlp) > 1).mkString("|")}"
- )
-
- collapse0(nsClone)
- })
-
- // It removes sentences which have only one difference - 'direct' flag of their user tokens.
- // `Direct` sentences have higher priority.
- case class Key(sysNotes: Seq[Map[String, JSerializable]], userNotes: Seq[Map[String, JSerializable]])
- case class Value(sentence: NCNlpSentence, directCount: Int)
-
- val m = mutable.HashMap.empty[Key, Value]
-
- sens.map(sen ⇒ {
- val notes = sen.flatten
-
- val sysNotes = notes.filter(_.isSystem)
- val nlpNotes = notes.filter(_.isNlp)
- val userNotes = notes.filter(_.isUser)
-
- def get(seq: Seq[NCNlpSentenceNote]): Seq[Map[String, JSerializable]] =
- seq.map(p ⇒
- // We have to delete some keys to have possibility to compare sentences.
- p.clone().filter(_._1 != "direct")
- )
-
- (Key(get(sysNotes), get(userNotes)), sen, nlpNotes.map(p ⇒ if (p.isDirect) 0 else 1).sum)
- }).
- foreach { case (key, sen, directCnt) ⇒
- m.get(key) match {
- case Some(v) ⇒
- // Best sentence is sentence with `direct` synonyms.
- if (v.directCount > directCnt)
- m += key → Value(sen, directCnt)
- case None ⇒ m += key → Value(sen, directCnt)
- }
- }
-
- m.values.map(_.sentence).toSeq
- }
- else
- collapse0(thisSen).flatMap(p ⇒ Option(Seq(p))).getOrElse(Seq.empty)
-
- sens = sens.distinct
-
- sens.foreach(sen ⇒
- sen.foreach(tok ⇒
- tok.size match {
- case 1 ⇒ require(tok.head.isNlp, s"Unexpected non-'nlpcraft:nlp' token: $tok")
- case 2 ⇒ require(tok.head.isNlp ^ tok.last.isNlp, s"Unexpected token notes: $tok")
- case _ ⇒ require(requirement = false, s"Unexpected token notes count: $tok")
- }
- )
- )
-
- // Drops similar sentences (with same tokens structure).
- // Among similar sentences we prefer one with minimal free words count.
- sens.groupBy(_.flatten.filter(!_.isNlp).map(_.getKey(withIndexes = false))).
- map { case (_, seq) ⇒ seq.minBy(_.filter(p ⇒ p.isNlp && !p.isStopWord).map(_.wordIndexes.length).sum) }.
- toSeq
- }
}
import org.apache.nlpcraft.common.nlp.NCNlpSentence._
@@ -975,7 +214,10 @@ class NCNlpSentence(
*/
def getDeletedNotes: Predef.Map[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = deletedNotes.toMap
- def collapse(mdl: NCModel, lastPhase: Boolean = false): Seq[NCNlpSentence] = {
- collapseSentence(this, mdl, lastPhase)
- }
+ /***
+ *
+ * @param deletedNotes
+ */
+ def addDeletedNotes(deletedNotes: Map[NCNlpSentenceNote, Seq[NCNlpSentenceToken]]): Unit =
+ this.deletedNotes ++= deletedNotes
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/SyTest.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/SyTest.java
deleted file mode 100644
index db5f657..0000000
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/SyTest.java
+++ /dev/null
@@ -1,241 +0,0 @@
-package org.apache.nlpcraft.common.nlp;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Comparator;
-import java.util.List;
-import java.util.Set;
-import java.util.TreeSet;
-import java.util.concurrent.ForkJoinPool;
-import java.util.concurrent.RecursiveTask;
-import java.util.stream.IntStream;
-
-import static java.util.stream.Collectors.toList;
-import static java.util.Arrays.asList;
-import static java.util.stream.Collectors.toUnmodifiableList;
-
-public class SyTest {
- public static class ComboSearch extends RecursiveTask<List<Long>> {
- private static final long THRESHOLD = (long)Math.pow(2, 20);
-
- private final long lo;
-
- private final long hi;
-
- private final long[] wordBits;
-
- private final int[] wordCounts;
-
- public ComboSearch(
- long lo,
- long hi,
- long[] words,
- int[] wordCounts
- ) {
- this.lo = lo;
- this.hi = hi;
- this.wordBits = words;
- this.wordCounts = wordCounts;
- }
-
- public static <T> List<List<T>> findCombos(List<List<T>> inp, ForkJoinPool pool) {
- List<List<T>> uniqueInp = inp.stream()
- .filter(row -> inp.stream().noneMatch(it -> it != row && it.containsAll(row)))
- .map(i -> i.stream().distinct().sorted().collect(toList()))
- .collect(toList());
-
- // Build dictionary of unique words.
- List<T> dict = uniqueInp.stream()
- .flatMap(Collection::stream)
- .distinct()
- .sorted()
- .collect(toList());
-
- System.out.println("uniqueInp="+uniqueInp.size());
- System.out.println("dict="+dict.size());
-
- if (dict.size() > Long.SIZE) {
- // Note: Power set of 64 words results in 9223372036854775807 combinations.
- throw new IllegalArgumentException("Can handle more than " + Long.SIZE + " unique words in the dictionary.");
- }
-
- // Convert words to bitmasks (each bit corresponds to an index in the dictionary).
- long[] wordBits = uniqueInp.stream()
- .sorted(Comparator.comparingInt(List::size))
- .mapToLong(row -> wordsToBits(row, dict))
- .toArray();
-
- // Cache words count per row.
- int[] wordCounts = uniqueInp.stream()
- .sorted(Comparator.comparingInt(List::size))
- .mapToInt(List::size)
- .toArray();
-
- // Prepare Fork/Join task to iterate over the power set of all combinations.
- int lo = 1;
- long hi = (long)Math.pow(2, dict.size());
-
- ComboSearch task = new ComboSearch(
- lo,
- hi,
- wordBits,
- wordCounts
- );
-
- return pool.invoke(task).stream()
- .map(bits -> bitsToWords(bits, dict))
- .collect(toList());
- }
-
- @Override
- protected List<Long> compute() {
- if (hi - lo <= THRESHOLD) {
- return computeLocal();
- } else {
- return forkJoin();
- }
- }
-
- private List<Long> computeLocal() {
- List<Long> result = new ArrayList<>();
-
- for (long comboBits = lo; comboBits < hi; comboBits++) {
- boolean match = true;
-
- // For each input row we check if subtracting the current combination of words
- // from the input row would give us the expected result.
- for (int j = 0; j < wordBits.length; j++) {
- // Get bitmask of how many words can be subtracted from the row.
- long commonBits = wordBits[j] & comboBits;
-
- int wordsToRemove = Long.bitCount(commonBits);
-
- // Check if there is more than 1 word remaining after subtraction.
- if (wordCounts[j] - wordsToRemove > 1) {
- // Skip this combination.
- match = false;
-
- break;
- }
- }
-
- if (match && !includes(comboBits, result)) {
- result.add(comboBits);
- }
- }
-
- return result;
- }
-
- private List<Long> forkJoin() {
- long mid = lo + hi >>> 1L;
-
- ComboSearch t1 = new ComboSearch(lo, mid, wordBits, wordCounts);
- ComboSearch t2 = new ComboSearch(mid, hi, wordBits, wordCounts);
-
- t2.fork();
-
- return merge(t1.compute(), t2.join());
- }
-
- private List<Long> merge(List<Long> l1, List<Long> l2) {
- if (l1.isEmpty()) {
- return l2;
- } else if (l2.isEmpty()) {
- return l1;
- }
-
- int size1 = l1.size();
- int size2 = l2.size();
-
- if (size1 == 1 && size2 > 1 || size2 == 1 && size1 > 1) {
- // Minor optimization in case if one of the lists has only one element.
- List<Long> list = size1 == 1 ? l2 : l1;
- Long val = size1 == 1 ? l1.get(0) : l2.get(0);
-
- if (!includes(val, list)) {
- list.add(val);
- }
-
- return list;
- } else {
- List<Long> result = new ArrayList<>(size1 + size2);
-
- for (int i = 0, max = Math.max(size1, size2); i < max; i++) {
- Long v1 = i < size1 ? l1.get(i) : null;
- Long v2 = i < size2 ? l2.get(i) : null;
-
- if (v1 != null && v2 != null) {
- if (containsAllBits(v1, v2)) {
- v1 = null;
- } else if (containsAllBits(v2, v1)) {
- v2 = null;
- }
- }
-
- if (v1 != null && !includes(v1, result)) {
- result.add(v1);
- }
-
- if (v2 != null && !includes(v2, result)) {
- result.add(v2);
- }
- }
-
- return result;
- }
- }
-
- private static boolean includes(long bits, List<Long> allBits) {
- for (long existing : allBits) {
- if (containsAllBits(bits, existing)) {
- return true;
- }
- }
-
- return false;
- }
-
- private static boolean containsAllBits(long bitSet1, long bitSet2) {
- return (bitSet1 & bitSet2) == bitSet2;
- }
-
- private static <T> long wordsToBits(List<T> words, List<T> dict) {
- long bits = 0;
-
- for (int i = 0; i < dict.size(); i++) {
- if (words.contains(dict.get(i))) {
- bits |= 1L << i;
- }
- }
-
- return bits;
- }
-
- private static <T> List<T> bitsToWords(long bits, List<T> dict) {
- List<T> words = new ArrayList<>(Long.bitCount(bits));
-
- for (int i = 0; i < dict.size(); i++) {
- if ((bits & 1L << i) != 0) {
- words.add(dict.get(i));
- }
- }
-
- return words;
- }
- }
-
- public static void main(String[] args) throws InterruptedException {
- List<List<String>> words = IntStream.range(0, 35)
- .mapToObj(i -> IntStream.range(0, i + 1).mapToObj(String::valueOf).collect(toUnmodifiableList()))
- .collect(toUnmodifiableList());
-
- long t = System.currentTimeMillis();
-
- ForkJoinPool forkJoinPool = new ForkJoinPool();
- final List<List<String>> combos = ComboSearch.findCombos(words, forkJoinPool);
-
- System.out.println("size=" + combos.size());
- System.out.println("time=" + (System.currentTimeMillis() - t));
- }
-}
\ No newline at end of file
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala
index 19308e3..da13b07 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala
@@ -49,6 +49,7 @@ import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.sort.NCSortEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.stopword.NCStopWordEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.suspicious.NCSuspiciousNounsEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.validate.NCValidateManager
+import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
import resource.managed
import java.io._
@@ -512,6 +513,7 @@ private [probe] object NCProbeBoot extends LazyLogging with NCOpenCensusTrace {
startedMgrs += NCProbeEnrichmentManager.start(span)
startedMgrs += NCConnectionManager.start(span)
startedMgrs += NCDialogFlowManager.start(span)
+ startedMgrs += NCSentenceManager.start(span)
}
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
index 368f0c4..c328e57 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
@@ -43,6 +43,7 @@ import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.stopword.NCStopWordEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.suspicious.NCSuspiciousNounsEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.impl._
import org.apache.nlpcraft.probe.mgrs.nlp.validate._
+import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
import org.apache.nlpcraft.probe.mgrs.{NCProbeMessage, NCProbeVariants}
import java.io.Serializable
@@ -500,7 +501,7 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats {
s"]")
}
- nlpSen.clone().collapse(mdl.model, lastPhase = true).
+ NCSentenceManager.collapse(mdl.model, nlpSen.clone(), lastPhase = true).
// Sorted to support deterministic logs.
sortBy(p ⇒
p.map(p ⇒ {
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 2a9dec0..82edf69 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -24,6 +24,7 @@ import org.apache.nlpcraft.model._
import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind, TEXT}
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
+import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeSynonym, NCProbeVariants}
import java.io.Serializable
@@ -355,7 +356,11 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
toks.map(t ⇒ (t.origText, t.index)).mkString(" ")
var permCnt = 0
- lazy val collapsedSens = NCProbeVariants.convert(ns.srvReqId, mdl, ns.clone().collapse(mdl.model)).map(_.asScala)
+ lazy val collapsedSens = NCProbeVariants.convert(
+ ns.srvReqId,
+ mdl,
+ NCSentenceManager.collapse(mdl.model, ns.clone())
+ ).map(_.asScala)
/**
*
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
similarity index 68%
copy from nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
copy to nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index 95a98a3..e0dd59c 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -15,29 +15,30 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.common.nlp
+package org.apache.nlpcraft.probe.mgrs.sentence
-import com.typesafe.scalalogging.LazyLogging
-import org.apache.nlpcraft.common.NCE
+import io.opencensus.trace.Span
+import org.apache.nlpcraft.common.nlp.NCNlpSentence.NoteLink
import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
-import org.apache.nlpcraft.common.util.NCComboRecursiveTask
+import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken}
+import org.apache.nlpcraft.common.{NCE, NCService, U}
import org.apache.nlpcraft.model.NCModel
-import java.io.{Serializable => JSerializable}
+import java.io.{Serializable ⇒ JSerializable}
import java.util
-import java.util.concurrent.ForkJoinPool
-import java.util.{Collections, Comparator, List => JList}
-import scala.collection.JavaConverters._
+import java.util.{List ⇒ JList}
+import scala.collection.JavaConverters.{asScalaBufferConverter, _}
import scala.collection.mutable.ArrayBuffer
import scala.collection.{Map, Seq, Set, mutable}
import scala.language.implicitConversions
-object NCNlpSentence extends LazyLogging {
- implicit def toTokens(x: NCNlpSentence): ArrayBuffer[NCNlpSentenceToken] = x.tokens
+/**
+ * Sentences processing manager.
+ */
+object NCSentenceManager extends NCService {
+ @volatile private var pool: java.util.concurrent.ForkJoinPool = _
- case class NoteKey(start: Int, end: Int)
- case class TokenKey(id: String, start: Int, end: Int)
- case class NoteLink(note: String, indexes: Seq[Int])
+ implicit def toTokens(x: NCNlpSentence): ArrayBuffer[NCNlpSentenceToken] = x.tokens
case class PartKey(id: String, start: Int, end: Int) {
require(start <= end)
@@ -73,7 +74,7 @@ object NCNlpSentence extends LazyLogging {
noteLinks ++=
(for ((name, idxs) ← names.asScala.zip(idxsSeq.asScala.map(_.asScala)))
yield NoteLink(name, idxs.sorted)
- )
+ )
}
if (n.contains("subjnotes")) add("subjnotes", "subjindexes")
@@ -462,8 +463,8 @@ object NCNlpSentence extends LazyLogging {
}
if (fixed.forall(_.size == 1))
- // Fix double dimension array to one dimension,
- // so it should be called always in spite of 'fixIndexesReferences' method.
+ // Fix double dimension array to one dimension,
+ // so it should be called always in spite of 'fixIndexesReferences' method.
ns.fixNote(n, idxsField → fixed.map(_.head).asJava.asInstanceOf[JSerializable])
else
ok = false
@@ -521,9 +522,9 @@ object NCNlpSentence extends LazyLogging {
val res =
fixIndexesReferences("nlpcraft:relation", "indexes", "note", ns, history) &&
- fixIndexesReferences("nlpcraft:limit", "indexes", "note", ns, history) &&
- fixIndexesReferencesList("nlpcraft:sort", "subjindexes", "subjnotes", ns, history) &&
- fixIndexesReferencesList("nlpcraft:sort", "byindexes", "bynotes", ns, history)
+ fixIndexesReferences("nlpcraft:limit", "indexes", "note", ns, history) &&
+ fixIndexesReferencesList("nlpcraft:sort", "subjindexes", "subjnotes", ns, history) &&
+ fixIndexesReferencesList("nlpcraft:sort", "byindexes", "bynotes", ns, history)
if (res) {
// Validation (all indexes calculated well)
@@ -562,7 +563,7 @@ object NCNlpSentence extends LazyLogging {
toks.flatten.filter(!_.isNlp).distinct
private def addDeleted(thisSen: NCNlpSentence, sen: NCNlpSentence, dels: Iterable[NCNlpSentenceNote]): Unit =
- sen.deletedNotes ++= dels.map(n ⇒ {
+ sen.addDeletedNotes(dels.map(n ⇒ {
val savedDelNote = n.clone()
val savedDelToks = n.tokenIndexes.map(idx ⇒ thisSen(idx).clone())
@@ -573,7 +574,7 @@ object NCNlpSentence extends LazyLogging {
savedDelTok.remove(mainNote)
savedDelNote → savedDelToks
- })
+ }).toMap)
/**
* This collapser handles several tasks:
@@ -597,27 +598,27 @@ object NCNlpSentence extends LazyLogging {
// We keep only one variant - with `best` direct and sparsity parameters,
// other variants for these words are redundant.
val redundant: Seq[NCNlpSentenceNote] =
- thisSen.flatten.filter(!_.isNlp).distinct.
- groupBy(_.getKey()).
- map(p ⇒ p._2.sortBy(p ⇒
- (
- // System notes don't have such flags.
- if (p.isUser) {
- if (p.isDirect)
- 0
- else
- 1
- }
- else
- 0,
- if (p.isUser)
- p.sparsity
- else
+ thisSen.flatten.filter(!_.isNlp).distinct.
+ groupBy(_.getKey()).
+ map(p ⇒ p._2.sortBy(p ⇒
+ (
+ // System notes don't have such flags.
+ if (p.isUser) {
+ if (p.isDirect)
0
- )
- )).
- flatMap(_.drop(1)).
- toSeq
+ else
+ 1
+ }
+ else
+ 0,
+ if (p.isUser)
+ p.sparsity
+ else
+ 0
+ )
+ )).
+ flatMap(_.drop(1)).
+ toSeq
redundant.foreach(thisSen.removeNote)
@@ -626,8 +627,6 @@ object NCNlpSentence extends LazyLogging {
flatMap(note ⇒ getNotNlpNotes(note.tokenIndexes.sorted.map(i ⇒ thisSen(i))).filter(_ != note)).
distinct
- println("delCombs="+delCombs.mkString("\n"))
-
// Optimization. Deletes all wholly swallowed notes.
val links = getLinks(thisSen.flatten)
@@ -651,89 +650,55 @@ object NCNlpSentence extends LazyLogging {
addDeleted(thisSen, thisSen, swallowed)
swallowed.foreach(thisSen.removeNote)
- val toksByIdx: Seq[Seq[NCNlpSentenceNote]] =
+ val toksByIdx: Seq[Set[NCNlpSentenceNote]] =
delCombs.flatMap(note ⇒ note.wordIndexes.map(_ → note)).
groupBy { case (idx, _) ⇒ idx }.
- map { case (_, seq) ⇒ seq.map { case (_, note) ⇒ note } }.
+ map { case (_, seq) ⇒ seq.map { case (_, note) ⇒ note }.toSet }.
toSeq.sortBy(-_.size)
-// val toksByIdx1 =
-// delCombs.flatMap(note ⇒ note.wordIndexes.map(_ → note)).
-// groupBy { case (idx, _) ⇒ idx }.
-// map { case (idx, seq) ⇒ idx → seq.map { case (_, note) ⇒ note } }.
-// toSeq.sortBy(_._2.size)
-//
-// toksByIdx.foreach{ case (seq) ⇒
-// println(s"toksByIdx seq=${seq.map(i ⇒ s"${i.noteType} ${i.wordIndexes.mkString(",")}").mkString(" | ")}")
-// }
-
-// toksByIdx1.sortBy(_._1).foreach{ case (i, seq) ⇒
-// println(s"toksByIdx1 ${i} seq=${seq.map(i ⇒ s"${i.noteType} ${i.wordIndexes.mkString(",")}").mkString(" | ")}")
-// }
-
- val dict = mutable.HashMap.empty[String, NCNlpSentenceNote]
- val dictBack = mutable.HashMap.empty[NCNlpSentenceNote, String]
-
- var i = 'A'
-
- val converted: Seq[Seq[String]] =
- toksByIdx.map(seq ⇒ {
- seq.map(
- n ⇒ {
- val s =
- dictBack.get(n) match {
- case Some(s) ⇒ s
- case None ⇒ {
- val s = s"$i"
-
- i = (i.toInt + 1).toChar
-
- dict += s → n
- dictBack += n → s
-
- s
- }
- }
-
- s
- }
- )
- })
-
- //val minDelSize = if (toksByIdx.isEmpty) 1 else toksByIdx.map(_.size).max - 1
+// val minDelSize = if (toksByIdx.isEmpty) 1 else toksByIdx.map(_.size).max - 1
var sens =
if (delCombs.nonEmpty) {
- val p = new ForkJoinPool()
-
- val tmp = NCComboRecursiveTask.findCombinations(
- toksByIdx.map(_.asJava).asJava,
- p
- )
-
- p.shutdown()
-
- val seq1 = tmp.asScala.map(_.asScala)
+ val deleted = mutable.ArrayBuffer.empty[Set[NCNlpSentenceNote]]
+
+// val combs = (minDelSize to delCombs.size).
+// flatMap(i ⇒
+// delCombs.combinations(i).
+// filter(delComb ⇒
+// !toksByIdx.exists(
+// rec ⇒
+// rec.size - delCombs.size <= 1 &&
+// rec.count(note ⇒ !delComb.contains(note)) > 1
+// )
+// )
+// ).
+// sortBy(_.size).
+// map(_.toSet)
+//
+ val combs = NCComboHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala)
val sens =
- seq1.
- flatMap(p ⇒ {
- val delComb: Seq[NCNlpSentenceNote] = p
+ combs.
+ flatMap(delComb ⇒
+ // Already processed with less subset of same deleted tokens.
+ if (!deleted.exists(_.subsetOf(delComb.toSet))) {
+ val nsClone = thisSen.clone()
- val nsClone = thisSen.clone()
+ // Saves deleted notes for sentence and their tokens.
+ addDeleted(thisSen, nsClone, delComb)
+ delComb.foreach(nsClone.removeNote)
- // Saves deleted notes for sentence and their tokens.
- addDeleted(thisSen, nsClone, delComb)
- delComb.foreach(nsClone.removeNote)
+ // Has overlapped notes for some tokens.
+ require(!nsClone.exists(_.count(!_.isNlp) > 1))
- // Has overlapped notes for some tokens.
- require(
- !nsClone.exists(_.count(!_.isNlp) > 1),
- s"Invalid notes: ${nsClone.filter(_.count(!_.isNlp) > 1).mkString("|")}"
- )
+ deleted += delComb.toSet
- collapse0(nsClone)
- })
+ collapse0(nsClone)
+ }
+ else
+ None
+ )
// It removes sentences which have only one difference - 'direct' flag of their user tokens.
// `Direct` sentences have higher priority.
@@ -790,192 +755,23 @@ object NCNlpSentence extends LazyLogging {
map { case (_, seq) ⇒ seq.minBy(_.filter(p ⇒ p.isNlp && !p.isStopWord).map(_.wordIndexes.length).sum) }.
toSeq
}
-}
-
-import org.apache.nlpcraft.common.nlp.NCNlpSentence._
-/**
- * Parsed NLP sentence is a collection of tokens. Each token is a collection of notes and
- * each note is a collection of KV pairs.
- *
- * @param srvReqId Server request ID.
- * @param text Normalized text.
- * @param enabledBuiltInToks Enabled built-in tokens.
- * @param tokens Initial buffer.
- * @param deletedNotes Deleted overridden notes with their tokens.
- */
-class NCNlpSentence(
- val srvReqId: String,
- val text: String,
- val enabledBuiltInToks: Set[String],
- override val tokens: mutable.ArrayBuffer[NCNlpSentenceToken] = new mutable.ArrayBuffer[NCNlpSentenceToken](32),
- private val deletedNotes: mutable.HashMap[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = mutable.HashMap.empty,
- private var initNlpNotes: Map[NoteKey, NCNlpSentenceNote] = null,
- private val nlpTokens: mutable.HashMap[TokenKey, NCNlpSentenceToken] = mutable.HashMap.empty
-) extends NCNlpSentenceTokenBuffer(tokens) with JSerializable {
- @transient
- private var hash: java.lang.Integer = _
-
- private def calcHash(): Int =
- Seq(srvReqId, text, enabledBuiltInToks, tokens).map(_.hashCode()).foldLeft(0)((a, b) ⇒ 31 * a + b)
-
- // Deep copy.
- override def clone(): NCNlpSentence =
- new NCNlpSentence(
- srvReqId,
- text,
- enabledBuiltInToks,
- tokens.map(_.clone()),
- deletedNotes.map(p ⇒ p._1.clone() → p._2.map(_.clone())),
- initNlpNotes = initNlpNotes
- )
+ override def start(parent: Span): NCService = {
+ ackStarting()
- /**
- * Utility method that gets set of notes for given note type collected from
- * tokens in this sentence. Notes are sorted in the same order they appear
- * in this sentence.
- *
- * @param noteType Note type.
- */
- def getNotes(noteType: String): Seq[NCNlpSentenceNote] = this.flatMap(_.getNotes(noteType)).distinct
+ pool = new java.util.concurrent.ForkJoinPool()
- /**
- * Utility method that removes note with given ID from all tokens in this sentence.
- * No-op if such note wasn't found.
- *
- * @param note Note.
- */
- def removeNote(note: NCNlpSentenceNote): Unit = this.foreach(_.remove(note))
-
- //noinspection HashCodeUsesVar
- override def hashCode(): Int = {
- if (hash == null)
- hash = calcHash()
-
- hash
+ ackStarted()
}
- def fixNote(note: NCNlpSentenceNote, kvs: (String, JSerializable)*): Unit = {
- val fixed = note.clone(kvs: _*)
+ override def stop(parent: Span): Unit = {
+ ackStopping()
- this.filter(t ⇒ t.index >= fixed.tokenIndexes.head && t.index <= fixed.tokenIndexes.last).foreach(t ⇒ {
- t.remove(note)
- t.add(fixed)
- })
+ U.shutdownPool(pool)
- hash = null
+ ackStopped()
}
- /**
- * Returns flag are note notes equal (or similar) or not. Reason of ignored difference can be stopwords tokens.
- *
- * @param n1 First note.
- * @param n2 Second note.
- */
- def notesEqualOrSimilar(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean =
- if (n1.noteType != n2.noteType)
- false
- else {
- val stopIdxs = this.filter(_.isStopWord).map(_.index)
-
- // One possible difference - stopwords indexes.
- def wordsEqualOrSimilar0(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean = {
- val set1 = n1.wordIndexes.toSet
- val set2 = n2.wordIndexes.toSet
-
- set1 == set2 || set1.subsetOf(set2) && set2.diff(set1).forall(stopIdxs.contains)
- }
-
- def wordsEqualOrSimilar(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean =
- wordsEqualOrSimilar0(n1, n2) || wordsEqualOrSimilar0(n2, n1)
-
- def tokensEqualOrSimilar0(set1: Set[NCNlpSentenceToken], set2: Set[NCNlpSentenceToken]): Boolean =
- set1 == set2 || set1.subsetOf(set2) && set2.diff(set1).forall(_.isStopWord)
-
- def tokensEqualOrSimilar(set1: Set[NCNlpSentenceToken], set2: Set[NCNlpSentenceToken]): Boolean =
- tokensEqualOrSimilar0(set1, set2) || tokensEqualOrSimilar0(set2, set1)
-
- def getList(n: NCNlpSentenceNote, refIdxName: String): Set[NCNlpSentenceToken] =
- n.getOrElse(refIdxName, Collections.emptyList).asInstanceOf[JList[Int]].asScala.
- map(this (_)).toSet
-
- def getListList(n: NCNlpSentenceNote, refIdxName: String): Set[NCNlpSentenceToken] =
- n.getOrElse(refIdxName, Collections.emptyList).asInstanceOf[JList[JList[Int]]].asScala.
- flatMap(_.asScala.map(this (_))).toSet
-
- def referencesEqualOrSimilar0(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean = {
- require(n1.noteType == n2.noteType)
-
- n1.noteType match {
- case "nlpcraft:sort" ⇒
- tokensEqualOrSimilar(getListList(n1, "subjindexes"), getListList(n2, "subjindexes")) &&
- tokensEqualOrSimilar(getListList(n1, "byindexes"), getListList(n2, "byindexes"))
- case "nlpcraft:limit" ⇒
- tokensEqualOrSimilar(getList(n1, "indexes"), getList(n2, "indexes"))
- case "nlpcraft:reference" ⇒
- tokensEqualOrSimilar(getList(n1, "indexes"), getList(n2, "indexes"))
-
- case _ ⇒ true
- }
- }
-
- def referencesEqualOrSimilar(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean =
- referencesEqualOrSimilar0(n1, n2) || referencesEqualOrSimilar0(n2, n1)
-
- def getUniqueKey0(n: NCNlpSentenceNote): Seq[Any] = n.getKey(withIndexes = false, withReferences = false)
-
- getUniqueKey0(n1) == getUniqueKey0(n2) && wordsEqualOrSimilar(n1, n2) && referencesEqualOrSimilar(n1, n2)
- }
-
- override def equals(obj: Any): Boolean = obj match {
- case x: NCNlpSentence ⇒
- tokens == x.tokens &&
- srvReqId == x.srvReqId &&
- text == x.text &&
- enabledBuiltInToks == x.enabledBuiltInToks
-
- case _ ⇒ false
- }
-
- /**
- *
- */
- def saveNlpNotes(): Unit =
- initNlpNotes = this.map(t ⇒ NoteKey(t.startCharIndex, t.endCharIndex) → t.getNlpNote).toMap
-
- /**
- *
- * @return
- */
- def findInitialNlpNote(startCharIndex: Int, endCharIndex: Int): Option[NCNlpSentenceNote] =
- initNlpNotes.get(NoteKey(startCharIndex, endCharIndex))
-
- /**
- *
- * @param nlp
- */
- def addNlpToken(nlp: NCNlpSentenceToken): Unit = {
- require(nlp.size <= 2)
-
- nlp.foreach(n ⇒ nlpTokens += TokenKey(n.noteType, nlp.startCharIndex, nlp.endCharIndex) → nlp)
- }
-
- /**
- *
- * @param noteType
- * @param startCharIndex
- * @param endCharIndex
- * @return
- */
- def findNlpToken(noteType: String, startCharIndex: Int, endCharIndex: Int): Option[NCNlpSentenceToken] =
- nlpTokens.get(TokenKey(noteType, startCharIndex, endCharIndex))
-
- /**
- *
- */
- def getDeletedNotes: Predef.Map[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = deletedNotes.toMap
-
- def collapse(mdl: NCModel, lastPhase: Boolean = false): Seq[NCNlpSentence] = {
- collapseSentence(this, mdl, lastPhase)
- }
+ def collapse(mdl: NCModel, sen: NCNlpSentence, lastPhase: Boolean = false): Seq[NCNlpSentence] =
+ collapseSentence(sen, mdl, lastPhase)
}
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/SyTest.java b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/SyTest.java
deleted file mode 100644
index 69d9ab4..0000000
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/SyTest.java
+++ /dev/null
@@ -1,278 +0,0 @@
-package org.apache.nlpcraft.probe.mgrs.nlp.enrichers;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Comparator;
-import java.util.HashSet;
-import java.util.List;
-import java.util.NavigableSet;
-import java.util.Optional;
-import java.util.Set;
-import java.util.TreeSet;
-import java.util.concurrent.CopyOnWriteArrayList;
-import java.util.concurrent.CountDownLatch;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.TimeUnit;
-import java.util.stream.Collectors;
-
-import static java.util.Arrays.asList;
-import static java.util.stream.Collectors.toList;
-import static java.util.stream.Collectors.toSet;
-
-public class SyTest {
- public static void main(String[] args) {
-// List<List<String>> words = asList(
-// asList("A", "B", "C"),
-// asList("B", "C", "D"),
-// asList("B", "D")
-// );
- List<List<String>> words = asList(
- asList("A", "B"),
- asList("C", "B"),
- asList("D", "E"),
- asList("D", "F"),
- asList("G", "H"),
- asList("I", "H"),
- asList("J", "K"),
- asList("L", "K"),
- asList("M", "N"),
- asList("M", "O"),
- asList("P", "Q"),
- asList("P", "R"),
- asList("S", "T"),
- asList("S", "U"),
- asList("V", "W"),
- asList("X", "W")
- ,
- asList("Y", "Z"),
- asList("A1", "A2"),
- asList("A3", "A3"),
- asList("A4", "A5", "A6")
- );
-
- System.out.println(
- "Dictionary size:"
- + words.stream()
- .flatMap(Collection::stream)
- .distinct()
- .count()
- );
-
- System.out.println("===== Performance =====");
-
- for (int i = 0; i < 1; i++) {
- long t = System.currentTimeMillis();
-
- Set<Set<String>> combos = findCombos(words);
-
-
-
- System.out.println("Iteration " + i + " Time: " + (System.currentTimeMillis() - t) + ", resCnt=" + combos.size());
- }
-
- if (true) {
- return;
- }
-
- Set<Set<String>> combos = findCombos(words);
-
- System.out.println();
- System.out.println("===== Result =====");
- System.out.println("Total combos: " + combos.size());
- System.out.println();
-// combos.stream()
-// .sorted(Comparator.comparing(Collection::size))
-// .forEach(combo ->
-// print(words, combo)
-// );
- }
-
- public static <T extends Comparable<T>> Set<Set<T>> findCombos(List<List<T>> inp) {
-
-
- List<List<T>> uniqueInp = inp.stream()
- .filter(row -> inp.stream().noneMatch(it -> it != row && it.containsAll(row)))
- .map(i -> i.stream().distinct().sorted().collect(toList()))
- .collect(toList());
-
- // Build dictionary of unique words.
- List<T> dict = uniqueInp.stream()
- .flatMap(Collection::stream)
- .distinct()
- .sorted()
- .collect(toList());
-
- if (dict.size() > Integer.SIZE) {
- // Note: Power set of 32 words results in 4294967296 combinations.
- throw new IllegalArgumentException("Can handle more than " + Integer.SIZE + " unique words in the dictionary.");
- }
-
- // Convert words to bitmasks (each bit corresponds to an index in the dictionary).
- int[] wordBits = uniqueInp.stream()
- .sorted(Comparator.comparingInt(List::size))
- .mapToInt(row -> wordsToBits(row, dict))
- .toArray();
-
- // Cache words count per row.
- int[] wordCounts = uniqueInp.stream()
- .sorted(Comparator.comparingInt(List::size))
- .mapToInt(List::size)
- .toArray();
-
- int min = 1;
- int max = (int)Math.pow(2, dict.size()) - 1;
-
- int batchFactor = 100;
- int threads = 13;
-
- ExecutorService pool = Executors.newFixedThreadPool(threads);
- CountDownLatch cdl = new CountDownLatch(batchFactor);
-
- int divRes = max / batchFactor;
- int divRest = max % batchFactor;
-
- int to = 0;
-
- List<Integer> result = new CopyOnWriteArrayList<>();
-
- for (int k = 0; k < batchFactor; k++) {
- to += divRes;
-
- if (k == divRes - 1) {
- to += divRest;
- }
-
- int toFinal = to;
- int fromFinal = min + k * divRes;
-
- pool.execute(
- () -> {
- List<Integer> locRes = new ArrayList<>();
-
- for (int comboBits = fromFinal; comboBits < toFinal; comboBits++) {
- boolean match = true;
-
- // For each input row we check if subtracting the current combination of words
- // from the input row would give us the expected result.
- for (int j = 0; j < wordBits.length; j++) {
- // Get bitmask of how many words can be subtracted from the row.
- int commonBits = wordBits[j] & comboBits;
-
- int wordsToRemove = Integer.bitCount(commonBits);
-
- // Check if there are more than 1 word remaining after subtraction.
- if (wordCounts[j] - wordsToRemove > 1) {
- // Skip this combination.
- match = false;
-
- break;
- }
- }
-
- if (match && !includes(comboBits, locRes)) {
- locRes.add(comboBits);
- }
- }
-
- result.addAll(locRes);
-
- cdl.countDown();
- }
- );
- }
-
-// Iterate over the power set.
-
- //pool.shutdown();
- try {
- cdl.await(Long.MAX_VALUE, TimeUnit.MILLISECONDS);
- //pool.awaitTermination(Long.MAX_VALUE, TimeUnit.MILLISECONDS);
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
-
- // Convert found results from bitmasks back to words.
- TreeSet<Set<T>> treeSet = new TreeSet<>(Comparator.comparingInt(Set::size));
-
- treeSet.addAll(result.stream().map(bits -> bitsToWords(bits, dict)).collect(toSet()));
-
- Set<Set<T>> normCombs = new HashSet<>();
-
- for (Set<T> set : treeSet) {
- boolean b = true;
-
- for (Set<T> added : normCombs) {
- if (added.containsAll(set)) {
- b = false;
-
- break;
- }
- }
-
- if (b) {
- normCombs.add(set);
- }
- }
-
- return normCombs;
-
- }
-
- private static <T> Set<Set<T>> squeeze(Set<Set<T>> combs) {
- Set<Set<T>> normCombs = new HashSet<>();
-
- combs.stream().sorted(Comparator.comparingInt(Set::size)).forEach(comb -> {
- // Skips already added shorter variants.
- if (normCombs.stream().filter(comb::containsAll).findAny().isEmpty()) {
- normCombs.add(comb);
- }
- });
-
- return normCombs;
- }
-
-
- private static boolean includes(int bits, List<Integer> allBits) {
- for (int existing : allBits) {
- if ((bits & existing) == existing) {
- return true;
- }
- }
-
- return false;
- }
-
- private static <T> int wordsToBits(List<T> words, List<T> dict) {
- int bits = 0;
-
- for (int i = 0; i < dict.size(); i++) {
- if (words.contains(dict.get(i))) {
- bits |= 1 << i;
- }
- }
-
- return bits;
- }
-
- private static <T> Set<T> bitsToWords(int bits, List<T> dict) {
- Set<T> words = new HashSet<>(Integer.bitCount(bits));
-
- for (int i = 0; i < dict.size(); i++) {
- if ((bits & 1 << i) != 0) {
- words.add(dict.get(i));
- }
- }
-
- return words;
- }
-
- private static void print(List<List<String>> inp, List<String> combo) {
- System.out.println("==== " + combo + "(" + combo.size() + ')');
- inp.stream().forEach(row -> {
- Set<String> s = new TreeSet<>(row);
- s.removeAll(combo);
- System.out.println(s);
- });
- }
-}
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
index afdeaab..b354533 100644
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
+++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
@@ -48,6 +48,6 @@ class NCNestedTestModel4 extends NCModelAdapter(
*/
@NCTestEnvironment(model = classOf[NCNestedTestModel4], startClient = true)
class NCEnricherNestedModelSpec4 extends NCTestContext {
- @Test1
+ @Test
def test(): Unit = checkIntent("the a " * 8, "onE2")
}
\ No newline at end of file
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/Test1.java b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/Test1.java
deleted file mode 100644
index 398e858..0000000
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/Test1.java
+++ /dev/null
@@ -1,135 +0,0 @@
-package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model;
-
-import com.google.common.collect.ImmutableSet;
-
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Comparator;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-import java.util.stream.Collectors;
-import java.util.stream.Stream;
-
-public class Test1 {
- private static List<Set<String>> ROWS =
- Arrays.asList(
- ImmutableSet.of("A", "B", "C"),
- ImmutableSet.of("B", "C", "D"),
- ImmutableSet.of("B", "D")
- );
-
-// // Uncomment it. Works too long time. Normalized result is 256.
-// private static List<Set<String>> ROWS = Arrays.asList(
-// ImmutableSet.of("A", "B"),
-// ImmutableSet.of("C", "B"),
-// ImmutableSet.of("D", "E"),
-// ImmutableSet.of("D", "F"),
-// ImmutableSet.of("G", "H"),
-// ImmutableSet.of("I", "H"),
-// ImmutableSet.of("J", "K"),
-// ImmutableSet.of("L", "K"),
-// ImmutableSet.of("M", "N"),
-// ImmutableSet.of("M", "O"),
-// ImmutableSet.of("P", "Q"),
-// ImmutableSet.of("P", "R"),
-// ImmutableSet.of("S", "T"),
-// ImmutableSet.of("S", "U"),
-// ImmutableSet.of("V", "W"),
-// ImmutableSet.of("X", "W")
-// );
-
- private static Set<String> ALL = ROWS.stream().flatMap(Collection::stream).collect(Collectors.toSet());
-
- // Goal: Find minimal set of combinations with following feature.
- // After removing combination values from each row - list should contain rows with size <= 1.
-
- // Expected solution: [C, B], [A, C, D], [A, B, D]
- // Example:
- // list - [C, B] = {{A}, {D}, {D}}
- // list - [A, C, D] = {{B}, {B}, {B}}
- // list - [A, B, D] = {{C}, {C}, {null}}
-
-
- // Additional. Redundant solutions: [A, B, C] ([C, B] enough), [A, B, C, D] ([A, C, D] enough) etc
-
- // Easiest.
- public static void main(String[] args) {
- long t = System.currentTimeMillis();
-
- System.out.println("1. start [time=" + (System.currentTimeMillis() - t) + ']');
-
- // 1. Extends.
- List<Set<String>> extRows = extendNulls();
-
- // 2. All valid rows (permutation)
- // Or manually permute, like https://stackoverflow.com/questions/17192796/generate-all-combinations-from-multiple-lists
- Set<List<String>> allSingleOrNullRows = com.google.common.collect.Sets.cartesianProduct(extRows);
-
- System.out.println("2. permuted [size=" + allSingleOrNullRows.size() + ", time=" + (System.currentTimeMillis() - t) + ']');
-
- // 3. Collects all suitable combinations.
- Set<Set<String>> combs =
- allSingleOrNullRows.
- stream().
- // Calculates how that single or empty lines can be constructed (it is required combination).
- map(row -> {
- Set<String> copy = new HashSet<>(ALL);
-
- copy.removeAll(row);
-
- return copy;
- }).
- distinct().
- filter(Test1::isSuitable).
- collect(Collectors.toSet());
-
- System.out.println("3. calculated [size=" + combs.size() + ", time=" + (System.currentTimeMillis() - t) + ']');
-
- // 3. Normalize variants (keeps only minimal valid subsets, see task description)
- Set<Set<String>> normCombs = squeeze(combs);
-
- System.out.println("4. normalized [size=" + normCombs.size() + ", time=" + (System.currentTimeMillis() - t) + ']');
- System.out.println("Norm results:" + normCombs);
- }
-
- /**
- * Removes `candidate` from each row of ROWS.
- * Return true if result list doesn't contain any row with size > 1.
- * <p>
- * If ROWS is {{a, b}, {a, c}}. Candidate {a, b} - ok, candidate {a} - ok, candidate {b} - no.
- */
- private static boolean isSuitable(Set<String> candidate) {
- for (Set<String> row : ROWS) {
- Set<String> copy = new HashSet<>(row);
-
- copy.removeAll(candidate);
-
- if (copy.size() > 1) {
- return false;
- }
- }
-
- return true;
- }
-
- private static Set<Set<String>> squeeze(Set<Set<String>> combs) {
- Set<Set<String>> normCombs = new HashSet<>();
-
- for (Set<String> comb : combs.stream().sorted(Comparator.comparingInt(Set::size)).collect(Collectors.toList())) {
- // Skips already added shorter variants.
- if (normCombs.stream().filter(comb::containsAll).findAny().isEmpty()) {
- normCombs.add(comb);
- }
- }
- return normCombs;
- }
-
- // Adds "" which means empty row. For example for small row it returns {{A, B, C, ""}, {B, C, D, ""}, {B, D, ""} }
- private static List<Set<String>> extendNulls() {
- return ROWS.stream().map(
- p -> Stream.concat(p.stream(), Stream.of("")).collect(Collectors.toSet())
- ).collect(Collectors.toList());
- }
-}
-
diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/Test2.java b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/Test2.java
deleted file mode 100644
index b8e5e42..0000000
--- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/Test2.java
+++ /dev/null
@@ -1,114 +0,0 @@
-package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model;
-
-import com.google.common.collect.ImmutableSet;
-import com.google.common.collect.Sets;
-
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Comparator;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-import java.util.stream.Collectors;
-
-public class Test2 {
-// private static List<Set<String>> ROWS =
-// Arrays.asList(
-// ImmutableSet.of("A", "B", "C"),
-// ImmutableSet.of("B", "C", "D"),
-// ImmutableSet.of("B", "D")
-// );
-
- // Uncomment it. Works too long time. Normalized result is 256.
- private static List<Set<String>> ROWS = Arrays.asList(
- ImmutableSet.of("A", "B"),
- ImmutableSet.of("C", "B"),
- ImmutableSet.of("D", "E"),
- ImmutableSet.of("D", "F"),
- ImmutableSet.of("G", "H"),
- ImmutableSet.of("I", "H"),
- ImmutableSet.of("J", "K"),
- ImmutableSet.of("L", "K"),
- ImmutableSet.of("M", "N"),
- ImmutableSet.of("M", "O"),
- ImmutableSet.of("P", "Q"),
- ImmutableSet.of("P", "R"),
- ImmutableSet.of("S", "T"),
- ImmutableSet.of("S", "U"),
- ImmutableSet.of("V", "W"),
- ImmutableSet.of("X", "W")
- );
-
- private static Set<String> ALL = ROWS.stream().flatMap(Collection::stream).collect(Collectors.toSet());
-
- // Goal: Find minimal set of combinations with following feature.
- // After removing combination values from each row - list should contain rows with size <= 1.
-
- // Expected solution: [C, B], [A, C, D], [A, B, D]
- // Example:
- // list - [C, B] = {{A}, {D}, {D}}
- // list - [A, C, D] = {{B}, {B}, {B}}
- // list - [A, B, D] = {{C}, {C}, {null}}
-
-
- // Additional. Redundant solutions: [A, B, C] ([C, B] enough), [A, B, C, D] ([A, C, D] enough) etc
-
- // Easiest.
- public static void main(String[] args) {
- long t = System.currentTimeMillis();
-
- System.out.println("1. start [time=" + (System.currentTimeMillis() - t) + ']');
-
- Set<Set<String>> combs = new HashSet<>();
-
- for (int i = 1; i < ALL.size(); i++) {
- combs.addAll(
- Sets.combinations(ALL, i).
- stream().
- filter(Test2::isSuitable).
- collect(Collectors.toSet())
- );
- }
-
- System.out.println("2. calculated [size=" + combs.size() + ", time=" + (System.currentTimeMillis() - t) + ']');
-
- // Normalize variants (keeps only minimal valid subsets, see task description)
- Set<Set<String>> normCombs = squeeze(combs);
-
- System.out.println("3. normalized [size=" + normCombs.size() + ", time=" + (System.currentTimeMillis() - t) + ']');
- System.out.println("Norm results:" + normCombs);
- }
-
- private static Set<Set<String>> squeeze(Set<Set<String>> combs) {
- Set<Set<String>> normCombs = new HashSet<>();
-
- for (Set<String> comb : combs.stream().sorted(Comparator.comparingInt(Set::size)).collect(Collectors.toList())) {
- // Skips already added shorter variants.
- if (normCombs.stream().filter(comb::containsAll).findAny().isEmpty()) {
- normCombs.add(comb);
- }
- }
- return normCombs;
- }
-
- /**
- * Removes `candidate` from each row of ROWS.
- * Return true if result list doesn't contain any row with size > 1.
- * <p>
- * If ROWS is {{a, b}, {a, c}}. Candidate {a, b} - ok, candidate {a} - ok, candidate {b} - no.
- */
- private static boolean isSuitable(Set<String> candidate) {
- for (Set<String> row : ROWS) {
- Set<String> copy = new HashSet<>(row);
-
- copy.removeAll(candidate);
-
- if (copy.size() > 1) {
- return false;
- }
- }
-
- return true;
- }
-}
-