You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/09/28 09:55:32 UTC
[incubator-nlpcraft] 01/01: Code cleanup.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-456
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 300042735860e626d90002b30b4d5b072892b73a
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Tue Sep 28 12:55:21 2021 +0300
Code cleanup.
---
.../nlpcraft/probe/mgrs/NCProbeIdlToken.scala | 41 +++++-
.../apache/nlpcraft/probe/mgrs/NCProbeModel.scala | 3 +
.../probe/mgrs/nlp/NCProbeEnrichmentManager.scala | 2 +-
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 104 +++++++-------
.../probe/mgrs/sentence/NCSentenceManager.scala | 25 ++--
.../probe/mgrs/synonyms/NCSynonymsManager.scala | 150 ++++++++++++---------
6 files changed, 196 insertions(+), 129 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeIdlToken.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeIdlToken.scala
index 5da9808..d4fc27c 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeIdlToken.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeIdlToken.scala
@@ -26,11 +26,46 @@ import org.apache.nlpcraft.model.{NCToken, _}
* @param word
*/
case class NCProbeIdlToken(token: NCToken, word: NCNlpSentenceToken) {
- val (origText: String, wordIndexes: Set[Int], minIndex: Int, maxIndex: Int, isToken: Boolean, isWord: Boolean) =
+ require(token != null ^ word != null)
+
+ val (
+ origText: String,
+ normText: String,
+ stem: String,
+ wordIndexes: Set[Int],
+ minIndex: Int,
+ maxIndex: Int,
+ startCharIndex: Int,
+ endCharIndex: Int,
+ isToken: Boolean,
+ isWord: Boolean
+ ) =
if (token != null)
- (token.origText, token.wordIndexes.toSet, token.wordIndexes.head, token.wordIndexes.last, true, false)
+ (
+ token.origText,
+ token.normText,
+ token.stem,
+ token.wordIndexes.toSet,
+ token.wordIndexes.head,
+ token.wordIndexes.last,
+ token.getStartCharIndex,
+ token.getEndCharIndex,
+ true,
+ false
+ )
else
- (word.origText, word.wordIndexes.toSet, word.wordIndexes.head, word.wordIndexes.last, false, true)
+ (
+ word.origText,
+ word.normText,
+ word.stem,
+ word.wordIndexes.toSet,
+ word.wordIndexes.head,
+ word.wordIndexes.last,
+ word.startCharIndex,
+ word.endCharIndex,
+ false,
+ true
+ )
private lazy val hash = if (isToken) Seq(wordIndexes, token.getId).hashCode() else wordIndexes.hashCode()
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
index ea41793..6b6a8e8 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
@@ -62,6 +62,9 @@ case class NCProbeModel(
lazy val hasNoIdlSynonyms: Boolean = continuousSynonyms.nonEmpty || sparseSynonyms.nonEmpty
lazy val hasSparseSynonyms: Boolean = sparseSynonyms.nonEmpty || idlSynonyms.exists(_._2.exists(_.sparse))
lazy val hasContinuousSynonyms: Boolean = continuousSynonyms.nonEmpty || idlSynonyms.exists(_._2.exists(!_.sparse))
+ lazy val isComplex: Boolean = hasIdlSynonyms || !model.getParsers.isEmpty
def hasIdlSynonyms(elemId: String): Boolean = idlSynonyms.contains(elemId)
+
+
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
index fde865f..560ddff 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
@@ -492,7 +492,7 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats {
}).toMap
// Loop has sense if model is complex (has user defined parsers or IDL based synonyms)
- continue = NCModelEnricher.isComplex(mdl) && res.exists { case (_, same) => !same }
+ continue = mdl.isComplex && res.exists { case (_, same) => !same }
if (DEEP_DEBUG)
if (continue) {
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 7196985..a39edfd 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -83,8 +83,6 @@ object NCModelEnricher extends NCProbeEnricher {
ackStopped()
}
- def isComplex(mdl: NCProbeModel): Boolean = mdl.hasIdlSynonyms || !mdl.model.getParsers.isEmpty
-
/**
*
* @param ns
@@ -180,7 +178,8 @@ object NCModelEnricher extends NCProbeEnricher {
new NCCustomElement() {
override def getElementId: String = noteId
override def getWords: JList[NCCustomWord] = words
- override def getMetadata: JavaMeta = md.map(p => p._1 -> p._2.asInstanceOf[AnyRef]).asJava
+ override def getMetadata: JavaMeta =
+ md.map { case (k, v) => k -> v.asInstanceOf[AnyRef] }.asJava
}
}).asJava
)
@@ -228,7 +227,7 @@ object NCModelEnricher extends NCProbeEnricher {
* Example: Piece: 'x1, x2(stopword), x3(stopword), x4' will be expanded into
* {'x1, x2, x3, x4', 'x1, x2, x4', 'x1, x3, x4', 'x1, x4'}
*
- * 3. All variants collected, duplicated deleted, etc.
+ * 3. All variants collected, duplicated sets deleted, etc.
*
* @param toks
*/
@@ -244,7 +243,7 @@ object NCModelEnricher extends NCProbeEnricher {
else
slides += mutable.ArrayBuffer.empty :+ stop
- // Too many stopords inside skipped.
+ // Too many stopwords inside skipped.
val bigSlides = slides.filter(_.size > 2)
var stops4Delete: Seq[Seq[NlpToken]] =
@@ -255,7 +254,7 @@ object NCModelEnricher extends NCProbeEnricher {
if (stops4AllCombs.nonEmpty)
for (
seq1 <- Range.inclusive(0, stops4AllCombs.size).flatMap(stops4AllCombs.combinations);
- seq2 <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations)
+ seq2 <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations)
)
yield seq1 ++ seq2.flatten
else
@@ -268,11 +267,10 @@ object NCModelEnricher extends NCProbeEnricher {
stops4Delete = stops4Delete.filter(seq => !seq.contains(combo.head) && !seq.contains(combo.last))
(Seq(combo) ++ stops4Delete.map(del => combo.filter(t => !del.contains(t)))).map(_ -> combo).distinct
-
}).
- filter(_._1.nonEmpty).
- groupBy(_._1).
- map(p => p._1 -> p._2.map(_._2).minBy(p => (-p.size, p.head.index))).
+ filter { case (seq, _) => seq.nonEmpty }.
+ groupBy { case (seq, _) => seq }.
+ map { case (toksKey, seq) => toksKey -> seq.map(_._2).minBy(p => (-p.size, p.head.index)) }.
sortBy { case(data, combo) => (-combo.size, -data.size, combo.head.index, data.head.index) }
/**
@@ -297,15 +295,17 @@ object NCModelEnricher extends NCProbeEnricher {
/**
*
- * @param tows
+ * @param idlToks
* @param ns
*/
- private def toTokens(tows: Seq[IdlToken], ns: Sentence): Seq[NlpToken] =
- (
- tows.filter(_.isWord).map(_.word) ++
- tows.filter(_.isToken).map(_.token).
- flatMap(w => ns.filter(t => t.wordIndexes.intersect(w.wordIndexes).nonEmpty))
- ).sortBy(_.startCharIndex)
+ private def toNlpTokens(idlToks: Seq[IdlToken], ns: Sentence): Seq[NlpToken] = {
+ val words = idlToks.filter(_.isWord).map(_.word)
+ val suitableToks =
+ idlToks.filter(_.isToken).map(_.token).
+ flatMap(w => ns.filter(t => t.wordIndexes.intersect(w.wordIndexes).nonEmpty))
+
+ (words ++ suitableToks).sortBy(_.startCharIndex)
+ }
/**
*
@@ -378,6 +378,7 @@ object NCModelEnricher extends NCProbeEnricher {
}
/**
+ * Prepares IDL tokens based on NLP tokens.
*
* @param h
* @param toks
@@ -391,9 +392,7 @@ object NCModelEnricher extends NCProbeEnricher {
// Drops without tokens (IDL part works with tokens).
if (rec.nonEmpty)
- Some(rec ++
- (seq.wordsIndexes.intersect(idxs) -- rec.flatMap(_.wordIndexes)).map(h.tokens)
- )
+ Some(rec ++ (seq.wordsIndexes.intersect(idxs) -- rec.flatMap(_.wordIndexes)).map(h.tokens))
else
None
}).seq
@@ -440,11 +439,11 @@ object NCModelEnricher extends NCProbeEnricher {
for (
// 'toksExt' is piece of sentence, 'toks' is the same as 'toksExt' or without some stopwords set.
(toks, toksExt) <- combosTokens(ns.toSeq);
- idxs = toks.map(_.index);
- e <- mdl.elements.values;
- elemId = e.getId;
- greedy = e.isGreedy.orElse(mdl.model.isGreedy)
- if !greedy || !alreadyMarked(ns, elemId, toks, idxs)
+ idxs = toks.map(_.index);
+ e <- mdl.elements.values;
+ elemId = e.getId;
+ greedy = e.isGreedy.orElse(mdl.model.isGreedy)
+ if !greedy || !alreadyMarked(ns, elemId, toks, idxs)
) {
def add(
dbgType: String,
@@ -456,7 +455,7 @@ object NCModelEnricher extends NCProbeEnricher {
val ok =
(!greedy || !alreadyMarked(ns, elemId, elemToks, idxs)) &&
- ( parts.isEmpty || !parts.exists { case (t, _) => t.getId == elemId })
+ ( parts.isEmpty || !parts.exists { case (tok, _) => tok.getId == elemId })
if (ok)
mark(
@@ -563,7 +562,7 @@ object NCModelEnricher extends NCProbeEnricher {
)
}
else
- // 2.2 Sparse.
+ // 2.2 Sparse.
for (syn <- allSyns; comb <- allCombs)
NCSynonymsManager.onSparseMatch(
ns.srvReqId,
@@ -573,7 +572,7 @@ object NCModelEnricher extends NCProbeEnricher {
req,
variantsToks,
res => {
- val toks = getSparsedTokens(toTokens(res, ns), toTokens(comb, ns))
+ val toks = getSparsedTokens(toNlpTokens(res, ns), toNlpTokens(comb, ns))
val parts = toParts(mdl, ns.srvReqId, res, syn)
val typ = if (syn.sparse) "IDL sparse"else "IDL continuous"
@@ -607,6 +606,9 @@ object NCModelEnricher extends NCProbeEnricher {
* @param ns
*/
private def normalize(ns: Sentence): Unit = {
+ // Find and removes user notes if sentence contains notes with similar structure but less count of swallowed stop-words.
+ // These stop-words can be used fro detection another user tokens and harmless if they are free words.
+ // Notes with links and with references on them - aren't touched.
val usrNotes = ns.flatten.filter(_.isUser).distinct
val links = NCSentenceManager.getLinks(usrNotes)
val parts = NCSentenceManager.getPartKeys(usrNotes)
@@ -638,28 +640,34 @@ object NCModelEnricher extends NCProbeEnricher {
// TODO: simplify, add tests, check model properties (sparse etc) for optimization.
/**
*
- * @param elmId
- * @param toks
- * @param sliceToksIdxsSorted
+ * @param elmId Element ID.
+ * @param toks Tokens.
+ * @param idxs Indexes, note that it can be not exactly tokens indexes (sparse case)
*/
- private def alreadyMarked(ns: Sentence, elmId: String, toks: Seq[NlpToken], sliceToksIdxsSorted: Seq[Int]): Boolean = {
+ private def alreadyMarked(ns: Sentence, elmId: String, toks: Seq[NlpToken], idxs: Seq[Int]): Boolean = {
lazy val toksIdxsSorted = toks.map(_.index).sorted
- sliceToksIdxsSorted.map(ns).forall(_.exists(n => n.noteType == elmId && n.sparsity == 0)) ||
- toks.exists(_.exists(n =>
- n.noteType == elmId &&
- (
- (n.sparsity == 0 &&
- (sliceToksIdxsSorted.containsSlice(n.tokenIndexes) || n.tokenIndexes.containsSlice(toksIdxsSorted))
- )
- ||
- (
- n.tokenIndexes == toksIdxsSorted ||
- n.tokenIndexes.containsSlice(toksIdxsSorted) &&
- U.isContinuous(toksIdxsSorted) &&
- U.isContinuous(n.tokenIndexes)
- )
- )
- ))
+ // All tokens with given indexes found with zero sparsity.
+ val ok1 = idxs.map(ns).forall(_.exists(n => n.noteType == elmId && n.sparsity == 0))
+
+ lazy val ok2 =
+ toks.exists(_.exists(n =>
+ if (n.noteType == elmId) {
+ val noteOk1 = n.sparsity == 0 &&
+ (idxs.containsSlice(n.tokenIndexes) || n.tokenIndexes.containsSlice(toksIdxsSorted))
+
+ lazy val noteOk2 =
+ n.tokenIndexes == toksIdxsSorted ||
+ n.tokenIndexes.containsSlice(toksIdxsSorted) &&
+ U.isContinuous(toksIdxsSorted) &&
+ U.isContinuous(n.tokenIndexes)
+
+ noteOk1 || noteOk2
+ }
+ else
+ false
+ ))
+
+ ok1 || ok2
}
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index 00d6bdf..f9f7a01 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -369,7 +369,7 @@ object NCSentenceManager extends NCService {
val t = NCNlpSentenceToken(idx)
// Note, it adds stop-words too.
- val content = nsCopyToks.zipWithIndex.filter(p => indexes.contains(p._2)).map(_._1)
+ val content = nsCopyToks.zipWithIndex.filter { case (_, idx) => indexes.contains(idx) }.map { case (tok, _) => tok}
content.foreach(t => history += t.index -> idx)
@@ -378,15 +378,12 @@ object NCSentenceManager extends NCService {
val n = content.size - 1
- content.zipWithIndex.foreach(p => {
- val t = p._1
- val idx = p._2
-
+ content.zipWithIndex.foreach { case (t, idx) =>
buf += get(t)
if (idx < n && t.endCharIndex != content(idx + 1).startCharIndex)
buf += " "
- })
+ }
buf.mkString
}
@@ -459,8 +456,7 @@ object NCSentenceManager extends NCService {
for (tok <- ns.filter(_.isTypeOf(noteType)) if ok)
tok.getNoteOpt(noteType, idxsField) match {
case Some(n) =>
- val idxs: Seq[Seq[Int]] =
- n.data[JList[JList[Int]]](idxsField).asScala.map(_.asScala.toSeq).toSeq
+ val idxs: Seq[Seq[Int]] = n.data[JList[JList[Int]]](idxsField).asScala.map(_.asScala.toSeq).toSeq
var fixed = idxs
history.foreach {
@@ -539,8 +535,7 @@ object NCSentenceManager extends NCService {
// Validation (all indexes calculated well)
require(
!res ||
- !ns.flatten.
- exists(n => ns.filter(_.wordIndexes.exists(n.wordIndexes.contains)).exists(t => !t.contains(n))),
+ !ns.flatten.exists(n => ns.filter(_.wordIndexes.exists(n.wordIndexes.contains)).exists(t => !t.contains(n))),
s"Invalid sentence:\n" +
ns.map(t =>
// Human readable invalid sentence for debugging.
@@ -745,9 +740,11 @@ object NCSentenceManager extends NCService {
)
)
+ // There are optimizations below. Similar variants by some criteria deleted.
+
def notNlpNotes(s: NCNlpSentence): Seq[NCNlpSentenceNote] = s.flatten.filter(!_.isNlp)
- // Drops similar sentences (with same notes structure). Keeps with more found.
+ // Drops similar sentences with same notes structure based on greedy elements. Keeps with more notes found.
val notGreedyElems =
mdl.getElements.asScala.flatMap(e => if (!e.isGreedy.orElse(mdl.isGreedy)) Some(e.getId) else None).toSet
@@ -768,6 +765,7 @@ object NCSentenceManager extends NCService {
var sensWithNotesIdxs = sensWithNotes.zipWithIndex
+ // Drops similar sentences if there are other sentences with superset of notes.
sens =
sensWithNotesIdxs.filter { case ((_, notNlpNotes1), idx1) =>
!sensWithNotesIdxs.
@@ -775,13 +773,12 @@ object NCSentenceManager extends NCService {
exists { case((_, notNlpNotes2), _) => notNlpNotes1.subsetOf(notNlpNotes2) }
}.map { case ((sen, _), _) => sen }
- // Drops similar sentences (with same tokens structure).
- // Among similar sentences we prefer one with minimal free words count.
+ // Drops similar sentences. Among similar sentences we prefer one with minimal free words count.
sens = sens.groupBy(notNlpNotes(_).map(_.getKey(withIndexes = false))).
map { case (_, seq) => seq.minBy(_.filter(p => p.isNlp && !p.isStopWord).map(_.wordIndexes.length).sum) }.
toSeq
- // Drops sentences if they are just subset of another.
+ // Drops sentences if they are just subset of another (indexes ignored here)
sensWithNotes = sensWithNotes.filter { case (sen, _) => sens.contains(sen) }
sensWithNotesIdxs = sensWithNotes.zipWithIndex
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
index 80f9c19..fa31f26 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
@@ -38,7 +38,7 @@ object NCSynonymsManager extends NCService {
private lazy val cache =
mutable.HashMap.empty[String, mutable.HashMap[Int, mutable.HashMap[Seq[T], mutable.HashSet[Synonym]]]]
- def isUnprocessed(elemId: String, s: Synonym, tokens: Seq[T]): Boolean =
+ def isUnprocessed(elemId: String, syn: Synonym, tokens: Seq[T]): Boolean =
cache.
getOrElseUpdate(
elemId,
@@ -51,7 +51,7 @@ object NCSynonymsManager extends NCService {
getOrElseUpdate(
tokens,
mutable.HashSet.empty[Synonym]
- ).add(s)
+ ).add(syn)
}
private case class SavedIdlKey(id: String, startCharIndex: Int, endCharIndex: Int, other: Map[String, AnyRef] = Map.empty)
@@ -72,13 +72,11 @@ object NCSynonymsManager extends NCService {
)
}
- private case class Value(request: NCRequest, variants: Seq[Seq[NCToken]], predicate: NCIdlFunction) {
- override def toString: String = variants.toString()
- }
+ private case class SavedIdlValue(request: NCRequest, variants: Seq[Seq[NCToken]], predicate: NCIdlFunction)
private case class IdlChunkKey(token: IdlToken, chunk: NCProbeSynonymChunk)
- private val savedIdl = mutable.HashMap.empty[String, mutable.HashMap[SavedIdlKey, mutable.ArrayBuffer[Value]]]
+ private val savedIdl = mutable.HashMap.empty[String, mutable.HashMap[SavedIdlKey, mutable.ArrayBuffer[SavedIdlValue]]]
private val idlChunksCache = mutable.HashMap.empty[String, mutable.HashMap[IdlChunkKey, Boolean]]
private val idlCaches = mutable.HashMap.empty[String, CacheHolder[IdlToken]]
private val tokCaches = mutable.HashMap.empty[String, CacheHolder[Int]]
@@ -120,7 +118,7 @@ object NCSynonymsManager extends NCService {
/**
*
- * @param s
+ * @param syn
* @param toks
* @param isMatch
* @param getIndex
@@ -128,19 +126,23 @@ object NCSynonymsManager extends NCService {
* @tparam T
*/
private def sparseMatch0[T](
- s: Synonym,
+ syn: Synonym,
toks: Seq[T],
isMatch: (T, NCProbeSynonymChunk) => Boolean,
getIndex: T => Int,
shouldBeNeighbors: Boolean
): Option[Seq[T]] =
- if (toks.size >= s.size) {
+ if (toks.size >= syn.size) {
lazy val res = mutable.ArrayBuffer.empty[T]
lazy val all = mutable.HashSet.empty[T]
+ // There are 3 states:
+ // 0 - initial working state, first step.
+ // 1 - working state, not first step.
+ // -1 - stop state.
var state = 0
- for (chunk <- s if state != -1) {
+ for (chunk <- syn if state != -1) {
val seq =
if (state == 0) {
state = 1
@@ -153,12 +155,12 @@ object NCSynonymsManager extends NCService {
if (seq.nonEmpty) {
val head = seq.head
- if (!s.permute && res.nonEmpty && getIndex(head) <= getIndex(res.last))
+ if (!syn.permute && res.nonEmpty && getIndex(head) <= getIndex(res.last))
state = -1
else {
all ++= seq
- if (all.size > s.size)
+ if (all.size > syn.size)
state = -1
else
res += head
@@ -168,7 +170,12 @@ object NCSynonymsManager extends NCService {
state = -1
}
- if (state != -1 && all.size == res.size && (!shouldBeNeighbors || U.isIncreased(res.map(getIndex).toSeq.sorted)))
+ if (
+ state != -1 && // State ok.
+ all.size == res.size && // There aren't excess processed tokens.
+ // `neighbors` conditions, important for simple not sparse synonyms.
+ (!shouldBeNeighbors || U.isIncreased(res.map(getIndex).toSeq.sorted))
+ )
Some(res.toSeq)
else
None
@@ -186,69 +193,75 @@ object NCSynonymsManager extends NCService {
private def save(req: NCRequest, tok: NCToken, pred: NCIdlFunction, variantsToks: Seq[Seq[NCToken]]): Unit = {
savedIdl.
getOrElseUpdate(req.getServerRequestId, mutable.HashMap.empty).
- getOrElseUpdate(SavedIdlKey(tok), mutable.ArrayBuffer.empty) +=
- Value(req, variantsToks, pred)
+ getOrElseUpdate(SavedIdlKey(tok), mutable.ArrayBuffer.empty) +=
+ SavedIdlValue(req, variantsToks, pred)
}
/**
+ * Checks that given synonym is not checked yet with given NLP tokens' indexes.
*
* @param srvReqId
* @param elemId
- * @param s
+ * @param syn
* @param tokens
*/
- private def isUnprocessedTokens(srvReqId: String, elemId: String, s: Synonym, tokens: Seq[Int]): Boolean =
- tokCaches.getOrElseUpdate(srvReqId, new CacheHolder[Int]).isUnprocessed(elemId, s, tokens)
+ private def isUnprocessedTokens(srvReqId: String, elemId: String, syn: Synonym, tokens: Seq[Int]): Boolean =
+ tokCaches.getOrElseUpdate(srvReqId, new CacheHolder[Int]).isUnprocessed(elemId, syn, tokens)
/**
+ * Checks that given synonym is not checked yet with given IDL tokens.
*
* @param srvReqId
* @param elemId
- * @param s
+ * @param syn
* @param tokens
*/
- private def isUnprocessedIdl(srvReqId: String, elemId: String, s: Synonym, tokens: Seq[IdlToken]): Boolean =
- idlCaches.getOrElseUpdate(srvReqId, new CacheHolder[IdlToken]).isUnprocessed(elemId, s, tokens)
+ private def isUnprocessedIdl(srvReqId: String, elemId: String, syn: Synonym, tokens: Seq[IdlToken]): Boolean =
+ idlCaches.getOrElseUpdate(srvReqId, new CacheHolder[IdlToken]).isUnprocessed(elemId, syn, tokens)
/**
+ * Checks matching IDL token with synonym's chunk.
*
- * @param tow
- * @param chunk
- * @param req
- * @param variantsToks
+ * @param t IDL token.
+ * @param chunk Synonym's chunk.
+ * @param req Request.
+ * @param variantsToks All possible request's variants.
*/
private def isMatch(
- tow: IdlToken, chunk: NCProbeSynonymChunk, req: NCRequest, variantsToks: Seq[Seq[NCToken]]
+ t: IdlToken, chunk: NCProbeSynonymChunk, req: NCRequest, variantsToks: Seq[Seq[NCToken]]
): Boolean =
idlChunksCache.
- getOrElseUpdate(req.getServerRequestId,
+ getOrElseUpdate(
+ req.getServerRequestId,
mutable.HashMap.empty[IdlChunkKey, Boolean]
).
getOrElseUpdate(
- IdlChunkKey(tow, chunk),
+ IdlChunkKey(t, chunk),
{
- def get0[T](fromToken: NCToken => T, fromWord: NlpToken => T): T =
- if (tow.isToken) fromToken(tow.token) else fromWord(tow.word)
-
chunk.kind match {
- case TEXT => chunk.wordStem == get0(_.stem, _.stem)
+ case TEXT => chunk.wordStem == t.stem
case REGEX =>
- chunk.regex.matcher(get0(_.origText, _.origText)).matches() ||
- chunk.regex.matcher(get0(_.normText, _.normText)).matches()
+ chunk.regex.matcher(t.origText).matches() || chunk.regex.matcher(t.normText).matches()
case IDL =>
- val ok =
+ val ok = {
+ // IDL condition just for tokens.
+ t.isToken &&
+ // Should be found at least one suitable variant (valid NCIdlContext) for given token.
+ // This variant will be checked again on last processing phase.
variantsToks.par.exists(vrntToks =>
- get0(t =>
- chunk.idlPred.apply(t, NCIdlContext(toks = vrntToks, req = req)).
- value.asInstanceOf[Boolean],
- _ => false
+ chunk.idlPred.apply(
+ t.token,
+ NCIdlContext(toks = vrntToks, req = req)).value.asInstanceOf[Boolean]
)
- )
+ }
+ // Saves all variants for next validation.
+ // All suitable variants can be deleted, so this positive result can be abolished
+ // on last processing phase.
if (ok)
- save(req, tow.token, chunk.idlPred, variantsToks)
+ save(req, t.token, chunk.idlPred, variantsToks)
ok
@@ -270,22 +283,29 @@ object NCSynonymsManager extends NCService {
require(toks != null)
require(!syn.sparse && !syn.hasIdl)
- if (
- toks.length == syn.length && {
+ if (toks.length == syn.length) { // Same length.
+ val ok =
if (syn.isTextOnly)
- toks.zip(syn).forall(p => p._1.stem == p._2.wordStem)
+ toks.zip(syn).
+ // Checks all synonym chunks with all tokens.
+ forall { case (tok, chunk) => tok.stem == chunk.wordStem }
else
- toks.zip(syn).sortBy(p => getSort(p._2.kind)).forall { case (tok, chunk) => isMatch(tok, chunk) }
- }
- )
- callback(())
+ toks.zip(syn).
+ // Pre-sort by chunk kind for performance reasons, easier to compare should be first.
+ sortBy { case (_, chunk) => getSort(chunk.kind) }.
+ // Checks all synonym chunks with all tokens.
+ forall { case (tok, chunk) => isMatch(tok, chunk) }
+
+ if (ok)
+ callback(())
+ }
}
/**
*
* @param srvReqId
* @param elemId
- * @param s
+ * @param syn
* @param toks
* @param req
* @param variantsToks
@@ -294,22 +314,22 @@ object NCSynonymsManager extends NCService {
def onMatch(
srvReqId: String,
elemId: String,
- s: Synonym,
+ syn: Synonym,
toks: Seq[IdlToken],
req: NCRequest,
variantsToks: Seq[Seq[NCToken]],
callback: Unit => Unit
): Unit =
- if (isUnprocessedIdl(srvReqId, elemId, s, toks)) {
+ if (isUnprocessedIdl(srvReqId, elemId, syn, toks)) {
require(toks != null)
if (
- toks.length == s.length && // Same length.
- toks.count(_.isToken) >= s.idlChunks && // Enough tokens.
- toks.zip(s).sortBy { // Pre-sort by chunk kind.
+ toks.length == syn.length && // Same length.
+ toks.count(_.isToken) >= syn.idlChunks && // Enough tokens.
+ toks.zip(syn).sortBy { // Pre-sort by chunk kind for performance reasons, easier to compare should be first.
case (_, chunk) => getSort(chunk.kind)
- }
- .forall { // TODO?
+ }.
+ forall { // Checks all synonym chunks with all tokens.
case (idlTok, chunk) => isMatch(idlTok, chunk, req, variantsToks)
}
)
@@ -365,7 +385,7 @@ object NCSynonymsManager extends NCService {
syn,
toks,
(t: IdlToken, chunk: NCProbeSynonymChunk) => isMatch(t, chunk, req, variantsToks),
- (t: IdlToken) => if (t.isToken) t.token.getStartCharIndex else t.word.startCharIndex,
+ (t: IdlToken) => t.startCharIndex,
shouldBeNeighbors = !syn.sparse
) match {
case Some(res) => callback(res)
@@ -374,13 +394,15 @@ object NCSynonymsManager extends NCService {
}
/**
+ * Checks that suitable variant wasn't deleted and IDL condition for token is still valid.
+ * We have to check it because NCIdlContext which used in predicate based on variant.
*
* @param srvReqId
- * @param senToks
+ * @param toks
*/
- def isStillValidIdl(srvReqId: String, senToks: Seq[NCToken]): Boolean =
+ def isStillValidIdl(srvReqId: String, toks: Seq[NCToken]): Boolean =
savedIdl.get(srvReqId) match {
- case Some(m) =>
+ case Some(map) =>
lazy val allCheckedSenToks = {
val set = mutable.HashSet.empty[SavedIdlKey]
@@ -390,13 +412,13 @@ object NCSynonymsManager extends NCService {
t.getPartTokens.asScala.foreach(add)
}
- senToks.foreach(add)
+ toks.foreach(add)
set
}
- senToks.forall(tok =>
- m.get(SavedIdlKey(tok)) match {
+ toks.forall(tok =>
+ map.get(SavedIdlKey(tok)) match {
case Some(vals) =>
vals.exists(
v =>
@@ -417,6 +439,7 @@ object NCSynonymsManager extends NCService {
}
/**
+ * Called when request processing finished.
*
* @param srvReqId
*/
@@ -427,6 +450,7 @@ object NCSynonymsManager extends NCService {
}
/**
+ * Called on each request enrichment iteration.
*
* @param srvReqId
*/