You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/09/29 13:36:46 UTC
[incubator-nlpcraft] branch NLPCRAFT-456 updated: WIP.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-456
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-456 by this push:
new a9dc535 WIP.
a9dc535 is described below
commit a9dc535b9eeb5ef6c3aecce521f12513184c6e12
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Wed Sep 29 16:36:36 2021 +0300
WIP.
---
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 10 +++++-----
.../probe/mgrs/sentence/NCSentenceManager.scala | 19 ++++---------------
2 files changed, 9 insertions(+), 20 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index a39edfd..26bda8b 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -607,8 +607,8 @@ object NCModelEnricher extends NCProbeEnricher {
*/
private def normalize(ns: Sentence): Unit = {
// Find and removes user notes if sentence contains notes with similar structure but less count of swallowed stop-words.
- // These stop-words can be used fro detection another user tokens and harmless if they are free words.
- // Notes with links and with references on them - aren't touched.
+ // These stop-words can be used for detection another user tokens and if they are free words it is harmless too.
+ // Ignored notes with links and with references on them.
val usrNotes = ns.flatten.filter(_.isUser).distinct
val links = NCSentenceManager.getLinks(usrNotes)
val parts = NCSentenceManager.getPartKeys(usrNotes)
@@ -658,9 +658,9 @@ object NCModelEnricher extends NCProbeEnricher {
lazy val noteOk2 =
n.tokenIndexes == toksIdxsSorted ||
- n.tokenIndexes.containsSlice(toksIdxsSorted) &&
- U.isContinuous(toksIdxsSorted) &&
- U.isContinuous(n.tokenIndexes)
+ n.tokenIndexes.containsSlice(toksIdxsSorted) &&
+ U.isContinuous(toksIdxsSorted) &&
+ U.isContinuous(n.tokenIndexes)
noteOk1 || noteOk2
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index 2643630..50137a2 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -684,7 +684,7 @@ object NCSentenceManager extends NCService {
* lengths - the winning note is chosen based on this priority.
*/
@throws[NCE]
- private def collapseSentence(sen: NCNlpSentence, mdl: NCModel, lastPhase: Boolean = false): Seq[NCNlpSentence] = {
+ def collapse(mdl: NCModel, sen: NCNlpSentence, lastPhase: Boolean = false): Seq[NCNlpSentence] = {
// Always deletes `similar` notes.
// Some words with same note type can be detected various ways.
// We keep only one variant - with `best` direct and sparsity parameters,
@@ -762,18 +762,16 @@ object NCSentenceManager extends NCService {
// There are optimizations below. Similar variants by some criteria deleted.
- def notNlpNotes(s: NCNlpSentence): Seq[NCNlpSentenceNote] = s.flatten.filter(!_.isNlp)
-
// Drops similar sentences with same notes structure based on greedy elements. Keeps with more notes found.
val notGreedyElems =
mdl.getElements.asScala.flatMap(e => if (!e.isGreedy.orElse(mdl.isGreedy)) Some(e.getId) else None).toSet
- sens = sens.groupBy(notNlpNotes(_).groupBy(_.noteType).keys.toSeq.sorted.distinct).
+ sens = sens.groupBy(p => getNotNlpNotes(p.tokens).groupBy(_.noteType).keys.toSeq.sorted.distinct).
flatMap { case (types, sensSeq) =>
if (types.exists(notGreedyElems.contains))
sensSeq
else {
- val m: Map[NCNlpSentence, Int] = sensSeq.map(p => p -> notNlpNotes(p).size).toMap
+ val m: Map[NCNlpSentence, Int] = sensSeq.map(p => p -> getNotNlpNotes(p.tokens).size).toMap
val max = m.values.max
@@ -794,7 +792,7 @@ object NCSentenceManager extends NCService {
}.map { case ((sen, _), _) => sen }
// Drops similar sentences. Among similar sentences we prefer one with minimal free words count.
- sens = sens.groupBy(notNlpNotes(_).map(_.getKey(withIndexes = false))).
+ sens = sens.groupBy(p => getNotNlpNotes(p.tokens).map(_.getKey(withIndexes = false))).
map { case (_, seq) => seq.minBy(_.filter(p => p.isNlp && !p.isStopWord).map(_.wordIndexes.length).sum) }.
toSeq
@@ -833,15 +831,6 @@ object NCSentenceManager extends NCService {
/**
*
- * @param mdl
- * @param sen
- * @param lastPhase
- */
- def collapse(mdl: NCModel, sen: NCNlpSentence, lastPhase: Boolean = false): Seq[NCNlpSentence] =
- collapseSentence(sen, mdl, lastPhase)
-
- /**
- *
* @param srvReqId
*/
def clearRequestData(srvReqId: String): Unit = combCache -= srvReqId