You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2020/03/14 07:00:39 UTC
[incubator-nlpcraft] 07/13: WIP.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 177669c062f9d1d26d6c9444fdea2367b2c69c43
Author: Sergey Kamov <se...@apache.org>
AuthorDate: Fri Mar 13 17:39:10 2020 +0300
WIP.
---
.../common/nlp/core/NCNlpCoreManager.scala | 4 +-
.../mgrs/nlp/enrichers/sort/NCSortEnricher.scala | 68 +++++++++++++---------
2 files changed, 41 insertions(+), 31 deletions(-)
diff --git a/src/main/scala/org/nlpcraft/common/nlp/core/NCNlpCoreManager.scala b/src/main/scala/org/nlpcraft/common/nlp/core/NCNlpCoreManager.scala
index b9dfa1f..87e5203 100644
--- a/src/main/scala/org/nlpcraft/common/nlp/core/NCNlpCoreManager.scala
+++ b/src/main/scala/org/nlpcraft/common/nlp/core/NCNlpCoreManager.scala
@@ -72,7 +72,7 @@ object NCNlpCoreManager extends NCService {
*/
def stem(words: String): String = {
val seq = tokenizer.tokenize(words).map(p ⇒ p → NCNlpPorterStemmer.stem(p.token))
-
+
seq.zipWithIndex.map { case ((tok, stem), idx) ⇒
idx match {
case 0 ⇒ stem
@@ -80,7 +80,7 @@ object NCNlpCoreManager extends NCService {
case _ ⇒ if (seq(idx - 1)._1.to + 1 < tok.from) s" $stem"
else stem
}
- }.mkString("")
+ }.mkString(" ")
}
/**
diff --git a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index 6ea5fb0..282736b 100644
--- a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++ b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -52,7 +52,7 @@ object NCSortEnricher extends NCProbeEnricher {
"desc" → false,
"{in|by|from} {top down|descending} {order|way|fashion|*}" → false,
"{in|by|from} {bottom up|ascending} {order|way|fashion|*}" → true
- ).flatMap { case (txt, asc) ⇒ p.expand(txt).map(p ⇒ NCNlpCoreManager.stem(p) → asc) }
+ ).flatMap { case (txt, asc) ⇒ p.expand(txt).map(p ⇒ NCNlpCoreManager.stem(p) → asc ) }
}
private final val TOK_ID = "nlpcraft:sort"
@@ -112,20 +112,24 @@ object NCSortEnricher extends NCProbeEnricher {
*
*/
private def validate() {
+ // Not duplicated.
require(SORT.size + BY.size + ORDER.size == (SORT ++ BY ++ ORDER.unzip._1).distinct.size)
- val seq1 = SORT.flatMap(_.split(" "))
- val seq2 = BY.flatMap(_.split(" "))
- val seq3 = ORDER.map(_._1).flatMap(_.split(" "))
+ // Single words.
+ require(!SORT.exists(_.contains(" ")))
+ require(!BY.exists(_.contains(" ")))
- require(seq1.size == seq1.distinct.size)
- require(seq2.size == seq2.distinct.size)
- require(seq3.size == seq3.distinct.size)
+ // Different words.
+ require(SORT.intersect(BY).isEmpty)
+ require(SORT.intersect(ORDER.unzip._1).isEmpty)
+ require(BY.intersect(ORDER.unzip._1).isEmpty)
- require(seq1.intersect(seq2).isEmpty)
- require(seq1.intersect(seq3).isEmpty)
- require(seq2.intersect(seq3).isEmpty)
+ val ordersSeq: Seq[Seq[String]] = ORDER.unzip._1.map(_.split(" ").toSeq)
+ // ORDER doens't contains words fron BY (It can contains words from SORT)
+ require(!BY.exists(p ⇒ ordersSeq.contains(p)))
+
+ // Right order of keywords and references.
SEQS.map(_.split(" ")).foreach(seq ⇒ {
require(seq.forall(p ⇒ p == "SORT" || p == "ORDER" || p == "BY" || p == "x"))
@@ -139,23 +143,9 @@ object NCSortEnricher extends NCProbeEnricher {
}
/**
- * Return flag which indicates are token contiguous or not.
- *
- * @param toks Tokens.
- * @param tok1Idx First token index.
- * @param tok2Idx Second token index.
- */
- private def contiguous(toks: Seq[NCNlpSentenceToken], tok1Idx: Int, tok2Idx: Int): Boolean = {
- val between = toks.filter(t ⇒ t.index > tok1Idx && t.index < tok2Idx)
-
- between.isEmpty || between.forall(_.isStopWord)
- }
-
- /**
* [Token] -> [NoteData]
* [Token(A, B), Token(A), Token(C, D), Token(C, D, X), Token(Z)] ⇒
* [ [A (0, 1), C (2, 3), Z (4)], [A (0, 1), D (2, 3), Z (4) ] ]
- *
* @param toks
*/
private def split(toks: Seq[NCNlpSentenceToken]): Seq[Seq[NoteData]] = {
@@ -165,11 +155,23 @@ object NCSortEnricher extends NCProbeEnricher {
if (all.nonEmpty) {
val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
+ /**
+ * Return flag which indicates are token contiguous or not.
+ *
+ * @param tok1Idx First token index.
+ * @param tok2Idx Second token index.
+ */
+ def contiguous(tok1Idx: Int, tok2Idx: Int): Boolean = {
+ val between = toks.filter(t ⇒ t.index > tok1Idx && t.index < tok2Idx)
+
+ between.isEmpty || between.forall(_.isStopWord)
+ }
+
def fill(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = mutable.ArrayBuffer.empty[NoteData]): Unit = {
seq += nd
all.
- filter(p ⇒ nd.indexes.last < p.indexes.head && contiguous(toks, nd.indexes.last, p.indexes.head)).
+ filter(p ⇒ nd.indexes.last < p.indexes.head && contiguous(nd.indexes.last, p.indexes.head)).
foreach(fill(_, mutable.ArrayBuffer.empty[NoteData] ++ seq.clone()))
if (seq.nonEmpty &&
@@ -193,7 +195,7 @@ object NCSortEnricher extends NCProbeEnricher {
*/
private def tryToMatch(toks: Seq[NCNlpSentenceToken]): Option[Match] = {
case class KeyWord(tokens: Seq[NCNlpSentenceToken], synonymIndex: Int) {
- // Added for tests reasons.
+ // Added for debug reasons.
override def toString = tokens.map(_.origText).mkString(" ")
}
@@ -226,13 +228,21 @@ object NCSortEnricher extends NCProbeEnricher {
else
None
- // Added for tests reasons.
- override def toString = s"Sort: $sort, by: ${by.toSeq.mkString(",")}, order: ${order.toSeq.mkString(",")}"
+ // Added for debug reasons.
+ override def toString = s"Sort: [$sort], by: [${by.toSeq.mkString(",")}], order: [${order.toSeq.mkString(",")}]"
}
val hOpt: Option[KeyWordsHolder] =
get0(SORT, toks) match {
- case Some(sort) ⇒ Some(KeyWordsHolder(sort, get0(BY, toks), get0(ORDER.unzip._1, toks)))
+ case Some(sort) ⇒
+ val orderOpt = get0(ORDER.unzip._1, toks)
+
+ def mkHolder: Option[KeyWordsHolder] = Some(KeyWordsHolder(sort, get0(BY, toks), orderOpt))
+
+ orderOpt match {
+ case Some(order) ⇒ if (order.tokens.intersect(sort.tokens).isEmpty) mkHolder else None
+ case None ⇒ mkHolder
+ }
case None ⇒ None
}