You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2020/03/14 07:00:36 UTC
[incubator-nlpcraft] 04/13: WIP.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit c73bf59544ded0fbf80f820d174bba52cb30c79f
Author: Sergey Kamov <se...@apache.org>
AuthorDate: Thu Mar 12 16:38:04 2020 +0300
WIP.
---
.../nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala | 24 ++++-
.../mgrs/nlp/enrichers/sort/NCSortEnricher.scala | 108 ++++++++++++---------
2 files changed, 83 insertions(+), 49 deletions(-)
diff --git a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala
index c632c2e..d2b1a4a 100644
--- a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala
+++ b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala
@@ -53,6 +53,8 @@ abstract class NCProbeEnricher extends NCService with LazyLogging {
Set.empty
else {
def getCommon(sortedToks: Seq[NCNlpSentenceToken]): Set[String] = {
+ require(sortedToks.nonEmpty)
+
val h = sortedToks.head
val l = sortedToks.last
@@ -64,12 +66,16 @@ abstract class NCProbeEnricher extends NCService with LazyLogging {
notes.filter(!_.isNlp).filter(n ⇒ h.index == n.tokenFrom && l.index == n.tokenTo).map(_.noteType).toSet
}
- val sortedToks = toks.sortBy(_.index)
+ var sortedToks = toks.sortBy(_.index)
var res = getCommon(sortedToks)
- if (res.isEmpty)
- res = getCommon(sortedToks.filter(!_.isStopWord))
+ if (res.isEmpty) {
+ sortedToks = sortedToks.filter(!_.isStopWord)
+
+ if (sortedToks.nonEmpty)
+ res = getCommon(sortedToks)
+ }
if (res.isEmpty) Set.empty else res
}
@@ -83,7 +89,10 @@ abstract class NCProbeEnricher extends NCService with LazyLogging {
*/
protected def hasReference(typ: String, refNoteName: String, refNoteVal: String, matched: Seq[NCNlpSentenceToken]): Boolean =
matched.forall(t ⇒
- t.isTypeOf(typ) && t.getNotes(typ).exists(n ⇒ n(refNoteName).asInstanceOf[String] == refNoteVal)
+ t.isTypeOf(typ) && t.getNotes(typ).exists(n ⇒ n.get(refNoteName) match {
+ case Some(s) ⇒ s.asInstanceOf[String] == refNoteVal
+ case None ⇒ false
+ })
)
/**
@@ -95,7 +104,12 @@ abstract class NCProbeEnricher extends NCService with LazyLogging {
*/
protected def hasReferences(typ: String, refNoteName: String, refNoteVals: Seq[String], matched: Seq[NCNlpSentenceToken]): Boolean =
matched.forall(t ⇒
- t.isTypeOf(typ) && t.getNotes(typ).exists(n ⇒ n(refNoteName).asInstanceOf[java.util.List[String]].asScala == refNoteVals)
+ t.isTypeOf(typ) && t.getNotes(typ).exists(n ⇒
+ n.get(refNoteName) match {
+ case Some(s) ⇒ s.asInstanceOf[java.util.List[String]].asScala == refNoteVals
+ case None ⇒ false
+ }
+ )
)
/**
diff --git a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index 4351213..ad426b5 100644
--- a/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++ b/src/main/scala/org/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -129,6 +129,19 @@ object NCSortEnricher extends NCProbeEnricher {
}
/**
+ * Return flag which indicates are token contiguous or not.
+ *
+ * @param toks Tokens.
+ * @param tok1Idx First token index.
+ * @param tok2Idx Second token index.
+ */
+ private def contiguous(toks: Seq[NCNlpSentenceToken], tok1Idx: Int, tok2Idx: Int): Boolean = {
+ val between = toks.filter(t ⇒ t.index > tok1Idx && t.index < tok2Idx)
+
+ between.isEmpty || between.forall(_.isStopWord)
+ }
+
+ /**
* [Token] -> [NoteData]
* [Token(A, B), Token(A), Token(C, D), Token(C, D, X), Token(Z)] ⇒
* [ [A (0, 1), C (2, 3), Z (4)], [A (0, 1), D (2, 3), Z (4) ] ]
@@ -136,30 +149,23 @@ object NCSortEnricher extends NCProbeEnricher {
* @param toks
*/
private def split(toks: Seq[NCNlpSentenceToken]): Seq[Seq[NoteData]] = {
- val all: Seq[NoteData] = toks.
- flatten.
- filter(!_.isNlp).
- map(n ⇒ NoteData(n.noteType, n.tokenFrom to n.tokenTo)).
- sortBy(_.indexes.head)
+ val all =
+ toks.flatten.filter(!_.isNlp).map(n ⇒ NoteData(n.noteType, n.tokenFrom to n.tokenTo)).sortBy(_.indexes.head).distinct
if (all.nonEmpty) {
- val first = all.head.indexes.head
- val last = all.last.indexes.last
-
val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
def fill(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = mutable.ArrayBuffer.empty[NoteData]): Unit = {
seq += nd
all.
- filter(p ⇒ nd.indexes.last < p.indexes.head && {
- val between = toks.slice(nd.indexes.last, p.indexes.head - 1)
-
- between.isEmpty || between.forall(_.isStopWord)
- }).
+ filter(p ⇒ nd.indexes.last < p.indexes.head && contiguous(toks, nd.indexes.last, p.indexes.head)).
foreach(fill(_, mutable.ArrayBuffer.empty[NoteData] ++ seq.clone()))
- if (seq.nonEmpty && seq.head.indexes.head == first && seq.last.indexes.last == last)
+ if (seq.nonEmpty &&
+ seq.head.indexes.head == all.head.indexes.head &&
+ seq.last.indexes.last == all.last.indexes.last
+ )
res += seq
}
@@ -171,6 +177,10 @@ object NCSortEnricher extends NCProbeEnricher {
Seq.empty
}
+ /**
+ *
+ * @param toks
+ */
private def tryToMatch(toks: Seq[NCNlpSentenceToken]): Option[Match] = {
case class KeyWord(tokens: Seq[NCNlpSentenceToken], synonymIndex: Int) {
// Added for tests reasons.
@@ -219,37 +229,47 @@ object NCSortEnricher extends NCProbeEnricher {
hOpt match {
case Some(h) ⇒
val others = toks.filter(t ⇒ !h.all.contains(t))
- val othersWithoutStops = others.filter(!_.isStopWord)
-
- if (
- othersWithoutStops.nonEmpty &&
- othersWithoutStops.forall(t ⇒ t.exists(n ⇒ n.isUser || SORT_TYPES.contains(n.noteType))) &&
- SEQS.contains(
- // It removes duplicates (`SORT x x ORDER x x x` converts to `SORT x ORDER x`)
- toks.map(t ⇒
- h.getKeyWordType(t).getOrElse("x")).
- foldLeft("")((x, y) ⇒ if (x.endsWith(y)) x else s"$x $y").trim
- )
- ) {
- val subj = mutable.ArrayBuffer.empty[NCNlpSentenceToken]
- val by = mutable.ArrayBuffer.empty[NCNlpSentenceToken]
-
- others.foreach(t ⇒ (if (subj.isEmpty || subj.last.index + 1 == t.index) subj else by) += t)
-
- require(subj.nonEmpty)
-
- Some(
- Match(
- asc = h.order match {
- case Some(order) ⇒ Some(ORDER(order.synonymIndex)._2)
- case None ⇒ None
- },
- main = h.sort.tokens,
- stop = h.byTokens ++ h.orderTokens,
- subjSeq = split(subj),
- bySeq = split(by)
+
+ if (others.nonEmpty) {
+ val othersRefs = others.filter(t ⇒ t.exists(n ⇒ n.isUser || SORT_TYPES.contains(n.noteType)))
+
+ if (
+ othersRefs.nonEmpty &&
+ others.filter(p ⇒ !othersRefs.contains(p)).forall(_.isStopWord) &&
+ SEQS.contains(
+ // It removes duplicates (`SORT x x ORDER x x x` converts to `SORT x ORDER x`)
+ toks.map(t ⇒
+ h.getKeyWordType(t).getOrElse("x")).
+ foldLeft("")((x, y) ⇒ if (x.endsWith(y)) x else s"$x $y").trim
+ )
+ ) {
+ val subj = mutable.ArrayBuffer.empty[NCNlpSentenceToken]
+ val by = mutable.ArrayBuffer.empty[NCNlpSentenceToken]
+
+ others.foreach(t ⇒
+ if (subj.isEmpty || by.isEmpty && contiguous(others, subj.last.index, t.index))
+ subj += t
+ else
+ by += t
+ )
+
+ require(subj.nonEmpty)
+
+ Some(
+ Match(
+ asc = h.order match {
+ case Some(order) ⇒ Some(ORDER(order.synonymIndex)._2)
+ case None ⇒ None
+ },
+ main = h.sort.tokens,
+ stop = h.byTokens ++ h.orderTokens,
+ subjSeq = split(subj),
+ bySeq = split(by)
+ )
)
- )
+ }
+ else
+ None
}
else
None