You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by ar...@apache.org on 2021/12/23 22:14:59 UTC
[incubator-nlpcraft] branch NLPCRAFT-469 updated: CR.
This is an automated email from the ASF dual-hosted git repository.
aradzinski pushed a commit to branch NLPCRAFT-469
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-469 by this push:
new e91a46d CR.
e91a46d is described below
commit e91a46d929f7f86681704216697d7cee87e979c1
Author: Aaron Radzinski <ar...@datalingvo.com>
AuthorDate: Thu Dec 23 14:14:54 2021 -0800
CR.
---
bindist/LICENSE | 37 ----
bindist/NOTICE | 7 -
.../scala/org/apache/nlpcraft/NCResultType.java | 4 +-
.../enricher/NCEnDictionaryTokenEnricher.java | 3 +-
.../token/enricher/impl/NCEnDictionaryImpl.scala | 6 +-
.../token/parser/opennlp/NCOpenNlpTokenParser.java | 12 +-
.../opennlp/impl/NCEnStopWordGenerator.scala | 17 +-
.../parser/opennlp/impl/NCEnStopWordsFinder.scala | 200 ++++++++-------------
.../token/parser/opennlp/impl/NCOpenNlpImpl.scala | 45 +++--
.../apache/nlpcraft/internal/util/NCUtils.scala | 29 +--
10 files changed, 125 insertions(+), 235 deletions(-)
diff --git a/bindist/LICENSE b/bindist/LICENSE
index 16bebe4..09418e7 100644
--- a/bindist/LICENSE
+++ b/bindist/LICENSE
@@ -203,40 +203,3 @@
Files under 'nlpcraft/src/main/resources/moby' folder are public domain. See
https://en.wikipedia.org/wiki/Moby_Project for more information.
-
-File org.apache.nlpcraft.internal.util.NCIdGenerator.java is based on https://github.com/peet/hashids.java
-and licensed under MIT license:
-
- Copyright (C) 2012 Ivan Akimov
-
- Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
- documentation files (the "Software"), to deal in the Software without restriction, including without limitation
- the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
- and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in all copies or substantial
- portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
- TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
- CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- DEALINGS IN THE SOFTWARE.
-
-File org.apache.nlpcraft.internal.blowfish.NCBlowfishHasher.java is based on
-https://github.com/jeremyh/jBCrypt/blob/master/src/main/java/org/mindrot/BCrypt.java
-and licensed as follows:
-
- Copyright (C) 2006 Damien Miller <dj...@mindrot.org>
-
- Permission to use, copy, modify, and distribute this software for any
- purpose with or without fee is hereby granted, provided that the above
- copyright notice and this permission notice appear in all copies.
-
- THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
\ No newline at end of file
diff --git a/bindist/NOTICE b/bindist/NOTICE
index add790b..6d75bcb 100644
--- a/bindist/NOTICE
+++ b/bindist/NOTICE
@@ -4,10 +4,3 @@ Copyright (C) 2021 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
-File org.apache.nlpcraft.internal.util.NCIdGenerator.java.
-Based on https://github.com/peet/hashids.java
-Copyright (C) 2012 Ivan Akimov
-
-File org.apache.nlpcraft.internal.blowfish.NCBlowfishHasher.java.
-Based on https://github.com/jeremyh/jBCrypt/blob/master/src/main/java/org/mindrot/BCrypt.java
-Copyright (C) 2006 Damien Miller <dj...@mindrot.org>
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCResultType.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCResultType.java
index e2579ed..12edb50 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCResultType.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCResultType.java
@@ -22,12 +22,12 @@ package org.apache.nlpcraft;
*/
public enum NCResultType {
/**
- * Final result is produced.
+ * Final result is ready.
*/
ASK_RESULT,
/**
- * Curation is requires.
+ * Curation is required.
*/
ASK_CURATE,
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricher.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricher.java
index a2a1ff1..85f0163 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricher.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/NCEnDictionaryTokenEnricher.java
@@ -22,7 +22,7 @@ import org.apache.nlpcraft.internal.nlp.token.enricher.impl.NCEnDictionaryImpl;
import java.util.List;
/**
- *
+ * TODO: enriches with <code>dict:en</code> property.
*/
public class NCEnDictionaryTokenEnricher implements NCTokenEnricher {
private final NCEnDictionaryImpl impl = new NCEnDictionaryImpl();
@@ -40,7 +40,6 @@ public class NCEnDictionaryTokenEnricher implements NCTokenEnricher {
@Override
public void enrich(NCRequest req, NCModelConfig cfg, List<NCToken> toks) {
assert impl != null;
-
impl.enrich(req, cfg, toks);
}
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnDictionaryImpl.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnDictionaryImpl.scala
index 405cff3..4c0b53d 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnDictionaryImpl.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/enricher/impl/NCEnDictionaryImpl.scala
@@ -19,8 +19,9 @@ package org.apache.nlpcraft.internal.nlp.token.enricher.impl
import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.util.NCUtils
+
/**
- *
+ * TODO: enriches with <code>dict:en</code> property.
*/
class NCEnDictionaryImpl extends NCLifecycle:
@volatile private var dict: Set[String] = _
@@ -29,10 +30,11 @@ class NCEnDictionaryImpl extends NCLifecycle:
override def stop(): Unit = dict = null
/**
+ * TODO: enriches with <code>dict:en</code> property.
*
* @param req
* @param cfg
* @param toks
*/
def enrich(req: NCRequest, cfg: NCModelConfig, toks: java.util.List[NCToken]): Unit =
- toks.forEach(t => t.put("english", dict.contains(t.getLemma)))
+ toks.forEach(t => t.put("dict:en", dict.contains(t.getLemma)))
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
index a20131b..c0e921f 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParser.java
@@ -58,9 +58,9 @@ public class NCOpenNlpTokenParser implements NCTokenParser {
* @throws NCException
*/
public NCOpenNlpTokenParser(File tokMdl, File posMdl, File lemmaDic) {
- Objects.requireNonNull(tokMdl, "Tonenizer model cannot be null");
- Objects.requireNonNull(posMdl, "POS model cannot be null");
- Objects.requireNonNull(lemmaDic, "Lemmatizer model cannot be null");
+ Objects.requireNonNull(tokMdl, "Tonenizer model file cannot be null.");
+ Objects.requireNonNull(posMdl, "POS model file cannot be null.");
+ Objects.requireNonNull(lemmaDic, "Lemmatizer model file cannot be null.");
try {
impl = NCOpenNlpImpl.apply(tokMdl, posMdl, lemmaDic);
@@ -78,9 +78,9 @@ public class NCOpenNlpTokenParser implements NCTokenParser {
* @throws NCException
*/
public NCOpenNlpTokenParser(String tokMdlSrc, String posMdlSrc, String lemmaDicSrc) {
- Objects.requireNonNull(tokMdlSrc, "Tonenizer model cannot be null");
- Objects.requireNonNull(posMdlSrc, "POS model cannot be null");
- Objects.requireNonNull(lemmaDicSrc, "Lemmatizer model cannot be null");
+ Objects.requireNonNull(tokMdlSrc, "Tonenizer model path cannot be null.");
+ Objects.requireNonNull(posMdlSrc, "POS model path cannot be null.");
+ Objects.requireNonNull(lemmaDicSrc, "Lemmatizer model path cannot be null.");
try {
impl = NCOpenNlpImpl.apply(tokMdlSrc, posMdlSrc, lemmaDicSrc);
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordGenerator.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordGenerator.scala
index c470d51..0751715 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordGenerator.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordGenerator.scala
@@ -25,7 +25,7 @@ import scala.collection.mutable
/**
* Generates first word sequences.
*/
-object NCEnStopWordGenerator extends App:
+object NCEnStopWordGenerator:
private final lazy val stemmer = new PorterStemmer
// Output files.
@@ -193,7 +193,6 @@ object NCEnStopWordGenerator extends App:
private def mkGzip(path: String, lines: Iterable[Any]): Unit =
val p = NCUtils.mkPath(s"nlpcraft/src/main/resources/stopwords/$path")
-
NCUtils.mkTextFile(p, lines)
NCUtils.gzipPath(p)
@@ -344,9 +343,13 @@ object NCEnStopWordGenerator extends App:
mkGzip(FIRST_WORDS_FILE, stem(buf.toSeq))
- mkFirstWords()
- mkNounWords()
-
- mkGzip(POS_WORDS_FILE, stem(mkPossessiveStopWords))
+ /**
+ *
+ * @param args
+ */
+ def main(args: Array[String]): Unit =
+ mkFirstWords()
+ mkNounWords()
+ mkGzip(POS_WORDS_FILE, stem(mkPossessiveStopWords))
- sys.exit()
+ sys.exit()
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
index a90cce7..167dd89 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
@@ -19,9 +19,10 @@ package org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl
import com.typesafe.scalalogging.LazyLogging
import opennlp.tools.stemmer.PorterStemmer
+
import org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.NCEnStopWordsFinder.*
import org.apache.nlpcraft.internal.util.NCUtils
-import org.apache.nlpcraft.{NCException, NCParameterizedAdapter, NCToken}
+import org.apache.nlpcraft.*
import java.util
import java.util.{List as JList, Set as JSet}
@@ -85,7 +86,6 @@ private[impl] object NCEnStopWordsFinder:
)
private val STOP_BEFORE_STOP: Seq[Word] = Seq("DT", "PRP", "PRP$", "WDT", "WP", "WP$", "WRB")
-
private val Q_POS = Set("``", "''")
/**
@@ -132,7 +132,7 @@ private[impl] object NCEnStopWordsFinder:
set.exists { case (b, e) => (b.isEmpty || s.startsWith(b)) && (e.isEmpty || s.endsWith(e)) }
def matches(s: String, posOpt: Option[String]): Boolean =
- if (s.contains(' '))
+ if s.contains(' ') then
false
else
posOpt match
@@ -163,32 +163,33 @@ private[impl] object NCEnStopWordsFinder:
wildcardsOrigins: ScanHolder
):
def matches(toks: Seq[NCToken]): Boolean =
- val posOpt =
- toks.size match
- case 0 => throw new AssertionError(s"Unexpected empty tokens.")
- case 1 => Some(toks.head.getPos)
- case _ => None
+ val posOpt = toks.size match
+ case 0 => throw new AssertionError(s"Unexpected empty tokens.")
+ case 1 => Some(toks.head.getPos)
+ case _ => None
// Hash access.
stems.matches(toStemKey(toks), posOpt) ||
- lemmas.matches(toLemmaKey(toks), posOpt) ||
- origins.matches(toOriginalKey(toks), posOpt) ||
- // Scan access.
- wildcardsLemmas.matches(toLemmaKey(toks), posOpt) ||
- wildcardsOrigins.matches(toOriginalKey(toks), posOpt)
-
- private def isQuote(t: NCToken): Boolean = Q_POS.contains(t.getPos)
+ lemmas.matches(toLemmaKey(toks), posOpt) ||
+ origins.matches(toOriginalKey(toks), posOpt) ||
+ // Scan access.
+ wildcardsLemmas.matches(toLemmaKey(toks), posOpt) ||
+ wildcardsOrigins.matches(toOriginalKey(toks), posOpt)
+ private def isQuote(t: NCToken): Boolean = Q_POS.contains(t.getPos)
private def toStemKey(toks: Seq[NCToken]): String = toks.map(_.getStem).mkString(" ")
private def toLemmaKey(toks: Seq[NCToken]): String = toks.map(_.getLemma).mkString(" ")
private def toValueKey(toks: Seq[NCToken]): String = toks.map(_.getOriginalText.toLowerCase).mkString(" ")
private def toOriginalKey(toks: Seq[NCToken]): String = toks.map(_.getOriginalText).mkString(" ")
-import org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl.NCEnStopWordsFinder.*
-
-private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String], exclStopWordsStems: Set[String]) extends LazyLogging:
- require(addStopWordsStems != null)
- require(exclStopWordsStems != null)
+/**
+ *
+ * @param addStems
+ * @param exclStems
+ */
+private[impl] class NCEnStopWordsFinder(addStems: Set[String], exclStems: Set[String]) extends LazyLogging:
+ require(addStems != null)
+ require(exclStems != null)
private val stemmer = new PorterStemmer
@@ -203,13 +204,13 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String], exclStop
"percent"
).map(stemmer.stem)
- @volatile private var possessiveWords: Set[String] = _
+ @volatile private var posWords: Set[String] = _ // Possessive words.
@volatile private var firstWords: Set[String] = _
@volatile private var nounWords: Set[String] = _
// Stemmatization is done already by generator.
NCUtils.executeParallel(
- () => possessiveWords = read("stopwords/possessive_words.txt.gz"),
+ () => posWords = read("stopwords/possessive_words.txt.gz"),
() => firstWords = read("stopwords/first_words.txt.gz"),
() => nounWords = read("stopwords/noun_words.txt.gz")
)(ExecutionContext.Implicits.global)
@@ -233,10 +234,8 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String], exclStop
*/
private def readStopWords(lines: Seq[String]): Map[Boolean, StopWordHolder] =
// 1. Prepares accumulation data structure.
- object WordForm extends Enumeration:
- type WordForm = Value
-
- val STEM, LEM, ORIG = Value
+ enum WordForm:
+ case STEM, LEM, ORIG
import WordForm.*
@@ -246,7 +245,7 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String], exclStop
val excludes = mutable.HashMap.empty[String, mutable.HashSet[T]]
def addCondition(cond: T, poses: Map[String, Boolean]): Any =
- if (poses.isEmpty)
+ if poses.isEmpty then
any += cond
else
def add(m: mutable.HashMap[String, mutable.HashSet[T]], incl: Boolean): Unit =
@@ -255,10 +254,8 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String], exclStop
case Some(set) => set.add(cond)
case _ =>
val set = mutable.HashSet.empty[T]
-
- set += cond
-
- m += pos -> set
+ set += cond
+ m += pos -> set
)
add(includes, incl = true)
@@ -267,17 +264,13 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String], exclStop
type Key = (Boolean, WordForm)
def mkMap[T](mkT: Unit => T): Map[Key, T] =
val m = mutable.Map.empty[Key, T]
-
def add(f: WordForm, mkT: Unit => T, isExc: Boolean): Unit =
val tuple: (Key, T) = (isExc, f) -> mkT(())
-
m += tuple._1 -> tuple._2
-
WordForm.values.foreach(f =>
add(f, mkT, isExc = true)
- add(f, mkT, isExc = false)
+ add(f, mkT, isExc = false)
)
-
m.toMap
// Prepares collections.
@@ -292,10 +285,9 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String], exclStop
var s = line.trim
// Word with size 1 word should contains letter only.
- if (s.length == 1 && !s.head.isLetter)
- throwError("Invalid stop word")
+ if s.length == 1 && !s.head.isLetter then throwError("Invalid stop word")
- def checkSingle(ch: Char): Unit = if (s.count(_ == ch) > 1) throwError(s"Unexpected symbols count: $ch")
+ def checkSingle(ch: Char): Unit = if s.count(_ == ch) > 1 then throwError(s"Unexpected symbols count: $ch")
// Confusing special symbols.
checkSingle('@')
@@ -303,71 +295,47 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String], exclStop
checkSingle('*')
val isExc = line.head == '~'
-
- if (isExc)
- s = line.drop(1)
-
+ if isExc then s = line.drop(1)
val idxPos = s.indexOf("|")
-
val poses: Map[String, Boolean] =
- if (idxPos > 0)
+ if idxPos > 0 then
s.
drop(idxPos + 1).
trim.split(" ").
map(_.trim.toUpperCase).
filter(_.nonEmpty).
toSeq.
- map(p => if (p.head == '~') p.drop(1).strip -> false else p -> true).
+ map(p => if p.head == '~' then p.drop(1).strip -> false else p -> true).
toMap
else
Map.empty
- if (!poses.keys.forall(POSES.contains))
- throwError(s"Invalid POSes: ${poses.keys.mkString(", ")}")
-
+ if !poses.keys.forall(POSES.contains) then throwError(s"Invalid POSes: ${poses.keys.mkString(", ")}")
val hasPoses = poses.nonEmpty
-
- if (hasPoses)
- s = s.take(idxPos).trim
-
+ if hasPoses then s = s.take(idxPos).trim
val isMultiWord = s.contains(' ')
// Confusing POSes.
- if (poses.nonEmpty && isMultiWord)
- throwError("POSes cannot be defined for multiple stop words.")
-
+ if poses.nonEmpty && isMultiWord then throwError("POSes cannot be defined for multiple stop words.")
var isCase = false
-
- if (s.head == '@')
+ if s.head == '@' then
s = s.drop(1)
-
// Empty word.
- if (s.isEmpty)
- throwError("Empty word")
-
+ if s.isEmpty then throwError("Empty word.")
isCase = true
-
val idxWild = s.indexOf("*")
-
- if (idxWild >= 0 && isMultiWord)
- throwError("Wildcard cannot be defined for multiple stop words.")
-
- if (idxWild < 0)
+ if idxWild >= 0 && isMultiWord then throwError("Wildcard cannot be defined for multiple stop words.")
+ if idxWild < 0 then
val (word, form) =
- if (isCase)
- (s, ORIG)
- else {
- if (!hasPoses) (stemmer.stem(s), STEM) else (stemmer.stem(s), LEM)
- }
-
+ if isCase then (s, ORIG)
+ else
+ if !hasPoses then (stemmer.stem(s), STEM) else (stemmer.stem(s), LEM)
mHash((isExc, form)).addCondition(word, poses)
else
val b = s.take(idxWild)
val e = s.drop(idxWild + 1)
- if (b.isEmpty && e.isEmpty && !hasPoses)
- throwError("Too general wildcard definition.")
-
+ if b.isEmpty && e.isEmpty && !hasPoses then throwError("Too general wildcard definition.")
mScan((isExc, if (isCase) ORIG else LEM)).addCondition((b, e), poses)
// 3. Converts data to service format.
@@ -383,12 +351,12 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String], exclStop
val incl = toImmutable(m((isExc, form)).includes)
val excl = toImmutable(m((isExc, form)).excludes)
- mkInstance(any ++ excl.values.flatten, incl, excl)
-
+ mkInstance(any ++ excl.values.flatten, incl, excl)
+ end mkHolder
def mkHash(form: WordForm): HashHolder = mkHolder(mHash, form, HashHolder.apply)
def mkScan(form: WordForm): ScanHolder = mkHolder(mScan, form, ScanHolder.apply)
- isExc -> StopWordHolder(mkHash(STEM), mkHash(LEM), mkHash(ORIG), mkScan(LEM), mkScan(ORIG))
+ isExc -> StopWordHolder(mkHash(STEM), mkHash(LEM), mkHash(ORIG), mkScan(LEM), mkScan(ORIG))
).toMap
private def isVerb(pos: String): Boolean = pos.head == 'V'
@@ -412,18 +380,12 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String], exclStop
): Boolean =
var stop = true
- for (
- (tok, idx) <- ns.zipWithIndex
- if idx != lastIdx &&
- !tok.isStopWord &&
- !isException(Seq(tok)) &&
- stopPoses.contains(tok.getPos) &&
- ns(idx + 1).isStopWord)
+ for ((tok, idx) <- ns.zipWithIndex if idx != lastIdx && !tok.isStopWord && !isException(Seq(tok)) &&
+ stopPoses.contains(tok.getPos) && ns(idx + 1).isStopWord)
stops += tok
-
stop = false
- if (stop) true else markBefore(ns, stopPoses, lastIdx, isException, stops)
+ if stop then true else markBefore(ns, stopPoses, lastIdx, isException, stops)
/**
* Checks value cached or not.
@@ -437,9 +399,7 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String], exclStop
case Some(b) => b
case None =>
val b = get(toks)
-
cache += toks -> b
-
b
/**
@@ -454,20 +414,12 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String], exclStop
val max = ns.size - 1
var stop = true
- for (
- (tok, idx) <- ns.zipWithIndex
- if idx != max &&
- !tok.isStopWord &&
- !exclStopWordsStems.contains(tok.getStem) &&
- POSES.contains(tok.getPos) &&
- ns(idx + 1).isStopWord
- )
+ for ((tok, idx) <- ns.zipWithIndex if idx != max && !tok.isStopWord && !exclStems.contains(tok.getStem) &&
+ POSES.contains(tok.getPos) && ns(idx + 1).isStopWord)
stops += tok
-
stop = false
- if (!stop)
- processCommonStops0(ns)
+ if !stop then processCommonStops0(ns)
processCommonStops0(ns)
@@ -506,8 +458,7 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String], exclStop
// | Pass #1. |
// | POS tags and manual resolution. |
// +---------------------------------+
- val stop =
- !isException(Seq(tok)) &&
+ val stop = !isException(Seq(tok)) &&
(// Percents after numbers.
// 1. Word from 'percentage' list.
percents.contains(stem) &&
@@ -515,13 +466,12 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String], exclStop
!isFirst && prev().getPos == "CD" &&
// 3. It's last word or any words after except numbers.
(isLast || next().getPos != "CD")
- ) ||
+ ) ||
// be, was, is etc. or has been etc.
isCommonVerbs("have", "be") ||
// be, was, is etc. or have done etc.
isCommonVerbs("have", "do")
- if (stop)
- stops += tok
+ if stop then stops += tok
// +--------------------------------------+
// | Pass #2. |
@@ -542,7 +492,7 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String], exclStop
// | Pass #3. |
// | Check external possessive stop-word file. |
// +--------------------------------------------+
- for (tup <- origToks; key = tup._2 if possessiveWords.contains(key) && !isException(tup._1))
+ for (tup <- origToks; key = tup._2 if posWords.contains(key) && !isException(tup._1))
tup._1.foreach(tok => stops += tok)
// +--------------------------------------------------+
@@ -554,8 +504,7 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String], exclStop
// All sentence first stop words + first non stop word.
val startToks = toks.takeWhile(_.isStopWord) ++ toks.find(!_.isStopWord).map(p => p)
- for (startTok <- startToks; tup <- origToks.filter(_._1.head == startTok); key = tup._2
- if firstWords.contains(key) && !isException(tup._1))
+ for (startTok <- startToks; tup <- origToks.filter(_._1.head == startTok); key = tup._2 if firstWords.contains(key) && !isException(tup._1))
tup._1.foreach(tok => stops += tok)
foundKeys += key
@@ -565,9 +514,7 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String], exclStop
// +-------------------------------------------------+
for (tup <- origToks; key = tup._2 if !foundKeys.contains(key) && !isException(tup._1))
foundKeys.find(key.startsWith) match
- case Some(s) =>
- if (nounWords.contains(key.substring(s.length).strip))
- tup._1.foreach(tok => stops += tok)
+ case Some(s) => if nounWords.contains(key.substring(s.length).strip) then tup._1.foreach(tok => stops += tok)
case None => ()
// +-------------------------------------------------+
@@ -580,10 +527,10 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String], exclStop
// | Pass #7. |
// | Processing additional and excluded stop words. |
// +-------------------------------------------------+
- for (t <- toks if addStopWordsStems.contains(t.getStem))
+ for (t <- toks if addStems.contains(t.getStem))
stops += t
- for (t <- stops.filter(t => exclStopWordsStems.contains(t.getStem)))
+ for (t <- stops.filter(t => exclStems.contains(t.getStem)))
stops -= t
// +-------------------------------------------------+
@@ -600,23 +547,18 @@ private[impl] class NCEnStopWordsFinder(addStopWordsStems: Set[String], exclStop
// +-------------------------------------------------+
var quotes = toks.filter(isQuote)
- if (quotes.size % 2 != 0)
- // Just ignore last odd quote.
- quotes = quotes.reverse.drop(1).reverse
+ // Just ignore last odd quote.
+ if quotes.size % 2 != 0 then quotes = quotes.reverse.drop(1).reverse
- if (quotes.nonEmpty)
+ if quotes.nonEmpty then
val m = toks.zipWithIndex.toMap
-
- val pairs =
- quotes.zipWithIndex.
- drop(1).
- flatMap { case (t, idx) => if (idx % 2 != 0) Some(m(t) -> m(quotes(idx - 1))) else None }
-
- stops --=
- stops.filter(t => pairs.exists { case (from, to) =>
+ val pairs = quotes.zipWithIndex.drop(1). flatMap {
+ case (t, idx) => if idx % 2 != 0 then Some(m(t) -> m(quotes(idx - 1))) else None
+ }
+ stops --= stops.filter(t => pairs.exists {
+ case (from, to) =>
val idx = m(t)
-
from > idx && to < idx
- })
+ })
stops.toSeq.sortBy(_.getStartCharIndex)
\ No newline at end of file
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
index dfc3c69..867c393 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
@@ -70,7 +70,7 @@ class NCOpenNlpImpl(
@volatile var tokenizer: TokenizerME = _
@volatile var tagger: POSTaggerME = _
@volatile var lemmatizer: DictionaryLemmatizer = _
- @volatile var sw: NCEnStopWordsFinder = _
+ @volatile var swFinder: NCEnStopWordsFinder = _
private var addStopWords: JSet[String] = _
private var exclStopWords: JSet[String] = _
@@ -80,10 +80,14 @@ class NCOpenNlpImpl(
() => tokenizer = new TokenizerME(new TokenizerModel(tokMdlIn)),
() => tagger = new POSTaggerME(new POSModel(posMdlIn)),
() => lemmatizer = new DictionaryLemmatizer(lemmaDicIn),
- () => sw = new NCEnStopWordsFinder(stem(addStopWords), stem(exclStopWords))
+ () => swFinder = new NCEnStopWordsFinder(stem(addStopWords), stem(exclStopWords))
)(ExecutionContext.Implicits.global)
- override def stop(): Unit = sw = null; lemmatizer = null; tagger = null; lemmatizer = null
+ override def stop(): Unit =
+ swFinder = null
+ lemmatizer = null
+ tagger = null
+ lemmatizer = null
/**
*
@@ -114,7 +118,7 @@ class NCOpenNlpImpl(
* @param set
*/
private def stem(set: JSet[String]): Set[String] =
- if (set == null) Set.empty else set.asScala.toSet.map(stemmer.stem)
+ if set == null then Set.empty else set.asScala.toSet.map(stemmer.stem)
/**
*
@@ -122,10 +126,6 @@ class NCOpenNlpImpl(
* @return
*/
override def parse(req: NCRequest): JList[NCToken] =
- // TODO: check started?
- if (tokenizer == null)
- throw new IllegalStateException(s"${this.getClass.getName} is not started.")
-
// OpenNLP classes are not thread-safe.
this.synchronized {
val sen = req.getNormalizedText
@@ -161,24 +161,23 @@ class NCOpenNlpImpl(
case (lemma, idx) => fixes.getOrElse(idx, lemma)
}
- val res: Seq[NCToken] =
- holders.zip(posTags).zip(lemmas).toIndexedSeq.map { case ((h, pos), lemma) =>
- new NCParameterizedAdapter with NCToken:
- override def getOriginalText: String = h.origin
- override def getNormalizedText: String = h.normalized
- override def getLemma: String = lemma
- override def getStem: String = stemmer.stem(h.normalized)
- override def getPos: String = pos
- override def isStopWord: Boolean = false
- override def getStartCharIndex: Int = h.start
- override def getEndCharIndex: Int = h.end
- override def getLength: Int = h.length
- }
+ val res: Seq[NCToken] = holders.zip(posTags).zip(lemmas).toIndexedSeq.map { case ((h, pos), lemma) =>
+ new NCParameterizedAdapter with NCToken:
+ override def getOriginalText: String = h.origin
+ override def getNormalizedText: String = h.normalized
+ override def getLemma: String = lemma
+ override def getStem: String = stemmer.stem(h.normalized)
+ override def getPos: String = pos
+ override def isStopWord: Boolean = false
+ override def getStartCharIndex: Int = h.start
+ override def getEndCharIndex: Int = h.end
+ override def getLength: Int = h.length
+ }
- val stops = sw.find(res)
+ val stops = swFinder.find(res)
res.map(tok =>
- if (stops.contains(tok))
+ if stops.contains(tok) then
new NCParameterizedAdapter with NCToken:
override def getOriginalText: String = tok.getOriginalText
override def getNormalizedText: String = tok.getNormalizedText
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
index aa05e79..9b4c7d9 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
@@ -313,7 +313,7 @@ object NCUtils extends LazyLogging:
* @tparam T
* @return
*/
- def notNull[T <: AnyRef](v: T, dflt: T): T = if (v == null) dflt else v
+ def notNull[T <: AnyRef](v: T, dflt: T): T = if v == null then dflt else v
/**
* Strips ANSI escape sequences from the given string.
@@ -486,7 +486,7 @@ object NCUtils extends LazyLogging:
s"$exClsName$errMsg $ansiCyanFg->$ansiReset ($fileName:$lineNum)"
msg.split("\n").foreach(line => {
- val s = s"${" " * indent}${if (first) ansiBlue("+-+ ") else " "}${bo(y(line))}"
+ val s = s"${" " * indent}${if first then ansiBlue("+-+ ") else " "}${bo(y(line))}"
logger.log(s)
first = false
})
@@ -742,11 +742,7 @@ object NCUtils extends LazyLogging:
import java.util.*
// Could be long for large sequences...
- val seq =
- if (sort)
- lines.map(_.toString).toSeq.sorted
- else
- lines
+ val seq = if sort then lines.map(_.toString).toSeq.sorted else lines
ps.println(s"#")
ps.println(s"# Licensed to the Apache Software Foundation (ASF) under one or more")
@@ -847,14 +843,12 @@ object NCUtils extends LazyLogging:
try
Using.resource(new GZIPOutputStream(new FileOutputStream(gz))) { stream =>
stream.write(readFileBytes(f))
-
stream.flush()
}
catch
case e: IOException => throw new NCException(s"Error gzip file: $f", e)
- if (!f.delete())
- throw new NCException(s"Error while deleting file: $f")
+ if !f.delete() then throw new NCException(s"Error while deleting file: $f")
logger.trace(s"File gzipped [source=$f, destination=$gz]")
@@ -983,18 +977,13 @@ object NCUtils extends LazyLogging:
*/
def permutations(toks: Seq[NCToken]): Seq[Seq[NCToken]] =
def multiple(seq: Seq[Seq[Option[NCToken]]], t: NCToken): Seq[Seq[Option[NCToken]]] =
- if (seq.isEmpty)
- if (t.isStopWord) IndexedSeq(IndexedSeq(Some(t)), IndexedSeq(None)) else IndexedSeq(IndexedSeq(Some(t)))
- else {
- (for (subSeq <- seq) yield subSeq :+ Some(t)) ++
- (if (t.isStopWord) for (subSeq <- seq) yield subSeq :+ None else Seq.empty)
- }
+ if seq.isEmpty then
+ if t.isStopWord then IndexedSeq(IndexedSeq(Some(t)), IndexedSeq(None)) else IndexedSeq(IndexedSeq(Some(t)))
+ else
+ (for (subSeq <- seq) yield subSeq :+ Some(t)) ++ (if t.isStopWord then for (subSeq <- seq) yield subSeq :+ None else Seq.empty)
var res: Seq[Seq[Option[NCToken]]] = Seq.empty
-
- for (t <- toks)
- res = multiple(res, t)
-
+ for (t <- toks) res = multiple(res, t)
res.map(_.flatten).filter(_.nonEmpty)
tokenMix(tokens, stopWords = true, maxLen).