You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by ar...@apache.org on 2021/12/24 00:31:09 UTC
[incubator-nlpcraft] branch NLPCRAFT-469 updated: WIP
This is an automated email from the ASF dual-hosted git repository.
aradzinski pushed a commit to branch NLPCRAFT-469
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-469 by this push:
new d096149 WIP
d096149 is described below
commit d096149270455b9d51f3c4e487fba4c83fa3d0b6
Author: Aaron Radzinski <ar...@datalingvo.com>
AuthorDate: Thu Dec 23 16:31:04 2021 -0800
WIP
---
.../resources/stopwords/possessive_words.txt.gz | Bin 990 -> 0 bytes
.../opennlp/impl/NCEnStopWordGenerator.scala | 35 ---------------------
.../parser/opennlp/impl/NCEnStopWordsFinder.scala | 21 ++++---------
3 files changed, 6 insertions(+), 50 deletions(-)
diff --git a/nlpcraft/src/main/resources/stopwords/possessive_words.txt.gz b/nlpcraft/src/main/resources/stopwords/possessive_words.txt.gz
deleted file mode 100644
index 20ed420..0000000
Binary files a/nlpcraft/src/main/resources/stopwords/possessive_words.txt.gz and /dev/null differ
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordGenerator.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordGenerator.scala
index 0751715..959d149 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordGenerator.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordGenerator.scala
@@ -31,40 +31,6 @@ object NCEnStopWordGenerator:
// Output files.
private val FIRST_WORDS_FILE = "first_words.txt"
private val NOUN_WORDS_FILE = "noun_words.txt"
- private val POS_WORDS_FILE = "possessive_words.txt"
-
- private final val POS1 = Seq(
- "for",
- "in",
- "on",
- "within"
- )
-
- private final val POS2 = Seq(
- "our",
- "my"
- )
-
- private final val POS3 = Seq(
- "website",
- "web-site",
- "web site",
- "company website",
- "personal website",
- "site",
- "team",
- "organization",
- "group",
- "company",
- "page",
- "property",
- "online property",
- "company online property"
- )
-
- private[impl] def mkPossessiveStopWords: Seq[String] =
- (for (w1 <- POS1; w2 <- POS2; w3 <- POS3) yield s"$w1 $w2 $w3") ++
- (for (w2 <- POS2; w3 <- POS3) yield s"$w2 $w3")
private final val QWORDS = Seq(
"what",
@@ -350,6 +316,5 @@ object NCEnStopWordGenerator:
def main(args: Array[String]): Unit =
mkFirstWords()
mkNounWords()
- mkGzip(POS_WORDS_FILE, stem(mkPossessiveStopWords))
sys.exit()
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
index 167dd89..0a49adf 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
@@ -204,13 +204,11 @@ private[impl] class NCEnStopWordsFinder(addStems: Set[String], exclStems: Set[St
"percent"
).map(stemmer.stem)
- @volatile private var posWords: Set[String] = _ // Possessive words.
@volatile private var firstWords: Set[String] = _
@volatile private var nounWords: Set[String] = _
// Stemmatization is done already by generator.
NCUtils.executeParallel(
- () => posWords = read("stopwords/possessive_words.txt.gz"),
() => firstWords = read("stopwords/first_words.txt.gz"),
() => nounWords = read("stopwords/noun_words.txt.gz")
)(ExecutionContext.Implicits.global)
@@ -488,15 +486,8 @@ private[impl] class NCEnStopWordsFinder(addStems: Set[String], exclStems: Set[St
val origToks: Seq[(Seq[NCToken], String)] =
(for (toks <- mix) yield toks.toSeq).map(s => s -> toStemKey(s)).toSeq
- // +--------------------------------------------+
- // | Pass #3. |
- // | Check external possessive stop-word file. |
- // +--------------------------------------------+
- for (tup <- origToks; key = tup._2 if posWords.contains(key) && !isException(tup._1))
- tup._1.foreach(tok => stops += tok)
-
// +--------------------------------------------------+
- // | Pass #4. |
+ // | Pass #3. |
// | Check for sentence beginners from external file. |
// +--------------------------------------------------+
@@ -509,7 +500,7 @@ private[impl] class NCEnStopWordsFinder(addStems: Set[String], exclStems: Set[St
foundKeys += key
// +-------------------------------------------------+
- // | Pass #5. |
+ // | Pass #4. |
// | Check for sentence beginners with ending nouns. |
// +-------------------------------------------------+
for (tup <- origToks; key = tup._2 if !foundKeys.contains(key) && !isException(tup._1))
@@ -518,13 +509,13 @@ private[impl] class NCEnStopWordsFinder(addStems: Set[String], exclStems: Set[St
case None => ()
// +-------------------------------------------------+
- // | Pass #6. |
+ // | Pass #5. |
// | Mark words with POSes before stop-words. |
// +-------------------------------------------------+
markBefore(toks, STOP_BEFORE_STOP, toks.size - 1, isException, stops)
// +-------------------------------------------------+
- // | Pass #7. |
+ // | Pass #6. |
// | Processing additional and excluded stop words. |
// +-------------------------------------------------+
for (t <- toks if addStems.contains(t.getStem))
@@ -534,7 +525,7 @@ private[impl] class NCEnStopWordsFinder(addStems: Set[String], exclStems: Set[St
stops -= t
// +-------------------------------------------------+
- // | Pass #8. |
+ // | Pass #7. |
// | Marks as stopwords, words with POS from |
// | configured list, which also placed before |
// | another stop words. |
@@ -542,7 +533,7 @@ private[impl] class NCEnStopWordsFinder(addStems: Set[String], exclStems: Set[St
processCommonStops(toks, stops)
// +-------------------------------------------------+
- // | Pass #9. |
+ // | Pass #8. |
// | Deletes stop words if they are marked as quoted.|
// +-------------------------------------------------+
var quotes = toks.filter(isQuote)