You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by ar...@apache.org on 2021/12/24 00:31:09 UTC

[incubator-nlpcraft] branch NLPCRAFT-469 updated: WIP

This is an automated email from the ASF dual-hosted git repository.

aradzinski pushed a commit to branch NLPCRAFT-469
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


The following commit(s) were added to refs/heads/NLPCRAFT-469 by this push:
     new d096149  WIP
d096149 is described below

commit d096149270455b9d51f3c4e487fba4c83fa3d0b6
Author: Aaron Radzinski <ar...@datalingvo.com>
AuthorDate: Thu Dec 23 16:31:04 2021 -0800

    WIP
---
 .../resources/stopwords/possessive_words.txt.gz    | Bin 990 -> 0 bytes
 .../opennlp/impl/NCEnStopWordGenerator.scala       |  35 ---------------------
 .../parser/opennlp/impl/NCEnStopWordsFinder.scala  |  21 ++++---------
 3 files changed, 6 insertions(+), 50 deletions(-)

diff --git a/nlpcraft/src/main/resources/stopwords/possessive_words.txt.gz b/nlpcraft/src/main/resources/stopwords/possessive_words.txt.gz
deleted file mode 100644
index 20ed420..0000000
Binary files a/nlpcraft/src/main/resources/stopwords/possessive_words.txt.gz and /dev/null differ
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordGenerator.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordGenerator.scala
index 0751715..959d149 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordGenerator.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordGenerator.scala
@@ -31,40 +31,6 @@ object NCEnStopWordGenerator:
     // Output files.
     private val FIRST_WORDS_FILE = "first_words.txt"
     private val NOUN_WORDS_FILE = "noun_words.txt"
-    private val POS_WORDS_FILE = "possessive_words.txt"
-
-    private final val POS1 = Seq(
-        "for",
-        "in",
-        "on",
-        "within"
-    )
-
-    private final val POS2 = Seq(
-        "our",
-        "my"
-    )
-
-    private final val POS3 = Seq(
-        "website",
-        "web-site",
-        "web site",
-        "company website",
-        "personal website",
-        "site",
-        "team",
-        "organization",
-        "group",
-        "company",
-        "page",
-        "property",
-        "online property",
-        "company online property"
-    )
-
-    private[impl] def mkPossessiveStopWords: Seq[String] =
-        (for (w1 <- POS1; w2 <- POS2; w3 <- POS3) yield s"$w1 $w2 $w3") ++
-            (for (w2 <- POS2; w3 <- POS3) yield s"$w2 $w3")
 
     private final val QWORDS = Seq(
         "what",
@@ -350,6 +316,5 @@ object NCEnStopWordGenerator:
     def main(args: Array[String]): Unit =
         mkFirstWords()
         mkNounWords()
-        mkGzip(POS_WORDS_FILE, stem(mkPossessiveStopWords))
 
         sys.exit()
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
index 167dd89..0a49adf 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCEnStopWordsFinder.scala
@@ -204,13 +204,11 @@ private[impl] class NCEnStopWordsFinder(addStems: Set[String], exclStems: Set[St
         "percent"
     ).map(stemmer.stem)
 
-    @volatile private var posWords: Set[String] = _ // Possessive words.
     @volatile private var firstWords: Set[String] = _
     @volatile private var nounWords: Set[String] = _
 
     // Stemmatization is done already by generator.
     NCUtils.executeParallel(
-        () => posWords = read("stopwords/possessive_words.txt.gz"),
         () => firstWords = read("stopwords/first_words.txt.gz"),
         () => nounWords = read("stopwords/noun_words.txt.gz")
     )(ExecutionContext.Implicits.global)
@@ -488,15 +486,8 @@ private[impl] class NCEnStopWordsFinder(addStems: Set[String], exclStems: Set[St
         val origToks: Seq[(Seq[NCToken], String)] =
             (for (toks <- mix) yield toks.toSeq).map(s => s -> toStemKey(s)).toSeq
 
-        // +--------------------------------------------+
-        // | Pass #3.                                   |
-        // | Check external possessive stop-word file.  |
-        // +--------------------------------------------+
-        for (tup <- origToks; key = tup._2 if posWords.contains(key) && !isException(tup._1))
-            tup._1.foreach(tok => stops += tok)
-
         // +--------------------------------------------------+
-        // | Pass #4.                                         |
+        // | Pass #3.                                         |
         // | Check for sentence beginners from external file. |
         // +--------------------------------------------------+
 
@@ -509,7 +500,7 @@ private[impl] class NCEnStopWordsFinder(addStems: Set[String], exclStems: Set[St
             foundKeys += key
 
         // +-------------------------------------------------+
-        // | Pass #5.                                        |
+        // | Pass #4.                                        |
         // | Check for sentence beginners with ending nouns. |
         // +-------------------------------------------------+
         for (tup <- origToks; key = tup._2 if !foundKeys.contains(key) && !isException(tup._1))
@@ -518,13 +509,13 @@ private[impl] class NCEnStopWordsFinder(addStems: Set[String], exclStems: Set[St
                 case None => ()
 
         // +-------------------------------------------------+
-        // | Pass #6.                                        |
+        // | Pass #5.                                        |
         // | Mark words with POSes before stop-words.        |
         // +-------------------------------------------------+
         markBefore(toks, STOP_BEFORE_STOP, toks.size - 1, isException, stops)
 
         // +-------------------------------------------------+
-        // | Pass #7.                                        |
+        // | Pass #6.                                        |
         // | Processing additional and excluded stop words.  |
         // +-------------------------------------------------+
         for (t <- toks if addStems.contains(t.getStem))
@@ -534,7 +525,7 @@ private[impl] class NCEnStopWordsFinder(addStems: Set[String], exclStems: Set[St
             stops -= t
 
         // +-------------------------------------------------+
-        // | Pass #8.                                        |
+        // | Pass #7.                                        |
         // | Marks as stopwords, words with POS from         |
         // | configured list, which also placed before       |
         // | another stop words.                             |
@@ -542,7 +533,7 @@ private[impl] class NCEnStopWordsFinder(addStems: Set[String], exclStems: Set[St
         processCommonStops(toks, stops)
 
         // +-------------------------------------------------+
-        // | Pass #9.                                        |
+        // | Pass #8.                                        |
         // | Deletes stop words if they are marked as quoted.|
         // +-------------------------------------------------+
         var quotes = toks.filter(isQuote)