You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/01/20 12:09:59 UTC

[incubator-nlpcraft] branch master updated: `swear_words` configuration moved to extra resources.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


The following commit(s) were added to refs/heads/master by this push:
     new c68171e  `swear_words` configuration moved to extra resources.
c68171e is described below

commit c68171e40b362e089b4725543766b5d4bb8be51d
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Wed Jan 20 15:09:29 2021 +0300

    `swear_words` configuration moved to extra resources.
---
 .../src/main/resources => external}/badfilter/swear_words.txt  |  0
 external/md5.txt                                               |  1 +
 .../nlpcraft/common/extcfg/NCExternalConfigManager.scala       |  5 ++++-
 .../apache/nlpcraft/common/extcfg/NCExternalConfigType.scala   |  2 +-
 .../mgrs/nlp/enrichers/dictionary/NCDictionaryEnricher.scala   | 10 ++++++----
 5 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/nlpcraft/src/main/resources/badfilter/swear_words.txt b/external/badfilter/swear_words.txt
similarity index 100%
rename from nlpcraft/src/main/resources/badfilter/swear_words.txt
rename to external/badfilter/swear_words.txt
diff --git a/external/md5.txt b/external/md5.txt
index 96c3ef4..d660ea3 100644
--- a/external/md5.txt
+++ b/external/md5.txt
@@ -16,6 +16,7 @@
 #
 
 geo/cc_by40_geo_config.zip 52f209bd2b2105f7163280bbc428aafe
+badfilter/swear_words.txt f423c9e889afe4386e0aa4c86c460c5b
 spell/cc_by40_spell_config.zip a53e4cfc9517edd5b94024e3aaf03d83
 opennlp/en-pos-maxent.bin db2cd70395b9e2e4c6b9957015a10607
 opennlp/en-ner-location.bin 0e67e6ad484406c125faead56b5c225e
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigManager.scala
index cbe292a..773bfe7 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigManager.scala
@@ -44,6 +44,9 @@ object NCExternalConfigManager extends NCService {
             GEO → Set(
                 "cc_by40_geo_config.zip"
             ),
+            BADFILTER → Set(
+                "swear_words.txt"
+            ),
             SPELL → Set(
                 "cc_by40_spell_config.zip"
             ),
@@ -98,7 +101,7 @@ object NCExternalConfigManager extends NCService {
 
             try
                 managed(Source.fromURL(url)) acquireAndGet { src ⇒
-                    src.getLines().map(_.trim()).filter(s ⇒ !s.isEmpty && !s.startsWith("#")).map(p ⇒ {
+                    src.getLines().map(_.trim()).filter(s ⇒ s.nonEmpty && !s.startsWith("#")).map(p ⇒ {
                         def splitPair(s: String, sep: String): (String, String) = {
                             val seq = s.split(sep).map(_.trim)
 
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigType.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigType.scala
index b62a8bc..71ec914 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigType.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigType.scala
@@ -23,5 +23,5 @@ package org.apache.nlpcraft.common.extcfg
 object NCExternalConfigType extends Enumeration {
     type NCResourceType = Value
 
-    val SPELL, GEO, OPENNLP = Value
+    val SPELL, GEO, OPENNLP, BADFILTER = Value
 }
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/dictionary/NCDictionaryEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/dictionary/NCDictionaryEnricher.scala
index 325c90c..2f8a343 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/dictionary/NCDictionaryEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/dictionary/NCDictionaryEnricher.scala
@@ -17,9 +17,9 @@
 
 package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.dictionary
 
-import java.io.Serializable
-
 import io.opencensus.trace.Span
+import org.apache.nlpcraft.common.extcfg.NCExternalConfigManager
+import org.apache.nlpcraft.common.extcfg.NCExternalConfigType.BADFILTER
 import org.apache.nlpcraft.common.nlp._
 import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager
 import org.apache.nlpcraft.common.nlp.dict._
@@ -27,6 +27,7 @@ import org.apache.nlpcraft.common.{NCService, _}
 import org.apache.nlpcraft.probe.mgrs.NCProbeModel
 import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
 
+import java.io.Serializable
 import scala.collection.Map
 
 /**
@@ -35,6 +36,8 @@ import scala.collection.Map
   * This enricher must be used after all enrichers which can manipulate 'quote' and 'stopword' notes of token.
   */
 object NCDictionaryEnricher extends NCProbeEnricher {
+    private final val RESOURCE = "swear_words.txt"
+
     @volatile private var swearWords: Set[String] = _
 
     /**
@@ -44,8 +47,7 @@ object NCDictionaryEnricher extends NCProbeEnricher {
      */
     override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
         ackStarting()
-
-        swearWords = U.readTextResource(s"badfilter/swear_words.txt", "UTF-8", logger).
+        swearWords = U.readTextStream(NCExternalConfigManager.getStream(BADFILTER, RESOURCE), "UTF-8", logger).
             map(NCNlpCoreManager.stem).
             toSet