You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/01/19 08:54:53 UTC

[incubator-nlpcraft] branch NLPCRAFT-221 updated: WIP.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-221
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


The following commit(s) were added to refs/heads/NLPCRAFT-221 by this push:
     new c0c8d2c  WIP.
c0c8d2c is described below

commit c0c8d2cf51990a043c101c4e7eacba0345fc8b1b
Author: skhdl <sk...@gmail.com>
AuthorDate: Tue Jan 19 11:54:01 2021 +0300

    WIP.
---
 .../src/main/resources => external}/badfilter/swear_words.txt  |  0
 external/md5.txt                                               |  1 +
 .../nlpcraft/common/extcfg/NCExternalConfigManager.scala       |  5 ++++-
 .../apache/nlpcraft/common/extcfg/NCExternalConfigType.scala   |  2 +-
 .../mgrs/nlp/enrichers/dictionary/NCDictionaryEnricher.scala   | 10 ++++++----
 5 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/nlpcraft/src/main/resources/badfilter/swear_words.txt b/external/badfilter/swear_words.txt
similarity index 100%
rename from nlpcraft/src/main/resources/badfilter/swear_words.txt
rename to external/badfilter/swear_words.txt
diff --git a/external/md5.txt b/external/md5.txt
index 96c3ef4..bde5b6a 100644
--- a/external/md5.txt
+++ b/external/md5.txt
@@ -16,6 +16,7 @@
 #
 
 geo/cc_by40_geo_config.zip 52f209bd2b2105f7163280bbc428aafe
+badfilter/swear_words.txt be8a5713d0dbdb595ea00bfb9442df1b
 spell/cc_by40_spell_config.zip a53e4cfc9517edd5b94024e3aaf03d83
 opennlp/en-pos-maxent.bin db2cd70395b9e2e4c6b9957015a10607
 opennlp/en-ner-location.bin 0e67e6ad484406c125faead56b5c225e
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigManager.scala
index cbe292a..773bfe7 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigManager.scala
@@ -44,6 +44,9 @@ object NCExternalConfigManager extends NCService {
             GEO → Set(
                 "cc_by40_geo_config.zip"
             ),
+            BADFILTER → Set(
+                "swear_words.txt"
+            ),
             SPELL → Set(
                 "cc_by40_spell_config.zip"
             ),
@@ -98,7 +101,7 @@ object NCExternalConfigManager extends NCService {
 
             try
                 managed(Source.fromURL(url)) acquireAndGet { src ⇒
-                    src.getLines().map(_.trim()).filter(s ⇒ !s.isEmpty && !s.startsWith("#")).map(p ⇒ {
+                    src.getLines().map(_.trim()).filter(s ⇒ s.nonEmpty && !s.startsWith("#")).map(p ⇒ {
                         def splitPair(s: String, sep: String): (String, String) = {
                             val seq = s.split(sep).map(_.trim)
 
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigType.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigType.scala
index b62a8bc..71ec914 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigType.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigType.scala
@@ -23,5 +23,5 @@ package org.apache.nlpcraft.common.extcfg
 object NCExternalConfigType extends Enumeration {
     type NCResourceType = Value
 
-    val SPELL, GEO, OPENNLP = Value
+    val SPELL, GEO, OPENNLP, BADFILTER = Value
 }
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/dictionary/NCDictionaryEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/dictionary/NCDictionaryEnricher.scala
index 325c90c..2f8a343 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/dictionary/NCDictionaryEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/dictionary/NCDictionaryEnricher.scala
@@ -17,9 +17,9 @@
 
 package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.dictionary
 
-import java.io.Serializable
-
 import io.opencensus.trace.Span
+import org.apache.nlpcraft.common.extcfg.NCExternalConfigManager
+import org.apache.nlpcraft.common.extcfg.NCExternalConfigType.BADFILTER
 import org.apache.nlpcraft.common.nlp._
 import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager
 import org.apache.nlpcraft.common.nlp.dict._
@@ -27,6 +27,7 @@ import org.apache.nlpcraft.common.{NCService, _}
 import org.apache.nlpcraft.probe.mgrs.NCProbeModel
 import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
 
+import java.io.Serializable
 import scala.collection.Map
 
 /**
@@ -35,6 +36,8 @@ import scala.collection.Map
   * This enricher must be used after all enrichers which can manipulate 'quote' and 'stopword' notes of token.
   */
 object NCDictionaryEnricher extends NCProbeEnricher {
+    private final val RESOURCE = "swear_words.txt"
+
     @volatile private var swearWords: Set[String] = _
 
     /**
@@ -44,8 +47,7 @@ object NCDictionaryEnricher extends NCProbeEnricher {
      */
     override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
         ackStarting()
-
-        swearWords = U.readTextResource(s"badfilter/swear_words.txt", "UTF-8", logger).
+        swearWords = U.readTextStream(NCExternalConfigManager.getStream(BADFILTER, RESOURCE), "UTF-8", logger).
             map(NCNlpCoreManager.stem).
             toSet