You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/01/19 08:54:53 UTC
[incubator-nlpcraft] branch NLPCRAFT-221 updated: WIP.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-221
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-221 by this push:
new c0c8d2c WIP.
c0c8d2c is described below
commit c0c8d2cf51990a043c101c4e7eacba0345fc8b1b
Author: skhdl <sk...@gmail.com>
AuthorDate: Tue Jan 19 11:54:01 2021 +0300
WIP.
---
.../src/main/resources => external}/badfilter/swear_words.txt | 0
external/md5.txt | 1 +
.../nlpcraft/common/extcfg/NCExternalConfigManager.scala | 5 ++++-
.../apache/nlpcraft/common/extcfg/NCExternalConfigType.scala | 2 +-
.../mgrs/nlp/enrichers/dictionary/NCDictionaryEnricher.scala | 10 ++++++----
5 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/nlpcraft/src/main/resources/badfilter/swear_words.txt b/external/badfilter/swear_words.txt
similarity index 100%
rename from nlpcraft/src/main/resources/badfilter/swear_words.txt
rename to external/badfilter/swear_words.txt
diff --git a/external/md5.txt b/external/md5.txt
index 96c3ef4..bde5b6a 100644
--- a/external/md5.txt
+++ b/external/md5.txt
@@ -16,6 +16,7 @@
#
geo/cc_by40_geo_config.zip 52f209bd2b2105f7163280bbc428aafe
+badfilter/swear_words.txt be8a5713d0dbdb595ea00bfb9442df1b
spell/cc_by40_spell_config.zip a53e4cfc9517edd5b94024e3aaf03d83
opennlp/en-pos-maxent.bin db2cd70395b9e2e4c6b9957015a10607
opennlp/en-ner-location.bin 0e67e6ad484406c125faead56b5c225e
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigManager.scala
index cbe292a..773bfe7 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigManager.scala
@@ -44,6 +44,9 @@ object NCExternalConfigManager extends NCService {
GEO → Set(
"cc_by40_geo_config.zip"
),
+ BADFILTER → Set(
+ "swear_words.txt"
+ ),
SPELL → Set(
"cc_by40_spell_config.zip"
),
@@ -98,7 +101,7 @@ object NCExternalConfigManager extends NCService {
try
managed(Source.fromURL(url)) acquireAndGet { src ⇒
- src.getLines().map(_.trim()).filter(s ⇒ !s.isEmpty && !s.startsWith("#")).map(p ⇒ {
+ src.getLines().map(_.trim()).filter(s ⇒ s.nonEmpty && !s.startsWith("#")).map(p ⇒ {
def splitPair(s: String, sep: String): (String, String) = {
val seq = s.split(sep).map(_.trim)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigType.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigType.scala
index b62a8bc..71ec914 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigType.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/extcfg/NCExternalConfigType.scala
@@ -23,5 +23,5 @@ package org.apache.nlpcraft.common.extcfg
object NCExternalConfigType extends Enumeration {
type NCResourceType = Value
- val SPELL, GEO, OPENNLP = Value
+ val SPELL, GEO, OPENNLP, BADFILTER = Value
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/dictionary/NCDictionaryEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/dictionary/NCDictionaryEnricher.scala
index 325c90c..2f8a343 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/dictionary/NCDictionaryEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/dictionary/NCDictionaryEnricher.scala
@@ -17,9 +17,9 @@
package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.dictionary
-import java.io.Serializable
-
import io.opencensus.trace.Span
+import org.apache.nlpcraft.common.extcfg.NCExternalConfigManager
+import org.apache.nlpcraft.common.extcfg.NCExternalConfigType.BADFILTER
import org.apache.nlpcraft.common.nlp._
import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager
import org.apache.nlpcraft.common.nlp.dict._
@@ -27,6 +27,7 @@ import org.apache.nlpcraft.common.{NCService, _}
import org.apache.nlpcraft.probe.mgrs.NCProbeModel
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
+import java.io.Serializable
import scala.collection.Map
/**
@@ -35,6 +36,8 @@ import scala.collection.Map
* This enricher must be used after all enrichers which can manipulate 'quote' and 'stopword' notes of token.
*/
object NCDictionaryEnricher extends NCProbeEnricher {
+ private final val RESOURCE = "swear_words.txt"
+
@volatile private var swearWords: Set[String] = _
/**
@@ -44,8 +47,7 @@ object NCDictionaryEnricher extends NCProbeEnricher {
*/
override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
ackStarting()
-
- swearWords = U.readTextResource(s"badfilter/swear_words.txt", "UTF-8", logger).
+ swearWords = U.readTextStream(NCExternalConfigManager.getStream(BADFILTER, RESOURCE), "UTF-8", logger).
map(NCNlpCoreManager.stem).
toSet