You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by ar...@apache.org on 2022/12/21 17:42:40 UTC
[incubator-nlpcraft] branch NLPCRAFT-520 updated: Scaladoc fixes.
This is an automated email from the ASF dual-hosted git repository.
aradzinski pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push:
new f5ee0f27 Scaladoc fixes.
f5ee0f27 is described below
commit f5ee0f27fe264c2151a879afe1f4dfc9f9b95892
Author: Aaron Radzinski <ar...@datalingvo.com>
AuthorDate: Wed Dec 21 09:42:34 2022 -0800
Scaladoc fixes.
---
.../org/apache/nlpcraft/internal/util/NCUtils.scala | 2 +-
.../nlp/enrichers/NCBracketsTokenEnricher.scala | 6 ++----
.../nlp/enrichers/NCDictionaryTokenEnricher.scala | 9 +++++----
.../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 5 +----
.../nlp/enrichers/NCOpenNLPTokenEnricher.scala | 8 +++-----
.../nlp/enrichers/NCQuotesTokenEnricher.scala | 5 ++---
.../nlp/enrichers/NCSwearWordsTokenEnricher.scala | 19 +++++++------------
7 files changed, 21 insertions(+), 33 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
index 1b81acd0..f9ad8792 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
@@ -392,7 +392,7 @@ object NCUtils extends LazyLogging:
data
/**
- * Reads lines from given resource.
+ * Reads lines from given resource.
*
* @param res Resource, file absolute or relative path or input stream.
* @param enc Encoding. Default value is "UTF-8".
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala
index 70ce779a..9b0fc863 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala
@@ -26,7 +26,7 @@ import scala.collection.{Map, mutable}
/**
* Companion helper.
*/
-object NCBracketsTokenEnricher:
+private object NCBracketsTokenEnricher:
private val BRACKETS = Map("(" -> ")", "{" -> "}", "[" -> "]", "<" -> ">")
private val BRACKETS_REVERSED = BRACKETS.map { case (key, value) => value -> key }
@@ -36,9 +36,7 @@ import NCBracketsTokenEnricher.*
* Brackets [[NCTokenEnricher token enricher]].
*
* This enricher adds `brackets` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]]
- * instance if the word it represents is enclosed in brackets.
- *
- * Supported brackets are: `()`, `{}`, `[]` and `<>`.
+ * instance if the word it represents is enclosed in brackets. Supported brackets are: `()`, `{}`, `[]` and `<>`.
*
* **NOTE:** invalid enclosed brackets are ignored and for all input tokens property `brackets` assigned as `false`.
*/
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
index 0d28f3ad..490a59bb 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCDictionaryTokenEnricher.scala
@@ -22,7 +22,7 @@ import org.apache.nlpcraft.*
import org.apache.nlpcraft.internal.util.NCUtils as U
/**
- * "Known-word" [[NCTokenEnricher token enricher]].
+ * Dictionary-based "known-word" [[NCTokenEnricher token enricher]].
*
* This enricher adds `dict` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]]
* instance if word it represents is a known dictionary word, i.e. the configured dictionary contains this word's
@@ -34,12 +34,13 @@ import org.apache.nlpcraft.internal.util.NCUtils as U
* metadata property before this enricher in your [[NCPipeline pipeline]].
*
* @param dictRes Relative path, absolute path, classpath resource or URL to the dictionary.
- * The dictionary should have a simple plain text format with *one lemma per line*, empty lines are skipped, duplicates ignored, header or other comments allowed.
- * Headers are lines started with **#** symbol. Search in the dictionary is implemented by input words **lemms**, case is ignored.
+ * The dictionary should have a simple plain text format with *one lemma per line*, empty lines are skipped,
+ * duplicates ignored, lines starting with **#** symbol will be treated as comments and ignored.
+ * Note that the search in the dictionary is implemented using words' **lemma** and case is ignored.
*/
//noinspection DuplicatedCode,ScalaWeakerAccess
class NCDictionaryTokenEnricher(dictRes: String) extends NCTokenEnricher with LazyLogging:
- require(dictRes != null, "Dictonary resource cannot be null.")
+ require(dictRes != null, "Dictionary resource cannot be null.")
private var dict: Set[String] = _
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index 3eaa7f6e..698c43f0 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -172,15 +172,12 @@ import org.apache.nlpcraft.nlp.enrichers.NCEnStopWordsTokenEnricher.*
* instance if the word it represents is an English stopword. The value `true` of this metadata property indicates that
* this word is detected as a stopword, `false` value indicates otherwise. This implementation works off the
* algorithm that uses an internal list of English stopwords as well as a procedural logic to determine the stopword
- * status of the token. This algorithm should work fine for most of the general uses cases. User, however, can add
+ * status of the token. This algorithm should work fine for most of the general uses cases. User can also add
* additional stopwords or exceptions for the existing ones using corresponding parameters in [[NCEnStopWordsTokenEnricher]]
* constructor.
*
* More information about stopwords can be found at [[https://en.wikipedia.org/wiki/Stop_word]].
*
- * `stemmer` implementation language should be corresponded to other components of [[NCPipeline]], but
- * required `stemmer` implementation is independent from other components' stemmers.
- *
* **NOTE:** this implementation requires `lemma` and `pos` string [[NCPropertyMap metadata]] properties that
* contain token's lemma and part of speech accordingly. You can configure [[NCOpenNLPTokenEnricher]] with the model
* for English language that would provide these metadata properties before this enricher in your [[NCPipeline pipeline]].
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
index a51284da..f5aa2f29 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCOpenNLPTokenEnricher.scala
@@ -27,15 +27,13 @@ import java.io.*
import scala.concurrent.ExecutionContext
/**
- * [[https://opennlp.apache.org/ OpenNLP]] based language independent [[NCTokenEnricher token enricher]]. This
+ * [[https://opennlp.apache.org/ OpenNLP]]-based language independent [[NCTokenEnricher token enricher]]. This
* enricher adds `lemma` and `pos` (part-of-speech) string [[NCPropertyMap metadata]] property to the [[NCToken token]]
* instance. Learn more about lemmas [[https://en.wikipedia.org/wiki/Lemma_(morphology) here]] and about part-of-speech
* [[https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html here]].
*
- * At least one of model must be defined.
- *
- * This OpenNLP enricher requires PoS and lemma models. Some of OpenNLP community models can be found
- * [[https://opennlp.sourceforge.net/models-1.5/ here]].
+ * This OpenNLP enricher requires PoS and lemma models. Some of free OpenNLP community maintained models can be found
+ * [[https://opennlp.sourceforge.net/models-1.5/ here]]. Note that at least one of model must be defined.
*
* @param posMdlRes Relative path, absolute path, classpath resource or URL to
* [[https://opennlp.apache.org/docs/2.0.0/apidocs/opennlp-tools/opennlp/tools/postag/POSTaggerME.html POSTaggerME]] model.
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
index f2abb1c8..e55be2fb 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala
@@ -24,12 +24,11 @@ import scala.collection.*
/**
* Companion helper.
*/
-object NCQuotesTokenEnricher:
+private object NCQuotesTokenEnricher:
private case class Range(from: Int, to: Int):
def in(idx: Int): Boolean = idx >= from && idx <= to
private val QUOTES = Map("«" -> "»", "\"" -> "\"", "`" -> "`", "'" -> "'")
-
private val QUOTES_REVERSED = QUOTES.map { case (key, value) => value -> key }
private val QUOTES_SYMBOLS = QUOTES.flatMap { case (key, value) => Set(key, value) }.toSet
@@ -46,7 +45,7 @@ import NCQuotesTokenEnricher.*
*
* Supported quotes are: **«**, **»**, **"**, **'**, **`**.
*
- * **NOTE:** invalid enclosed quotes are ignored and for all input tokens property `quoted` assigned as `false`.
+ * **NOTE:** invalid enclosed quotes are ignored.
*/
//noinspection ScalaWeakerAccess
class NCQuotesTokenEnricher extends NCTokenEnricher with LazyLogging:
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
index ff0ebc98..86afb5c7 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCSwearWordsTokenEnricher.scala
@@ -26,23 +26,18 @@ import java.io.*
import java.util.Objects
/**
- * "Swear-word" [[NCTokenEnricher token enricher]].
+ * Swear-word [[NCTokenEnricher token enricher]].
*
* This enricher adds `swear` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]]
- * instance if word it represents is a swear word dictionary, i.e. the swear dictionary contains this word's
+ * instance if word it represents is in a swear word dictionary, i.e. the swear dictionary contains this word's
* stem. The value `true` of the metadata property indicates that this word's stem is found in the dictionary,
* `false` value indicates otherwise.
*
- * Read more about stemming [[https://en.wikipedia.org/wiki/Stemming here]].
- * Dictionary language and `stemmer` implementation language should be corresponded to other components of [[NCPipeline]], but
- * required `stemmer` implementation is independent from other components' stemmers.
- *
- * Stemming is used here because it is too difficult to be based on more accurate `lemma` approach for swear words.
- *
- * @param dictRes Relative path, absolute path, classpath resource or URL to the swear dictionary. The dictionary should have a simple
- * plain text format with *one word per line*, empty lines are skipped, duplicates ignored, header or other comments allowed.
- * Headers are lines started with **#** symbol. Search in the dictionary is implemented by input words **stems**, case is ignored.
- * @param stemmer Stemmer implementation for the dictionary language.
+ * @param dictRes Relative path, absolute path, classpath resource or URL to the dictionary.
+ * The dictionary should have a simple plain text format with *one lemma per line*, empty lines are skipped,
+ * duplicates ignored, lines starting with **#** symbol will be treated as comments and ignored.
+ * Note that the search in the dictionary is implemented using words' **stem** and case is ignored.
+ * @param stemmer Stemmer implementation for the language used in the supplied swear-word dictionary.
*/
//noinspection ScalaWeakerAccess
class NCSwearWordsTokenEnricher(dictRes: String, stemmer: NCStemmer) extends NCTokenEnricher with LazyLogging: