You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2020/05/29 12:53:39 UTC
[incubator-nlpcraft] branch NLPCRAFT-41 updated: Suggestions
service runner added.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-41
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-41 by this push:
new ef8a2fa Suggestions service runner added.
ef8a2fa is described below
commit ef8a2fa8580c488927e7edaa6d8f6dbbfb5a79ad
Author: Sergey Kamov <se...@apache.org>
AuthorDate: Fri May 29 15:53:19 2020 +0300
Suggestions service runner added.
---
.../NCSuggestionsGenerator.scala} | 236 +++++++++++++++++----
1 file changed, 196 insertions(+), 40 deletions(-)
diff --git a/src/main/scala/org/apache/nlpcraft/model/tools/synonyms/NCSynonymsGenerator.scala b/src/main/scala/org/apache/nlpcraft/model/tools/suggestions/NCSuggestionsGenerator.scala
similarity index 63%
rename from src/main/scala/org/apache/nlpcraft/model/tools/synonyms/NCSynonymsGenerator.scala
rename to src/main/scala/org/apache/nlpcraft/model/tools/suggestions/NCSuggestionsGenerator.scala
index 8e3072b..27d28da 100644
--- a/src/main/scala/org/apache/nlpcraft/model/tools/synonyms/NCSynonymsGenerator.scala
+++ b/src/main/scala/org/apache/nlpcraft/model/tools/suggestions/NCSuggestionsGenerator.scala
@@ -14,7 +14,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.nlpcraft.model.tools.synonyms
+
+package org.apache.nlpcraft.model.tools.suggestions
import java.util.concurrent.atomic.AtomicInteger
import java.util.concurrent.{CopyOnWriteArrayList, CountDownLatch, TimeUnit}
@@ -31,27 +32,22 @@ import org.apache.nlpcraft.common.ascii.NCAsciiTable
import org.apache.nlpcraft.common.makro.NCMacroParser
import org.apache.nlpcraft.common.nlp.core.NCNlpPorterStemmer
import org.apache.nlpcraft.common.util.NCUtils
+import org.apache.nlpcraft.common.version.NCVersion
import org.apache.nlpcraft.model.NCModelFileAdapter
import scala.collection.JavaConverters._
import scala.collection._
-case class NCSynonymsGeneratorData(
- url: String = "http://localhost:5000/synonyms",
- modelPath: String,
- responseLimit: Int = 10, // TODO: add scoreLimit
- minScore: Double = 0,
- supportedSynonymsWords: Int = 1,
- debugRequests: Boolean = false
-) {
- require(url != null, "URL cannot be null")
- require(modelPath != null, "Model path cannot be null")
- require(responseLimit > 0, "Response limit value must be positive")
- require(minScore >= 0, "Minimal score value cannot be negative")
- require(supportedSynonymsWords > 0, "Supported synonyms words count value must be positive")
-}
+case class ParametersHolder(
+ modelPath: String,
+ url: String,
+ limit: Int,
+ minScore: Double,
+ synonymsWords: Int,
+ debug: Boolean
+)
-object NCSynonymsGenerator {
+object NCSuggestionsGeneratorImpl {
/**
* Suggestion data holder.
*
@@ -65,6 +61,7 @@ object NCSynonymsGenerator {
case class Suggestion(
word: String, bert: Double, normalized: Double, ftext: Double, `ftext-sentence`: Double, score: Double
)
+
case class RequestData(sentence: String, example: String, elementId: String, lower: Int, upper: Int, limit: Int)
case class RestRequest(sentence: String, simple: Boolean, lower: Int, upper: Int, limit: Int)
case class RestResponse(data: java.util.ArrayList[Suggestion])
@@ -97,6 +94,7 @@ object NCSynonymsGenerator {
private def split(s: String): Seq[String] = s.split(" ").toSeq.map(_.trim).filter(_.nonEmpty)
private def toStem(s: String): String = split(s).map(NCNlpPorterStemmer.stem).mkString(" ")
private def toStemWord(s: String): String = NCNlpPorterStemmer.stem(s)
+
private def getAllSlices(seq1: Seq[String], seq2: Seq[String]): Seq[Int] = {
val seq = mutable.Buffer.empty[Int]
@@ -111,7 +109,7 @@ object NCSynonymsGenerator {
seq
}
- def process(data: NCSynonymsGeneratorData): Unit = {
+ def process(data: ParametersHolder): Unit = {
val mdl = new NCModelFileAdapter(data.modelPath) {}
val parser = new NCMacroParser()
@@ -126,8 +124,8 @@ object NCSynonymsGenerator {
require(
word.forall(ch ⇒
ch.isLetterOrDigit ||
- ch == '\'' ||
- SEPARATORS.contains(ch)
+ ch == '\'' ||
+ SEPARATORS.contains(ch)
),
s"Unsupported symbols: $word"
)
@@ -145,12 +143,12 @@ object NCSynonymsGenerator {
val elemSyns =
mdl.getElements.asScala.map(e ⇒ e.getId → e.getSynonyms.asScala.flatMap(parser.expand)).
- map { case (id, seq) ⇒ id → seq.map(txt ⇒ split(txt).map(p ⇒ Word(p, toStemWord(p))))}.toMap
+ map { case (id, seq) ⇒ id → seq.map(txt ⇒ split(txt).map(p ⇒ Word(p, toStemWord(p)))) }.toMap
val allReqs =
elemSyns.map {
case (elemId, syns) ⇒
- val normSyns: Seq[Seq[Word]] = syns.filter(_.size <= data.supportedSynonymsWords)
+ val normSyns: Seq[Seq[Word]] = syns.filter(_.size <= data.synonymsWords)
val synsStems = normSyns.map(_.map(_.stem))
val synsWords = normSyns.map(_.map(_.word))
@@ -175,13 +173,13 @@ object NCSynonymsGenerator {
elementId = elemId,
lower = idx,
upper = idx + synStems.length - 1,
- limit = data.responseLimit
+ limit = data.limit
)
}
- (for (idx ← exampleIdxs; (synStems, i) ← synsStems.zipWithIndex) yield mkRequestData(idx, synStems, i)).
- distinct
- }
+ (for (idx ← exampleIdxs; (synStems, i) ← synsStems.zipWithIndex) yield mkRequestData(idx, synStems, i)).
+ distinct
+ }
elemId → reqs.toSet
}.filter(_._2.nonEmpty)
@@ -190,7 +188,7 @@ object NCSynonymsGenerator {
println(s"Synonyms count: ${elemSyns.map(_._2.size).sum}")
println(s"Request prepared: ${allReqs.map(_._2.size).sum}")
- val allSuggs = new java.util.concurrent.ConcurrentHashMap[String, java.util.List[Suggestion]] ()
+ val allSuggs = new java.util.concurrent.ConcurrentHashMap[String, java.util.List[Suggestion]]()
val cdl = new CountDownLatch(allReqs.map { case (_, seq) ⇒ seq.size }.sum)
val debugs = mutable.HashMap.empty[RequestData, Seq[Suggestion]]
val cnt = new AtomicInteger(0)
@@ -222,7 +220,7 @@ object NCSynonymsGenerator {
finally
post.releaseConnection()
- if (data.debugRequests)
+ if (data.debug)
debugs += req → resp
val i = cnt.incrementAndGet()
@@ -285,11 +283,11 @@ object NCSynonymsGenerator {
}.
toSeq
- val normFactor = seq.map(_._2).sum.toDouble / seq.size / avgScores(elemId)
+ val normFactor = seq.map(_._2).sum.toDouble / seq.size / avgScores(elemId)
seq.
map { case (sugg, cnt) ⇒ (sugg, cnt, sugg.score * normFactor * cnt.toDouble / counts(elemId)) }.
- sortBy { case (_, _, cumFactor) ⇒ -cumFactor }.
+ sortBy { case (_, _, cumFactor) ⇒ -cumFactor }.
zipWithIndex.
foreach { case ((sugg, cnt, cumFactor), sugIdx) ⇒
def f(d: Double): String = "%1.3f" format d
@@ -308,7 +306,7 @@ object NCSynonymsGenerator {
}
}
- if (data.debugRequests) {
+ if (data.debug) {
var i = 1
debugs.groupBy(_._1.example).foreach { case (_, m) ⇒
@@ -341,15 +339,173 @@ object NCSynonymsGenerator {
}
}
-object NCSynonymsGeneratorRunner extends App {
- NCSynonymsGenerator.process(
- NCSynonymsGeneratorData(
- url = "http://localhost:5000/synonyms",
- modelPath = "src/main/scala/org/apache/nlpcraft/examples/weather/weather_model.json",
- minScore = 0,
- responseLimit = 10,
- supportedSynonymsWords = 1,
- debugRequests = true
+object NCSuggestionsGenerator extends App {
+ private lazy val DFLT_URL: String = "http://localhost:5000/synonyms"
+ private lazy val DFLT_LIMIT: Int = 10 // TODO: add scoreLimit
+ private lazy val DFLT_MIN_SCORE: Double = 0
+ private lazy val DFLT_SYNONYMNS_WORDS: Int = 1
+ private lazy val DFLT_DEBUG: Boolean = false
+
+ /**
+ *
+ * @param msg Optional error message.
+ */
+ private def errorExit(msg: String = null): Unit = {
+ if (msg != null)
+ System.err.println(
+ s"""
+ |ERROR:
+ | $msg""".stripMargin
+ )
+
+ if (msg == null)
+ System.err.println(
+ s"""
+ |NAME:
+ | NCSuggestionsGenerator -- NLPCraft synonyms suggestions generator for given model.
+ |
+ |SYNOPSIS:
+ | java -cp apache-nlpcraft-incubating-${NCVersion.getCurrent}-all-deps.jar org.apache.nlpcraft.model.tools.suggestions.NCSuggestionsGenerator [PARAMETERS]
+ |
+ |DESCRIPTION:
+ | This utility generates synonyms suggestions for given NLPCraft model.
+ | Note that Python NLP server should be started and accessible parameter URL.
+ |
+ | This Java class can be run from the command line or from an IDE like any other
+ | Java application.""".stripMargin
+ )
+
+ System.err.println(
+ s"""
+ |PARAMETERS:
+ | [--model|-m] model path
+ | Mandatory file model path.
+ | It should have one of the following extensions: .js, .json, .yml, or .yaml
+ |
+ | [--url|-u] url
+ | Optional Python NLP server URL.
+ | Default is $DFLT_URL.
+ |
+ | [--limit|-l] limit
+ | Optional maximum suggestions per synonyms count value.
+ | Default is $DFLT_LIMIT.
+ |
+ | [--score|-c] score
+ | Optional minimal suggestion score value.
+ | Default is $DFLT_MIN_SCORE.
+ |
+ | [--syns|-s] synonyms count
+ | Optional words count which defined which synonyms words count supported.
+ | Default is $DFLT_SYNONYMNS_WORDS.
+ |
+ | [--debug|-d] [true|false]
+ | Optional flag on whether or not to debug output.
+ | Default is $DFLT_DEBUG.
+ |
+ | [--help|-h|-?]
+ | Prints this usage information.
+ |
+ |EXAMPLES:
+ | java -cp apache-nlpcraft-incubating-${NCVersion.getCurrent}-all-deps.jar org.apache.nlpcraft.model.tools.sqlgen.NCSqlModelGenerator
+ | -m src/main/scala/org/apache/nlpcraft/examples/weather/weather_model.json
+ | -u $DFLT_URL
+ """.stripMargin
)
- )
+
+ System.exit(1)
+ }
+
+ /**
+ *
+ * @param v
+ * @param name
+ */
+ private def mandatoryParam(v: String, name: String): Unit =
+ if (v == null)
+ throw new IllegalArgumentException(s"Parameter is mandatory and must be set: $name")
+
+ /**
+ *
+ * @param v
+ * @param name
+ * @return
+ */
+ private def parseNum[T](v: String, name: String, extract: String ⇒ T, fromIncl: T, toIncl: T)(implicit e: T ⇒ Number): T = {
+ val t =
+ try
+ extract(v.toLowerCase)
+ catch {
+ case _: NumberFormatException ⇒ throw new IllegalArgumentException(s"Invalid numeric: $name")
+ }
+
+ val td = t.doubleValue()
+
+ if (td < fromIncl.doubleValue() || td > toIncl.doubleValue())
+ throw new IllegalArgumentException(s"Invalid `$name` range. Must be between: $fromIncl and $toIncl")
+
+ t
+ }
+
+ /**
+ *
+ * @param v
+ * @param name
+ * @return
+ */
+ private def parseBoolean(v: String, name: String): Boolean =
+ v.toLowerCase match {
+ case "true" ⇒ true
+ case "false" ⇒ false
+
+ case _ ⇒ throw new IllegalArgumentException(s"Invalid boolean value in: $name $v")
+ }
+
+ /**
+ *
+ * @param cmdArgs
+ * @return
+ */
+ private def parseCmdParameters(cmdArgs: Array[String]): ParametersHolder = {
+ if (cmdArgs.isEmpty || !cmdArgs.intersect(Seq("--help", "-h", "-help", "--?", "-?", "/?", "/help")).isEmpty)
+ errorExit()
+
+ var mdlPath: String = null
+
+ var url = DFLT_URL
+ var limit = DFLT_LIMIT
+ var minScore = DFLT_MIN_SCORE
+ var synsWords = DFLT_SYNONYMNS_WORDS
+ var debug = DFLT_DEBUG
+
+ var i = 0
+
+ try {
+ while (i < cmdArgs.length - 1) {
+ val k = cmdArgs(i).toLowerCase
+ val v = cmdArgs(i + 1)
+
+ k match {
+ case "--model" | "-m" ⇒ mdlPath = v
+ case "--url" | "-u" ⇒ url = v
+ case "--limit" | "-l" ⇒ limit = parseNum(v, k, (s: String) ⇒ s.toInt, 1, Integer.MAX_VALUE)
+ case "--score" | "-c" ⇒ minScore = parseNum(v, k, (s: String) ⇒ s.toDouble, 0, Integer.MAX_VALUE)
+ case "--syns" | "-s" ⇒ synsWords = parseNum(v, k, (s: String) ⇒ s.toInt, 1, Integer.MAX_VALUE)
+ case "--debug" | "-d" ⇒ debug = parseBoolean(v, k)
+
+ case _ ⇒ throw new IllegalArgumentException(s"Invalid argument: ${cmdArgs(i)}")
+ }
+
+ i = i + 2
+ }
+
+ mandatoryParam(mdlPath, "--model")
+ }
+ catch {
+ case e: Exception ⇒ errorExit(e.getMessage)
+ }
+
+ ParametersHolder(mdlPath, url, limit, minScore, synsWords, debug)
+ }
+
+ NCSuggestionsGeneratorImpl.process(parseCmdParameters(args))
}
\ No newline at end of file