You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2020/05/18 15:29:18 UTC
[incubator-nlpcraft] 01/02: Synonyms static generator CLI version
added.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-41
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 0e08caa948e9422bc175f9b2cfdc7c0be1658133
Author: Sergey Kamov <se...@apache.org>
AuthorDate: Mon May 18 18:17:36 2020 +0300
Synonyms static generator CLI version added.
---
enricher/bin/start_server.sh | 2 +-
.../nlpcraft/examples/weather/weather_model.json | 14 +-
.../model/tools/synonyms/NCSynonymsGenerator.scala | 339 +++++++++++++++++++++
3 files changed, 353 insertions(+), 2 deletions(-)
diff --git a/enricher/bin/start_server.sh b/enricher/bin/start_server.sh
index 8e382e4..ec4e816 100755
--- a/enricher/bin/start_server.sh
+++ b/enricher/bin/start_server.sh
@@ -16,4 +16,4 @@
# limitations under the License.
#
-FLASK_APP=server.py python -m flask run
+FLASK_APP=server.py python3 -m flask run
diff --git a/src/main/scala/org/apache/nlpcraft/examples/weather/weather_model.json b/src/main/scala/org/apache/nlpcraft/examples/weather/weather_model.json
index f1c682a..14d43ab 100644
--- a/src/main/scala/org/apache/nlpcraft/examples/weather/weather_model.json
+++ b/src/main/scala/org/apache/nlpcraft/examples/weather/weather_model.json
@@ -22,7 +22,19 @@
"description": "Weather example model.",
"examples": [
"What's the local weather forecast?",
- "What's the weather in Moscow?"
+ "What's the weather in Moscow?",
+ "What is the weather like outside?",
+ "How's the weather?",
+ "What's the weather forecast for the rest of the week?",
+ "What's the weather forecast this week?",
+ "What's the weather out there?",
+ "Is it cold outside?",
+ "Is it hot outside?",
+ "Will it rain today?",
+ "When it will rain in Delhi?",
+ "Is there any possibility of rain in Delhi?",
+ "Is it raining now?",
+ "Is there any chance of rain today?"
],
"macros": [
{
diff --git a/src/main/scala/org/apache/nlpcraft/model/tools/synonyms/NCSynonymsGenerator.scala b/src/main/scala/org/apache/nlpcraft/model/tools/synonyms/NCSynonymsGenerator.scala
new file mode 100644
index 0000000..f7a8783
--- /dev/null
+++ b/src/main/scala/org/apache/nlpcraft/model/tools/synonyms/NCSynonymsGenerator.scala
@@ -0,0 +1,339 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nlpcraft.model.tools.synonyms
+
+import java.util.concurrent.atomic.AtomicInteger
+import java.util.concurrent.{CopyOnWriteArrayList, CountDownLatch, TimeUnit}
+
+import com.google.gson.Gson
+import com.google.gson.reflect.TypeToken
+import org.apache.http.HttpResponse
+import org.apache.http.client.ResponseHandler
+import org.apache.http.client.methods.HttpPost
+import org.apache.http.entity.StringEntity
+import org.apache.http.impl.client.HttpClients
+import org.apache.http.util.EntityUtils
+import org.apache.nlpcraft.common.ascii.NCAsciiTable
+import org.apache.nlpcraft.common.makro.NCMacroParser
+import org.apache.nlpcraft.common.nlp.core.NCNlpPorterStemmer
+import org.apache.nlpcraft.common.util.NCUtils
+import org.apache.nlpcraft.model.NCModelFileAdapter
+
+import scala.collection.JavaConverters._
+import scala.collection._
+
+case class NCSynonymsGeneratorData(
+ url: String = "http://localhost:5000",
+ modelPath: String,
+ minScore: Double = 0,
+ supportMultiple: Boolean = false,
+ debugRequests: Boolean = false
+)
+
+object NCSynonymsGenerator {
+ /**
+ * Suggestion data holder.
+ *
+ * @param word Word
+ * @param bert Bert factor.
+ * @param normalized Normalized bert factor.
+ * @param ftext FText factor.
+ * @param `ftext-sentence` TODO:
+ * @param score Calculated summary factor: normalized * weight1 + ftext * weight2 (weights values are 1 currently)
+ */
+ case class Suggestion(
+ word: String, bert: Double, normalized: Double, ftext: Double, `ftext-sentence`: Double, score: Double
+ )
+ case class RequestData(sentence: String, example: String, elementId: String, lower: Int, upper: Int)
+ case class RestRequest(sentence: String, simple: Boolean, lower: Int, upper: Int)
+ case class RestResponse(data: java.util.ArrayList[Suggestion])
+
+ private final val GSON = new Gson
+ private final val TYPE_RESP = new TypeToken[RestResponse]() {}.getType
+ private final val SEPARATORS = Seq('?', ',', '.', '-', '!')
+
+ private def mkHandler(req: RequestData): ResponseHandler[Seq[Suggestion]] =
+ (resp: HttpResponse) ⇒ {
+ val code = resp.getStatusLine.getStatusCode
+ val e = resp.getEntity
+
+ val js = if (e != null) EntityUtils.toString(e) else null
+
+ if (js == null)
+ throw new RuntimeException(s"Unexpected empty response [req=$req, code=$code]")
+
+ code match {
+ case 200 ⇒
+ val data: RestResponse = GSON.fromJson(js, TYPE_RESP)
+
+ data.data.asScala
+
+ case 400 ⇒ throw new RuntimeException(js)
+ case _ ⇒ throw new RuntimeException(s"Unexpected response [req=$req, code=$code, response=$js]")
+ }
+ }
+
+ private def split(s: String): Seq[String] = s.split(" ").toSeq.map(_.trim).filter(_.nonEmpty)
+ private def toStem(s: String): String = split(s).map(NCNlpPorterStemmer.stem).mkString(" ")
+ private def toStemWord(s: String): String = NCNlpPorterStemmer.stem(s)
+ private def getAllSlices(seq1: Seq[String], seq2: Seq[String]): Seq[Int] = {
+ val seq = mutable.Buffer.empty[Int]
+
+ var i = seq1.indexOfSlice(seq2)
+
+ while (i >= 0) {
+ seq += i
+
+ i = seq1.indexOfSlice(seq2, i + 1)
+ }
+
+ seq
+ }
+
+ def process(data: NCSynonymsGeneratorData): Unit = {
+ val mdl = new NCModelFileAdapter(data.modelPath) {}
+
+ val parser = new NCMacroParser()
+
+ if (mdl.getMacros != null)
+ mdl.getMacros.asScala.foreach { case (name, str) ⇒ parser.addMacro(name, str) }
+
+ val client = HttpClients.createDefault
+
+ case class Word(word: String, stem: String) {
+ require(!word.contains(" "), s"Word cannot contains spaces: $word")
+ require(
+ word.forall(ch ⇒
+ ch.isLetterOrDigit ||
+ ch == '\'' ||
+ SEPARATORS.contains(ch)
+ ),
+ s"Unsupported symbols: $word"
+ )
+ }
+
+ val examples =
+ mdl.getExamples.asScala.
+ map(ex ⇒ SEPARATORS.foldLeft(ex)((s, ch) ⇒ s.replaceAll(s"\\$ch", s" $ch "))).
+ map(ex ⇒ {
+ val seq = ex.split(" ")
+
+ seq → seq.map(toStemWord)
+ }).
+ toMap
+
+ val elemSyns =
+ mdl.getElements.asScala.map(e ⇒ e.getId → e.getSynonyms.asScala.flatMap(parser.expand)).
+ map { case (id, seq) ⇒ id → seq.map(txt ⇒ split(txt).map(p ⇒ Word(p, toStemWord(p))))}.toMap
+
+ val allReqs =
+ elemSyns.map {
+ case (elemId, syns) ⇒
+ val normSyns: Seq[Seq[Word]] =
+ if (data.supportMultiple) syns.filter(_.size <= 2) else syns.filter(_.size == 1)
+ val synsStems = normSyns.map(_.map(_.stem))
+ val synsWords = normSyns.map(_.map(_.word))
+
+ val reqs =
+ examples.flatMap { case (exampleWords, exampleStems) ⇒
+ val exampleIdxs = synsStems.flatMap(synStems ⇒ getAllSlices(exampleStems, synStems))
+
+ def mkRequestData(idx: Int, synStems: Seq[String], synStemsIdx: Int): RequestData = {
+ val fromIncl = idx
+ val toExcl = idx + synStems.length
+
+ RequestData(
+ sentence = exampleWords.zipWithIndex.flatMap {
+ case (exampleWord, i) ⇒
+ i match {
+ case x if x == fromIncl ⇒ synsWords(synStemsIdx)
+ case x if x > fromIncl && x < toExcl ⇒ Seq.empty
+ case _ ⇒ Seq(exampleWord)
+ }
+ }.mkString(" "),
+ example = exampleWords.mkString(" "),
+ elementId = elemId,
+ lower = idx,
+ upper = idx + synStems.length - 1
+ )
+ }
+
+ (for (idx ← exampleIdxs; (synStems, i) ← synsStems.zipWithIndex) yield mkRequestData(idx, synStems, i)).
+ distinct
+ }
+
+ elemId → reqs.toSet
+ }.filter(_._2.nonEmpty)
+
+ println(s"Examples count: ${examples.size}")
+ println(s"Synonyms count: ${elemSyns.map(_._2.size).sum}")
+ println(s"Request prepared: ${allReqs.map(_._2.size).sum}")
+
+ val allSuggs = new java.util.concurrent.ConcurrentHashMap[String, java.util.List[Suggestion]] ()
+ val cdl = new CountDownLatch(allReqs.map { case (_, seq) ⇒ seq.size }.sum)
+ val debugs = mutable.HashMap.empty[RequestData, Seq[Suggestion]]
+ val cnt = new AtomicInteger(0)
+
+ for ((elemId, reqs) ← allReqs; req ← reqs) {
+ NCUtils.asFuture(
+ _ ⇒ {
+ val post = new HttpPost(data.url)
+
+ post.setHeader("Content-Type", "application/json")
+ post.setEntity(
+ new StringEntity(
+ GSON.toJson(
+ RestRequest(
+ sentence = req.sentence,
+ simple = false,
+ lower = req.lower,
+ upper = req.upper
+ )
+ ),
+ "UTF-8"
+ )
+ )
+
+ val resp: Seq[Suggestion] =
+ try
+ client.execute(post, mkHandler(req))
+ finally
+ post.releaseConnection()
+
+ if (data.debugRequests)
+ debugs += req → resp
+
+ val i = cnt.incrementAndGet()
+
+ if (i % 10 == 0)
+ println(s"Executed: $i requests.")
+
+ allSuggs.
+ computeIfAbsent(elemId, (_: String) ⇒ new CopyOnWriteArrayList[Suggestion]()).
+ addAll(resp.asJava)
+ },
+ (e: Throwable) ⇒ {
+ e.printStackTrace()
+
+ cdl.countDown()
+ },
+ (_: Boolean) ⇒ cdl.countDown()
+ )
+ }
+
+ cdl.await(Long.MaxValue, TimeUnit.MILLISECONDS)
+
+ println("All requests executed.")
+
+ val allSynsStems = elemSyns.flatMap(_._2).flatten.map(_.stem).toSet
+
+ val filteredSuggs =
+ allSuggs.asScala.map {
+ case (elemId, elemSuggs) ⇒ elemId → elemSuggs.asScala.filter(_.score >= data.minScore)
+ }.filter(_._2.nonEmpty)
+
+ val avgScores = filteredSuggs.map { case (elemId, suggs) ⇒ elemId → (suggs.map(_.score).sum / suggs.size) }
+ val counts = filteredSuggs.map { case (elemId, suggs) ⇒ elemId → suggs.size }
+
+ val tbl = NCAsciiTable()
+
+ val headers = Seq("Element", "Suggestion", "Summary factor", "Count", "Bert/Ftext score", "Bert", "Bert norm", "Ftext")
+
+ tbl #= ((if (data.supportMultiple) headers ++ Seq("Ftext-Sentence") else headers) :_*)
+
+ filteredSuggs.
+ foreach { case (elemId, elemSuggs) ⇒
+ val seq: Seq[(Suggestion, Int)] = elemSuggs.
+ map(sugg ⇒ (sugg, toStem(sugg.word))).
+ groupBy { case (_, stem) ⇒ stem }.
+ filter { case (stem, _) ⇒ !allSynsStems.contains(stem) }.
+ map { case (_, group) ⇒
+ val seq = group.map { case (sugg, _) ⇒ sugg }.sortBy(-_.score)
+
+ // Drops repeated.
+ (seq.head, seq.length)
+ }.
+ toSeq
+
+ val normFactor = seq.map(_._2).sum.toDouble / seq.size / avgScores(elemId)
+
+ seq.
+ map { case (sugg, cnt) ⇒ (sugg, cnt, sugg.score * normFactor * cnt.toDouble / counts(elemId)) }.
+ sortBy { case (_, _, cumFactor) ⇒ -cumFactor }.
+ zipWithIndex.
+ foreach { case ((sugg, cnt, cumFactor), sugIdx) ⇒
+ def f(d: Double): String = "%1.3f" format d
+
+ val vals = Seq(
+ if (sugIdx == 0) elemId else " ",
+ sugg.word,
+ f(cumFactor),
+ cnt,
+ f(sugg.score),
+ f(sugg.bert),
+ f(sugg.normalized),
+ f(sugg.ftext)
+ )
+
+ tbl += ((if (data.supportMultiple) vals ++ Seq(f(sugg.`ftext-sentence`)) else vals) :_*)
+ }
+ }
+
+ if (data.debugRequests) {
+ var i = 1
+
+ debugs.groupBy(_._1.example).foreach { case (_, m) ⇒
+ m.toSeq.sortBy(_._1.sentence).foreach { case (req, suggs) ⇒
+ val s =
+ split(req.sentence).zipWithIndex.map { case (w, i) ⇒
+ i match {
+ case x if x == req.lower && x == req.upper ⇒ s"<<<$w>>>"
+ case x if x == req.lower ⇒ s"<<<$w"
+ case x if x == req.upper ⇒ s"$w>>>"
+ case _ ⇒ w
+ }
+ }.mkString(" ")
+
+ println(
+ s"$i. " +
+ s"Request=$s, " +
+ s"suggestions=[${suggs.map(_.word).mkString(", ")}], " +
+ s"element=${req.elementId}"
+ )
+
+ i = i + 1
+ }
+ }
+ }
+
+ println("Suggestions:")
+
+ tbl.render()
+ }
+}
+
+object NCSynonymsGeneratorRunner extends App {
+ NCSynonymsGenerator.process(
+ NCSynonymsGeneratorData(
+ url = "http://localhost:5000",
+ modelPath = "src/main/scala/org/apache/nlpcraft/examples/weather/weather_model.json",
+ minScore = 0,
+ supportMultiple = false, // TODO: change it to words count.
+ debugRequests = true
+ )
+ )
+}
\ No newline at end of file