You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2020/04/25 09:43:19 UTC
[incubator-nlpcraft] branch NLPCRAFT-41 updated: WIP.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-41
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-41 by this push:
new 515b98a WIP.
515b98a is described below
commit 515b98a5642074385c9c29812ee07de40014843a
Author: Sergey Kamov <se...@apache.org>
AuthorDate: Sat Apr 25 12:43:10 2020 +0300
WIP.
---
.../model/tools/synonyms/NCSynonymsGenerator.scala | 151 ++++++++++++++-------
1 file changed, 99 insertions(+), 52 deletions(-)
diff --git a/src/main/scala/org/apache/nlpcraft/model/tools/synonyms/NCSynonymsGenerator.scala b/src/main/scala/org/apache/nlpcraft/model/tools/synonyms/NCSynonymsGenerator.scala
index 34df51a..4d37250 100644
--- a/src/main/scala/org/apache/nlpcraft/model/tools/synonyms/NCSynonymsGenerator.scala
+++ b/src/main/scala/org/apache/nlpcraft/model/tools/synonyms/NCSynonymsGenerator.scala
@@ -31,31 +31,35 @@ import org.apache.nlpcraft.common.makro.NCMacroParser
import org.apache.nlpcraft.common.nlp.core.NCNlpPorterStemmer
import org.apache.nlpcraft.model.NCModelFileAdapter
+import scala.collection._
import scala.collection.JavaConverters._
-object NCSynonymsGenerator extends App {
+case class NCSynonymsGenerator(url: String, modelPath: String, minFactor: Double) {
// TODO: all string fields
- case class Holder(word: String, bert: String, normalized: String, ftext: String, score: String) {
+ // normalized - normalized bert value.
+ // score = normalized * weight + ftext * weight
+ // both `weights` = 1
+ case class Suggestion(word: String, bert: String, normalized: String, ftext: String, score: String) {
override def toString: String = s"$word [bert=$bert, ftext=$ftext, normalized=$normalized, score=$score]"
}
+
case class Request(sentence: String, simple: Boolean)
- case class Response(data: java.util.ArrayList[Holder])
+
+ case class Response(data: java.util.ArrayList[Suggestion])
private val GSON = new Gson
private val TYPE_RESP: Type = new TypeToken[Response]() {}.getType
private def split(s: String): Seq[String] = s.split(" ").toSeq.map(_.trim).filter(_.nonEmpty)
- private def ask(client: CloseableHttpClient, url: String, words: Seq[String], idx: Int, minFactor: Double): Seq[Holder]= {
- val sen = words.zipWithIndex.map { case (word, wordIdx) ⇒ if (wordIdx == idx) s"$word#" else word }.mkString(" ")
-
+ private def ask(client: CloseableHttpClient, sen: String): Seq[Suggestion] = {
val post = new HttpPost(url)
post.setHeader("Content-Type", "application/json")
post.setEntity(new StringEntity(GSON.toJson(Request(sen, simple = false)), "UTF-8"))
- val h = new ResponseHandler[Seq[Holder]]() {
- override def handleResponse(resp: HttpResponse): Seq[Holder] = {
+ val h = new ResponseHandler[Seq[Suggestion]]() {
+ override def handleResponse(resp: HttpResponse): Seq[Suggestion] = {
val code = resp.getStatusLine.getStatusCode
val e = resp.getEntity
@@ -66,7 +70,6 @@ object NCSynonymsGenerator extends App {
code match {
case 200 ⇒
- // TODO: add filter by minFactor.
val data: Response = GSON.fromJson(js, TYPE_RESP)
data.data.asScala
@@ -83,65 +86,109 @@ object NCSynonymsGenerator extends App {
post.releaseConnection()
}
- private def process(mdlPath: String, url: String): Unit = {
- val mdl = new NCModelFileAdapter(mdlPath) {
- // No-op.
- }
+ def process(): Unit = {
+ val mdl = new NCModelFileAdapter(modelPath) {}
val parser = new NCMacroParser()
if (mdl.getMacros != null)
mdl.getMacros.asScala.foreach { case (name, str) ⇒ parser.addMacro(name, str) }
- val table = NCAsciiTable()
- val client: CloseableHttpClient = HttpClients.createDefault
+ val client = HttpClients.createDefault
- table #= ("Single synonym", "Suggestions")
+ case class Word(word: String) {
+ val stem: String = NCNlpPorterStemmer.stem(word)
+ }
- val examples: Set[(Seq[String], Seq[String])] =
+ val examples: Seq[Seq[Word]] =
mdl.getExamples.asScala.
// TODO: Is it enough?
- map(_.replaceAll("\\?", "")).
- map(_.replaceAll("\\.", "")).
- map(_.replaceAll("!", "")).
- map(split).map(p ⇒ p → p.map(NCNlpPorterStemmer.stem)).toSet
-
- val suggestions =
- mdl.getElements.asScala.flatMap(e ⇒ {
- val elemSyns = e.getSynonyms.asScala.flatMap(p ⇒ parser.expand(p)).map(s ⇒ s → split(s)).toMap
+ map(_.replaceAll("\\?", " ?")).
+ map(_.replaceAll("\\.", " .")).
+ map(_.replaceAll(",", " ,")).
+ map(_.replaceAll("!", " !")).
+ map(split).
+ map(_.map(Word)).
+ toSeq
+
+ val elemSyns = mdl.getElements.asScala.map(e ⇒ e.getId → e.getSynonyms.asScala.flatMap(parser.expand)).toMap
+
+ val cache = mutable.HashMap.empty[String, Seq[Suggestion]]
+
+ val allSuggs =
+ elemSyns.map {
+ case (elemId, elemSyns) ⇒
+ val stemsSyns: Seq[(String, String)] =
+ elemSyns.
+ map(text ⇒ text → split(text).map(Word)).
+ filter { case( _, words) ⇒ words.size == 1 }.
+ map { case(text, words) ⇒ words.head.stem → text }
+
+ val hs: Seq[Suggestion] =
+ examples.flatMap(exWords ⇒ {
+ val exStems = exWords.map(_.stem)
+
+ val idxs =
+ exStems.flatMap(stem ⇒
+ stemsSyns.find(_._1 == stem) match {
+ case Some(p) ⇒ Some(exStems.indexOf(p._1))
+ case None ⇒ None
+ }
+ )
+
+ if (idxs.nonEmpty)
+ stemsSyns.map(_._2).flatMap(syn ⇒ {
+ val wordsTxt =
+ exWords.zipWithIndex.map { case (word, idx) ⇒ if (idxs.contains(idx)) syn else word.word }
+
+ idxs.flatMap(idx ⇒ {
+ val sen =
+ wordsTxt.zipWithIndex.map {
+ case (word, wordIdx) ⇒ if (wordIdx == idx) s"$word#" else word
+ }.mkString(" ")
+
+ cache.get(sen) match {
+ case Some(res) ⇒ res
+ case None ⇒
+ val res: Seq[Suggestion] = ask(client, sen).filter(_.score.toDouble >= minFactor)
+
+ cache += sen → res
+
+ res
+ }
+ })
+ })
+ else
+ Seq.empty
+ })
+
+ elemId → hs
+ }.filter(_._2.nonEmpty)
+
+ val allSyns = elemSyns.flatMap(_._2).toSet
- elemSyns.filter(_._2.length == 1).
- map(_._2.head).
- map(p ⇒ p → NCNlpPorterStemmer.stem(p)).
- flatMap { case (syn, synStem) ⇒
- val suggestions: Set[Holder] =
- examples.filter(_._2.contains(synStem)).flatMap { case (eWords, eStems) ⇒
- val idx = eStems.indexOf(synStem)
-
- require(idx >= 0)
-
- ask(client, url, eWords, idx, 0.0)
- }.filter(p ⇒ !elemSyns.contains(p.word))
-
- if (suggestions.nonEmpty) Some(syn → suggestions) else None
- }.toMap
- }).toMap
-
- val n = suggestions.size
+ val table = NCAsciiTable()
- suggestions.zipWithIndex.map { case ((syn, hs), idx) ⇒
- // TODO: sort
- hs.toSeq.sortBy(_.score.toDouble).reverse.foreach(h ⇒ table += (syn, h))
+ table #= ("Element", "Suggestions")
- if (idx != n - 1)
- table += ("-------", "-------")
+ allSuggs.foreach { case (elemId, elemSuggs) ⇒
+ elemSuggs.
+ groupBy(_.word).
+ map { case (_, group) ⇒ group.sortBy(_.score.toDouble).reverse.head }. // Drops repeated.
+ toSeq.sortBy(_.score.toDouble).reverse.
+ filter(p ⇒ !allSyns.contains(p.word)). // TODO: drop by stem, not by word as is
+ zipWithIndex.
+ foreach { case (sugg, sugIdx) ⇒ table += (if (sugIdx == 0) elemId else " ", sugg) }
}
table.render()
}
+}
- process(
- "src/main/scala/org/apache/nlpcraft/examples/weather/weather_model.json",
- "http://localhost:5000"
- )
+object NCSynonymsGeneratorRunner extends App {
+ NCSynonymsGenerator(
+ url = "http://localhost:5000",
+ modelPath = "src/main/scala/org/apache/nlpcraft/examples/weather/weather_model.json",
+ minFactor = 0
+ ).process()
}