You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2020/06/23 10:36:08 UTC
[incubator-nlpcraft] 01/03: Spell and Geo enricher refactored.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-85
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 57d490f821a79e7cf228f96cf8efef1937ae7dce
Author: Sergey Kamov <se...@apache.org>
AuthorDate: Tue Jun 23 13:24:32 2020 +0300
Spell and Geo enricher refactored.
---
.../apache/nlpcraft/server/geo/NCGeoManager.scala | 16 +-
.../org/apache/nlpcraft/server/json/NCJson.scala | 11 +
.../server/nlp/enrichers/geo/NCGeoEnricher.scala | 336 +++++++++++----------
.../server/nlp/spell/NCSpellCheckManager.scala | 43 ++-
4 files changed, 230 insertions(+), 176 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/geo/NCGeoManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/geo/NCGeoManager.scala
index c4c3fa4..182cdc2 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/geo/NCGeoManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/geo/NCGeoManager.scala
@@ -19,6 +19,7 @@ package org.apache.nlpcraft.server.geo
import io.opencensus.trace.Span
import org.apache.nlpcraft.common.nlp.dict.{NCDictionaryManager, NCDictionaryType}
+import org.apache.nlpcraft.common.util.NCUtils
import org.apache.nlpcraft.common.{NCService, _}
import org.apache.nlpcraft.server.json.NCJson
@@ -82,7 +83,20 @@ object NCGeoManager extends NCService {
*/
@throws[NCE]
override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
- model = readAndConstructModel(true)
+ val ok =
+ Seq(
+ COUNTRY_DIR ,
+ CONT_PATH,
+ METRO_PATH,
+ SYNONYMS_DIR_PATH,
+ CASE_SENSITIVE_DIR_PATH
+ ).forall(NCUtils.hasResource)
+
+ if (ok)
+ model = readAndConstructModel(true)
+ else
+ // TODO: warning text.
+ logger.warn(s"Some GEO Data not found for some reasons")
super.start()
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/json/NCJson.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/json/NCJson.scala
index 99c5fe6..08e3981 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/json/NCJson.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/json/NCJson.scala
@@ -300,6 +300,17 @@ object NCJson {
case e: Throwable ⇒ throw new NCE(s"Failed to parse: $res", e)
}
+ /**
+ * Extracts type optional `T` from given JSON `file`.
+ *
+ * @param res Resource to extract from.
+ * @param ignoreCase Whether or not to ignore case.
+ * @tparam T Type of the object to extract.
+ */
+ @throws[NCE]
+ def extractResourceOpt[T: Manifest](res: String, ignoreCase: Boolean): Option[T] =
+ if (U.hasResource(res)) Some(extractResource(res, ignoreCase)) else None
+
// Gets string with removed symbol + from exponent part of numbers.
// It is developed to avoid Lift parsing errors during processing numbers like '2E+2'.
@tailrec
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/geo/NCGeoEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/geo/NCGeoEnricher.scala
index dc93989..a9a9434 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/geo/NCGeoEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/geo/NCGeoEnricher.scala
@@ -52,35 +52,18 @@ object NCGeoEnricher extends NCServerEnricher {
// Common word exceptions configuration folder.
private final val EXCEPTIONS_PATH = "geo/exceptions"
- @throws[NCE]
- private[geo] final val LOCATIONS: Map[String, Set[NCGeoEntry]] = NCGeoManager.getModel.synonyms
-
- // GEO names matched with common english words and user defined exception GEO names.
- // Note that 'ignore case' parameter set as false because DLGeoLocationKind definition (CITY ect)
- @throws[NCE]
- // TODO: refactor... incomprehensible!
- private final val COMMONS: Map[NCGeoLocationKind, Set[String]] =
- U.getFilesResources(EXCEPTIONS_PATH).
- flatMap(f ⇒
- NCJson.extractResource[immutable.Map[String, immutable.Set[String]]](f, ignoreCase = false).
- map(p ⇒ NCGeoLocationKind.withName(p._1.toUpperCase) → p._2)
- ).groupBy(_._1).map(p ⇒ p._1 → p._2.flatMap(_._2).toSet).map(p ⇒ p._1 → p._2.map(_.toLowerCase))
-
private final val GEO_TYPES: Set[String] = NCGeoLocationKind.values.map(mkName)
+ @volatile private[geo] var locations: Map[String, Set[NCGeoEntry]] = _
+ @volatile private var commons: Map[NCGeoLocationKind, Set[String]] = _
+ @volatile private var topUsa: Set[String] = _
+ @volatile private var topWorld: Set[String] = _
+
// JSON extractor for largest cities.
case class TopCity(name: String, region: String)
private def glue(s: String*): String = s.map(_.toLowerCase).mkString("|")
- private final val TOP_USA: Set[String] =
- NCJson.extractResource[List[TopCity]](US_TOP_PATH, ignoreCase = true).
- map(city ⇒ glue(city.name, city.region)).toSet
-
- private final val TOP_WORLD: Set[String] =
- NCJson.extractResource[List[TopCity]](WORLD_TOP_PATH, ignoreCase = true).
- map(city ⇒ glue(city.name, city.region)).toSet
-
private def isConflictName(name: String): Boolean =
US_CONFLICT_STATES.contains(name.toLowerCase) && name.exists(_.isLower)
@@ -92,167 +75,202 @@ object NCGeoEnricher extends NCServerEnricher {
private def getGeoNotes(t: NCNlpSentenceToken): Set[NCNlpSentenceNote] = GEO_TYPES.flatMap(t.getNotes)
override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ ⇒
+ locations = null
+ commons = null
+ topUsa = null
+ topWorld = null
+
super.stop()
}
+ @throws[NCE]
override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
+ NCGeoManager.getModel match {
+ case null ⇒
+ // TODO: warning text.
+ logger.warn(s"Some GEO Data not found for some reasons")
+
+ case mdl ⇒
+ locations = mdl.synonyms
+
+ // TODO: refactor... incomprehensible!
+ // GEO names matched with common english words and user defined exception GEO names.
+ // Note that 'ignore case' parameter set as false because DLGeoLocationKind definition (CITY ect)
+ commons =
+ U.getFilesResources(EXCEPTIONS_PATH).
+ flatMap(f ⇒
+ NCJson.extractResource[immutable.Map[String, immutable.Set[String]]](f, ignoreCase = false).
+ map(p ⇒ NCGeoLocationKind.withName(p._1.toUpperCase) → p._2)
+ ).groupBy(_._1).map(p ⇒ p._1 → p._2.flatMap(_._2).toSet).map(p ⇒ p._1 → p._2.map(_.toLowerCase))
+
+ topUsa =
+ NCJson.extractResource[List[TopCity]](US_TOP_PATH, ignoreCase = true).
+ map(city ⇒ glue(city.name, city.region)).toSet
+
+ topWorld =
+ NCJson.extractResource[List[TopCity]](WORLD_TOP_PATH, ignoreCase = true).
+ map(city ⇒ glue(city.name, city.region)).toSet
+ }
+
super.start()
}
@throws[NCE]
override def enrich(ns: NCNlpSentence, parent: Span = null): Unit =
startScopedSpan("enrich", parent, "srvReqId" → ns.srvReqId, "txt" → ns.text) { _ ⇒
- // This stage must not be 1st enrichment stage.
- assume(ns.nonEmpty)
+ if (locations != null) {
+ // This stage must not be 1st enrichment stage.
+ assume(ns.nonEmpty)
- for (toks ← ns.tokenMixWithStopWords(withQuoted = true)) {
- def mkNote(kind: NCGeoLocationKind, seq: (String, Any)*): NCNlpSentenceNote =
- NCNlpSentenceNote(toks.map(_.index), mkName(kind), seq :_*)
+ for (toks ← ns.tokenMixWithStopWords(withQuoted = true)) {
+ def mkNote(kind: NCGeoLocationKind, seq: (String, Any)*): NCNlpSentenceNote =
+ NCNlpSentenceNote(toks.map(_.index), mkName(kind), seq: _*)
- def toSerializable(m: Map[String, Any]): java.io.Serializable= {
- val ser = new util.HashMap[String, Object]()
+ def toSerializable(m: Map[String, Any]): java.io.Serializable = {
+ val ser = new util.HashMap[String, Object]()
- m.foreach { case (k, v) ⇒ ser.put(k, v.asInstanceOf[Object]) }
+ m.foreach { case (k, v) ⇒ ser.put(k, v.asInstanceOf[Object]) }
- ser
- }
-
- def make(e: NCGeoEntry): NCNlpSentenceNote =
- e match {
- case x: NCGeoMetro ⇒
- mkNote(
- METRO,
- "metro" → x.name
- )
-
- case x: NCGeoContinent ⇒
- mkNote(
- CONTINENT,
- "continent" → x.name
- )
-
- case x: NCGeoSubContinent ⇒
- mkNote(
- SUBCONTINENT,
- "continent" → x.continent.name,
- "subcontinent" → x.name
- )
-
- case x: NCGeoCountry ⇒
- mkNote(
- COUNTRY,
- "continent" → x.subContinent.continent.name,
- "subcontinent" → x.subContinent.name,
- "country" → x.name,
- "countrymeta" → toSerializable(x.meta)
- )
-
- case x: NCGeoRegion ⇒
- mkNote(
- REGION,
- "continent" → x.country.subContinent.continent.name,
- "subcontinent" → x.country.subContinent.name,
- "country" → x.country.name,
- "region" → x.name,
- "countrymeta" → toSerializable(x.country.meta)
- )
-
- case x: NCGeoCity ⇒
- mkNote(
- CITY,
- "continent" → x.region.country.subContinent.continent.name,
- "subcontinent" → x.region.country.subContinent.name,
- "country" → x.region.country.name,
- "region" → x.region.name,
- "city" → x.name,
- "countrymeta" → toSerializable(x.region.country.meta),
- "citymeta" → toSerializable(x.meta)
- )
-
- case _ ⇒ throw new AssertionError(s"Unexpected data: $e")
+ ser
}
- def addAll(locs: Set[NCGeoEntry]): Unit =
- for (loc ← locs) {
- val note = make(loc)
+ def make(e: NCGeoEntry): NCNlpSentenceNote =
+ e match {
+ case x: NCGeoMetro ⇒
+ mkNote(
+ METRO,
+ "metro" → x.name
+ )
+
+ case x: NCGeoContinent ⇒
+ mkNote(
+ CONTINENT,
+ "continent" → x.name
+ )
+
+ case x: NCGeoSubContinent ⇒
+ mkNote(
+ SUBCONTINENT,
+ "continent" → x.continent.name,
+ "subcontinent" → x.name
+ )
+
+ case x: NCGeoCountry ⇒
+ mkNote(
+ COUNTRY,
+ "continent" → x.subContinent.continent.name,
+ "subcontinent" → x.subContinent.name,
+ "country" → x.name,
+ "countrymeta" → toSerializable(x.meta)
+ )
+
+ case x: NCGeoRegion ⇒
+ mkNote(
+ REGION,
+ "continent" → x.country.subContinent.continent.name,
+ "subcontinent" → x.country.subContinent.name,
+ "country" → x.country.name,
+ "region" → x.name,
+ "countrymeta" → toSerializable(x.country.meta)
+ )
+
+ case x: NCGeoCity ⇒
+ mkNote(
+ CITY,
+ "continent" → x.region.country.subContinent.continent.name,
+ "subcontinent" → x.region.country.subContinent.name,
+ "country" → x.region.country.name,
+ "region" → x.region.name,
+ "city" → x.name,
+ "countrymeta" → toSerializable(x.region.country.meta),
+ "citymeta" → toSerializable(x.meta)
+ )
+
+ case _ ⇒ throw new AssertionError(s"Unexpected data: $e")
+ }
- toks.foreach(t ⇒ t.add(note))
+ def addAll(locs: Set[NCGeoEntry]): Unit =
+ for (loc ← locs) {
+ val note = make(loc)
- // Other types(JJ etc) and quoted word are not re-marked.
- toks.filter(t ⇒ !NCPennTreebank.NOUNS_POS.contains(t.pos) && t.pos != "FW").
- foreach(t ⇒ ns.fixNote(t.getNlpNote, "pos" → NCPennTreebank.SYNTH_POS))
- }
+ toks.foreach(t ⇒ t.add(note))
- LOCATIONS.get(toks.map(_.normText).mkString(" ")) match {
- case Some(locs) ⇒
- // If multiple token match - add it.
- if (toks.length > 1)
- addAll(locs)
- else {
- // Only one token - toks.length == 1
- val t = toks.head
-
- // If LOCATION or noun - add it.
- if (NCPennTreebank.NOUNS_POS.contains(t.pos))
+ // Other types(JJ etc) and quoted word are not re-marked.
+ toks.filter(t ⇒ !NCPennTreebank.NOUNS_POS.contains(t.pos) && t.pos != "FW").
+ foreach(t ⇒ ns.fixNote(t.getNlpNote, "pos" → NCPennTreebank.SYNTH_POS))
+ }
+
+ locations.get(toks.map(_.normText).mkString(" ")) match {
+ case Some(locs) ⇒
+ // If multiple token match - add it.
+ if (toks.length > 1)
addAll(locs)
- // If US state - add it.
- else
- // For now - simply ignore abbreviations for US states that
- // conflict with commonly used English words. User will have to
- // use full names.
- if (!isConflictName(t.origText)) {
- def isTopCity(g: NCGeoCity): Boolean = {
- val name = glue(g.name, g.region.name)
-
- TOP_USA.contains(name) || TOP_WORLD.contains(name)
- }
+ else {
+ // Only one token - toks.length == 1
+ val t = toks.head
+
+ // If LOCATION or noun - add it.
+ if (NCPennTreebank.NOUNS_POS.contains(t.pos))
+ addAll(locs)
+ // If US state - add it.
+ else
+ // For now - simply ignore abbreviations for US states that
+ // conflict with commonly used English words. User will have to
+ // use full names.
+ if (!isConflictName(t.origText)) {
+ def isTopCity(g: NCGeoCity): Boolean = {
+ val name = glue(g.name, g.region.name)
+
+ topUsa.contains(name) || topWorld.contains(name)
+ }
- addAll(locs.collect {
- case g: NCGeoContinent ⇒ g
- case g: NCGeoSubContinent ⇒ g
- case g: NCGeoCountry ⇒ g
- case g: NCGeoMetro ⇒ g
- case g: NCGeoRegion if g.country.name == "united states" ⇒ g
- case g: NCGeoCity if isTopCity(g) ⇒ g
- })
+ addAll(locs.collect {
+ case g: NCGeoContinent ⇒ g
+ case g: NCGeoSubContinent ⇒ g
+ case g: NCGeoCountry ⇒ g
+ case g: NCGeoMetro ⇒ g
+ case g: NCGeoRegion if g.country.name == "united states" ⇒ g
+ case g: NCGeoCity if isTopCity(g) ⇒ g
+ })
+ }
+ // In all other cases - ignore one-token match.
}
- // In all other cases - ignore one-token match.
- }
- case None ⇒
- // Case sensitive synonyms.
- LOCATIONS.get(toks.map(_.origText).mkString(" ")) match {
- case Some(locs) ⇒ addAll(locs)
- case None ⇒
- // If there is no direct match try to convert JJs to NNs and re-check
- // for a possible match, e.g. "american" ⇒ "america".
- if (toks.size == 1) {
- val tok = toks.head
-
- if (NCPennTreebank.JJS_POS.contains(tok.pos)) {
- var endLoop = false
-
- for (noun ← NCWordNetManager.getNNsForJJ(tok.normText); if !endLoop) {
- def onResult(locs: Set[NCGeoEntry]): Unit = {
- addAll(locs)
- endLoop = true
- }
-
- LOCATIONS.get(noun) match {
- case Some(locs) ⇒ onResult(locs)
- case None ⇒
- LOCATIONS.get(noun.toLowerCase) match {
- case Some(locs) ⇒ onResult(locs)
- case None ⇒ // No-op.
- }
+ case None ⇒
+ // Case sensitive synonyms.
+ locations.get(toks.map(_.origText).mkString(" ")) match {
+ case Some(locs) ⇒ addAll(locs)
+ case None ⇒
+ // If there is no direct match try to convert JJs to NNs and re-check
+ // for a possible match, e.g. "american" ⇒ "america".
+ if (toks.size == 1) {
+ val tok = toks.head
+
+ if (NCPennTreebank.JJS_POS.contains(tok.pos)) {
+ var endLoop = false
+
+ for (noun ← NCWordNetManager.getNNsForJJ(tok.normText); if !endLoop) {
+ def onResult(locs: Set[NCGeoEntry]): Unit = {
+ addAll(locs)
+ endLoop = true
+ }
+
+ locations.get(noun) match {
+ case Some(locs) ⇒ onResult(locs)
+ case None ⇒
+ locations.get(noun.toLowerCase) match {
+ case Some(locs) ⇒ onResult(locs)
+ case None ⇒ // No-op.
+ }
+ }
}
}
}
- }
- }
+ }
+ }
}
- }
- collapse(ns)
+ collapse(ns)
+ }
}
private def getValue(note: NCNlpSentenceNote, key: String): String = note(key).asInstanceOf[String]
@@ -313,7 +331,7 @@ object NCGeoEnricher extends NCServerEnricher {
val excls = new mutable.HashSet[NCNlpSentenceNote]() ++ getGeoNotes(ns).filter(note ⇒ {
val kind = extractKind(note)
- COMMONS.get(kind) match {
+ commons.get(kind) match {
// GEO is common word defined directly or via synonym.
case Some(cs) ⇒
cs.contains(getName(kind, note)) ||
@@ -416,9 +434,9 @@ object NCGeoEnricher extends NCServerEnricher {
case CITY ⇒
val cityReg = glue(get("city"), get("region"))
- if (TOP_WORLD.contains(cityReg))
+ if (topWorld.contains(cityReg))
2
- else if (TOP_USA.contains(cityReg))
+ else if (topUsa.contains(cityReg))
1
else
0
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/spell/NCSpellCheckManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/spell/NCSpellCheckManager.scala
index 0782359..5f9f0e6 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/spell/NCSpellCheckManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/spell/NCSpellCheckManager.scala
@@ -28,12 +28,10 @@ import scala.collection._
*/
object NCSpellCheckManager extends NCService {
case class Record(correct: String, misspellings: Seq[String])
+
+ private final val PATH = "spell/dictionary.json"
- private val dict: Map[String, String] = (
- for (rec ← NCJson.extractResource[List[Record]]("spell/dictionary.json", ignoreCase = true)) yield {
- for (v ← rec.misspellings) yield v → rec.correct
- })
- .flatten.toMap
+ @volatile private var dict: Map[String, String] = _
private def isWordUpper(s: String): Boolean = s.forall(_.isUpper)
private def isHeadUpper(s: String): Boolean = s.head.isUpper
@@ -47,10 +45,22 @@ object NCSpellCheckManager extends NCService {
s // Full lower case by default.
override def start(parent: Span): NCService = startScopedSpan("start", parent) { _ ⇒
+ NCJson.extractResourceOpt[List[Record]](PATH, ignoreCase = true) match {
+ case Some(recs) ⇒
+ dict = (for (rec ← recs) yield { for (v ← rec.misspellings) yield v → rec.correct } ).flatten.toMap
+ case None ⇒
+ // TODO: warning text.
+ logger.warn(s"Data not found for some reasons: $PATH")
+
+ dict = Map.empty
+ }
+
super.start()
}
override def stop(parent: Span): Unit = startScopedSpan("stop", parent) { _ ⇒
+ dict = null
+
super.stop()
}
@@ -62,15 +72,16 @@ object NCSpellCheckManager extends NCService {
*
* @param in Word to check.
*/
- def check(in: String): String = dict.get(in.toLowerCase) match {
- case None ⇒ in
- case Some(out) ⇒
- val inSeq = split(in)
- val outSeq = split(out)
-
- if (inSeq.lengthCompare(outSeq.size) == 0)
- outSeq.zip(inSeq).map(p ⇒ processCase(p._1, p._2)).mkString(" ")
- else
- processCase(out, in)
- }
+ def check(in: String): String =
+ dict.get(in.toLowerCase) match {
+ case None ⇒ in
+ case Some(out) ⇒
+ val inSeq = split(in)
+ val outSeq = split(out)
+
+ if (inSeq.lengthCompare(outSeq.size) == 0)
+ outSeq.zip(inSeq).map(p ⇒ processCase(p._1, p._2)).mkString(" ")
+ else
+ processCase(out, in)
+ }
}