You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2020/06/23 10:36:08 UTC

[incubator-nlpcraft] 01/03: Spell and Geo enricher refactored.

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-85
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit 57d490f821a79e7cf228f96cf8efef1937ae7dce
Author: Sergey Kamov <se...@apache.org>
AuthorDate: Tue Jun 23 13:24:32 2020 +0300

    Spell and Geo enricher refactored.
---
 .../apache/nlpcraft/server/geo/NCGeoManager.scala  |  16 +-
 .../org/apache/nlpcraft/server/json/NCJson.scala   |  11 +
 .../server/nlp/enrichers/geo/NCGeoEnricher.scala   | 336 +++++++++++----------
 .../server/nlp/spell/NCSpellCheckManager.scala     |  43 ++-
 4 files changed, 230 insertions(+), 176 deletions(-)

diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/geo/NCGeoManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/geo/NCGeoManager.scala
index c4c3fa4..182cdc2 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/geo/NCGeoManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/geo/NCGeoManager.scala
@@ -19,6 +19,7 @@ package org.apache.nlpcraft.server.geo
 
 import io.opencensus.trace.Span
 import org.apache.nlpcraft.common.nlp.dict.{NCDictionaryManager, NCDictionaryType}
+import org.apache.nlpcraft.common.util.NCUtils
 import org.apache.nlpcraft.common.{NCService, _}
 import org.apache.nlpcraft.server.json.NCJson
 
@@ -82,7 +83,20 @@ object NCGeoManager extends NCService {
       */
     @throws[NCE]
     override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
-        model = readAndConstructModel(true)
+        val ok =
+            Seq(
+                COUNTRY_DIR ,
+                CONT_PATH,
+                METRO_PATH,
+                SYNONYMS_DIR_PATH,
+                CASE_SENSITIVE_DIR_PATH
+            ).forall(NCUtils.hasResource)
+
+        if (ok)
+            model = readAndConstructModel(true)
+        else
+            // TODO: warning text.
+            logger.warn(s"Some GEO Data not found for some reasons")
         
         super.start()
     }
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/json/NCJson.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/json/NCJson.scala
index 99c5fe6..08e3981 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/json/NCJson.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/json/NCJson.scala
@@ -300,6 +300,17 @@ object NCJson {
             case e: Throwable ⇒ throw new NCE(s"Failed to parse: $res", e)
         }
 
+    /**
+      * Extracts type optional `T` from given JSON `file`.
+      *
+      * @param res Resource to extract from.
+      * @param ignoreCase Whether or not to ignore case.
+      * @tparam T Type of the object to extract.
+      */
+    @throws[NCE]
+    def extractResourceOpt[T: Manifest](res: String, ignoreCase: Boolean): Option[T] =
+        if (U.hasResource(res)) Some(extractResource(res, ignoreCase)) else None
+
     // Gets string with removed symbol + from exponent part of numbers.
     // It is developed to avoid Lift parsing errors during processing numbers like '2E+2'.
     @tailrec
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/geo/NCGeoEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/geo/NCGeoEnricher.scala
index dc93989..a9a9434 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/geo/NCGeoEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/geo/NCGeoEnricher.scala
@@ -52,35 +52,18 @@ object NCGeoEnricher extends NCServerEnricher {
     // Common word exceptions configuration folder.
     private final val EXCEPTIONS_PATH = "geo/exceptions"
 
-    @throws[NCE]
-    private[geo] final val LOCATIONS: Map[String, Set[NCGeoEntry]] = NCGeoManager.getModel.synonyms
-
-    // GEO names matched with common english words and user defined exception GEO names.
-    // Note that 'ignore case' parameter set as false because DLGeoLocationKind definition (CITY ect)
-    @throws[NCE]
-    // TODO: refactor... incomprehensible!
-    private final val COMMONS: Map[NCGeoLocationKind, Set[String]]  =
-        U.getFilesResources(EXCEPTIONS_PATH).
-            flatMap(f ⇒
-                NCJson.extractResource[immutable.Map[String, immutable.Set[String]]](f, ignoreCase = false).
-                    map(p ⇒ NCGeoLocationKind.withName(p._1.toUpperCase) → p._2)
-            ).groupBy(_._1).map(p ⇒ p._1 → p._2.flatMap(_._2).toSet).map(p ⇒ p._1 → p._2.map(_.toLowerCase))
-
     private final val GEO_TYPES: Set[String] = NCGeoLocationKind.values.map(mkName)
 
+    @volatile private[geo] var locations: Map[String, Set[NCGeoEntry]] = _
+    @volatile private var commons: Map[NCGeoLocationKind, Set[String]] = _
+    @volatile private var topUsa: Set[String] = _
+    @volatile private var topWorld: Set[String] = _
+
     // JSON extractor for largest cities.
     case class TopCity(name: String, region: String)
 
     private def glue(s: String*): String = s.map(_.toLowerCase).mkString("|")
 
-    private final val TOP_USA: Set[String] =
-        NCJson.extractResource[List[TopCity]](US_TOP_PATH, ignoreCase = true).
-            map(city ⇒ glue(city.name, city.region)).toSet
-
-    private final val TOP_WORLD: Set[String] =
-        NCJson.extractResource[List[TopCity]](WORLD_TOP_PATH, ignoreCase = true).
-            map(city ⇒ glue(city.name, city.region)).toSet
-
     private def isConflictName(name: String): Boolean =
         US_CONFLICT_STATES.contains(name.toLowerCase) && name.exists(_.isLower)
 
@@ -92,167 +75,202 @@ object NCGeoEnricher extends NCServerEnricher {
     private def getGeoNotes(t: NCNlpSentenceToken): Set[NCNlpSentenceNote] = GEO_TYPES.flatMap(t.getNotes)
 
     override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ ⇒
+        locations = null
+        commons = null
+        topUsa = null
+        topWorld = null
+
         super.stop()
     }
 
+    @throws[NCE]
     override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { _ ⇒
+         NCGeoManager.getModel match {
+            case null ⇒
+                // TODO: warning text.
+                logger.warn(s"Some GEO Data not found for some reasons")
+
+            case mdl ⇒
+                locations = mdl.synonyms
+
+                // TODO: refactor... incomprehensible!
+                // GEO names matched with common english words and user defined exception GEO names.
+                // Note that 'ignore case' parameter set as false because DLGeoLocationKind definition (CITY ect)
+                commons =
+                    U.getFilesResources(EXCEPTIONS_PATH).
+                        flatMap(f ⇒
+                            NCJson.extractResource[immutable.Map[String, immutable.Set[String]]](f, ignoreCase = false).
+                                map(p ⇒ NCGeoLocationKind.withName(p._1.toUpperCase) → p._2)
+                        ).groupBy(_._1).map(p ⇒ p._1 → p._2.flatMap(_._2).toSet).map(p ⇒ p._1 → p._2.map(_.toLowerCase))
+
+                topUsa =
+                    NCJson.extractResource[List[TopCity]](US_TOP_PATH, ignoreCase = true).
+                        map(city ⇒ glue(city.name, city.region)).toSet
+
+                topWorld =
+                    NCJson.extractResource[List[TopCity]](WORLD_TOP_PATH, ignoreCase = true).
+                        map(city ⇒ glue(city.name, city.region)).toSet
+         }
+
         super.start()
     }
 
     @throws[NCE]
     override def enrich(ns: NCNlpSentence, parent: Span = null): Unit =
         startScopedSpan("enrich", parent, "srvReqId" → ns.srvReqId, "txt" → ns.text) { _ ⇒
-            // This stage must not be 1st enrichment stage.
-            assume(ns.nonEmpty)
+            if (locations != null) {
+                // This stage must not be 1st enrichment stage.
+                assume(ns.nonEmpty)
 
-            for (toks ← ns.tokenMixWithStopWords(withQuoted = true)) {
-                def mkNote(kind: NCGeoLocationKind, seq: (String, Any)*): NCNlpSentenceNote =
-                    NCNlpSentenceNote(toks.map(_.index), mkName(kind), seq :_*)
+                for (toks ← ns.tokenMixWithStopWords(withQuoted = true)) {
+                    def mkNote(kind: NCGeoLocationKind, seq: (String, Any)*): NCNlpSentenceNote =
+                        NCNlpSentenceNote(toks.map(_.index), mkName(kind), seq: _*)
 
-                def toSerializable(m: Map[String, Any]): java.io.Serializable= {
-                    val ser = new util.HashMap[String, Object]()
+                    def toSerializable(m: Map[String, Any]): java.io.Serializable = {
+                        val ser = new util.HashMap[String, Object]()
 
-                    m.foreach { case (k, v) ⇒ ser.put(k, v.asInstanceOf[Object]) }
+                        m.foreach { case (k, v) ⇒ ser.put(k, v.asInstanceOf[Object]) }
 
-                    ser
-                }
-
-                def make(e: NCGeoEntry): NCNlpSentenceNote =
-                    e match {
-                        case x: NCGeoMetro ⇒
-                            mkNote(
-                                METRO,
-                                "metro" → x.name
-                            )
-
-                        case x: NCGeoContinent ⇒
-                            mkNote(
-                                CONTINENT,
-                                "continent" → x.name
-                            )
-
-                        case x: NCGeoSubContinent ⇒
-                            mkNote(
-                                SUBCONTINENT,
-                                "continent" → x.continent.name,
-                                "subcontinent" → x.name
-                            )
-
-                        case x: NCGeoCountry ⇒
-                            mkNote(
-                                COUNTRY,
-                                "continent" → x.subContinent.continent.name,
-                                "subcontinent" → x.subContinent.name,
-                                "country" → x.name,
-                                "countrymeta" → toSerializable(x.meta)
-                            )
-
-                        case x: NCGeoRegion ⇒
-                            mkNote(
-                                REGION,
-                                "continent" → x.country.subContinent.continent.name,
-                                "subcontinent" → x.country.subContinent.name,
-                                "country" → x.country.name,
-                                "region" → x.name,
-                                "countrymeta" → toSerializable(x.country.meta)
-                            )
-
-                        case x: NCGeoCity ⇒
-                            mkNote(
-                                CITY,
-                                "continent" → x.region.country.subContinent.continent.name,
-                                "subcontinent" → x.region.country.subContinent.name,
-                                "country" → x.region.country.name,
-                                "region" → x.region.name,
-                                "city" → x.name,
-                                "countrymeta" → toSerializable(x.region.country.meta),
-                                "citymeta" → toSerializable(x.meta)
-                            )
-                            
-                        case _ ⇒ throw new AssertionError(s"Unexpected data: $e")
+                        ser
                     }
 
-                def addAll(locs: Set[NCGeoEntry]): Unit =
-                    for (loc ← locs) {
-                        val note = make(loc)
+                    def make(e: NCGeoEntry): NCNlpSentenceNote =
+                        e match {
+                            case x: NCGeoMetro ⇒
+                                mkNote(
+                                    METRO,
+                                    "metro" → x.name
+                                )
+
+                            case x: NCGeoContinent ⇒
+                                mkNote(
+                                    CONTINENT,
+                                    "continent" → x.name
+                                )
+
+                            case x: NCGeoSubContinent ⇒
+                                mkNote(
+                                    SUBCONTINENT,
+                                    "continent" → x.continent.name,
+                                    "subcontinent" → x.name
+                                )
+
+                            case x: NCGeoCountry ⇒
+                                mkNote(
+                                    COUNTRY,
+                                    "continent" → x.subContinent.continent.name,
+                                    "subcontinent" → x.subContinent.name,
+                                    "country" → x.name,
+                                    "countrymeta" → toSerializable(x.meta)
+                                )
+
+                            case x: NCGeoRegion ⇒
+                                mkNote(
+                                    REGION,
+                                    "continent" → x.country.subContinent.continent.name,
+                                    "subcontinent" → x.country.subContinent.name,
+                                    "country" → x.country.name,
+                                    "region" → x.name,
+                                    "countrymeta" → toSerializable(x.country.meta)
+                                )
+
+                            case x: NCGeoCity ⇒
+                                mkNote(
+                                    CITY,
+                                    "continent" → x.region.country.subContinent.continent.name,
+                                    "subcontinent" → x.region.country.subContinent.name,
+                                    "country" → x.region.country.name,
+                                    "region" → x.region.name,
+                                    "city" → x.name,
+                                    "countrymeta" → toSerializable(x.region.country.meta),
+                                    "citymeta" → toSerializable(x.meta)
+                                )
+
+                            case _ ⇒ throw new AssertionError(s"Unexpected data: $e")
+                        }
 
-                        toks.foreach(t ⇒ t.add(note))
+                    def addAll(locs: Set[NCGeoEntry]): Unit =
+                        for (loc ← locs) {
+                            val note = make(loc)
 
-                        // Other types(JJ etc) and quoted word are not re-marked.
-                        toks.filter(t ⇒ !NCPennTreebank.NOUNS_POS.contains(t.pos) && t.pos != "FW").
-                            foreach(t ⇒ ns.fixNote(t.getNlpNote, "pos" → NCPennTreebank.SYNTH_POS))
-                    }
+                            toks.foreach(t ⇒ t.add(note))
 
-                LOCATIONS.get(toks.map(_.normText).mkString(" ")) match {
-                    case Some(locs) ⇒
-                        // If multiple token match - add it.
-                        if (toks.length > 1)
-                            addAll(locs)
-                        else {
-                            // Only one token - toks.length == 1
-                            val t = toks.head
-
-                            // If LOCATION or noun - add it.
-                            if (NCPennTreebank.NOUNS_POS.contains(t.pos))
+                            // Other types(JJ etc) and quoted word are not re-marked.
+                            toks.filter(t ⇒ !NCPennTreebank.NOUNS_POS.contains(t.pos) && t.pos != "FW").
+                                foreach(t ⇒ ns.fixNote(t.getNlpNote, "pos" → NCPennTreebank.SYNTH_POS))
+                        }
+
+                    locations.get(toks.map(_.normText).mkString(" ")) match {
+                        case Some(locs) ⇒
+                            // If multiple token match - add it.
+                            if (toks.length > 1)
                                 addAll(locs)
-                            // If US state - add it.
-                            else
-                            // For now - simply ignore abbreviations for US states that
-                            // conflict with commonly used English words. User will have to
-                            // use full names.
-                            if (!isConflictName(t.origText)) {
-                                def isTopCity(g: NCGeoCity): Boolean = {
-                                    val name = glue(g.name, g.region.name)
-
-                                    TOP_USA.contains(name) || TOP_WORLD.contains(name)
-                                }
+                            else {
+                                // Only one token - toks.length == 1
+                                val t = toks.head
+
+                                // If LOCATION or noun - add it.
+                                if (NCPennTreebank.NOUNS_POS.contains(t.pos))
+                                    addAll(locs)
+                                // If US state - add it.
+                                else
+                                // For now - simply ignore abbreviations for US states that
+                                // conflict with commonly used English words. User will have to
+                                // use full names.
+                                if (!isConflictName(t.origText)) {
+                                    def isTopCity(g: NCGeoCity): Boolean = {
+                                        val name = glue(g.name, g.region.name)
+
+                                        topUsa.contains(name) || topWorld.contains(name)
+                                    }
 
-                                addAll(locs.collect {
-                                    case g: NCGeoContinent ⇒ g
-                                    case g: NCGeoSubContinent ⇒ g
-                                    case g: NCGeoCountry ⇒ g
-                                    case g: NCGeoMetro ⇒ g
-                                    case g: NCGeoRegion if g.country.name == "united states" ⇒ g
-                                    case g: NCGeoCity if isTopCity(g) ⇒ g
-                                })
+                                    addAll(locs.collect {
+                                        case g: NCGeoContinent ⇒ g
+                                        case g: NCGeoSubContinent ⇒ g
+                                        case g: NCGeoCountry ⇒ g
+                                        case g: NCGeoMetro ⇒ g
+                                        case g: NCGeoRegion if g.country.name == "united states" ⇒ g
+                                        case g: NCGeoCity if isTopCity(g) ⇒ g
+                                    })
+                                }
+                                // In all other cases - ignore one-token match.
                             }
-                            // In all other cases - ignore one-token match.
-                        }
-                    case None ⇒
-                        // Case sensitive synonyms.
-                        LOCATIONS.get(toks.map(_.origText).mkString(" ")) match {
-                            case Some(locs) ⇒ addAll(locs)
-                            case None ⇒
-                                // If there is no direct match try to convert JJs to NNs and re-check
-                                // for a possible match, e.g. "american" ⇒ "america".
-                                if (toks.size == 1) {
-                                    val tok = toks.head
-
-                                    if (NCPennTreebank.JJS_POS.contains(tok.pos)) {
-                                        var endLoop = false
-
-                                        for (noun ← NCWordNetManager.getNNsForJJ(tok.normText); if !endLoop) {
-                                            def onResult(locs: Set[NCGeoEntry]): Unit = {
-                                                addAll(locs)
-                                                endLoop = true
-                                            }
-
-                                            LOCATIONS.get(noun) match {
-                                                case Some(locs) ⇒ onResult(locs)
-                                                case None ⇒
-                                                    LOCATIONS.get(noun.toLowerCase) match {
-                                                        case Some(locs) ⇒ onResult(locs)
-                                                        case None ⇒ // No-op.
-                                                    }
+                        case None ⇒
+                            // Case sensitive synonyms.
+                            locations.get(toks.map(_.origText).mkString(" ")) match {
+                                case Some(locs) ⇒ addAll(locs)
+                                case None ⇒
+                                    // If there is no direct match try to convert JJs to NNs and re-check
+                                    // for a possible match, e.g. "american" ⇒ "america".
+                                    if (toks.size == 1) {
+                                        val tok = toks.head
+
+                                        if (NCPennTreebank.JJS_POS.contains(tok.pos)) {
+                                            var endLoop = false
+
+                                            for (noun ← NCWordNetManager.getNNsForJJ(tok.normText); if !endLoop) {
+                                                def onResult(locs: Set[NCGeoEntry]): Unit = {
+                                                    addAll(locs)
+                                                    endLoop = true
+                                                }
+
+                                                locations.get(noun) match {
+                                                    case Some(locs) ⇒ onResult(locs)
+                                                    case None ⇒
+                                                        locations.get(noun.toLowerCase) match {
+                                                            case Some(locs) ⇒ onResult(locs)
+                                                            case None ⇒ // No-op.
+                                                        }
+                                                }
                                             }
                                         }
                                     }
-                                }
-                        }
+                            }
+                    }
                 }
-            }
 
-            collapse(ns)
+                collapse(ns)
+            }
         }
 
     private def getValue(note: NCNlpSentenceNote, key: String): String = note(key).asInstanceOf[String]
@@ -313,7 +331,7 @@ object NCGeoEnricher extends NCServerEnricher {
         val excls = new mutable.HashSet[NCNlpSentenceNote]() ++ getGeoNotes(ns).filter(note ⇒ {
             val kind = extractKind(note)
 
-            COMMONS.get(kind) match {
+            commons.get(kind) match {
                 // GEO is common word defined directly or via synonym.
                 case Some(cs) ⇒
                     cs.contains(getName(kind, note)) ||
@@ -416,9 +434,9 @@ object NCGeoEnricher extends NCServerEnricher {
                 case CITY ⇒
                     val cityReg = glue(get("city"), get("region"))
 
-                    if (TOP_WORLD.contains(cityReg))
+                    if (topWorld.contains(cityReg))
                         2
-                    else if (TOP_USA.contains(cityReg))
+                    else if (topUsa.contains(cityReg))
                         1
                     else
                         0
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/spell/NCSpellCheckManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/spell/NCSpellCheckManager.scala
index 0782359..5f9f0e6 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/spell/NCSpellCheckManager.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/spell/NCSpellCheckManager.scala
@@ -28,12 +28,10 @@ import scala.collection._
   */
 object NCSpellCheckManager extends NCService {
     case class Record(correct: String, misspellings: Seq[String])
+
+    private final val PATH = "spell/dictionary.json"
     
-    private val dict: Map[String, String] = (
-        for (rec ← NCJson.extractResource[List[Record]]("spell/dictionary.json", ignoreCase = true)) yield {
-            for (v ← rec.misspellings) yield v → rec.correct
-        })
-        .flatten.toMap
+    @volatile private var dict: Map[String, String] = _
     
     private def isWordUpper(s: String): Boolean = s.forall(_.isUpper)
     private def isHeadUpper(s: String): Boolean = s.head.isUpper
@@ -47,10 +45,22 @@ object NCSpellCheckManager extends NCService {
             s // Full lower case by default.
     
     override def start(parent: Span): NCService = startScopedSpan("start", parent) { _ ⇒
+        NCJson.extractResourceOpt[List[Record]](PATH, ignoreCase = true) match {
+            case Some(recs) ⇒
+                dict = (for (rec ← recs) yield { for (v ← rec.misspellings) yield v → rec.correct } ).flatten.toMap
+            case None ⇒
+                // TODO: warning text.
+                logger.warn(s"Data not found for some reasons: $PATH")
+
+                dict = Map.empty
+        }
+
         super.start()
     }
     
     override def stop(parent: Span): Unit = startScopedSpan("stop", parent) { _ ⇒
+        dict = null
+
         super.stop()
     }
     
@@ -62,15 +72,16 @@ object NCSpellCheckManager extends NCService {
       *
       * @param in Word to check.
       */
-    def check(in: String): String = dict.get(in.toLowerCase) match {
-        case None ⇒ in
-        case Some(out) ⇒
-            val inSeq = split(in)
-            val outSeq = split(out)
-            
-            if (inSeq.lengthCompare(outSeq.size) == 0)
-                outSeq.zip(inSeq).map(p ⇒ processCase(p._1, p._2)).mkString(" ")
-            else
-                processCase(out, in)
-    }
+    def check(in: String): String =
+        dict.get(in.toLowerCase) match {
+            case None ⇒ in
+            case Some(out) ⇒
+                val inSeq = split(in)
+                val outSeq = split(out)
+
+                if (inSeq.lengthCompare(outSeq.size) == 0)
+                    outSeq.zip(inSeq).map(p ⇒ processCase(p._1, p._2)).mkString(" ")
+                else
+                    processCase(out, in)
+        }
 }