You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by se...@apache.org on 2021/06/28 14:09:43 UTC
[incubator-nlpcraft] 01/02: WIP.
This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-70_NEW
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 544d8a40860393698ec80b5c992e890770469c16
Author: Sergey Kamov <sk...@gmail.com>
AuthorDate: Mon Jun 28 14:33:32 2021 +0300
WIP.
---
nlpcraft/pom.xml | 4 ++++
.../apache/nlpcraft/common/nlp/NCNlpSentence.scala | 2 +-
.../enrichers/ctxword/NCContextWordEnricher.scala | 26 ++++++++++++++++++++--
pom.xml | 7 ++++++
4 files changed, 36 insertions(+), 3 deletions(-)
diff --git a/nlpcraft/pom.xml b/nlpcraft/pom.xml
index 4d8c292..62d3683 100644
--- a/nlpcraft/pom.xml
+++ b/nlpcraft/pom.xml
@@ -232,6 +232,10 @@
<groupId>org.jline</groupId>
<artifactId>jline</artifactId>
</dependency>
+ <dependency>
+ <groupId>org.jibx</groupId>
+ <artifactId>jibx-tools</artifactId>
+ </dependency>
<!-- Test dependencies. -->
<dependency>
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index d237558..ed22935 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -52,7 +52,7 @@ class NCNlpSentence(
val srvReqId: String,
val text: String,
val enabledBuiltInToks: Set[String],
- val mlConfig: Option[NCModelMLConfigMdo],
+ val mlConfig: Option[NCModelMLConfigMdo] = None,
var mlData: Map[Int, Map[String, Double]] = Map.empty,
override val tokens: mutable.ArrayBuffer[NCNlpSentenceToken] = new mutable.ArrayBuffer[NCNlpSentenceToken](32),
var firstProbePhase: Boolean = true,
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
index 4982e29..79c970e 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
@@ -18,12 +18,16 @@
package org.apache.nlpcraft.server.nlp.enrichers.ctxword
import io.opencensus.trace.Span
-import org.apache.nlpcraft.common.nlp.core.NCNlpPorterStemmer.stem
+import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager
+import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager.stem
+import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceToken}
import org.apache.nlpcraft.common.{NCE, NCService, U}
import org.apache.nlpcraft.server.mdo.NCModelMLConfigMdo
import org.apache.nlpcraft.server.nlp.enrichers.NCServerEnricher
+import org.apache.nlpcraft.server.nlp.enrichers.basenlp.NCBaseNlpEnricher
import org.apache.nlpcraft.server.sugsyn.{NCSuggestSynonymManager, NCSuggestionRequest, NCWordSuggestion}
+import org.jibx.schema.codegen.extend.DefaultNameConverter
import scala.collection.mutable
import scala.concurrent.Await
@@ -33,9 +37,14 @@ import scala.concurrent.duration.Duration
* ContextWord enricher.
*/
object NCContextWordEnricher extends NCServerEnricher {
+ private final val POS_PLURALS = Set("NNS", "NNPS")
+ private final val POS_SINGULAR = Set("NN", "NNP")
+
private final val MAX_CTXWORD_SCORE = 2
private final val EXCLUSIVE_MIN_SCORE = -1.0
+ private final val CONVERTER = new DefaultNameConverter
+
private case class ModelProbeKey(probeId: String, modelId: String)
private case class ElementScore(elementId: String, averageScore: Double, senScore: Double, sampleScore: Double)
@@ -171,6 +180,19 @@ object NCContextWordEnricher extends NCServerEnricher {
@throws[NCE]
private def askSamples(cfg: NCModelMLConfigMdo): ElementStemScore = {
val sampleWords = cfg.samples.map(spaceTokenize).toSeq
+
+
+ sampleWords.map(s => {
+ val sampleSen = new NCNlpSentence("sampleReqId", sampleWords.mkString(" "), Set.empty)
+
+ NCBaseNlpEnricher.enrich(sampleSen)
+
+ sampleSen.
+ })
+
+
+
+
val sampleWordsStems = sampleWords.map(_.map(stem))
val recs: Map[String, Seq[NCSuggestionRequest]] =
@@ -227,7 +249,7 @@ object NCContextWordEnricher extends NCServerEnricher {
}
}
- val nounToks = ns.tokens.filter(_.pos.startsWith("N"))
+ val nounToks = ns.tokens.filter(t => NCPennTreebank.NOUNS_POS.contains(t.pos))
if (nounToks.nonEmpty) {
val key = ModelProbeKey(cfg.probeId, cfg.modelId)
diff --git a/pom.xml b/pom.xml
index fd0d687..59e871a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -154,6 +154,7 @@
<lightstep.grpc.ver>0.15.8</lightstep.grpc.ver>
<junit.ver>5.5.1</junit.ver>
<jsonpath.ver>2.4.0</jsonpath.ver>
+ <jibx.tools.ver>1.3.3</jibx.tools.ver>
<!-- Force specific encoding on text resources. -->
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
@@ -474,6 +475,12 @@
</dependency>
<dependency>
+ <groupId>org.jibx</groupId>
+ <artifactId>jibx-tools</artifactId>
+ <version>${jibx.tools.ver}</version>
+ </dependency>
+
+ <dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>${stanford.corenlp.ver}</version>