You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by ar...@apache.org on 2021/12/17 03:46:01 UTC
[incubator-nlpcraft] branch master_test updated: WIP
This is an automated email from the ASF dual-hosted git repository.
aradzinski pushed a commit to branch master_test
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/master_test by this push:
new 94985df WIP
94985df is described below
commit 94985df3440f528004c60464d7cc34c93e6257fd
Author: Aaron Radzinski <ar...@datalingvo.com>
AuthorDate: Thu Dec 16 19:41:58 2021 -0800
WIP
---
.../parser/opennlp/NCOpenNlpTokenParserImpl.scala | 231 ---------------------
.../token/parser/opennlp/impl/NCOpenNlpImpl.scala | 70 ++++++-
2 files changed, 68 insertions(+), 233 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserImpl.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserImpl.scala
deleted file mode 100644
index 7d19ed1..0000000
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/NCOpenNlpTokenParserImpl.scala
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nlpcraft.internal.nlp.token.parser.opennlp
-
-import opennlp.tools.lemmatizer.DictionaryLemmatizer
-import opennlp.tools.postag.{POSModel, POSTagger, POSTaggerME}
-import opennlp.tools.stemmer.PorterStemmer
-import opennlp.tools.tokenize.{Tokenizer, TokenizerME, TokenizerModel}
-import org.apache.nlpcraft.*
-
-import java.io.{File, FileNotFoundException, IOException, InputStream as IS, BufferedInputStream as BIS, FileInputStream as FIS}
-import java.net.URL
-import java.util
-import java.util.{List, Objects}
-import scala.concurrent.ExecutionContext
-import scala.jdk.CollectionConverters.SeqHasAsJava
-import scala.util.Using
-import scala.util.control.Exception.catching
-
-private[opennlp] object NCOpenNlpTokenParserImpl {
- /**
- *
- * @param tokenizer
- * @param tagger
- * @param lemmatizer
- */
- @throws[NullPointerException]
- @throws[NCException]
- def apply(
- tokenizer: File,
- tagger: File,
- lemmatizer: File
- ): NCOpenNlpTokenParserImpl = {
- verify(tokenizer, tagger, lemmatizer)
-
- try
- new NCOpenNlpTokenParserImpl(
- new BIS(new FIS(tokenizer)), new BIS(new FIS(tagger)), new BIS(new FIS(lemmatizer))
- )
- catch {
- // TODO:
- case e: FileNotFoundException => throw new NCException("Error reading configuration files.", e)
- }
- }
-
- /**
- *
- * @param tokenizer
- * @param tagger
- * @param lemmatizer
- */
- @throws[NullPointerException]
- @throws[NCException]
- def apply(
- tokenizer: String,
- tagger: String,
- lemmatizer: String
- ): NCOpenNlpTokenParserImpl = {
- verify(tokenizer, tagger, lemmatizer)
-
- new NCOpenNlpTokenParserImpl(
- getStream(tokenizer),
- getStream(tagger),
- getStream(lemmatizer)
- )
- }
-
- /**
- *
- * @param tokenizer
- * @param tagger
- * @param lemmatizer
- */
- @throws[NullPointerException]
- @throws[NCException]
- def apply(
- tokenizer: URL,
- tagger: URL,
- lemmatizer: URL
- ): NCOpenNlpTokenParserImpl = {
- verify(tokenizer, tagger, lemmatizer)
-
- try
- new NCOpenNlpTokenParserImpl(
- new BIS(tokenizer.openStream), new BIS(tagger.openStream), new BIS(lemmatizer.openStream)
- )
- catch {
- // TODO:
- case e: IOException => throw new NCException("Error reading configuration URLs.", e)
- }
- }
-
- /**
- *
- * @param tokenizer
- * @param tagger
- * @param lemmatizer
- */
- @throws[NullPointerException]
- def apply(tokenizer: IS, tagger: IS, lemmatizer: IS): NCOpenNlpTokenParserImpl = {
- verify(tokenizer, tagger, lemmatizer)
-
- new NCOpenNlpTokenParserImpl(tokenizer, tagger, lemmatizer)
- }
-
- @throws[NullPointerException]
- private def verify(tokenizer: Any, tagger: Any, lemmatizer: Any): Unit = {
- Objects.requireNonNull(tokenizer, "Argument `tokenizer` cannot be null")
- Objects.requireNonNull(tagger, "Argument `tagger` cannot be null")
- Objects.requireNonNull(lemmatizer, "Argument `lemmatizer` cannot be null")
- }
-
- @throws[NCException]
- private def getStream(res: String) = {
- val in = this.getClass.getClassLoader.getResourceAsStream(res)
-
- if (in == null) // TODO:
- throw new NCException("Error reading resource: " + res)
-
- new BIS(in)
- }
-}
-
-/**
- *
- * @param tokenizerStream
- * @param taggerStream
- * @param lemmatizerStream
- */
-private[opennlp] class NCOpenNlpTokenParserImpl(tokenizerStream: IS, taggerStream: IS, lemmatizerStream: IS) extends NCTokenParser {
- private val stemmer = new PorterStemmer
-
- var extraStopWords: util.List[String] = _
- var excludedStopWords: util.List[String] = _
-
- @volatile private var tokenizer: Tokenizer = _
- @volatile private var tagger: POSTagger = _
- @volatile private var lemmatizer: DictionaryLemmatizer = _
-
- override def parse(req: NCRequest): util.List[NCToken] = {
- case class Holder(origin: String, normalized: String, start: Int, end: Int, lenght: Int)
- abstract class NCOpenNlpToken extends NCParameterizedAdapter with NCToken
-
- val sen = req.getNormalizedText
-
- val hs =
- tokenizer.tokenizePos(sen).map(
- t => {
- val txt = t.getCoveredText(sen).toString
-
- Holder(txt, txt.toLowerCase, t.getStart, t.getEnd, t.length())
- }
- )
-
- val words = hs.map(_.origin)
- val poses = tagger.tag(words)
-
- require(hs.length == poses.length)
-
- var lemmas = lemmatizer.lemmatize(words, poses).toSeq
-
- // Hack.
- // For some reasons lemmatizer dictionary (en-lemmatizer.dict) marks some words with non-existent POS 'NNN'
- // Valid POS list: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
- // Example of dictionary records:
- // ...
- // time JJ time
- // time NNN time
- // ...
- // time-ball NN time-ball
- // ...
- val suspIdxs =
- lemmas.
- zip(poses).
- zipWithIndex.flatMap {
- // "0" is flag that lemma cannot be obtained for some reasons.
- case ((lemma, pos), i) => if (lemma == "O" && pos == "NN") Some(i) else None
- }
-
- if (suspIdxs.nonEmpty) {
- val fixes: Map[Int, String] =
- lemmatizer.
- lemmatize(suspIdxs.map(i => words(i)).toArray, suspIdxs.map(_ => "NNN").toArray).
- zipWithIndex.
- flatMap { case (lemma, i) => if (lemma != "0") Some(suspIdxs(i) -> lemma) else None }.toMap
-
- lemmas = lemmas.zipWithIndex.map { case (lemma, idx) => fixes.getOrElse(idx, lemma) }
- }
-
- hs.zip(poses).zip(lemmas).toIndexedSeq.map { case ((h, pos), lemma) =>
- new NCOpenNlpToken() {
- override def getOriginalText: String = h.origin
- override def getNormalizedText: String = h.normalized
- override def getLemma: String = lemma
- override def getStem: String = stemmer.stem(h.normalized)
- override def getPos: String = pos
- override def isStopWord: Boolean = true // TODO: implement
- override def getStartCharIndex: Int = h.start
- override def getEndCharIndex: Int = h.end
- override def getLength: Int = h.lenght
- }
- }.asJava
- }
-
- override def start(): Unit = {
- tokenizer = new TokenizerME(new TokenizerModel(tokenizerStream))
- tagger = new POSTaggerME(new POSModel(taggerStream))
- lemmatizer = new DictionaryLemmatizer(lemmatizerStream)
- }
-
- override def stop(): Unit = {
- lemmatizer = null
- tagger = null
- tokenizer = null
- }
-}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
index fa600c4..bd4c272 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/nlp/token/parser/opennlp/impl/NCOpenNlpImpl.scala
@@ -18,9 +18,75 @@
package org.apache.nlpcraft.internal.nlp.token.parser.opennlp.impl
import org.apache.nlpcraft.*
+import java.io.*
+
+import opennlp.tools.lemmatizer.*
+import opennlp.tools.postag.*
+import opennlp.tools.stemmer.*
+import opennlp.tools.tokenize.*
+
+import scala.jdk.CollectionConverters.*
/**
*
+ * @param tokIn
+ * @param tagIn
+ * @param lemmaIn
*/
-class NCOpenNlpImpl:
- def parse(req: NCRequest): java.util.List[NCToken] = ???
+class NCOpenNlpImpl(tokIn: InputStream, tagIn: InputStream, lemmaIn: InputStream):
+ private val tokenizer = new TokenizerME(new TokenizerModel(tokIn))
+ private val tagger = new POSTaggerME(new POSModel(tagIn))
+ private val lemmatizer = new DictionaryLemmatizer(lemmaIn)
+ private val stemmer = new PorterStemmer
+
+ /**
+ *
+ * @param req
+ * @return
+ */
+ def parse(req: NCRequest): java.util.List[NCToken] =
+ val sen = req.getNormalizedText
+
+ case class TokenHolder(origin: String, normalized: String, start: Int, end: Int, length: Int)
+
+ val holders = tokenizer.tokenizePos(sen).map( t => {
+ val txt = t.getCoveredText(sen).toString
+ TokenHolder(txt, txt.toLowerCase, t.getStart, t.getEnd, t.length())
+ })
+
+ val words = holders.map(_.origin)
+ val posTags = tagger.tag(words)
+ var lemmas = lemmatizer.lemmatize(words, posTags).toSeq
+
+ require(holders.length == posTags.length)
+
+ // For some reasons lemmatizer (en-lemmatizer.dict) marks some words with non-existent POS 'NNN'
+ // Valid POS list: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
+ val suspIdxs = lemmas.zip(posTags).zipWithIndex.flatMap {
+ // "0" is flag that lemma cannot be obtained for some reasons.
+ case ((lemma, pos), i) => if lemma == "O" && pos == "NN" then Some(i) else None
+ }
+
+ if suspIdxs.nonEmpty then
+ val fixes: Map[Int, String] = lemmatizer.
+ lemmatize(suspIdxs.map(i => words(i)).toArray, suspIdxs.map(_ => "NNN").toArray).
+ zipWithIndex.
+ flatMap {
+ case (lemma, i) => if lemma != "0" then Some(suspIdxs(i) -> lemma) else None
+ }.toMap
+ lemmas = lemmas.zipWithIndex.map {
+ case (lemma, idx) => fixes.getOrElse(idx, lemma)
+ }
+
+ holders.zip(posTags).zip(lemmas).toIndexedSeq.map { case ((h, pos), lemma) =>
+ new NCParameterizedAdapter with NCToken:
+ override def getOriginalText: String = h.origin
+ override def getNormalizedText: String = h.normalized
+ override def getLemma: String = lemma
+ override def getStem: String = stemmer.stem(h.normalized)
+ override def getPos: String = pos
+ override def isStopWord: Boolean = true // TODO: implement
+ override def getStartCharIndex: Int = h.start
+ override def getEndCharIndex: Int = h.end
+ override def getLength: Int = h.length
+ }.asJava