You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nlpcraft.apache.org by ar...@apache.org on 2021/12/06 22:04:32 UTC
[incubator-nlpcraft] branch master updated: WIP + renaming.
This is an automated email from the ASF dual-hosted git repository.
aradzinski pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/master by this push:
new 35d18a8 WIP + renaming.
35d18a8 is described below
commit 35d18a80488ff4a33b930f3769ae068b1641a3ca
Author: Aaron Radzinski <ar...@datalingvo.com>
AuthorDate: Mon Dec 6 14:04:24 2021 -0800
WIP + renaming.
---
.../main/scala/org/apache/nlpcraft/NCContext.java | 5 +-
.../scala/org/apache/nlpcraft/NCConversation.java | 4 +-
.../main/scala/org/apache/nlpcraft/NCEntity.java | 169 ++++++++++++++++
.../{NCWordParser.java => NCEntityParser.java} | 7 +-
.../main/scala/org/apache/nlpcraft/NCModel.java | 2 +-
.../scala/org/apache/nlpcraft/NCModelConfig.java | 4 +-
.../apache/nlpcraft/NCModelConfigFileAdapter.java | 4 +-
.../main/scala/org/apache/nlpcraft/NCResult.java | 6 +-
.../main/scala/org/apache/nlpcraft/NCToken.java | 214 ++-------------------
.../scala/org/apache/nlpcraft/NCTokenParser.java | 5 +-
.../src/main/scala/org/apache/nlpcraft/NCWord.java | 71 -------
11 files changed, 199 insertions(+), 292 deletions(-)
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCContext.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCContext.java
index e7eede7..31832ad 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCContext.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCContext.java
@@ -17,9 +17,6 @@
package org.apache.nlpcraft;
-import java.util.Collection;
-import java.util.List;
-
/**
*
*/
@@ -30,7 +27,7 @@ public interface NCContext {
* @param tok Token to check.
* @return {@code true} if given token is associated with this context, {@code false} otherwise.
*/
- boolean isOwnerOf(NCToken tok);
+ boolean isOwnerOf(NCEntity tok);
/**
* Gets model configuration for this query.
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCConversation.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCConversation.java
index aefce08..7f96285 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCConversation.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCConversation.java
@@ -35,7 +35,7 @@ public interface NCConversation {
* @return List of tokens for this conversation's STM. The list can be empty which indicates that
* conversation is brand new (or timed out).
*/
- List<NCToken> getTokens();
+ List<NCEntity> getTokens();
/**
* Gets the chronologically ordered list of previously matched intents sorted from oldest to newest
@@ -55,7 +55,7 @@ public interface NCConversation {
*
* @param filter Token remove filter.
*/
- void clearStm(Predicate<NCToken> filter);
+ void clearStm(Predicate<NCEntity> filter);
/**
* Removes all previously matched intents using given dialog flow item predicate.
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
new file mode 100644
index 0000000..35aac20
--- /dev/null
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntity.java
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft;
+
+import java.util.List;
+
+/**
+ *
+ */
+public interface NCEntity {
+ /**
+ * Gets ID of the request this entity is part of.
+ *
+ * @return ID of the request this entity is part of.
+ */
+ String getRequestId();
+
+ /**
+ *
+ * @return
+ */
+ String getId();
+
+ /**
+ * Gets the optional parent ID of the model element this entity represents. This only available
+ * for user-defined model elements - built-in entities do not have parents and this will return {@code null}.
+ *
+ * @return ID of the entity's element immediate parent or {@code null} if not available.
+ * @see NCElement#getParentId()
+ * @see #getAncestors()
+ */
+ String getParentId();
+
+ /**
+ * Gets the list of all parent IDs from this entity up to the root. This only available
+ * for user-defined model elements = built-in entities do not have parents and will return an empty list.
+ *
+ * @return List, potentially empty but never {@code null}, of all parent IDs from this entity up to the root.
+ * @see #getParentId()
+ */
+ List<String> getAncestors();
+
+ /**
+ * Tests whether this entity is a child of given entity ID. It is equivalent to:
+ * <pre class="brush: java">
+ * return getAncestors().contains(tokId);
+ * </pre>
+ *
+ * @param tokId Ancestor entity ID.
+ * @return <code>true</code> this entity is a child of given entity ID, <code>false</code> otherwise.
+ */
+ default boolean isChildOf(String tokId) {
+ return getAncestors().contains(tokId);
+ }
+
+ /**
+ * Gets the value if this entity was detected via element's value (or its synonyms). Otherwise,
+ * returns {@code null}. Only applicable for user-defined model elements - built-in entities
+ * do not have values, and it will return {@code null}.
+ *
+ * @return Value for the user-defined model element or {@code null}, if not available.
+ * @see NCElement#getValues()
+ */
+ String getValue();
+
+ /**
+ * Gets the list of groups this entity belongs to. Note that, by default, if not specified explicitly,
+ * entity always belongs to one group with ID equal to entity ID.
+ *
+ * @return entity groups list. Never {@code null} - but can be empty.
+ * @see NCElement#getGroups()
+ */
+ List<String> getGroups();
+
+ /**
+ * Tests whether this entity belongs to the given group. It is equivalent to:
+ * <pre class="brush: java">
+ * return getGroups().contains(grp);
+ * </pre>
+ *
+ * @param grp Group to test.
+ * @return <code>True</code> if this entity belongs to the group <code>grp</code>, {@code false} otherwise.
+ */
+ default boolean isMemberOf(String grp) {
+ return getGroups().contains(grp);
+ }
+
+ /**
+ * Gets start character index of this entity in the original text.
+ *
+ * @return Start character index of this entity.
+ */
+ int getStartCharIndex();
+
+ /**
+ * Gets end character index of this entity in the original text.
+ *
+ * @return End character index of this entity.
+ */
+ int getEndCharIndex();
+
+ /**
+ *
+ * @return Whether this entity is a stopword.
+ */
+ boolean isStopWord();
+
+ /**
+ *
+ * @return Original user input text for this entity.
+ */
+ String getOriginalText();
+
+ /**
+ *
+ * @return Index of this entity in the sentence.
+ */
+ int getIndex();
+
+ /**
+ *
+ * @return Normalized user input text for this entity.
+ */
+ String getNormalizedText();
+ /**
+ *
+ * @return Lemma of this entity, i.e. a canonical form of this word.
+ */
+ String getLemma();
+
+ /**
+ *
+ * @return Stem of this entity.
+ */
+ String getStem();
+
+ /**
+ *
+ * @return Penn Treebank POS tag for this entity.
+ */
+ String getPos();
+
+ /**
+ * A shortcut method that gets internal globally unique system ID of the entity.
+ * <p>
+ * This method is equivalent to:
+ * <pre class="brush: java">
+ * return meta("nlpcraft:nlp:unid");
+ * </pre>
+ *
+ * @return Internal globally unique system ID of the entity.
+ */
+ String getGuid();
+}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCWordParser.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.java
similarity index 76%
rename from nlpcraft/src/main/scala/org/apache/nlpcraft/NCWordParser.java
rename to nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.java
index 4d2d862..fedf377 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCWordParser.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCEntityParser.java
@@ -22,11 +22,14 @@ import java.util.List;
/**
*
*/
-public interface NCWordParser {
+public interface NCEntityParser {
/**
*
* @param req
+ * @param cfg
+ * @param toks
+ * @param ents List of already parsed entities prio to this step. Can be empty but never {@code null}.
* @return
*/
- List<NCWord> parse(NCRequest req);
+ List<NCEntity> parse(NCRequest req, NCModelConfig cfg, List<NCToken> toks, List<NCEntity> ents);
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModel.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModel.java
index 3c4e656..876142d 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModel.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModel.java
@@ -34,7 +34,7 @@ public interface NCModel {
* @param var
* @return
*/
- default boolean onVariant(List<NCToken> var) {
+ default boolean onVariant(List<NCEntity> var) {
return true;
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfig.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfig.java
index e2b4e49..0637ff1 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfig.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfig.java
@@ -71,13 +71,13 @@ public interface NCModelConfig {
*
* @return
*/
- NCWordParser getWordParser();
+ NCTokenParser getTokenParser();
/**
*
* @return
*/
- NCTokenParser getTokenParser();
+ NCEntityParser getEntityParser();
/**
* Gets unique, <i>immutable</i> ID of this model.
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfigFileAdapter.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfigFileAdapter.java
index 97fddd1..13f2f2f 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfigFileAdapter.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelConfigFileAdapter.java
@@ -48,12 +48,12 @@ public class NCModelConfigFileAdapter implements NCModelConfig {
}
@Override
- public NCWordParser getWordParser() {
+ public NCTokenParser getTokenParser() {
return null; // TODO
}
@Override
- public NCTokenParser getTokenParser() {
+ public NCEntityParser getEntityParser() {
return null; // TODO
}
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCResult.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCResult.java
index 1d26f50..2d16a54 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCResult.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCResult.java
@@ -64,7 +64,7 @@ public class NCResult implements Serializable {
private String type;
/** Sequence of tokens represents a fully parsed (see {@link NCContext#getVariants()} method) user input. */
- private Collection<NCToken> tokens;
+ private Collection<NCEntity> tokens;
/** ID of the intent. */
private String intentId;
@@ -197,7 +197,7 @@ public class NCResult implements Serializable {
* @return Gets tokens that were used to produce this query result.
* @see #setTokens(Collection)
*/
- public Collection<NCToken> getTokens() {
+ public Collection<NCEntity> getTokens() {
return tokens;
}
@@ -213,7 +213,7 @@ public class NCResult implements Serializable {
* @param tokens Collection of tokens that was used to produce this query result.
* @see #getTokens()
*/
- public void setTokens(Collection<NCToken> tokens) {
+ public void setTokens(Collection<NCEntity> tokens) {
this.tokens = tokens;
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
index 6064189..14927df 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCToken.java
@@ -17,243 +17,55 @@
package org.apache.nlpcraft;
-import java.util.List;
-
/**
*
*/
public interface NCToken {
-
- /**
- * Gets ID of the request this token is part of.
- *
- * @return ID of the request this token is part of.
- */
- String getRequestId();
-
/**
*
* @return
*/
- String getId();
-
- /**
- * Gets the optional parent ID of the model element this token represents. This only available
- * for user-defined model elements - built-in tokens do not have parents and this will return {@code null}.
- *
- * @return ID of the token's element immediate parent or {@code null} if not available.
- * @see NCElement#getParentId()
- * @see #getAncestors()
- */
- String getParentId();
-
- /**
- * Gets the list of all parent IDs from this token up to the root. This only available
- * for user-defined model elements = built-in tokens do not have parents and will return an empty list.
- *
- * @return List, potentially empty but never {@code null}, of all parent IDs from this token up to the root.
- * @see #getParentId()
- */
- List<String> getAncestors();
+ String getOriginalText();
/**
- * Tests whether this token is a child of given token ID. It is equivalent to:
- * <pre class="brush: java">
- * return getAncestors().contains(tokId);
- * </pre>
*
- * @param tokId Ancestor token ID.
- * @return <code>true</code> this token is a child of given token ID, <code>false</code> otherwise.
+ * @return
*/
- default boolean isChildOf(String tokId) {
- return getAncestors().contains(tokId);
- }
+ String getNormalizedText();
/**
- * Gets the value if this token was detected via element's value (or its synonyms). Otherwise,
- * returns {@code null}. Only applicable for user-defined model elements - built-in tokens
- * do not have values, and it will return {@code null}.
*
- * @return Value for the user-defined model element or {@code null}, if not available.
- * @see NCElement#getValues()
+ * @return
*/
- String getValue();
+ String getLemma();
/**
- * Gets the list of groups this token belongs to. Note that, by default, if not specified explicitly,
- * token always belongs to one group with ID equal to token ID.
*
- * @return Token groups list. Never {@code null} - but can be empty.
- * @see NCElement#getGroups()
+ * @return
*/
- List<String> getGroups();
+ String getStem();
/**
- * Tests whether this token belongs to the given group. It is equivalent to:
- * <pre class="brush: java">
- * return getGroups().contains(grp);
- * </pre>
*
- * @param grp Group to test.
- * @return <code>True</code> if this token belongs to the group <code>grp</code>, {@code false} otherwise.
+ * @return
*/
- default boolean isMemberOf(String grp) {
- return getGroups().contains(grp);
- }
+ String getPos();
/**
- * Gets start character index of this token in the original text.
*
- * @return Start character index of this token.
+ * @return
*/
int getStartCharIndex();
/**
- * Gets end character index of this token in the original text.
*
- * @return End character index of this token.
+ * @return
*/
int getEndCharIndex();
/**
- * A shortcut method checking whether this token is a stopword. Stopwords are some extremely internal
- * words which add little value in helping to understand user input and are excluded from the
- * processing entirely. For example, words like a, the, can, of, about, over, etc. are
- * typical stopwords in English. NLPCraft has built-in set of stopwords.
- * <p>
- * This method is equivalent to:
- * <pre class="brush: java">
- * return meta("nlpcraft:nlp:stopword");
- * </pre>
- * See more information on token metadata <a target=_ href="https://nlpcraft.apache.org/data-model.html">here</a>.
- *
- * @return Whether this token is a stopword.
- */
- boolean isStopWord();
-
- /**
- * A shortcut method checking whether this token represents a free word. A free word is a
- * token that was detected neither as a part of user defined nor system tokens.
- * <p>
- * This method is equivalent to:
- * <pre class="brush: java">
- * return meta("nlpcraft:nlp:freeword");
- * </pre>
- * See more information on token metadata <a target=_ href="https://nlpcraft.apache.org/data-model.html">here</a>.
- *
- * @return Whether this token is a freeword.
- */
- boolean isFreeWord();
-
- /**
- * A shortcut method that gets original user input text for this token.
- * <p>
- * This method is equivalent to:
- * <pre class="brush: java">
- * return meta("nlpcraft:nlp:origtext");
- * </pre>
- * See more information on token metadata <a target=_ href="https://nlpcraft.apache.org/data-model.html">here</a>.
- *
- * @return Original user input text for this token.
- */
- String getOriginalText();
-
- /**
- * A shortcut method that gets index of this token in the sentence.
- * <p>
- * This method is equivalent to:
- * <pre class="brush: java">
- * return meta("nlpcraft:nlp:index");
- * </pre>
- * See more information on token metadata <a target=_ href="https://nlpcraft.apache.org/data-model.html">here</a>.
- *
- * @return Index of this token in the sentence.
- */
- int getIndex();
-
- /**
- * A shortcut method that gets normalized user input text for this token.
- * <p>
- * This method is equivalent to:
- * <pre class="brush: java">
- * return meta("nlpcraft:nlp:normtext");
- * </pre>
- * See more information on token metadata <a target=_ href="https://nlpcraft.apache.org/data-model.html">here</a>.
- *
- * @return Normalized user input text for this token.
- */
- String getNormalizedText();
-
- /**
- * A shortcut method on whether this token is a swear word. NLPCraft has built-in list of
- * internal English swear words.
- * <p>
- * This method is equivalent to:
- * <pre class="brush: java">
- * return meta("nlpcraft:nlp:swear");
- * </pre>
- * See more information on token metadata <a target=_ href="https://nlpcraft.apache.org/data-model.html">here</a>.
- *
- * @return Whether this token is a swear word.
- */
- boolean isSwear();
-
- /**
- * A shortcut method to get lemma of this token, i.e. a canonical form of this word. Note that
- * stemming and lemmatization allow reducing inflectional forms and sometimes derivationally related
- * forms of a word to a internal base form. Lemmatization refers to the use of a vocabulary and
- * morphological analysis of words, normally aiming to remove inflectional endings only and to
- * return the base or dictionary form of a word, which is known as the lemma.
- * <p>
- * This method is equivalent to:
- * <pre class="brush: java">
- * return meta("nlpcraft:nlp:lemma");
- * </pre>
- * See more information on token metadata <a target=_ href="https://nlpcraft.apache.org/data-model.html">here</a>.
*
- * @return Lemma of this token, i.e. a canonical form of this word.
- */
- String getLemma();
-
- /**
- * A shortcut method to get stem of this token. Note that stemming and lemmatization allow to reduce
- * inflectional forms and sometimes derivationally related forms of a word to a internal base form.
- * Unlike lemma, stemming is a basic heuristic process that chops off the ends of words in the
- * hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes.
- * <p>
- * This method is equivalent to:
- * <pre class="brush: java">
- * return meta("nlpcraft:nlp:stem");
- * </pre>
- * See more information on token metadata <a target=_ href="https://nlpcraft.apache.org/data-model.html">here</a>.
- *
- * @return Stem of this token.
- */
- String getStem();
-
- /**
- * A shortcut method to get Penn Treebank POS tag for this token. Note that additionally to standard Penn
- * Treebank POS tags NLPCraft introduced '---' synthetic tag to indicate a POS tag for multiword tokens.
- * <p>
- * This method is equivalent to:
- * <pre class="brush: java">
- * return meta("nlpcraft:nlp:pos");
- * </pre>
- * See more information on token metadata <a target=_ href="https://nlpcraft.apache.org/data-model.html">here</a>.
- *
- * @return Penn Treebank POS tag for this token.
- */
- String getPos();
-
- /**
- * A shortcut method that gets internal globally unique system ID of the token.
- * <p>
- * This method is equivalent to:
- * <pre class="brush: java">
- * return meta("nlpcraft:nlp:unid");
- * </pre>
- *
- * @return Internal globally unique system ID of the token.
+ * @return
*/
- String getUnid();
+ int getLength();
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenParser.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenParser.java
index 36051bb..0f06198 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenParser.java
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCTokenParser.java
@@ -26,10 +26,7 @@ public interface NCTokenParser {
/**
*
* @param req
- * @param cfg
- * @param words
- * @param toks List of already parsed tokens prio to this step. Can be empty but never {@code null}.
* @return
*/
- List<NCToken> parse(NCRequest req, NCModelConfig cfg, List<NCWord> words, List<NCToken> toks);
+ List<NCToken> parse(NCRequest req);
}
diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCWord.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCWord.java
deleted file mode 100644
index dc1408c..0000000
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCWord.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nlpcraft;
-
-/**
- *
- */
-public interface NCWord {
- /**
- *
- * @return
- */
- String getOriginalText();
-
- /**
- *
- * @return
- */
- String getNormalizedText();
-
- /**
- *
- * @return
- */
- String getLemma();
-
- /**
- *
- * @return
- */
- String getStem();
-
- /**
- *
- * @return
- */
- String getPos();
-
- /**
- *
- * @return
- */
- int getStartCharIndex();
-
- /**
- *
- * @return
- */
- int getEndCharIndex();
-
- /**
- *
- * @return
- */
- int getLength();
-}