You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/10/13 13:13:34 UTC
svn commit: r1182782 - in
/incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor:
namefinder/EntityContentProvider.java util/UIMAUtil.java
Author: joern
Date: Thu Oct 13 11:13:34 2011
New Revision: 1182782
URL: http://svn.apache.org/viewvc?rev=1182782&view=rev
Log:
OPENNLP-303 Now uses token annotations from input CAS instead of simple tokenizer.
Added:
incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/util/UIMAUtil.java (with props)
Modified:
incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java
Modified: incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java?rev=1182782&r1=1182781&r2=1182782&view=diff
==============================================================================
--- incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java (original)
+++ incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java Thu Oct 13 11:13:34 2011
@@ -27,6 +27,8 @@ import opennlp.tools.tokenize.SimpleToke
import opennlp.tools.util.Span;
import org.apache.opennlp.caseditor.OpenNLPPreferenceConstants;
+import org.apache.opennlp.caseditor.util.ContainingConstraint;
+import org.apache.opennlp.caseditor.util.UIMAUtil;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.FeatureStructure;
@@ -260,6 +262,9 @@ public class EntityContentProvider imple
@Override
public void run() {
+
+ // TODO: Check if view is still available, that might be called after view is disposed.
+
IStatus status = event.getResult();
if (status.isOK()) {
@@ -378,18 +383,15 @@ public class EntityContentProvider imple
}
void runNameFinder() {
+
IPreferenceStore store = editor.getCasDocumentProvider().getTypeSystemPreferenceStore(editor.getEditorInput());
String sentenceTypeName = store.getString(OpenNLPPreferenceConstants.SENTENCE_TYPE);
- // TODO: Add check for sentence type name
if (sentenceTypeName.isEmpty()) {
nameFinderView.setMessage("Sentence type is not set!");
return;
}
-
- // TODO: Add check for additional sentence type names
-
String modelPathes[] = store.getString(OpenNLPPreferenceConstants.NAME_FINDER_MODEL_PATH).split(",");
for (int i = 0; i < modelPathes.length; i++) {
@@ -409,51 +411,60 @@ public class EntityContentProvider imple
if (text != null) {
+ Type sentenceTypes[] = UIMAUtil.splitTypes(
+ sentenceTypeName + "," + additionalSentenceTypes, ',', cas.getTypeSystem());
+
+ if (sentenceTypes == null) {
+ nameFinderView.setMessage("Sentence type does not exist in type system!");
+ return;
+ }
+
+ String tokenName = store.getString(OpenNLPPreferenceConstants.TOKEN_TYPE);
+
+ if (tokenName.isEmpty()) {
+ nameFinderView.setMessage("Token type name is not set!");
+ return;
+ }
+
+ Type tokenType = cas.getTypeSystem().getType(tokenName);
+
+ if (tokenType == null) {
+ nameFinderView.setMessage("Token type does not exist in type system!");
+ return;
+ }
+
List<Span> sentences = new ArrayList<Span>();
-
- String sentenceTypeNames[] = (sentenceTypeName + "," + additionalSentenceTypes).split(",");
+ List<Span> tokens = new ArrayList<Span>();
- for (String typeName : sentenceTypeNames) {
- Type sentenceType = cas.getTypeSystem().getType(typeName.trim());
+ for (Iterator<AnnotationFS> sentenceIterator =
+ UIMAUtil.createMultiTypeIterator(cas, sentenceTypes);
+ sentenceIterator.hasNext();) {
- if (sentenceType == null) {
- nameFinderView.setMessage("Sentence type does not exist in type system!");
- return;
- }
+ AnnotationFS sentenceAnnotation = (AnnotationFS) sentenceIterator
+ .next();
- FSIndex<AnnotationFS> sentenceAnnotations = cas
- .getAnnotationIndex(sentenceType);
+ // TODO: Add code to detect overlapping sentences ... not allowed!
- for (Iterator<AnnotationFS> sentenceIterator = sentenceAnnotations
- .iterator(); sentenceIterator.hasNext();) {
-
- AnnotationFS sentenceAnnotation = (AnnotationFS) sentenceIterator
- .next();
+ sentences.add(new Span(sentenceAnnotation.getBegin(), sentenceAnnotation.getEnd()));
+
+ // Performance Note:
+ // The following code has O(n^2) complexity, can be optimized
+ // by using a token iterate over all tokens and manual weaving.
+
+ FSIndex<AnnotationFS> allTokens = cas.getAnnotationIndex(tokenType);
+
+ ContainingConstraint containingConstraint =
+ new ContainingConstraint(sentenceAnnotation);
+
+ Iterator<AnnotationFS> containingTokens = cas.createFilteredIterator(
+ allTokens.iterator(), containingConstraint);
+
+ while (containingTokens.hasNext()) {
+ AnnotationFS token = (AnnotationFS) containingTokens.next();
- sentences.add(new Span(sentenceAnnotation.getBegin(), sentenceAnnotation.getEnd()));
+ tokens.add(new Span(token.getBegin(), token.getEnd()));
}
}
-
- // sort sentences list ... ascending
- Collections.sort(sentences);
-
- // iterate again and create tokens ...
- List<Span> tokens = new ArrayList<Span>();
-
- for (Span sentence : sentences) {
- String sentText = sentence.getCoveredText(text).toString();
-
- // TODO: Extract tokens here! Instead of using the simple tokenizer!
-
- Span tokenSpans[] = SimpleTokenizer.INSTANCE.tokenizePos(sentText);
-
- int sentenceOffset = sentence.getStart();
-
- for (Span token : tokenSpans) {
- tokens.add(new Span(sentenceOffset + token.getStart(),
- sentenceOffset + token.getEnd()));
- }
- }
List<Span> nameSpans = new ArrayList<Span>();
@@ -479,9 +490,22 @@ public class EntityContentProvider imple
}
}
- // This will cause issues when it is done while it is running!
+ // Bug: Changing the data of the name finder will cause an issue if it is already running!
+
nameFinder.setText(text);
+
+ if (sentences.size() == 0) {
+ nameFinderView.setMessage("CAS must at least contain one sentence!");
+ return;
+ }
+
nameFinder.setSentences(sentences.toArray(new Span[sentences.size()]));
+
+ if (tokens.size() == 0) {
+ nameFinderView.setMessage("CAS must at least contain one token within a sentence!");
+ return;
+ }
+
nameFinder.setTokens(tokens.toArray(new Span[tokens.size()]));
nameFinder.setVerifiedNames(nameSpans.toArray(new Span[nameSpans.size()]));
nameFinder.setModelPath(modelPathes, nameTypeNames);
Added: incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/util/UIMAUtil.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/util/UIMAUtil.java?rev=1182782&view=auto
==============================================================================
--- incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/util/UIMAUtil.java (added)
+++ incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/util/UIMAUtil.java Thu Oct 13 11:13:34 2011
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.caseditor.util;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.ConstraintFactory;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.FSTypeConstraint;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.text.AnnotationFS;
+
+public class UIMAUtil {
+
+ public static String[] split(String parameter, char splitChar) {
+
+ String parts[] = parameter.split(Character.toString(splitChar));
+
+ for (int i = 0; i < parts.length; i++) {
+ parts[i] = parts[i].trim();
+ }
+
+ return parts;
+ }
+
+ // TODO: Should throw an exception
+ public static Type[] splitTypes(String typeList, char splitChar, TypeSystem typeSystem) {
+ String typeNames[] = split(typeList, splitChar);
+
+ Type types[] = new Type[typeNames.length];
+
+ for (int i = 0; i < typeNames.length; i++) {
+ types[i] = typeSystem.getType(typeNames[i]);
+
+ if (types[i] == null) {
+ return null; // TODO: Throw an exception instead!
+ }
+ }
+
+ return types;
+ }
+
+ public static FSIterator<AnnotationFS> createMultiTypeIterator(CAS cas, Type... types) {
+
+ if (types.length == 0)
+ throw new IllegalArgumentException("Need at least one type to create an iterator!");
+
+ ConstraintFactory cf = ConstraintFactory.instance();
+ FSIterator<AnnotationFS> iterator = cas.getAnnotationIndex().iterator();
+
+ FSTypeConstraint typeConstraint = cf.createTypeConstraint();
+
+ for (Type type : types) {
+ typeConstraint.add(type);
+ }
+
+ // Create and use the filtered iterator
+ FSIterator<AnnotationFS> filteredIterator = cas.createFilteredIterator(iterator, typeConstraint);
+
+ return filteredIterator;
+ }
+}
Propchange: incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/util/UIMAUtil.java
------------------------------------------------------------------------------
svn:mime-type = text/plain