You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/09/08 15:58:00 UTC
svn commit: r1166692 - in
/incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor:
OpenNLPPreferenceConstants.java namefinder/EntityContentProvider.java
namefinder/NameFinderPreferencePage.java
Author: joern
Date: Thu Sep 8 13:57:59 2011
New Revision: 1166692
URL: http://svn.apache.org/viewvc?rev=1166692&view=rev
Log:
OPENNLP-235 Name Finder now supports multiple sentence types.
Modified:
incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/OpenNLPPreferenceConstants.java
incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java
incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderPreferencePage.java
Modified: incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/OpenNLPPreferenceConstants.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/OpenNLPPreferenceConstants.java?rev=1166692&r1=1166691&r2=1166692&view=diff
==============================================================================
--- incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/OpenNLPPreferenceConstants.java (original)
+++ incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/OpenNLPPreferenceConstants.java Thu Sep 8 13:57:59 2011
@@ -24,6 +24,7 @@ public class OpenNLPPreferenceConstants
public static final String PARAGRAPH_TYPE = "PARAGRAPH_TYPE";
public static final String SENTENCE_TYPE = "SENTENCE_TYPE";
+ public static final String ADDITIONAL_SENTENCE_TYPE = "ADDITIONAL_SENTENCE_TYPE";
public static final String TOKEN_TYPE = "TOKEN_TYPE";
public static final String NAME_TYPE = "NAME_TYPE";
public static final String TOKENIZER_MODEL_PATH = "TOKENIZER_MODEL_PATH";
Modified: incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java?rev=1166692&r1=1166691&r2=1166692&view=diff
==============================================================================
--- incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java (original)
+++ incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java Thu Sep 8 13:57:59 2011
@@ -19,6 +19,7 @@ package org.apache.opennlp.caseditor.nam
import java.util.ArrayList;
import java.util.Collection;
+import java.util.Collections;
import java.util.Iterator;
import java.util.List;
@@ -325,6 +326,7 @@ public class EntityContentProvider imple
void runNameFinder() {
IPreferenceStore store = OpenNLPPlugin.getDefault().getPreferenceStore();
String sentenceTypeName = store.getString(OpenNLPPreferenceConstants.SENTENCE_TYPE);
+ String additionalSentenceTypes = store.getString(OpenNLPPreferenceConstants.ADDITIONAL_SENTENCE_TYPE);
String nameTypeName = store.getString(OpenNLPPreferenceConstants.NAME_TYPE);
String modelPath = store.getString(OpenNLPPreferenceConstants.NAME_FINDER_MODEL_PATH);
@@ -332,43 +334,56 @@ public class EntityContentProvider imple
// just get it from preference store?!
// Should have a good way to display an error when the type is incorrect ...
- Type sentenceType = cas.getTypeSystem().getType(sentenceTypeName);
+
String text = cas.getDocumentText();
if (text != null) {
- // TODO: get list of token annotations
-
List<Span> sentences = new ArrayList<Span>();
- List<Span> tokens = new ArrayList<Span>();
- // get a list on name annotations, they will force the
- // name finder to detect them ... and maybe maintain a negative list
-
- FSIndex<AnnotationFS> sentenceAnnotations = cas
- .getAnnotationIndex(sentenceType);
-
- for (Iterator<AnnotationFS> sentenceIterator = sentenceAnnotations
- .iterator(); sentenceIterator.hasNext();) {
-
- AnnotationFS sentenceAnnotation = (AnnotationFS) sentenceIterator
- .next();
- sentences.add(new Span(sentenceAnnotation.getBegin(), sentenceAnnotation.getEnd()));
+ String sentenceTypeNames[] = (sentenceTypeName + "," + additionalSentenceTypes) .split(",");
+
+ for (String typeName : sentenceTypeNames) {
+ Type sentenceType = cas.getTypeSystem().getType(typeName.trim());
+
+ // TODO: If type cannot be mapped, it throws a null pointer exception ...
+
+ FSIndex<AnnotationFS> sentenceAnnotations = cas
+ .getAnnotationIndex(sentenceType);
+
+ for (Iterator<AnnotationFS> sentenceIterator = sentenceAnnotations
+ .iterator(); sentenceIterator.hasNext();) {
+
+ AnnotationFS sentenceAnnotation = (AnnotationFS) sentenceIterator
+ .next();
+
+ sentences.add(new Span(sentenceAnnotation.getBegin(), sentenceAnnotation.getEnd()));
- String sentText = sentenceAnnotation.getCoveredText();
+ }
+ }
- Span tokenSpans[] = SimpleTokenizer.INSTANCE.tokenizePos(sentText);
+ // sort sentences list ... ascending
+ Collections.sort(sentences);
+
+ // iterate again and create tokens ...
+ List<Span> tokens = new ArrayList<Span>();
+
+ for (Span sentence : sentences) {
+ String sentText = sentence.getCoveredText(text).toString();
+
+ // TODO: Extract tokens here! Instead of using the simple tokenizer!
+
+ Span tokenSpans[] = SimpleTokenizer.INSTANCE.tokenizePos(sentText);
- int sentenceOffset = sentenceAnnotation.getBegin();
+ int sentenceOffset = sentence.getStart();
- for (Span token : tokenSpans) {
- tokens.add(new Span(sentenceOffset + token.getStart(),
- sentenceOffset + token.getEnd()));
- }
+ for (Span token : tokenSpans) {
+ tokens.add(new Span(sentenceOffset + token.getStart(),
+ sentenceOffset + token.getEnd()));
+ }
}
-
- // Note: When an annotation is removed, it might still be in the cas ...
+
List<Span> nameSpans = new ArrayList<Span>();
Modified: incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderPreferencePage.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderPreferencePage.java?rev=1166692&r1=1166691&r2=1166692&view=diff
==============================================================================
--- incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderPreferencePage.java (original)
+++ incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderPreferencePage.java Thu Sep 8 13:57:59 2011
@@ -43,6 +43,11 @@ public class NameFinderPreferencePage ex
"Model Path", getFieldEditorParent());
addField(modelPath);
+ StringFieldEditor additionalSentenceTypes = new StringFieldEditor(
+ OpenNLPPreferenceConstants.ADDITIONAL_SENTENCE_TYPE,
+ "Additional Sentence Types", getFieldEditorParent());
+ addField(additionalSentenceTypes);
+
StringFieldEditor nameType = new StringFieldEditor(
OpenNLPPreferenceConstants.NAME_TYPE,
"Name Type", getFieldEditorParent());