You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/12/06 14:34:00 UTC
svn commit: r1210904 - in
/incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder:
EntityContentProvider.java NameFinderJob.java
Author: joern
Date: Tue Dec 6 13:34:00 2011
New Revision: 1210904
URL: http://svn.apache.org/viewvc?rev=1210904&view=rev
Log:
OPENNLP-324 Added configuration options and added initial capital letter filter.
Modified:
incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java
incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderJob.java
Modified: incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java?rev=1210904&r1=1210903&r2=1210904&view=diff
==============================================================================
--- incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java (original)
+++ incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/EntityContentProvider.java Tue Dec 6 13:34:00 2011
@@ -50,6 +50,10 @@ import org.eclipse.jface.viewers.TableVi
import org.eclipse.jface.viewers.Viewer;
import org.eclipse.swt.widgets.Display;
+/**
+ * The EntityContentProvider is responsible to trigger the detection of entities
+ * and turn these into potential entity annotations.
+ */
// Need its own list (or map), otherwise it is complicated to compute updates ...
// Maybe we should create again, a "View" map of indexes to its annotations?!
public class EntityContentProvider implements IStructuredContentProvider {
@@ -97,7 +101,7 @@ public class EntityContentProvider imple
// Remove all entities from the view and candidate list
// TODO: Refactor this code branch ...
- // Now it only needs to remove all intersecting entites from the
+ // Now it only needs to remove all intersecting entities from the
// candidate list and add the entity itself to the confirmed list
int selectionIndex = EntityContentProvider.this.entityListViewer.
@@ -108,7 +112,6 @@ public class EntityContentProvider imple
entity.setBeginIndex(annotation.getBegin());
entity.setEndIndex(annotation.getEnd());
entity.setEntityText(annotation.getCoveredText());
-// entity.setConfirmed(true);
entity.setConfidence(null);
entityListViewer.remove(entity);
@@ -340,8 +343,11 @@ public class EntityContentProvider imple
void runNameFinder() {
- IPreferenceStore store = editor.getCasDocumentProvider().getTypeSystemPreferenceStore(editor.getEditorInput());
+ // TODO: Check if sentences do overlap
+ // TODO: Check if tokens do overlap
+ // TODO: Check that tokens do not intersect with sentence span
+ IPreferenceStore store = editor.getCasDocumentProvider().getTypeSystemPreferenceStore(editor.getEditorInput());
// TODO: All preferences should be retrieved when the name finder executed!
// Just move it down the run method ...
@@ -521,6 +527,15 @@ public class EntityContentProvider imple
nameFinder.setVerifiedNames(null);
}
+ nameFinder.setIgnoreShortTokens(store.getBoolean(
+ OpenNLPPreferenceConstants.IGNORE_SHORT_TOKENS));
+
+ nameFinder.setOnlyConsiderAllLetterTokens(store.getBoolean(
+ OpenNLPPreferenceConstants.ONLY_CONSIDER_ALL_LETTER_TOKENS));
+
+ nameFinder.setOnlyConsiderInitialCapitalLetterTokens(store.getBoolean(
+ OpenNLPPreferenceConstants.ONLY_CONSIDER_INITIAL_CAPITAL_TOKENS));
+
nameFinder.schedule();
}
}
Modified: incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderJob.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderJob.java?rev=1210904&r1=1210903&r2=1210904&view=diff
==============================================================================
--- incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderJob.java (original)
+++ incubator/opennlp/sandbox/caseditor-opennlp-plugin/src/main/java/org/apache/opennlp/caseditor/namefinder/NameFinderJob.java Tue Dec 6 13:34:00 2011
@@ -54,6 +54,12 @@ public class NameFinderJob extends Job {
private Span verifiedNames[] = new Span[0];
private List<PotentialAnnotation> nameList;
+
+ private boolean ignoreShortTokens;
+
+ private boolean onlyConsiderAllLetterTokens;
+
+ private boolean onlyConsiderInitialLetterTokens;
NameFinderJob() {
super("Name Finder Job");
@@ -83,6 +89,18 @@ public class NameFinderJob extends Job {
this.verifiedNames = verifiedNames;
}
+ synchronized void setIgnoreShortTokens(boolean ignoreShortTokens) {
+ this.ignoreShortTokens = ignoreShortTokens;
+ }
+
+ synchronized void setOnlyConsiderAllLetterTokens(boolean onlyConsiderAllLetterTokens) {
+ this.onlyConsiderAllLetterTokens = onlyConsiderAllLetterTokens;
+ }
+
+ synchronized void setOnlyConsiderInitialCapitalLetterTokens(boolean onlyConsiderInitialLetterTokens) {
+ this.onlyConsiderInitialLetterTokens = onlyConsiderInitialLetterTokens;
+ }
+
// maybe report result, through an interface?!
// Note: Concurrency issue ... here! Editor might already be closed after model is loaded!
// The job change listener in the Entity Content Provider must handle that!
@@ -156,12 +174,21 @@ public class NameFinderJob extends Job {
// as part of the outcome!
verifiedNameTokens.put(i, verifiedName.getType() + "-" + outcome);
- // TODO: Do not put stop word
- // Only put, if char length is two
- // Only put only letters in token
StringPattern pattern = StringPattern.recognize(tokenStrings[i]);
- if (pattern.isAllLetter() && tokenStrings[i].length() > 1) {
+ boolean useToken = true;
+
+ if (ignoreShortTokens && tokenStrings[i].length() < 4) {
+ useToken = false;
+ }
+ else if (onlyConsiderAllLetterTokens && !pattern.isAllLetter()) {
+ useToken = false;
+ }
+ else if (onlyConsiderInitialLetterTokens && !pattern.isInitialCapitalLetter()) {
+ useToken = false;
+ }
+
+ if (useToken) {
nameTokens.add(verifiedName.getType() + "-" + tokenStrings[i]);
}
}