You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by mb...@apache.org on 2007/12/07 15:50:13 UTC
svn commit: r602111 - in /incubator/uima/sandbox/trunk/WhitespaceTokenizer:
./ desc/WhitespaceTokenizer.xml
src/main/java/org/apache/uima/annotator/WhitespaceTokenizer.java
Author: mbaessler
Date: Fri Dec 7 06:50:11 2007
New Revision: 602111
URL: http://svn.apache.org/viewvc?rev=602111&view=rev
Log:
UIMA-669
make WhitespaceTokenizer Sofa aware
https://issues.apache.org/jira/browse/UIMA-669
Modified:
incubator/uima/sandbox/trunk/WhitespaceTokenizer/ (props changed)
incubator/uima/sandbox/trunk/WhitespaceTokenizer/desc/WhitespaceTokenizer.xml
incubator/uima/sandbox/trunk/WhitespaceTokenizer/src/main/java/org/apache/uima/annotator/WhitespaceTokenizer.java
Propchange: incubator/uima/sandbox/trunk/WhitespaceTokenizer/
------------------------------------------------------------------------------
--- svn:ignore (original)
+++ svn:ignore Fri Dec 7 06:50:11 2007
@@ -1,5 +1,5 @@
-
-.settings
-target
-.classpath
-.project
+.settings
+target
+.classpath
+.project
+metadata
Modified: incubator/uima/sandbox/trunk/WhitespaceTokenizer/desc/WhitespaceTokenizer.xml
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/WhitespaceTokenizer/desc/WhitespaceTokenizer.xml?rev=602111&r1=602110&r2=602111&view=diff
==============================================================================
--- incubator/uima/sandbox/trunk/WhitespaceTokenizer/desc/WhitespaceTokenizer.xml (original)
+++ incubator/uima/sandbox/trunk/WhitespaceTokenizer/desc/WhitespaceTokenizer.xml Fri Dec 7 06:50:11 2007
@@ -38,11 +38,34 @@
separated languages
</description>
<version>1.0</version>
- <vendor>Michael Baessler</vendor>
+ <vendor>The Apache Software Foundation</vendor>
- <configurationParameters></configurationParameters>
+ <configurationParameters>
+ <configurationParameter>
+ <name>SofaNames</name>
+ <description>
+ The Sofa names the annotator should work on. If no
+ names are specified, the annotator works on the
+ default sofa.
+ </description>
+ <type>String</type>
+ <multiValued>true</multiValued>
+ <mandatory>false</mandatory>
+ </configurationParameter>
+
+ </configurationParameters>
<configurationParameterSettings>
+ <!--
+ <nameValuePair>
+ <name>SofaNames</name>
+ <value>
+ <array>
+ <string>sofaName</string>
+ </array>
+ </value>
+ </nameValuePair>
+ -->
</configurationParameterSettings>
<typeSystemDescription>
@@ -76,7 +99,9 @@
<inputs />
<outputs>
<type>org.apache.uima.TokenAnnotation</type>
- <feature>org.apache.uima.TokenAnnotation:tokentype</feature>
+ <feature>
+ org.apache.uima.TokenAnnotation:tokentype
+ </feature>
<type>org.apache.uima.SentenceAnnotation</type>
</outputs>
<languagesSupported>
Modified: incubator/uima/sandbox/trunk/WhitespaceTokenizer/src/main/java/org/apache/uima/annotator/WhitespaceTokenizer.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/WhitespaceTokenizer/src/main/java/org/apache/uima/annotator/WhitespaceTokenizer.java?rev=602111&r1=602110&r2=602111&view=diff
==============================================================================
--- incubator/uima/sandbox/trunk/WhitespaceTokenizer/src/main/java/org/apache/uima/annotator/WhitespaceTokenizer.java (original)
+++ incubator/uima/sandbox/trunk/WhitespaceTokenizer/src/main/java/org/apache/uima/annotator/WhitespaceTokenizer.java Fri Dec 7 06:50:11 2007
@@ -18,25 +18,23 @@
*/
package org.apache.uima.annotator;
+import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Iterator;
import java.util.List;
-import org.apache.uima.analysis_engine.ResultSpecification;
-import org.apache.uima.analysis_engine.annotator.AnnotatorConfigurationException;
-import org.apache.uima.analysis_engine.annotator.AnnotatorContext;
-import org.apache.uima.analysis_engine.annotator.AnnotatorContextException;
-import org.apache.uima.analysis_engine.annotator.AnnotatorInitializationException;
-import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException;
-import org.apache.uima.analysis_engine.annotator.JTextAnnotator_ImplBase;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
-import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;
-public class WhitespaceTokenizer extends JTextAnnotator_ImplBase {
+public class WhitespaceTokenizer extends CasAnnotator_ImplBase {
private static final int CH_SPECIAL = 0;
@@ -68,129 +66,150 @@
private Logger logger;
+ private String[] sofaNames;
+
private static List<String> punctuations = Arrays.asList(new String[] { ".",
"!", "?" });
public static final String MESSAGE_BUNDLE = "org.apache.uima.annotator.whitespaceTokenizerMessages";
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.analysis_engine.annotator.JTextAnnotator#process(org.apache.uima.jcas.impl.JCas,
- * org.apache.uima.analysis_engine.ResultSpecification)
+ /* (non-Javadoc)
+ * @see org.apache.uima.analysis_component.CasAnnotator_ImplBase#process(org.apache.uima.cas.CAS)
*/
- public void process(JCas aJCas, ResultSpecification aResultSpec)
- throws AnnotatorProcessException {
+ public void process(CAS aCas) throws AnalysisEngineProcessException {
this.logger.logrb(Level.INFO, "WhitespaceTokenizer", "process",
MESSAGE_BUNDLE, "whitespace_tokenizer_info_start_processing");
- // get text content from the CAS
- char[] textContent = aJCas.getCas().getDocumentText().toCharArray();
- this.cas = aJCas.getCas();
- int tokenStart = UNDEFINED;
- int currentCharPos = 0;
- int sentenceStart = 0;
- int nextCharType = UNDEFINED;
- char nextChar = INVALID_CHAR;
-
- while (currentCharPos < textContent.length) {
- char currentChar = textContent[currentCharPos];
- int currentCharType = getCharacterType(currentChar);
-
- // get character class for current and next character
- if ((currentCharPos + 1) < textContent.length) {
- nextChar = textContent[currentCharPos + 1];
- nextCharType = getCharacterType(nextChar);
- } else {
- nextCharType = UNDEFINED;
- nextChar = INVALID_CHAR;
+ ArrayList<CAS> casList = new ArrayList<CAS>();
+ // check if sofa names are available
+ if (this.sofaNames != null && this.sofaNames.length > 0) {
+
+ // get sofa names
+ for (int i = 0; i < this.sofaNames.length; i++) {
+ Iterator it = aCas.getViewIterator(this.sofaNames[i]);
+ while (it.hasNext()) {
+ // add sofas to the cas List to process
+ casList.add((CAS) it.next());
+ }
}
+ } else {
+ // use default sofa for the processing
+ casList.add(aCas);
+ }
- // check if current character is a letter or number
- if (currentCharType == CH_LETTER || currentCharType == CH_NUMBER) {
+ for (int x = 0; x < casList.size(); x++) {
- // check if it is the first letter of a token
- if (tokenStart == UNDEFINED) {
- // start new token here
- tokenStart = currentCharPos;
- }
- }
+ this.cas = casList.get(x);
- // check if current character is a whitespace character
- else if (currentCharType == CH_WHITESPACE) {
+ // get text content from the CAS
+ char[] textContent = this.cas.getDocumentText().toCharArray();
- // terminate current token
- if (tokenStart != UNDEFINED) {
- // end of current word
- createAnnotation(this.tokenType, tokenStart, currentCharPos);
- tokenStart = UNDEFINED;
+ int tokenStart = UNDEFINED;
+ int currentCharPos = 0;
+ int sentenceStart = 0;
+ int nextCharType = UNDEFINED;
+ char nextChar = INVALID_CHAR;
+
+ while (currentCharPos < textContent.length) {
+ char currentChar = textContent[currentCharPos];
+ int currentCharType = getCharacterType(currentChar);
+
+ // get character class for current and next character
+ if ((currentCharPos + 1) < textContent.length) {
+ nextChar = textContent[currentCharPos + 1];
+ nextCharType = getCharacterType(nextChar);
+ } else {
+ nextCharType = UNDEFINED;
+ nextChar = INVALID_CHAR;
}
- }
- // check if current character is a special character
- else if (currentCharType == CH_SPECIAL) {
+ // check if current character is a letter or number
+ if (currentCharType == CH_LETTER || currentCharType == CH_NUMBER) {
- // terminate current token
- if (tokenStart != UNDEFINED) {
- // end of current word
- createAnnotation(this.tokenType, tokenStart, currentCharPos);
- tokenStart = UNDEFINED;
+ // check if it is the first letter of a token
+ if (tokenStart == UNDEFINED) {
+ // start new token here
+ tokenStart = currentCharPos;
+ }
}
- // create token for special character
- createAnnotation(this.tokenType, currentCharPos, currentCharPos + 1);
- }
+ // check if current character is a whitespace character
+ else if (currentCharType == CH_WHITESPACE) {
- // check if current character is new line character
- else if (currentCharType == CH_NEWLINE) {
- // terminate current token
- if (tokenStart != UNDEFINED) {
- // end of current word
- createAnnotation(this.tokenType, tokenStart, currentCharPos);
- tokenStart = UNDEFINED;
+ // terminate current token
+ if (tokenStart != UNDEFINED) {
+ // end of current word
+ createAnnotation(this.tokenType, tokenStart, currentCharPos);
+ tokenStart = UNDEFINED;
+ }
}
- }
- // check if current character is new punctuation character
- else if (currentCharType == CH_PUNCTUATION) {
+ // check if current character is a special character
+ else if (currentCharType == CH_SPECIAL) {
- // terminates the current token
- if (tokenStart != UNDEFINED) {
- createAnnotation(this.tokenType, tokenStart, currentCharPos);
- tokenStart = UNDEFINED;
+ // terminate current token
+ if (tokenStart != UNDEFINED) {
+ // end of current word
+ createAnnotation(this.tokenType, tokenStart, currentCharPos);
+ tokenStart = UNDEFINED;
+ }
+
+ // create token for special character
+ createAnnotation(this.tokenType, currentCharPos,
+ currentCharPos + 1);
}
- // check next token type so see if we have a sentence end
- if (((nextCharType == CH_WHITESPACE) || (nextCharType == CH_NEWLINE))
- && (punctuations.contains(new String(
- new char[] { currentChar })))) {
- // terminate sentence
- createAnnotation(this.sentenceType, sentenceStart,
+ // check if current character is new line character
+ else if (currentCharType == CH_NEWLINE) {
+ // terminate current token
+ if (tokenStart != UNDEFINED) {
+ // end of current word
+ createAnnotation(this.tokenType, tokenStart, currentCharPos);
+ tokenStart = UNDEFINED;
+ }
+ }
+
+ // check if current character is new punctuation character
+ else if (currentCharType == CH_PUNCTUATION) {
+
+ // terminates the current token
+ if (tokenStart != UNDEFINED) {
+ createAnnotation(this.tokenType, tokenStart, currentCharPos);
+ tokenStart = UNDEFINED;
+ }
+
+ // check next token type so see if we have a sentence end
+ if (((nextCharType == CH_WHITESPACE) || (nextCharType == CH_NEWLINE))
+ && (punctuations.contains(new String(
+ new char[] { currentChar })))) {
+ // terminate sentence
+ createAnnotation(this.sentenceType, sentenceStart,
+ currentCharPos + 1);
+ sentenceStart = currentCharPos + 1;
+ }
+ // create token for punctuation character
+ createAnnotation(this.tokenType, currentCharPos,
currentCharPos + 1);
- sentenceStart = currentCharPos + 1;
}
- // create token for punctuation character
- createAnnotation(this.tokenType, currentCharPos, currentCharPos + 1);
+ // go to the next token
+ currentCharPos++;
+ } // end of character loop
+
+ // we are at the end of the text terminate open token annotations
+ if (tokenStart != UNDEFINED) {
+ // end of current word
+ createAnnotation(this.tokenType, tokenStart, currentCharPos);
+ tokenStart = UNDEFINED;
}
- // go to the next token
- currentCharPos++;
- } // end of character loop
-
- // we are at the end of the text terminate open token annotations
- if (tokenStart != UNDEFINED) {
- // end of current word
- createAnnotation(this.tokenType, tokenStart, currentCharPos);
- tokenStart = UNDEFINED;
- }
- // we are at the end of the text terminate open sentence annotations
- if (sentenceStart != UNDEFINED) {
- // end of current word
- createAnnotation(this.sentenceType, sentenceStart, currentCharPos);
- sentenceStart = UNDEFINED;
+ // we are at the end of the text terminate open sentence annotations
+ if (sentenceStart != UNDEFINED) {
+ // end of current word
+ createAnnotation(this.sentenceType, sentenceStart, currentCharPos);
+ sentenceStart = UNDEFINED;
+ }
}
-
this.logger.logrb(Level.INFO, "WhitespaceTokenizer", "process",
MESSAGE_BUNDLE, "whitespace_tokenizer_info_stop_processing");
}
@@ -277,41 +296,33 @@
}
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.analysis_engine.annotator.Annotator_ImplBase#initialize(org.apache.uima.analysis_engine.annotator.AnnotatorContext)
- */
- public void initialize(AnnotatorContext aContext)
- throws AnnotatorInitializationException,
- AnnotatorConfigurationException {
-
- // initialize logger
- try {
- this.logger = aContext.getLogger();
- } catch (AnnotatorContextException ex) {
- throw new AnnotatorInitializationException(ex);
- }
-
- this.logger.logrb(Level.INFO, "WhitespaceTokenizer", "initialize",
- MESSAGE_BUNDLE, "whitespace_tokenizer_info_initialized");
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.analysis_engine.annotator.Annotator_ImplBase#typeSystemInit(org.apache.uima.cas.TypeSystem)
- */
- public void typeSystemInit(TypeSystem aTypeSystem)
- throws AnnotatorInitializationException,
- AnnotatorConfigurationException {
+ @Override
+ public void typeSystemInit(TypeSystem typeSystem)
+ throws AnalysisEngineProcessException {
+ super.typeSystemInit(typeSystem);
// initialize cas token type
- this.tokenType = aTypeSystem.getType(TOKEN_ANNOTATION_NAME);
+ this.tokenType = typeSystem.getType(TOKEN_ANNOTATION_NAME);
- this.sentenceType = aTypeSystem.getType(SENTENCE_ANNOTATION_NAME);
+ this.sentenceType = typeSystem.getType(SENTENCE_ANNOTATION_NAME);
this.logger.logrb(Level.INFO, "WhitespaceTokenizer", "typeSystemInit",
MESSAGE_BUNDLE, "whitespace_tokenizer_info_typesystem_initialized");
+
+ }
+
+ @Override
+ public void initialize(UimaContext context)
+ throws ResourceInitializationException {
+ super.initialize(context);
+
+ this.sofaNames = (String[]) getContext().getConfigParameterValue(
+ "SofaNames");
+
+ this.logger = context.getLogger();
+
+ this.logger.logrb(Level.INFO, "WhitespaceTokenizer", "initialize",
+ MESSAGE_BUNDLE, "whitespace_tokenizer_info_initialized");
+
}
}