You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by mb...@apache.org on 2007/12/07 15:50:13 UTC
svn commit: r602111 - in /incubator/uima/sandbox/trunk/WhitespaceTokenizer: ./ desc/WhitespaceTokenizer.xml src/main/java/org/apache/uima/annotator/WhitespaceTokenizer.java

Author: mbaessler
Date: Fri Dec  7 06:50:11 2007
New Revision: 602111

URL: http://svn.apache.org/viewvc?rev=602111&view=rev
Log:
UIMA-669

make WhitespaceTokenizer Sofa aware

https://issues.apache.org/jira/browse/UIMA-669

Modified:
    incubator/uima/sandbox/trunk/WhitespaceTokenizer/   (props changed)
    incubator/uima/sandbox/trunk/WhitespaceTokenizer/desc/WhitespaceTokenizer.xml
    incubator/uima/sandbox/trunk/WhitespaceTokenizer/src/main/java/org/apache/uima/annotator/WhitespaceTokenizer.java

Propchange: incubator/uima/sandbox/trunk/WhitespaceTokenizer/
------------------------------------------------------------------------------
--- svn:ignore (original)
+++ svn:ignore Fri Dec  7 06:50:11 2007
@@ -1,5 +1,5 @@
-
-.settings
-target
-.classpath
-.project
+.settings
+target
+.classpath
+.project
+metadata

Modified: incubator/uima/sandbox/trunk/WhitespaceTokenizer/desc/WhitespaceTokenizer.xml
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/WhitespaceTokenizer/desc/WhitespaceTokenizer.xml?rev=602111&r1=602110&r2=602111&view=diff
==============================================================================
--- incubator/uima/sandbox/trunk/WhitespaceTokenizer/desc/WhitespaceTokenizer.xml (original)
+++ incubator/uima/sandbox/trunk/WhitespaceTokenizer/desc/WhitespaceTokenizer.xml Fri Dec  7 06:50:11 2007
@@ -38,11 +38,34 @@
 			separated languages
 		</description>
 		<version>1.0</version>
-		<vendor>Michael Baessler</vendor>
+		<vendor>The Apache Software Foundation</vendor>
 
-		<configurationParameters></configurationParameters>
+		<configurationParameters>
+			<configurationParameter>
+				<name>SofaNames</name>
+				<description>
+					The Sofa names the annotator should work on. If no
+					names are specified, the annotator works on the
+					default sofa.
+				</description>
+				<type>String</type>
+				<multiValued>true</multiValued>
+				<mandatory>false</mandatory>
+			</configurationParameter>
+
+		</configurationParameters>
 
 		<configurationParameterSettings>
+		<!-- 
+			<nameValuePair>
+				<name>SofaNames</name>
+				<value>
+					<array>
+						<string>sofaName</string>
+					</array>
+				</value>
+			</nameValuePair>
+		-->
 		</configurationParameterSettings>
 
 		<typeSystemDescription>
@@ -76,7 +99,9 @@
 				<inputs />
 				<outputs>
 					<type>org.apache.uima.TokenAnnotation</type>
-					<feature>org.apache.uima.TokenAnnotation:tokentype</feature>
+					<feature>
+						org.apache.uima.TokenAnnotation:tokentype
+					</feature>
 					<type>org.apache.uima.SentenceAnnotation</type>
 				</outputs>
 				<languagesSupported>

Modified: incubator/uima/sandbox/trunk/WhitespaceTokenizer/src/main/java/org/apache/uima/annotator/WhitespaceTokenizer.java
URL: http://svn.apache.org/viewvc/incubator/uima/sandbox/trunk/WhitespaceTokenizer/src/main/java/org/apache/uima/annotator/WhitespaceTokenizer.java?rev=602111&r1=602110&r2=602111&view=diff
==============================================================================
--- incubator/uima/sandbox/trunk/WhitespaceTokenizer/src/main/java/org/apache/uima/annotator/WhitespaceTokenizer.java (original)
+++ incubator/uima/sandbox/trunk/WhitespaceTokenizer/src/main/java/org/apache/uima/annotator/WhitespaceTokenizer.java Fri Dec  7 06:50:11 2007
@@ -18,25 +18,23 @@
  */
 package org.apache.uima.annotator;
 
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Iterator;
 import java.util.List;
 
-import org.apache.uima.analysis_engine.ResultSpecification;
-import org.apache.uima.analysis_engine.annotator.AnnotatorConfigurationException;
-import org.apache.uima.analysis_engine.annotator.AnnotatorContext;
-import org.apache.uima.analysis_engine.annotator.AnnotatorContextException;
-import org.apache.uima.analysis_engine.annotator.AnnotatorInitializationException;
-import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException;
-import org.apache.uima.analysis_engine.annotator.JTextAnnotator_ImplBase;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.Type;
 import org.apache.uima.cas.TypeSystem;
 import org.apache.uima.cas.text.AnnotationFS;
-import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
 import org.apache.uima.util.Level;
 import org.apache.uima.util.Logger;
 
-public class WhitespaceTokenizer extends JTextAnnotator_ImplBase {
+public class WhitespaceTokenizer extends CasAnnotator_ImplBase {
 
    private static final int CH_SPECIAL = 0;
 
@@ -68,129 +66,150 @@
 
    private Logger logger;
 
+   private String[] sofaNames;
+
    private static List<String> punctuations = Arrays.asList(new String[] { ".",
          "!", "?" });
 
    public static final String MESSAGE_BUNDLE = "org.apache.uima.annotator.whitespaceTokenizerMessages";
 
-   /*
-    * (non-Javadoc)
-    * 
-    * @see org.apache.uima.analysis_engine.annotator.JTextAnnotator#process(org.apache.uima.jcas.impl.JCas,
-    *      org.apache.uima.analysis_engine.ResultSpecification)
+   /* (non-Javadoc)
+    * @see org.apache.uima.analysis_component.CasAnnotator_ImplBase#process(org.apache.uima.cas.CAS)
     */
-   public void process(JCas aJCas, ResultSpecification aResultSpec)
-         throws AnnotatorProcessException {
+   public void process(CAS aCas) throws AnalysisEngineProcessException {
 
       this.logger.logrb(Level.INFO, "WhitespaceTokenizer", "process",
             MESSAGE_BUNDLE, "whitespace_tokenizer_info_start_processing");
 
-      // get text content from the CAS
-      char[] textContent = aJCas.getCas().getDocumentText().toCharArray();
-      this.cas = aJCas.getCas();
-      int tokenStart = UNDEFINED;
-      int currentCharPos = 0;
-      int sentenceStart = 0;
-      int nextCharType = UNDEFINED;
-      char nextChar = INVALID_CHAR;
-
-      while (currentCharPos < textContent.length) {
-         char currentChar = textContent[currentCharPos];
-         int currentCharType = getCharacterType(currentChar);
-
-         // get character class for current and next character
-         if ((currentCharPos + 1) < textContent.length) {
-            nextChar = textContent[currentCharPos + 1];
-            nextCharType = getCharacterType(nextChar);
-         } else {
-            nextCharType = UNDEFINED;
-            nextChar = INVALID_CHAR;
+      ArrayList<CAS> casList = new ArrayList<CAS>();
+      // check if sofa names are available
+      if (this.sofaNames != null && this.sofaNames.length > 0) {
+
+         // get sofa names
+         for (int i = 0; i < this.sofaNames.length; i++) {
+            Iterator it = aCas.getViewIterator(this.sofaNames[i]);
+            while (it.hasNext()) {
+               // add sofas to the cas List to process
+               casList.add((CAS) it.next());
+            }
          }
+      } else {
+         // use default sofa for the processing
+         casList.add(aCas);
+      }
 
-         // check if current character is a letter or number
-         if (currentCharType == CH_LETTER || currentCharType == CH_NUMBER) {
+      for (int x = 0; x < casList.size(); x++) {
 
-            // check if it is the first letter of a token
-            if (tokenStart == UNDEFINED) {
-               // start new token here
-               tokenStart = currentCharPos;
-            }
-         }
+         this.cas = casList.get(x);
 
-         // check if current character is a whitespace character
-         else if (currentCharType == CH_WHITESPACE) {
+         // get text content from the CAS
+         char[] textContent = this.cas.getDocumentText().toCharArray();
 
-            // terminate current token
-            if (tokenStart != UNDEFINED) {
-               // end of current word
-               createAnnotation(this.tokenType, tokenStart, currentCharPos);
-               tokenStart = UNDEFINED;
+         int tokenStart = UNDEFINED;
+         int currentCharPos = 0;
+         int sentenceStart = 0;
+         int nextCharType = UNDEFINED;
+         char nextChar = INVALID_CHAR;
+
+         while (currentCharPos < textContent.length) {
+            char currentChar = textContent[currentCharPos];
+            int currentCharType = getCharacterType(currentChar);
+
+            // get character class for current and next character
+            if ((currentCharPos + 1) < textContent.length) {
+               nextChar = textContent[currentCharPos + 1];
+               nextCharType = getCharacterType(nextChar);
+            } else {
+               nextCharType = UNDEFINED;
+               nextChar = INVALID_CHAR;
             }
-         }
 
-         // check if current character is a special character
-         else if (currentCharType == CH_SPECIAL) {
+            // check if current character is a letter or number
+            if (currentCharType == CH_LETTER || currentCharType == CH_NUMBER) {
 
-            // terminate current token
-            if (tokenStart != UNDEFINED) {
-               // end of current word
-               createAnnotation(this.tokenType, tokenStart, currentCharPos);
-               tokenStart = UNDEFINED;
+               // check if it is the first letter of a token
+               if (tokenStart == UNDEFINED) {
+                  // start new token here
+                  tokenStart = currentCharPos;
+               }
             }
 
-            // create token for special character
-            createAnnotation(this.tokenType, currentCharPos, currentCharPos + 1);
-         }
+            // check if current character is a whitespace character
+            else if (currentCharType == CH_WHITESPACE) {
 
-         // check if current character is new line character
-         else if (currentCharType == CH_NEWLINE) {
-            // terminate current token
-            if (tokenStart != UNDEFINED) {
-               // end of current word
-               createAnnotation(this.tokenType, tokenStart, currentCharPos);
-               tokenStart = UNDEFINED;
+               // terminate current token
+               if (tokenStart != UNDEFINED) {
+                  // end of current word
+                  createAnnotation(this.tokenType, tokenStart, currentCharPos);
+                  tokenStart = UNDEFINED;
+               }
             }
-         }
 
-         // check if current character is new punctuation character
-         else if (currentCharType == CH_PUNCTUATION) {
+            // check if current character is a special character
+            else if (currentCharType == CH_SPECIAL) {
 
-            // terminates the current token
-            if (tokenStart != UNDEFINED) {
-               createAnnotation(this.tokenType, tokenStart, currentCharPos);
-               tokenStart = UNDEFINED;
+               // terminate current token
+               if (tokenStart != UNDEFINED) {
+                  // end of current word
+                  createAnnotation(this.tokenType, tokenStart, currentCharPos);
+                  tokenStart = UNDEFINED;
+               }
+
+               // create token for special character
+               createAnnotation(this.tokenType, currentCharPos,
+                     currentCharPos + 1);
             }
 
-            // check next token type so see if we have a sentence end
-            if (((nextCharType == CH_WHITESPACE) || (nextCharType == CH_NEWLINE))
-                  && (punctuations.contains(new String(
-                        new char[] { currentChar })))) {
-               // terminate sentence
-               createAnnotation(this.sentenceType, sentenceStart,
+            // check if current character is new line character
+            else if (currentCharType == CH_NEWLINE) {
+               // terminate current token
+               if (tokenStart != UNDEFINED) {
+                  // end of current word
+                  createAnnotation(this.tokenType, tokenStart, currentCharPos);
+                  tokenStart = UNDEFINED;
+               }
+            }
+
+            // check if current character is new punctuation character
+            else if (currentCharType == CH_PUNCTUATION) {
+
+               // terminates the current token
+               if (tokenStart != UNDEFINED) {
+                  createAnnotation(this.tokenType, tokenStart, currentCharPos);
+                  tokenStart = UNDEFINED;
+               }
+
+               // check next token type so see if we have a sentence end
+               if (((nextCharType == CH_WHITESPACE) || (nextCharType == CH_NEWLINE))
+                     && (punctuations.contains(new String(
+                           new char[] { currentChar })))) {
+                  // terminate sentence
+                  createAnnotation(this.sentenceType, sentenceStart,
+                        currentCharPos + 1);
+                  sentenceStart = currentCharPos + 1;
+               }
+               // create token for punctuation character
+               createAnnotation(this.tokenType, currentCharPos,
                      currentCharPos + 1);
-               sentenceStart = currentCharPos + 1;
             }
-            // create token for punctuation character
-            createAnnotation(this.tokenType, currentCharPos, currentCharPos + 1);
+            // go to the next token
+            currentCharPos++;
+         } // end of character loop
+
+         // we are at the end of the text terminate open token annotations
+         if (tokenStart != UNDEFINED) {
+            // end of current word
+            createAnnotation(this.tokenType, tokenStart, currentCharPos);
+            tokenStart = UNDEFINED;
          }
-         // go to the next token
-         currentCharPos++;
-      } // end of character loop
-
-      // we are at the end of the text terminate open token annotations
-      if (tokenStart != UNDEFINED) {
-         // end of current word
-         createAnnotation(this.tokenType, tokenStart, currentCharPos);
-         tokenStart = UNDEFINED;
-      }
 
-      // we are at the end of the text terminate open sentence annotations
-      if (sentenceStart != UNDEFINED) {
-         // end of current word
-         createAnnotation(this.sentenceType, sentenceStart, currentCharPos);
-         sentenceStart = UNDEFINED;
+         // we are at the end of the text terminate open sentence annotations
+         if (sentenceStart != UNDEFINED) {
+            // end of current word
+            createAnnotation(this.sentenceType, sentenceStart, currentCharPos);
+            sentenceStart = UNDEFINED;
+         }
       }
-
       this.logger.logrb(Level.INFO, "WhitespaceTokenizer", "process",
             MESSAGE_BUNDLE, "whitespace_tokenizer_info_stop_processing");
    }
@@ -277,41 +296,33 @@
 
    }
 
-   /*
-    * (non-Javadoc)
-    * 
-    * @see org.apache.uima.analysis_engine.annotator.Annotator_ImplBase#initialize(org.apache.uima.analysis_engine.annotator.AnnotatorContext)
-    */
-   public void initialize(AnnotatorContext aContext)
-         throws AnnotatorInitializationException,
-         AnnotatorConfigurationException {
-
-      // initialize logger
-      try {
-         this.logger = aContext.getLogger();
-      } catch (AnnotatorContextException ex) {
-         throw new AnnotatorInitializationException(ex);
-      }
-
-      this.logger.logrb(Level.INFO, "WhitespaceTokenizer", "initialize",
-            MESSAGE_BUNDLE, "whitespace_tokenizer_info_initialized");
-   }
-
-   /*
-    * (non-Javadoc)
-    * 
-    * @see org.apache.uima.analysis_engine.annotator.Annotator_ImplBase#typeSystemInit(org.apache.uima.cas.TypeSystem)
-    */
-   public void typeSystemInit(TypeSystem aTypeSystem)
-         throws AnnotatorInitializationException,
-         AnnotatorConfigurationException {
+   @Override
+   public void typeSystemInit(TypeSystem typeSystem)
+         throws AnalysisEngineProcessException {
 
+      super.typeSystemInit(typeSystem);
       // initialize cas token type
-      this.tokenType = aTypeSystem.getType(TOKEN_ANNOTATION_NAME);
+      this.tokenType = typeSystem.getType(TOKEN_ANNOTATION_NAME);
 
-      this.sentenceType = aTypeSystem.getType(SENTENCE_ANNOTATION_NAME);
+      this.sentenceType = typeSystem.getType(SENTENCE_ANNOTATION_NAME);
 
       this.logger.logrb(Level.INFO, "WhitespaceTokenizer", "typeSystemInit",
             MESSAGE_BUNDLE, "whitespace_tokenizer_info_typesystem_initialized");
+
+   }
+
+   @Override
+   public void initialize(UimaContext context)
+         throws ResourceInitializationException {
+      super.initialize(context);
+
+      this.sofaNames = (String[]) getContext().getConfigParameterValue(
+            "SofaNames");
+
+      this.logger = context.getLogger();
+
+      this.logger.logrb(Level.INFO, "WhitespaceTokenizer", "initialize",
+            MESSAGE_BUNDLE, "whitespace_tokenizer_info_initialized");
+
    }
 }