You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2014/04/17 00:44:42 UTC
svn commit: r1588089 -
/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/TokenizerAnnotatorPTB.java
Author: tmill
Date: Wed Apr 16 22:44:42 2014
New Revision: 1588089
URL: http://svn.apache.org/r1588089
Log:
CTAKES-295: Updated TokenizerAnnotatorPTB to use UIMAFit configuration.
Modified:
ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/TokenizerAnnotatorPTB.java
Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/TokenizerAnnotatorPTB.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/TokenizerAnnotatorPTB.java?rev=1588089&r1=1588088&r2=1588089&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/TokenizerAnnotatorPTB.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/TokenizerAnnotatorPTB.java Wed Apr 16 22:44:42 2014
@@ -18,28 +18,26 @@
*/
package org.apache.ctakes.core.ae;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.log4j.Logger;
-import org.apache.uima.UimaContext;
-import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
-import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
-import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException;
-import org.apache.uima.cas.FSIterator;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.JFSIndexRepository;
-import org.apache.uima.jcas.tcas.Annotation;
-import org.apache.uima.resource.ResourceAccessException;
-import org.apache.uima.resource.ResourceInitializationException;
-
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
import org.apache.ctakes.core.nlp.tokenizer.TokenizerPTB;
-import org.apache.ctakes.core.util.ParamUtil;
-import org.apache.ctakes.typesystem.type.syntax.BaseToken;
-import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
-import org.apache.ctakes.typesystem.type.textspan.Segment;
-import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.JFSIndexRepository;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.descriptor.ConfigurationParameter;
/**
* UIMA annotator that tokenizes based on Penn Treebank rules.
@@ -56,44 +54,34 @@ public class TokenizerAnnotatorPTB exten
* of type String, should be multi-valued and optional.
*/
public static final String PARAM_SEGMENTS_TO_SKIP = "SegmentsToSkip";
-
-
- private UimaContext context;
- private Set<String> skipSegmentsSet;
+ @ConfigurationParameter(
+ name = PARAM_SEGMENTS_TO_SKIP,
+ mandatory = false,
+ description = "Set of segments that can be skipped"
+ )
+ private String[] skipSegmentsArray;
+ private Set<String> skipSegmentsSet;
private TokenizerPTB tokenizer;
private int tokenCount = 0;
- public void initialize(UimaContext aContext) throws ResourceInitializationException {
-
+ @Override
+ public void initialize(UimaContext aContext) throws ResourceInitializationException {
super.initialize(aContext);
-
- logger.info("Initializing " + this.getClass().getName());
- context = aContext;
- try {
- configInit();
- } catch (ResourceAccessException e) {
- throw new ResourceInitializationException(e);
- } finally {};
- }
-
- /**
- * Reads configuration parameters.
- * @throws ResourceAccessException
- */
- private void configInit() throws ResourceAccessException {
-
- skipSegmentsSet = ParamUtil.getStringParameterValuesSet(PARAM_SEGMENTS_TO_SKIP, context);
-
- tokenizer = new TokenizerPTB();
-
+ logger.info("Initializing " + this.getClass().getName());
+ tokenizer = new TokenizerPTB();
+ skipSegmentsSet = new HashSet<>();
+ if(skipSegmentsArray != null){
+ Collections.addAll(skipSegmentsSet, skipSegmentsArray);
+ }
}
/**
* Entry point for processing.
*/
- public void process(JCas jcas) throws AnalysisEngineProcessException {
+ @Override
+ public void process(JCas jcas) throws AnalysisEngineProcessException {
logger.info("process(JCas) in " + this.getClass().getName());
@@ -105,11 +93,7 @@ public class TokenizerAnnotatorPTB exten
Segment sa = (Segment) segmentItr.next();
String segmentID = sa.getId();
if (!skipSegmentsSet.contains(segmentID)) {
- try {
- annotateRange(jcas, sa.getBegin(), sa.getEnd());
- } catch (AnnotatorProcessException e) {
- throw new AnalysisEngineProcessException(e);
- }
+ annotateRange(jcas, sa.getBegin(), sa.getEnd());
}
}
}
@@ -122,9 +106,10 @@ public class TokenizerAnnotatorPTB exten
* Tokenizes a range of text, adding the tokens to the CAS
* Tokenizes one sentence at a time. Only tokenizes what is within Sentence annotation.
* There must have been Sentence annotations created beforehand in order for this method
- * to tokenize anything.
+ * to tokenize anything.
+ * @throws AnalysisEngineProcessException
*/
- protected void annotateRange(JCas jcas, int rangeBegin, int rangeEnd) throws AnnotatorProcessException {
+ protected void annotateRange(JCas jcas, int rangeBegin, int rangeEnd) throws AnalysisEngineProcessException {
// int tokenCount = 0; // can't start with tokenCount=0 here because this method can be called multiple times
JFSIndexRepository indexes = jcas.getJFSIndexRepository();
@@ -162,15 +147,19 @@ public class TokenizerAnnotatorPTB exten
if (sentence.getBegin() < rangeBegin || sentence.getEnd() > rangeEnd) {
continue;
}
- List<BaseToken> tokens = (List<BaseToken>)tokenizer.tokenizeTextSegment(jcas, sentence.getCoveredText(), sentence.getBegin(), true);
- for (BaseToken bta: tokens) {
+ List<?> tokens = tokenizer.tokenizeTextSegment(jcas, sentence.getCoveredText(), sentence.getBegin(), true);
+ for (Object bta: tokens) {
if (bta==null) {
Exception e = new RuntimeException("bta==null tokenCount=" + tokenCount + " tokens.size()==" + tokens.size());
e.printStackTrace();
} else{
//logger.info("Token #" + tokenCount + " len = " + bta.getCoveredText().length() + " " + bta.getCoveredText());
- // add the BaseToken to CAS index
- bta.addToIndexes();
+ // add the BaseToken to CAS index
+ if(BaseToken.class.isAssignableFrom(bta.getClass())){
+ BaseToken.class.cast(bta).addToIndexes();
+ }else{
+ throw new AnalysisEngineProcessException("Token returned cannot be cast as BaseToken", new Object[]{bta});
+ }
//tokenCount++;
}
}