You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by al...@apache.org on 2006/12/07 23:40:30 UTC
svn commit: r483709 [5/5] - in
/incubator/uima/uimaj/trunk/uimaj-examples/src/main/opennlp_wrappers/src:
com/ibm/uima/examples/opennlp/ com/ibm/uima/examples/opennlp/annotator/
org/ org/apache/ org/apache/uima/ org/apache/uima/examples/
org/apache/uima...
Modified: incubator/uima/uimaj/trunk/uimaj-examples/src/main/opennlp_wrappers/src/org/apache/uima/examples/opennlp/annotator/Parser.java
URL: http://svn.apache.org/viewvc/incubator/uima/uimaj/trunk/uimaj-examples/src/main/opennlp_wrappers/src/org/apache/uima/examples/opennlp/annotator/Parser.java?view=diff&rev=483709&r1=477887&r2=483709
==============================================================================
--- incubator/uima/uimaj/trunk/uimaj-examples/src/main/opennlp_wrappers/src/org/apache/uima/examples/opennlp/annotator/Parser.java (original)
+++ incubator/uima/uimaj/trunk/uimaj-examples/src/main/opennlp_wrappers/src/org/apache/uima/examples/opennlp/annotator/Parser.java Thu Dec 7 14:40:23 2006
@@ -30,394 +30,368 @@
import opennlp.tools.parser.ParserME;
import opennlp.tools.util.Span;
-import org.apache.uima.analysis_engine.ResultSpecification;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.analysis_engine.annotator.AnnotatorConfigurationException;
-import org.apache.uima.analysis_engine.annotator.AnnotatorContext;
-import org.apache.uima.analysis_engine.annotator.AnnotatorInitializationException;
import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException;
-import org.apache.uima.analysis_engine.annotator.JTextAnnotator_ImplBase;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.examples.opennlp.Sentence;
import org.apache.uima.examples.opennlp.SyntaxAnnotation;
import org.apache.uima.examples.opennlp.Token;
import org.apache.uima.jcas.impl.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
/**
- * UIMA Analysis Engine that invokes the OpenNLP Parser. The OpenNLP Parser
- * generates PennTreeBank style syntax tags. These tags are mapped into
- * annotation types according to the tag mapping table (MAPPINGS_PARAM)
- * parameter and corresponding annotations are created in the CAS. The directory
- * containing the various model files used by the OpenNLP Parser must also be
- * specified as a parameter (MODEL_DIR_PARAM).
+ * UIMA Analysis Engine that invokes the OpenNLP Parser. The OpenNLP Parser generates PennTreeBank
+ * style syntax tags. These tags are mapped into annotation types according to the tag mapping table
+ * (MAPPINGS_PARAM) parameter and corresponding annotations are created in the CAS. The directory
+ * containing the various model files used by the OpenNLP Parser must also be specified as a
+ * parameter (MODEL_DIR_PARAM).
*
*/
-public class Parser extends JTextAnnotator_ImplBase {
+public class Parser extends JCasAnnotator_ImplBase {
- /** Parse tag mappings array parameter name. */
- private static final String MAPPINGS_PARAM = "ParseTagMappings";
+ /** Parse tag mappings array parameter name. */
+ private static final String MAPPINGS_PARAM = "ParseTagMappings";
- /** Model directory parameter name. */
- private static final String MODEL_DIR_PARAM = "ModelDirectory";
+ /** Model directory parameter name. */
+ private static final String MODEL_DIR_PARAM = "ModelDirectory";
- /** Use tag dictionary flag parameter name. */
- private static final String USE_TAG_DICT_PARAM = "UseTagDictionary";
-
- /** Case sensitive tag dictionary flag parameter name. */
- private static final String CASE_INSESNITIVE_TD_PARAM = "CaseSensitiveTagDictionary";
-
- /** Beam size paramter name. */
- private static final String BEAM_SIZE_PARAM = "BeamSize";
-
- /** Advance percentage parameter name. */
- private static final String ADV_PERCENT_PARAM = "AdvancePercentage";
-
- /** Name to use for this Analysis Engine component. */
- private static final String COMPONENT_NAME = "OpenNLP Parser";
-
- /** The OpenNLP parser */
- private ParserME parser;
-
- /**
- * Hashtable for characters that must be escaped because they have special
- * meaning for the parser.
- */
- private Hashtable escapeMap = new Hashtable();
+ /** Use tag dictionary flag parameter name. */
+ private static final String USE_TAG_DICT_PARAM = "UseTagDictionary";
+
+ /** Case sensitive tag dictionary flag parameter name. */
+ private static final String CASE_INSESNITIVE_TD_PARAM = "CaseSensitiveTagDictionary";
+
+ /** Beam size paramter name. */
+ private static final String BEAM_SIZE_PARAM = "BeamSize";
+
+ /** Advance percentage parameter name. */
+ private static final String ADV_PERCENT_PARAM = "AdvancePercentage";
+
+ /** Name to use for this Analysis Engine component. */
+ private static final String COMPONENT_NAME = "OpenNLP Parser";
+
+ /** The OpenNLP parser */
+ private ParserME parser;
+
+ /**
+ * Hashtable for characters that must be escaped because they have special meaning for the parser.
+ */
+ private Hashtable escapeMap = new Hashtable();
+
+ /**
+ * Table to keep track of span offsets when characters are escaped. Required to properly set spans
+ * in parse annotations.
+ */
+ private OffsetMap offsetMap = new OffsetMap();
+
+ /**
+ * Hash that maps parse tags to the constructor for the corresponding annotation type class.
+ */
+ private Hashtable parseTagMap = new Hashtable();
+
+ /**
+ * Initialize the Annotator.
+ *
+ * @see JCasAnnotator_ImplBase#initialize(UimaContext)
+ */
+ public void initialize(UimaContext aContext) throws ResourceInitializationException {
+ super.initialize(aContext);
+
+ try {
+ String[] mappingStrings = null;
+
+ mappingStrings = (String[]) aContext.getConfigParameterValue(MAPPINGS_PARAM);
+ if (mappingStrings == null) {
+ throw new AnnotatorConfigurationException();
+ }
+ loadMappings(mappingStrings);
+
+ String modelDirName = (String) aContext.getConfigParameterValue(MODEL_DIR_PARAM);
+
+ File modelDir = new File(modelDirName);
+ if (!modelDir.isDirectory()) {
+ throw new AnnotatorConfigurationException();
+ }
+
+ // set parameter defaults
+ boolean useTagDictionary = false;
+ boolean caseSensitiveTagDictionary = false;
+ int beamSize = ParserME.defaultBeamSize;
+ double advancePercentage = ParserME.defaultAdvancePercentage;
+
+ Boolean useTagDictP = (Boolean) aContext.getConfigParameterValue(USE_TAG_DICT_PARAM);
+ if (useTagDictP != null)
+ useTagDictionary = useTagDictP.booleanValue();
+ Boolean caseSensitiveTagDictP = (Boolean) aContext
+ .getConfigParameterValue(CASE_INSESNITIVE_TD_PARAM);
+ if (caseSensitiveTagDictP != null)
+ caseSensitiveTagDictionary = caseSensitiveTagDictP.booleanValue();
+ Integer beamSizeInt = (Integer) aContext.getConfigParameterValue(BEAM_SIZE_PARAM);
+ if (beamSizeInt != null)
+ beamSize = beamSizeInt.intValue();
+ Float advPercentFlt = (Float) aContext.getConfigParameterValue(ADV_PERCENT_PARAM);
+ if (advPercentFlt != null)
+ advancePercentage = advPercentFlt.doubleValue();
+
+ parser = TreebankParser.getParser(modelDirName, useTagDictionary, caseSensitiveTagDictionary,
+ beamSize, advancePercentage);
+ } catch (Exception e) {
+ throw new ResourceInitializationException(e);
+ }
+ initEscapeMap();
+ }
- /**
- * Table to keep track of span offsets when characters are escaped. Required to
- * properly set spans in parse annotations.
- */
- private OffsetMap offsetMap = new OffsetMap();
+ /**
+ * Processes the parse tag mappaings parameter. The constructor for each class identified in the
+ * array is loaded and stored in the mapping hashtable, using the label provided in the parameter
+ * as the key.
+ *
+ * @param mappingStrings
+ * Array of mapping strings of the form "tag,class"
+ * @throws AnnotatorConfigurationException
+ */
+ private void loadMappings(String[] mappingStrings) throws AnnotatorConfigurationException {
+ // populate the mappings hash table (key: parse tag,CAS Annotation Type
+ // Constructor)
+ for (int i = 0; i < mappingStrings.length; i++) {
+ String[] mappingPair = mappingStrings[i].split(",");
+ if (mappingPair.length < 2)
+ throw new AnnotatorConfigurationException();
+
+ String parseTag = mappingPair[0];
+ String className = mappingPair[1];
+
+ Constructor annotationConstructor;
+ // get the name of the JCAS type with this name
+ Class annotationClass;
+ try {
+ annotationClass = Class.forName(className);
+ // get the constructor for that JCAS type
+ annotationConstructor = annotationClass.getConstructor(new Class[] { JCas.class });
+ } catch (Exception e) {
+ throw new AnnotatorConfigurationException(e);
+ }
+ parseTagMap.put(parseTag, annotationConstructor);
+ }
+ }
- /**
- * Hash that maps parse tags to the constructor for the corresponding
- * annotation type class.
- */
- private Hashtable parseTagMap = new Hashtable();
+ /**
+ * Process a CAS.
+ *
+ * @see JCasAnnotator_ImplBase#process(JCas)
+ */
+ public void process(JCas aJCas) throws AnalysisEngineProcessException {
+
+ ArrayList wordList = new ArrayList();
+ StringBuffer sentenceBuffer = new StringBuffer();
+ offsetMap.clear();
+
+ AnnotationIndex sentenceIndex = (AnnotationIndex) aJCas.getJFSIndexRepository()
+ .getAnnotationIndex(Sentence.type);
+ AnnotationIndex tokenIndex = (AnnotationIndex) aJCas.getJFSIndexRepository()
+ .getAnnotationIndex(Token.type);
+
+ // iterate over Sentences
+ FSIterator sentenceIterator = sentenceIndex.iterator();
+ while (sentenceIterator.hasNext()) {
+ Sentence sentence = (Sentence) sentenceIterator.next();
+
+ wordList.clear();
+ sentenceBuffer.setLength(0);
+
+ int mapIdx = 0;
+
+ // iterate over Tokens
+ FSIterator tokenIterator = tokenIndex.subiterator(sentence);
+ while (tokenIterator.hasNext()) {
+ Token token = (Token) tokenIterator.next();
+
+ String word = escapeToken(token.getCoveredText());
+
+ int start = sentenceBuffer.length();
+ int end = start + word.length();
+
+ int origIdx = token.getBegin();
+ for (mapIdx = start; mapIdx <= end; mapIdx++) {
+ offsetMap.putMapping(mapIdx, origIdx);
+ if (origIdx < token.getEnd())
+ origIdx++;
+ }
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.analysis_engine.annotator.BaseAnnotator#initialize(org.apache.uima.analysis_engine.annotator.AnnotatorContext)
- */
- public void initialize(AnnotatorContext aContext)
- throws AnnotatorInitializationException,
- AnnotatorConfigurationException {
- super.initialize(aContext);
+ sentenceBuffer.append(word + " ");
+ wordList.add(word);
+ }
+
+ if (sentenceBuffer.length() == 0) // check for empty sentence
+ continue;
+
+ String sentenceText = sentenceBuffer.substring(0, sentenceBuffer.length() - 1);
+
+ Parse parse = new Parse(sentenceText, new Span(0, sentenceText.length()), "INC", 1, null);
+
+ int tokenStart = 0;
+ int tokenEnd = 0;
+ Iterator wordIterator = wordList.iterator();
+ while (wordIterator.hasNext()) {
+ String word = (String) wordIterator.next();
+ tokenEnd = tokenStart + word.length();
+ parse.insert(new Parse(sentenceText, new Span(tokenStart, tokenEnd), ParserME.TOK_NODE, 0));
+ tokenStart = tokenEnd + 1; // advance past space
+ }
+ parse = parser.parse(parse);
+
+ makeAnnotations(parse, aJCas);
+
+ // parse.show();
+ // System.out.println("");
+ // System.out.println(show(parse));
+ }
+ }
+ /**
+ * Initializes the table of characters that must be "escaped". These characters have special
+ * meaning to the parser, so they are replaced with a special string, which is understood by the
+ * parser to represent that character.
+ */
+ private void initEscapeMap() {
+ escapeMap.put("(", "-LRB-");
+ escapeMap.put(")", "-RRB-");
+ escapeMap.put("{", "-LCB-");
+ escapeMap.put("}", "-RCB-");
+ escapeMap.put("[", "-LSB-");
+ escapeMap.put("]", "-RSB-");
+ }
+
+ /**
+ * Escape the input token, if necessary. Consult the EscapeMap to see if the input token is a
+ * character that must be escaped and, if so, return the escape sequence. Otherwise, return the
+ * input token.
+ *
+ * @param token
+ * The token to escape.
+ * @return If token must be escaped, then the escaped token, otherwise the original token.
+ */
+ private String escapeToken(String token) {
+ String newToken = (String) escapeMap.get(token);
+ if (newToken == null)
+ return token;
+ return newToken;
+ }
+
+ /**
+ * Create the parse annotations in the CAS corresponding to the results of the OpenNLP parse.
+ *
+ * @param parse
+ * The parse generated by the OpenNLP parser.
+ * @param jCas
+ * The JCas in which to create the annotations.
+ * @throws AnnotatorProcessException
+ */
+ private void makeAnnotations(Parse parse, JCas jCas) throws AnalysisEngineProcessException {
+ Span span = parse.getSpan();
+ String tag = parse.getType();
+ if (!tag.equals(ParserME.TOK_NODE)) {
+
+ // make the annotation
+ int start = offsetMap.getMapping(span.getStart());
+ int end = offsetMap.getMapping(span.getEnd());
+ Constructor annotationMaker = (Constructor) parseTagMap.get(tag);
+ if (annotationMaker != null) {
+ SyntaxAnnotation syntaxAnnot;
try {
- String[] mappingStrings = null;
-
- mappingStrings = (String[]) aContext
- .getConfigParameterValue(MAPPINGS_PARAM);
- if (mappingStrings == null) {
- throw new AnnotatorConfigurationException();
- }
- loadMappings(mappingStrings);
-
- String modelDirName = (String) aContext
- .getConfigParameterValue(MODEL_DIR_PARAM);
-
- File modelDir = new File(modelDirName);
- if (!modelDir.isDirectory()) {
- throw new AnnotatorConfigurationException();
- }
-
- // set parameter defaults
- boolean useTagDictionary = false;
- boolean caseSensitiveTagDictionary = false;
- int beamSize = ParserME.defaultBeamSize;
- double advancePercentage = ParserME.defaultAdvancePercentage;
-
- Boolean useTagDictP = (Boolean) aContext
- .getConfigParameterValue(USE_TAG_DICT_PARAM);
- if (useTagDictP != null)
- useTagDictionary = useTagDictP.booleanValue();
- Boolean caseSensitiveTagDictP = (Boolean) aContext
- .getConfigParameterValue(CASE_INSESNITIVE_TD_PARAM);
- if (caseSensitiveTagDictP != null)
- caseSensitiveTagDictionary = caseSensitiveTagDictP
- .booleanValue();
- Integer beamSizeInt = (Integer) aContext
- .getConfigParameterValue(BEAM_SIZE_PARAM);
- if (beamSizeInt != null)
- beamSize = beamSizeInt.intValue();
- Float advPercentFlt = (Float) aContext
- .getConfigParameterValue(ADV_PERCENT_PARAM);
- if (advPercentFlt != null)
- advancePercentage = advPercentFlt.doubleValue();
-
- parser = TreebankParser.getParser(modelDirName,
- useTagDictionary, caseSensitiveTagDictionary, beamSize,
- advancePercentage);
+ syntaxAnnot = (SyntaxAnnotation) annotationMaker.newInstance(new Object[] { jCas });
} catch (Exception e) {
- throw new AnnotatorInitializationException(e);
+ throw new AnalysisEngineProcessException(e);
}
- initEscapeMap();
+ syntaxAnnot.setBegin(start);
+ syntaxAnnot.setEnd(end);
+ syntaxAnnot.setComponentId(COMPONENT_NAME);
+ syntaxAnnot.addToIndexes();
+ }
+ Parse[] children = parse.getChildren();
+ for (int i = 0; i < children.length; i++) {
+ makeAnnotations(children[i], jCas);
+ }
}
+ }
- /**
- * Processes the parse tag mappaings parameter. The constructor for each
- * class identified in the array is loaded and stored in the mapping
- * hashtable, using the label provided in the parameter as the key.
- *
- * @param mappingStrings
- * Array of mapping strings of the form "tag,class"
- * @throws AnnotatorConfigurationException
- */
- private void loadMappings(String[] mappingStrings)
- throws AnnotatorConfigurationException {
- // populate the mappings hash table (key: parse tag,CAS Annotation Type
- // Constructor)
- for (int i = 0; i < mappingStrings.length; i++) {
- String[] mappingPair = mappingStrings[i].split(",");
- if (mappingPair.length < 2)
- throw new AnnotatorConfigurationException();
-
- String parseTag = mappingPair[0];
- String className = mappingPair[1];
-
- Constructor annotationConstructor;
- // get the name of the JCAS type with this name
- Class annotationClass;
- try {
- annotationClass = Class.forName(className);
- // get the constructor for that JCAS type
- annotationConstructor = annotationClass
- .getConstructor(new Class[] { JCas.class });
- } catch (Exception e) {
- throw new AnnotatorConfigurationException(e);
- }
- parseTagMap.put(parseTag, annotationConstructor);
- }
+ public String show(Parse parse) {
+ Span span = parse.getSpan();
+ if (parse.getType().equals(ParserME.TOK_NODE)) {
+ return (parse.getText().substring(span.getStart(), span.getEnd()));
}
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.analysis_engine.annotator.JTextAnnotator#process(org.apache.uima.jcas.impl.JCas,
- * org.apache.uima.analysis_engine.ResultSpecification)
- */
- public void process(JCas aJCas, ResultSpecification aResultSpec)
- throws AnnotatorProcessException {
-
- ArrayList wordList = new ArrayList();
- StringBuffer sentenceBuffer = new StringBuffer();
- offsetMap.clear();
-
- AnnotationIndex sentenceIndex = (AnnotationIndex) aJCas
- .getJFSIndexRepository().getAnnotationIndex(Sentence.type);
- AnnotationIndex tokenIndex = (AnnotationIndex) aJCas
- .getJFSIndexRepository().getAnnotationIndex(Token.type);
-
- // iterate over Sentences
- FSIterator sentenceIterator = sentenceIndex.iterator();
- while (sentenceIterator.hasNext()) {
- Sentence sentence = (Sentence) sentenceIterator.next();
-
- wordList.clear();
- sentenceBuffer.setLength(0);
-
- int mapIdx = 0;
-
- // iterate over Tokens
- FSIterator tokenIterator = tokenIndex.subiterator(sentence);
- while (tokenIterator.hasNext()) {
- Token token = (Token) tokenIterator.next();
-
- String word = escapeToken(token.getCoveredText());
-
- int start = sentenceBuffer.length();
- int end = start + word.length();
-
- int origIdx = token.getBegin();
- for (mapIdx = start; mapIdx <= end; mapIdx++) {
- offsetMap.putMapping(mapIdx, origIdx);
- if (origIdx < token.getEnd())
- origIdx++;
- }
-
- sentenceBuffer.append(word + " ");
- wordList.add(word);
- }
-
- if (sentenceBuffer.length() == 0) // check for empty sentence
- continue;
-
- String sentenceText = sentenceBuffer.substring(0, sentenceBuffer
- .length() - 1);
-
- Parse parse = new Parse(sentenceText, new Span(0, sentenceText
- .length()), "INC", 1, null);
-
- int tokenStart = 0;
- int tokenEnd = 0;
- Iterator wordIterator = wordList.iterator();
- while (wordIterator.hasNext()) {
- String word = (String) wordIterator.next();
- tokenEnd = tokenStart + word.length();
- parse.insert(new Parse(sentenceText, new Span(tokenStart,
- tokenEnd), ParserME.TOK_NODE, 0));
- tokenStart = tokenEnd + 1; // advance past space
- }
- parse = parser.parse(parse);
-
- makeAnnotations(parse, aJCas);
-
- // parse.show();
- // System.out.println("");
- // System.out.println(show(parse));
- }
+ Parse[] children = parse.getChildren();
+ if (children.length == 1) {
+ Parse childParse = children[0];
+ if (childParse.getType().equals(ParserME.TOK_NODE)) {
+ return (show(childParse) + "/" + parse.getType());
+ }
}
-
- /**
- * Initializes the table of characters that must be "escaped". These
- * characters have special meaning to the parser, so they are replaced with
- * a special string, which is understood by the parser to represent that
- * character.
- */
- private void initEscapeMap() {
- escapeMap.put("(", "-LRB-");
- escapeMap.put(")", "-RRB-");
- escapeMap.put("{", "-LCB-");
- escapeMap.put("}", "-RCB-");
- escapeMap.put("[", "-LSB-");
- escapeMap.put("]", "-RSB-");
+ String retVal = "(" + parse.getType() + " ";
+ for (int i = 0; i < children.length; i++) {
+ retVal += show(children[i]) + " ";
}
+ return (retVal + ")");
+ }
- /**
- * Escape the input token, if necessary. Consult the EscapeMap to see if the
- * input token is a character that must be escaped and, if so, return the
- * escape sequence. Otherwise, return the input token.
- *
- * @param token
- * The token to escape.
- * @return If token must be escaped, then the escaped token, otherwise the
- * original token.
- */
- private String escapeToken(String token) {
- String newToken = (String) escapeMap.get(token);
- if (newToken == null)
- return token;
- return newToken;
+ public void printParse(Parse parse, String prefix) {
+ System.out.println(prefix + "Label: " + parse.getLabel());
+ System.out.println(prefix + "Type: " + parse.getType());
+ Span span = parse.getSpan();
+ System.out.println(prefix + "Span: " + span.getStart() + ":" + span.getEnd());
+ System.out.println(prefix + "Text: "
+ + parse.getText().substring(span.getStart(), span.getEnd()));
+ Parse[] children = parse.getChildren();
+ for (int i = 0; i < children.length; i++) {
+ printParse(children[i], prefix + " ");
}
+ }
+
+ /**
+ * Private class to hold span offset mappings. When the input text contains special characters
+ * that must be escaped, the escape sequences are longer than the original text. This table keeps
+ * track of modified span offsets so that the results of the parse (performed on the
+ * length-modified text) can be mapped back to the original text spans.
+ */
+ private class OffsetMap extends ArrayList {
+
+ private static final long serialVersionUID = 1L;
+
/**
- * Create the parse annotations in the CAS corresponding to the results of
- * the OpenNLP parse.
+ * Store a span mapping in the table.
*
- * @param parse
- * The parse generated by the OpenNLP parser.
- * @param jCas
- * The JCas in which to create the annotations.
- * @throws AnnotatorProcessException
+ * @param index
+ * The new offset.
+ * @param offset
+ * The original offset.
*/
- private void makeAnnotations(Parse parse, JCas jCas)
- throws AnnotatorProcessException {
- Span span = parse.getSpan();
- String tag = parse.getType();
- if (!tag.equals(ParserME.TOK_NODE)) {
-
- // make the annotation
- int start = offsetMap.getMapping(span.getStart());
- int end = offsetMap.getMapping(span.getEnd());
- Constructor annotationMaker = (Constructor) parseTagMap.get(tag);
- if (annotationMaker != null) {
- SyntaxAnnotation syntaxAnnot;
- try {
- syntaxAnnot = (SyntaxAnnotation) annotationMaker
- .newInstance(new Object[] { jCas });
- } catch (Exception e) {
- throw new AnnotatorProcessException(e);
- }
- syntaxAnnot.setBegin(start);
- syntaxAnnot.setEnd(end);
- syntaxAnnot.setComponentId(COMPONENT_NAME);
- syntaxAnnot.addToIndexes();
- }
- Parse[] children = parse.getChildren();
- for (int i = 0; i < children.length; i++) {
- makeAnnotations(children[i], jCas);
- }
- }
- }
-
- public String show(Parse parse) {
- Span span = parse.getSpan();
- if (parse.getType().equals(ParserME.TOK_NODE)) {
- return (parse.getText().substring(span.getStart(), span.getEnd()));
- }
- Parse[] children = parse.getChildren();
- if (children.length == 1) {
- Parse childParse = children[0];
- if (childParse.getType().equals(ParserME.TOK_NODE)) {
- return (show(childParse) + "/" + parse.getType());
- }
- }
- String retVal = "(" + parse.getType() + " ";
- for (int i = 0; i < children.length; i++) {
- retVal += show(children[i]) + " ";
- }
- return (retVal + ")");
- }
-
- public void printParse(Parse parse, String prefix) {
- System.out.println(prefix + "Label: " + parse.getLabel());
- System.out.println(prefix + "Type: " + parse.getType());
- Span span = parse.getSpan();
- System.out.println(prefix + "Span: " + span.getStart() + ":"
- + span.getEnd());
- System.out.println(prefix + "Text: "
- + parse.getText().substring(span.getStart(), span.getEnd()));
- Parse[] children = parse.getChildren();
- for (int i = 0; i < children.length; i++) {
- printParse(children[i], prefix + " ");
- }
+ public void putMapping(int index, int offset) {
+ Integer element = new Integer(offset);
+ if (index < size()) {
+ set(index, element);
+ } else {
+ for (int i = size(); i < index; i++)
+ add(null);
+ add(element);
+ }
}
/**
- * Private class to hold span offset mappings. When the input text contains
- * special characters that must be escaped, the escape sequences are longer
- * than the original text. This table keeps track of modified span offsets
- * so that the results of the parse (performed on the length-modified text)
- * can be mapped back to the original text spans.
+ * Retrieve a span mapping from the table.
+ *
+ * @param index
+ * The new offset.
+ * @return The original offset.
*/
- private class OffsetMap extends ArrayList {
-
- private static final long serialVersionUID = 1L;
-
- /**
- * Store a span mapping in the table.
- *
- * @param index
- * The new offset.
- * @param offset
- * The original offset.
- */
- public void putMapping(int index, int offset) {
- Integer element = new Integer(offset);
-
- if (index < size()) {
- set(index, element);
- } else {
- for (int i = size(); i < index; i++)
- add(null);
- add(element);
- }
- }
-
- /**
- * Retrieve a span mapping from the table.
- *
- * @param index
- * The new offset.
- * @return The original offset.
- */
- public int getMapping(int index) {
- Integer element = (Integer) get(index);
- return element.intValue();
- }
+ public int getMapping(int index) {
+ Integer element = (Integer) get(index);
+ return element.intValue();
}
+ }
}
Modified: incubator/uima/uimaj/trunk/uimaj-examples/src/main/opennlp_wrappers/src/org/apache/uima/examples/opennlp/annotator/SentenceDetector.java
URL: http://svn.apache.org/viewvc/incubator/uima/uimaj/trunk/uimaj-examples/src/main/opennlp_wrappers/src/org/apache/uima/examples/opennlp/annotator/SentenceDetector.java?view=diff&rev=483709&r1=477887&r2=483709
==============================================================================
--- incubator/uima/uimaj/trunk/uimaj-examples/src/main/opennlp_wrappers/src/org/apache/uima/examples/opennlp/annotator/SentenceDetector.java (original)
+++ incubator/uima/uimaj/trunk/uimaj-examples/src/main/opennlp_wrappers/src/org/apache/uima/examples/opennlp/annotator/SentenceDetector.java Thu Dec 7 14:40:23 2006
@@ -21,69 +21,57 @@
import java.io.IOException;
-import org.apache.uima.analysis_engine.ResultSpecification;
-import org.apache.uima.analysis_engine.annotator.AnnotatorConfigurationException;
-import org.apache.uima.analysis_engine.annotator.AnnotatorContext;
-import org.apache.uima.analysis_engine.annotator.AnnotatorContextException;
-import org.apache.uima.analysis_engine.annotator.AnnotatorInitializationException;
-import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException;
-import org.apache.uima.analysis_engine.annotator.JTextAnnotator_ImplBase;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.examples.opennlp.Sentence;
import org.apache.uima.jcas.impl.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
/**
- * Simple Annotator to detect sentences and create Sentence annotations in the
- * CAS. Uses the OpenNLP MaxEnt Sentence Detector.
+ * Simple Annotator to detect sentences and create Sentence annotations in the CAS. Uses the OpenNLP
+ * MaxEnt Sentence Detector.
*
*/
-public class SentenceDetector extends JTextAnnotator_ImplBase {
+public class SentenceDetector extends JCasAnnotator_ImplBase {
- public static final String MODEL_FILE_PARAM = "ModelFile";
-
- public static final String COMPONENT_ID = "OpenNLP Sentence Detector";
-
- private opennlp.tools.lang.english.SentenceDetector sentenceDetector;
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.analysis_engine.annotator.BaseAnnotator#initialize(org.apache.uima.analysis_engine.annotator.AnnotatorContext)
- */
- public void initialize(AnnotatorContext aContext)
- throws AnnotatorInitializationException,
- AnnotatorConfigurationException {
- super.initialize(aContext);
- String modelFile;
- try {
- modelFile = (String) aContext
- .getConfigParameterValue(MODEL_FILE_PARAM);
- } catch (AnnotatorContextException e) {
- throw new AnnotatorConfigurationException(e);
- }
- try {
- sentenceDetector = new opennlp.tools.lang.english.SentenceDetector(modelFile);
- } catch (IOException e) {
- throw new AnnotatorInitializationException(e);
- }
+ public static final String MODEL_FILE_PARAM = "ModelFile";
+
+ public static final String COMPONENT_ID = "OpenNLP Sentence Detector";
+
+ private opennlp.tools.lang.english.SentenceDetector sentenceDetector;
+
+ /**
+ * Initialize the Annotator.
+ *
+ * @see JCasAnnotator_ImplBase#initialize(UimaContext)
+ */
+ public void initialize(UimaContext aContext) throws ResourceInitializationException {
+ super.initialize(aContext);
+ String modelFile;
+ modelFile = (String) aContext.getConfigParameterValue(MODEL_FILE_PARAM);
+ try {
+ sentenceDetector = new opennlp.tools.lang.english.SentenceDetector(modelFile);
+ } catch (IOException e) {
+ throw new ResourceInitializationException(e);
}
+ }
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.analysis_engine.annotator.JTextAnnotator#process(org.apache.uima.jcas.impl.JCas,
- * org.apache.uima.analysis_engine.ResultSpecification)
- */
- public void process(JCas aJCas, ResultSpecification aResultSpec)
- throws AnnotatorProcessException {
- String docText = aJCas.getDocumentText();
- int sentenceOffsets[] = sentenceDetector.sentPosDetect(docText);
- int begin = 0;
- for (int i = 0; i < sentenceOffsets.length; i++) {
- Sentence sentence = new Sentence(aJCas, begin, sentenceOffsets[i]);
- sentence.setComponentId(COMPONENT_ID);
- sentence.addToIndexes();
- begin = sentenceOffsets[i];
- }
+ /**
+ * Process a CAS.
+ *
+ * @see JCasAnnotator_ImplBase#process(JCas)
+ */
+ public void process(JCas aJCas) throws AnalysisEngineProcessException {
+ String docText = aJCas.getDocumentText();
+ int sentenceOffsets[] = sentenceDetector.sentPosDetect(docText);
+ int begin = 0;
+ for (int i = 0; i < sentenceOffsets.length; i++) {
+ Sentence sentence = new Sentence(aJCas, begin, sentenceOffsets[i]);
+ sentence.setComponentId(COMPONENT_ID);
+ sentence.addToIndexes();
+ begin = sentenceOffsets[i];
}
+ }
}
Modified: incubator/uima/uimaj/trunk/uimaj-examples/src/main/opennlp_wrappers/src/org/apache/uima/examples/opennlp/annotator/Tokenizer.java
URL: http://svn.apache.org/viewvc/incubator/uima/uimaj/trunk/uimaj-examples/src/main/opennlp_wrappers/src/org/apache/uima/examples/opennlp/annotator/Tokenizer.java?view=diff&rev=483709&r1=477887&r2=483709
==============================================================================
--- incubator/uima/uimaj/trunk/uimaj-examples/src/main/opennlp_wrappers/src/org/apache/uima/examples/opennlp/annotator/Tokenizer.java (original)
+++ incubator/uima/uimaj/trunk/uimaj-examples/src/main/opennlp_wrappers/src/org/apache/uima/examples/opennlp/annotator/Tokenizer.java Thu Dec 7 14:40:23 2006
@@ -23,90 +23,76 @@
import opennlp.tools.util.Span;
-import org.apache.uima.analysis_engine.ResultSpecification;
-import org.apache.uima.analysis_engine.annotator.AnnotatorConfigurationException;
-import org.apache.uima.analysis_engine.annotator.AnnotatorContext;
-import org.apache.uima.analysis_engine.annotator.AnnotatorContextException;
-import org.apache.uima.analysis_engine.annotator.AnnotatorInitializationException;
-import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException;
-import org.apache.uima.analysis_engine.annotator.JTextAnnotator_ImplBase;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.examples.opennlp.Sentence;
import org.apache.uima.examples.opennlp.Token;
import org.apache.uima.jcas.impl.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
/**
- * UIMA Analysis Engine that invokes the OpenNLP Tokenizer. The OpenNLP
- * Tokenizer generates a PennTreeBank style tokenization. This annotator assumes
- * that sentences have already been annotated in the CAS with Sentence
- * annotations. We iterate over sentences and invoke the OpenNLP Tokenizer on
- * each sentence. For each token, a Token annotation is created in the CAS. The
- * model file for the OpenNLP Tokenizer is specified as a parameter
- * (MODEL_FILE_PARAM).
+ * UIMA Analysis Engine that invokes the OpenNLP Tokenizer. The OpenNLP Tokenizer generates a
+ * PennTreeBank style tokenization. This annotator assumes that sentences have already been
+ * annotated in the CAS with Sentence annotations. We iterate over sentences and invoke the OpenNLP
+ * Tokenizer on each sentence. For each token, a Token annotation is created in the CAS. The model
+ * file for the OpenNLP Tokenizer is specified as a parameter (MODEL_FILE_PARAM).
*
*/
-public class Tokenizer extends JTextAnnotator_ImplBase {
+public class Tokenizer extends JCasAnnotator_ImplBase {
- public static final String MODEL_FILE_PARAM = "ModelFile";
+ public static final String MODEL_FILE_PARAM = "ModelFile";
- public static final String COMPONENT_ID = "OpenNLP Tokenizer";
+ public static final String COMPONENT_ID = "OpenNLP Tokenizer";
- private opennlp.tools.lang.english.Tokenizer tokenizer;
+ private opennlp.tools.lang.english.Tokenizer tokenizer;
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.analysis_engine.annotator.BaseAnnotator#initialize(org.apache.uima.analysis_engine.annotator.AnnotatorContext)
- */
- public void initialize(AnnotatorContext aContext)
- throws AnnotatorInitializationException,
- AnnotatorConfigurationException {
- super.initialize(aContext);
- String modelFile;
-
- try {
- modelFile = (String) aContext
- .getConfigParameterValue(MODEL_FILE_PARAM);
- } catch (AnnotatorContextException e) {
- throw new AnnotatorConfigurationException(e);
- }
-
- try {
- tokenizer = new opennlp.tools.lang.english.Tokenizer(modelFile);
- } catch (IOException e) {
- throw new AnnotatorInitializationException(e);
- }
- }
+ /**
+ * Initialize the Annotator.
+ *
+ * @see JCasAnnotator_ImplBase#initialize(UimaContext)
+ */
+ public void initialize(UimaContext aContext) throws ResourceInitializationException {
+ super.initialize(aContext);
+ String modelFile;
+
+ modelFile = (String) aContext.getConfigParameterValue(MODEL_FILE_PARAM);
- /*
- * (non-Javadoc)
- *
- * @see org.apache.uima.analysis_engine.annotator.JTextAnnotator#process(org.apache.uima.jcas.impl.JCas,
- * org.apache.uima.analysis_engine.ResultSpecification)
- */
- public void process(JCas aJCas, ResultSpecification aResultSpec)
- throws AnnotatorProcessException {
-
- AnnotationIndex sentenceIndex = (AnnotationIndex) aJCas
- .getJFSIndexRepository().getAnnotationIndex(Sentence.type);
-
- // iterate over Sentences
- FSIterator sentenceIterator = sentenceIndex.iterator();
- while (sentenceIterator.hasNext()) {
- Sentence sentence = (Sentence) sentenceIterator.next();
-
- String text = sentence.getCoveredText();
- Span[] tokenSpans = tokenizer.tokenizePos(text);
- for (int i = 0; i < tokenSpans.length; i++) {
- Span span = tokenSpans[i];
- Token token = new Token(aJCas);
- token.setBegin(sentence.getBegin() + span.getStart());
- token.setEnd(sentence.getBegin() + span.getEnd());
- token.setComponentId(COMPONENT_ID);
- token.addToIndexes();
- }
- }
+ try {
+ tokenizer = new opennlp.tools.lang.english.Tokenizer(modelFile);
+ } catch (IOException e) {
+ throw new ResourceInitializationException(e);
+ }
+ }
+ /**
+ * Process a CAS.
+ *
+ * @see JCasAnnotator_ImplBase#process(JCas)
+ */
+ public void process(JCas aJCas) throws AnalysisEngineProcessException {
+
+ AnnotationIndex sentenceIndex = (AnnotationIndex) aJCas.getJFSIndexRepository()
+ .getAnnotationIndex(Sentence.type);
+
+ // iterate over Sentences
+ FSIterator sentenceIterator = sentenceIndex.iterator();
+ while (sentenceIterator.hasNext()) {
+ Sentence sentence = (Sentence) sentenceIterator.next();
+
+ String text = sentence.getCoveredText();
+ Span[] tokenSpans = tokenizer.tokenizePos(text);
+ for (int i = 0; i < tokenSpans.length; i++) {
+ Span span = tokenSpans[i];
+ Token token = new Token(aJCas);
+ token.setBegin(sentence.getBegin() + span.getStart());
+ token.setEnd(sentence.getBegin() + span.getEnd());
+ token.setComponentId(COMPONENT_ID);
+ token.addToIndexes();
+ }
}
+
+ }
}