You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2015/10/07 22:14:55 UTC
svn commit: r1707377 -
/ctakes/trunk/ctakes-pos-tagger/src/main/java/org/apache/ctakes/postagger/POSTagger.java
Author: tmill
Date: Wed Oct 7 20:14:55 2015
New Revision: 1707377
URL: http://svn.apache.org/viewvc?rev=1707377&view=rev
Log:
Fix POS tagger to only tag printable tokens.
Modified:
ctakes/trunk/ctakes-pos-tagger/src/main/java/org/apache/ctakes/postagger/POSTagger.java
Modified: ctakes/trunk/ctakes-pos-tagger/src/main/java/org/apache/ctakes/postagger/POSTagger.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-pos-tagger/src/main/java/org/apache/ctakes/postagger/POSTagger.java?rev=1707377&r1=1707376&r2=1707377&view=diff
==============================================================================
--- ctakes/trunk/ctakes-pos-tagger/src/main/java/org/apache/ctakes/postagger/POSTagger.java (original)
+++ ctakes/trunk/ctakes-pos-tagger/src/main/java/org/apache/ctakes/postagger/POSTagger.java Wed Oct 7 20:14:55 2015
@@ -51,27 +51,29 @@
package org.apache.ctakes.postagger;
import java.io.InputStream;
+import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
-import opennlp.tools.postag.POSModel;
-
import org.apache.ctakes.core.resource.FileLocator;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
import org.apache.ctakes.typesystem.type.textspan.Segment;
import org.apache.ctakes.typesystem.type.textspan.Sentence;
import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.TypePrioritiesFactory;
import org.apache.uima.fit.factory.TypeSystemDescriptionFactory;
import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+
+import opennlp.tools.postag.POSModel;
public class POSTagger extends JCasAnnotator_ImplBase {
@@ -114,19 +116,24 @@ public class POSTagger extends JCasAnnot
Collection<Sentence> sentences = JCasUtil.select(jCas, Sentence.class);
for (Sentence sentence : sentences) {
- List<BaseToken> tokens = JCasUtil.selectCovered(BaseToken.class,
- sentence);
- String[] words = new String[tokens.size()];
+ List<BaseToken> printableTokens = new ArrayList<>();
+
+ for(BaseToken token : JCasUtil.selectCovered(BaseToken.class, sentence)){
+ if(token instanceof NewlineToken) continue;
+ printableTokens.add(token);
+ }
+
+ String[] words = new String[printableTokens.size()];
for (int i = 0; i < words.length; i++) {
- words[i] = tokens.get(i).getCoveredText();
+ words[i] = printableTokens.get(i).getCoveredText();
}
if (words.length > 0) {
String[] wordTagList = tagger.tag(words);
try {
- for (int i = 0; i < tokens.size(); i++) {
- BaseToken token = tokens.get(i);
+ for (int i = 0; i < printableTokens.size(); i++) {
+ BaseToken token = printableTokens.get(i);
String posTag = wordTagList[i];
token.setPartOfSpeech(posTag);
}