You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2016/10/05 01:56:42 UTC
svn commit: r1763337 - /ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/ae/ClearNLPDependencyParserAE.java

Author: seanfinan
Date: Wed Oct  5 01:56:42 2016
New Revision: 1763337

URL: http://svn.apache.org/viewvc?rev=1763337&view=rev
Log:
Check for zero printable tokens

Modified:
    ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/ae/ClearNLPDependencyParserAE.java

Modified: ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/ae/ClearNLPDependencyParserAE.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/ae/ClearNLPDependencyParserAE.java?rev=1763337&r1=1763336&r2=1763337&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/ae/ClearNLPDependencyParserAE.java (original)
+++ ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/ae/ClearNLPDependencyParserAE.java Wed Oct  5 01:56:42 2016
@@ -1,55 +1,54 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
 package org.apache.ctakes.dependency.parser.ae;
 
-import java.io.InputStream;
-import java.net.URI;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.ctakes.core.resource.FileLocator;
-import org.apache.ctakes.dependency.parser.util.ClearDependencyUtility;
-import org.apache.ctakes.dependency.parser.util.DependencyUtility;
-import org.apache.ctakes.typesystem.type.syntax.BaseToken;
-import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
-import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
-import org.apache.ctakes.typesystem.type.textspan.Sentence;
-import org.apache.log4j.Logger;
-import org.apache.uima.UimaContext;
-import org.apache.uima.analysis_engine.AnalysisEngineDescription;
-import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
-import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
-import org.apache.uima.fit.descriptor.ConfigurationParameter;
-import org.apache.uima.fit.descriptor.TypeCapability;
-import org.apache.uima.fit.factory.AnalysisEngineFactory;
-import org.apache.uima.fit.util.JCasUtil;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.resource.ResourceInitializationException;
-
-import com.googlecode.clearnlp.component.AbstractComponent;
-import com.googlecode.clearnlp.dependency.DEPFeat;
-import com.googlecode.clearnlp.dependency.DEPNode;
-import com.googlecode.clearnlp.dependency.DEPTree;
-import com.googlecode.clearnlp.engine.EngineGetter;
-import com.googlecode.clearnlp.morphology.AbstractMPAnalyzer;
-import com.googlecode.clearnlp.nlp.NLPLib;
-import com.googlecode.clearnlp.reader.AbstractReader;
+import com.googlecode.clearnlp.component.AbstractComponent;
+import com.googlecode.clearnlp.dependency.DEPFeat;
+import com.googlecode.clearnlp.dependency.DEPNode;
+import com.googlecode.clearnlp.dependency.DEPTree;
+import com.googlecode.clearnlp.engine.EngineGetter;
+import com.googlecode.clearnlp.morphology.AbstractMPAnalyzer;
+import com.googlecode.clearnlp.nlp.NLPLib;
+import com.googlecode.clearnlp.reader.AbstractReader;
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.dependency.parser.util.ClearDependencyUtility;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.descriptor.TypeCapability;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+
+import java.io.InputStream;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.List;
 
 /**
  * <br>
@@ -75,12 +74,12 @@ import com.googlecode.clearnlp.reader.Ab
 				"org.apache.ctakes.typesystem.type.syntax.BaseToken:begin"
 		})
 public class ClearNLPDependencyParserAE extends JCasAnnotator_ImplBase {
-
-  final String language = AbstractReader.LANG_EN;
+
+  final String language = AbstractReader.LANG_EN;
   public Logger logger = Logger.getLogger(getClass().getName());
 
   // Default model values
-  public static final String DEFAULT_MODEL_FILE_NAME = "org/apache/ctakes/dependency/parser/models/dependency/mayo-en-dep-1.3.0.jar";
+  public static final String DEFAULT_MODEL_FILE_NAME = "org/apache/ctakes/dependency/parser/models/dependency/mayo-en-dep-1.3.0.jar";
   public static final String ENG_LEMMATIZER_DATA_FILE = "org/apache/ctakes/dependency/parser/models/lemmatizer/dictionary-1.3.1.jar";
 
 
@@ -90,19 +89,19 @@ public class ClearNLPDependencyParserAE
 		  name = PARAM_PARSER_MODEL_FILE_NAME,
 		  description = "This parameter provides the file name of the dependency parser model required " +
 					      "by the factory method provided by ClearNLPUtil.  If not specified, this " +
-					      "analysis engine will use a default model from the resources directory",
+					      "analysis engine will use a default model from the resources directory",
 		  defaultValue = DEFAULT_MODEL_FILE_NAME)
   protected URI parserModelUri;
 
-  public static final String PARAM_LEMMATIZER_DATA_FILE = "LemmatizerDataFile";
-
-  @ConfigurationParameter(
-      name = PARAM_LEMMATIZER_DATA_FILE,
-      description = "This parameter provides the data file required for the MorphEnAnalyzer. If not "
-          + "specified, this analysis engine will use a default model from the resources directory",
-      defaultValue = ENG_LEMMATIZER_DATA_FILE)
-  protected URI lemmatizerDataFile;
-
+  public static final String PARAM_LEMMATIZER_DATA_FILE = "LemmatizerDataFile";
+
+  @ConfigurationParameter(
+      name = PARAM_LEMMATIZER_DATA_FILE,
+      description = "This parameter provides the data file required for the MorphEnAnalyzer. If not "
+          + "specified, this analysis engine will use a default model from the resources directory",
+      defaultValue = ENG_LEMMATIZER_DATA_FILE)
+  protected URI lemmatizerDataFile;
+
 	public static final String PARAM_USE_LEMMATIZER = "UseLemmatizer";
 	@ConfigurationParameter(
 			name = PARAM_USE_LEMMATIZER,
@@ -124,39 +123,43 @@ public class ClearNLPDependencyParserAE
 		try {
 			if (useLemmatizer) {
 				// Note: If lemmatizer data file is not specified, then use lemmas from the BaseToken normalizedToken field.
-				// Initialize lemmatizer
-				
-                InputStream lemmatizerModel = (this.lemmatizerDataFile == null)
-                        ? FileLocator.getAsStream(ENG_LEMMATIZER_DATA_FILE)
-                        : FileLocator.getAsStream(this.lemmatizerDataFile.getPath());
-                        
+				// Initialize lemmatizer
+				
+                InputStream lemmatizerModel = (this.lemmatizerDataFile == null)
+                        ? FileLocator.getAsStream(ENG_LEMMATIZER_DATA_FILE)
+                        : FileLocator.getAsStream(this.lemmatizerDataFile.getPath());
+                        
                     this.lemmatizer = EngineGetter.getMPAnalyzer(language, lemmatizerModel);
-			}
-				InputStream parserModel = (this.parserModelUri == null)
-                    ? FileLocator.getAsStream(DEFAULT_MODEL_FILE_NAME)
-                    : FileLocator.getAsStream(this.parserModelUri.getPath());
-                 
-                    this.parser = EngineGetter.getComponent(parserModel, this.language, NLPLib.MODE_DEP);
+			}
+				InputStream parserModel = (this.parserModelUri == null)
+                    ? FileLocator.getAsStream(DEFAULT_MODEL_FILE_NAME)
+                    : FileLocator.getAsStream(this.parserModelUri.getPath());
+                 
+                    this.parser = EngineGetter.getComponent(parserModel, this.language, NLPLib.MODE_DEP);
 
-        } catch (Exception e) {
-            throw new ResourceInitializationException(e);
+        } catch (Exception e) {
+            throw new ResourceInitializationException(e);
         }
 	}
 
 	@Override
 	public void process(JCas jCas) throws AnalysisEngineProcessException {
 		for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
-			List<BaseToken> printableTokens = new ArrayList<>();
-			for(BaseToken token : JCasUtil.selectCovered(jCas, BaseToken.class, sentence)){
-			  if(token instanceof NewlineToken) continue;
-			  printableTokens.add(token);
-			}
-			
+			List<BaseToken> printableTokens = new ArrayList<>();
+			for(BaseToken token : JCasUtil.selectCovered(jCas, BaseToken.class, sentence)){
+			  if(token instanceof NewlineToken) continue;
+			  printableTokens.add(token);
+			}
+
+			if ( printableTokens.isEmpty() ) {
+				// If there are no printable tokens then #convert fails
+				continue;
+			}
 			DEPTree tree = new DEPTree();
 
 			// Convert CAS data into structures usable by ClearNLP
 			for (int i = 0; i < printableTokens.size(); i++) {
-				BaseToken token = printableTokens.get(i);
+				BaseToken token = printableTokens.get(i);
 				String lemma = useLemmatizer ? lemmatizer.getLemma(token.getCoveredText(), token.getPartOfSpeech()) : token.getNormalizedForm();
 				DEPNode node = new DEPNode(i+1, token.getCoveredText(), lemma, token.getPartOfSpeech(), new DEPFeat());
 				tree.add(node);
@@ -164,14 +167,14 @@ public class ClearNLPDependencyParserAE
 
 			// Run parser and convert output back to CAS friendly data types
 			parser.process(tree);
-			ArrayList<ConllDependencyNode> nodes = ClearDependencyUtility.convert(jCas, tree, sentence, printableTokens);
-			DependencyUtility.addToIndexes(jCas, nodes);
+			ArrayList<ConllDependencyNode> nodes = ClearDependencyUtility.convert( jCas, tree, sentence, printableTokens );
+			DependencyUtility.addToIndexes( jCas, nodes );
 		}
-		
 		
-	}
-	
-	public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException{
-	  return AnalysisEngineFactory.createEngineDescription(ClearNLPDependencyParserAE.class);
+		
+	}
+	
+	public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException{
+	  return AnalysisEngineFactory.createEngineDescription(ClearNLPDependencyParserAE.class);
 	}
 }