You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by dl...@apache.org on 2015/08/18 23:30:19 UTC

svn commit: r1696507 - in /ctakes/sandbox/ctakes-wsd: pom.xml src/main/java/org/apache/ctakes/consumers/SentencePrinter.java src/main/java/org/apache/ctakes/pipelines/BasicPipeline.java

Author: dligach
Date: Tue Aug 18 21:30:19 2015
New Revision: 1696507

URL: http://svn.apache.org/r1696507
Log:
switched BasicPipeline to Tim's sentence segmenter that handles line breaks correctly

Modified:
    ctakes/sandbox/ctakes-wsd/pom.xml
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/BasicPipeline.java

Modified: ctakes/sandbox/ctakes-wsd/pom.xml
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/pom.xml?rev=1696507&r1=1696506&r2=1696507&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-wsd/pom.xml (original)
+++ ctakes/sandbox/ctakes-wsd/pom.xml Tue Aug 18 21:30:19 2015
@@ -112,7 +112,12 @@
       <artifactId>diffutils</artifactId>
       <version>1.3.0</version>
     </dependency>
-  </dependencies>
+    <dependency>
+    	<groupId>org.apache.ctakes</groupId>
+    	<artifactId>ctakes-sentdetect-cleartk</artifactId>
+    	<version>3.2.3-SNAPSHOT</version>
+    </dependency>
+    </dependencies>
   <!-- The below is all necessary to unpack the UMLS resources since they 
     can't be used from the classpath -->
   <build>

Modified: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java?rev=1696507&r1=1696506&r2=1696507&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java (original)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/consumers/SentencePrinter.java Tue Aug 18 21:30:19 2015
@@ -68,7 +68,8 @@ public class SentencePrinter {
     public void process(JCas jCas) throws AnalysisEngineProcessException {
       
       for(Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
-        System.out.println(sentence.getCoveredText());
+        System.out.println("* " + sentence.getCoveredText());
+        System.out.println();
       }
     }
   }

Modified: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/BasicPipeline.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/BasicPipeline.java?rev=1696507&r1=1696506&r2=1696507&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/BasicPipeline.java (original)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/BasicPipeline.java Tue Aug 18 21:30:19 2015
@@ -15,13 +15,12 @@ import org.apache.ctakes.chunker.ae.Defa
 import org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster;
 import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator;
 import org.apache.ctakes.core.ae.OverlapAnnotator;
-import org.apache.ctakes.core.ae.SentenceDetector;
 import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
 import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
+import org.apache.ctakes.core.cleartk.ae.SentenceDetectorAnnotator;
 import org.apache.ctakes.core.resource.FileLocator;
 import org.apache.ctakes.core.resource.FileResourceImpl;
 import org.apache.ctakes.dependency.parser.ae.ClearNLPDependencyParserAE;
-import org.apache.ctakes.dependency.parser.ae.ClearNLPSemanticRoleLabelerAE;
 import org.apache.ctakes.dictionary.lookup2.ae.AbstractJCasTermAnnotator;
 import org.apache.ctakes.dictionary.lookup2.ae.DefaultJCasTermAnnotator;
 import org.apache.ctakes.dictionary.lookup2.ae.JCasTermAnnotator;
@@ -82,11 +81,9 @@ public class BasicPipeline {
     // identify segments 
     aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(SimpleSegmentAnnotator.class));
 
-    // identify sentences
-    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
-        SentenceDetector.class,
-        SentenceDetector.SD_MODEL_FILE_PARAM,
-        "org/apache/ctakes/core/sentdetect/sd-med-model.zip"));
+    // identify sentences (use Tim's sentence segmenter that handles line breaks correctly)
+    aggregateBuilder.add(SentenceDetectorAnnotator.getDescription("/org/apache/ctakes/core/sentdetect/model.jar"));
+    
     // identify tokens
     aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(TokenizerAnnotatorPTB.class));
     // merge some tokens
@@ -159,8 +156,8 @@ public class BasicPipeline {
     // add dependency parser
     aggregateBuilder.add(ClearNLPDependencyParserAE.createAnnotatorDescription());
     
-    // add semantic role labeler
-    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ClearNLPSemanticRoleLabelerAE.class));
+    // add semantic role labeler (removing for now -- crashes after switched to Tim's sentence segmenter)
+    // aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ClearNLPSemanticRoleLabelerAE.class));
 
     // write out the CAS after all the above annotations
     aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(