You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by ja...@apache.org on 2017/03/24 16:03:01 UTC

svn commit: r1788492 - in /ctakes/trunk: ./ ctakes-assertion/ ctakes-chunker/ ctakes-core/ ctakes-core/src/main/java/org/apache/ctakes/core/ae/ ctakes-core/src/main/java/org/apache/ctakes/core/sentence/ ctakes-pos-tagger/ ctakes-relation-extractor/ cta...

Author: james-masanz
Date: Fri Mar 24 16:03:01 2017
New Revision: 1788492

URL: http://svn.apache.org/viewvc?rev=1788492&view=rev
Log:
first pass at updating OpenNLP to 1.7.2 CTAKES-191

Modified:
    ctakes/trunk/ctakes-assertion/pom.xml
    ctakes/trunk/ctakes-chunker/pom.xml
    ctakes/trunk/ctakes-core/pom.xml
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java
    ctakes/trunk/ctakes-pos-tagger/pom.xml
    ctakes/trunk/ctakes-relation-extractor/pom.xml
    ctakes/trunk/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/annotators/SentenceDetector.java
    ctakes/trunk/pom.xml

Modified: ctakes/trunk/ctakes-assertion/pom.xml
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/pom.xml?rev=1788492&r1=1788491&r2=1788492&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/pom.xml (original)
+++ ctakes/trunk/ctakes-assertion/pom.xml Fri Mar 24 16:03:01 2017
@@ -123,11 +123,11 @@
 		</dependency>
 		<dependency>
 			<groupId>org.apache.opennlp</groupId>
-			<artifactId>opennlp-maxent</artifactId>
+			<artifactId>opennlp-tools</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>org.apache.opennlp</groupId>
-			<artifactId>opennlp-tools</artifactId>
+			<artifactId>opennlp-maxent</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>org.apache.lucene</groupId>

Modified: ctakes/trunk/ctakes-chunker/pom.xml
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-chunker/pom.xml?rev=1788492&r1=1788491&r2=1788492&view=diff
==============================================================================
--- ctakes/trunk/ctakes-chunker/pom.xml (original)
+++ ctakes/trunk/ctakes-chunker/pom.xml Fri Mar 24 16:03:01 2017
@@ -55,11 +55,11 @@
 		</dependency>
 		<dependency>
 			<groupId>org.apache.opennlp</groupId>
-			<artifactId>opennlp-maxent</artifactId>
+			<artifactId>opennlp-tools</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>org.apache.opennlp</groupId>
-			<artifactId>opennlp-tools</artifactId>
+			<artifactId>opennlp-maxent</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>junit</groupId>

Modified: ctakes/trunk/ctakes-core/pom.xml
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/pom.xml?rev=1788492&r1=1788491&r2=1788492&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/pom.xml (original)
+++ ctakes/trunk/ctakes-core/pom.xml Fri Mar 24 16:03:01 2017
@@ -74,11 +74,11 @@
 		</dependency>
 		<dependency>
 			<groupId>org.apache.opennlp</groupId>
-			<artifactId>opennlp-maxent</artifactId>
+			<artifactId>opennlp-tools</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>org.apache.opennlp</groupId>
-			<artifactId>opennlp-tools</artifactId>
+			<artifactId>opennlp-maxent</artifactId>
 		</dependency>
       <!--  Todo : is lucene necessary at this level?  -->
 		<dependency>

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java?rev=1788492&r1=1788491&r2=1788492&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java Fri Mar 24 16:03:01 2017
@@ -20,6 +20,7 @@ package org.apache.ctakes.core.ae;
 
 import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.sentdetect.*;
+import opennlp.tools.util.MarkableFileInputStreamFactory;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.TrainingParameters;
@@ -98,8 +99,7 @@ public class SentenceDetector extends JC
 		  sdmodel = new SentenceModel(is);
 		  EndOfSentenceScannerImpl eoss = new EndOfSentenceScannerImpl();
 		  DefaultSDContextGenerator cg = new DefaultSDContextGenerator(eoss.getEndOfSentenceCharacters());
-		  sentenceDetector = new SentenceDetectorCtakes(
-		      sdmodel.getMaxentModel(), cg, eoss);
+		  sentenceDetector = new SentenceDetectorCtakes(sdmodel.getMaxentModel(), cg, eoss);
 
 		  skipSegmentsSet = new HashSet<>();
 		  if(skipSegmentsArray != null){
@@ -289,10 +289,15 @@ public class SentenceDetector extends JC
 
 
 		Charset charset = Charset.forName("UTF-8");
-    SentenceModel mod = null;
+		SentenceModel mod = null;
 		
-		try(FileInputStream inStream = new FileInputStream(inFile)){
-		  ObjectStream<String> lineStream = new PlainTextByLineStream(inStream, charset);
+    	
+    
+		MarkableFileInputStreamFactory mfisf = new MarkableFileInputStreamFactory(inFile);
+		ObjectStream<String> lineStream = null;
+		try {
+			
+		  lineStream = new PlainTextByLineStream(mfisf, charset);
 		  ObjectStream<SentenceSample> sampleStream = new SentenceSampleStream(lineStream);
 
 		  // Training Parameters
@@ -310,6 +315,8 @@ public class SentenceDetector extends JC
 		  } finally {
 		    sampleStream.close();
 		  }
+		} catch (IOException e) {
+			lineStream.close();
 		}
 		
 		try(FileOutputStream outStream = new FileOutputStream(outFile)){

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java?rev=1788492&r1=1788491&r2=1788492&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java Fri Mar 24 16:03:01 2017
@@ -29,11 +29,12 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
-import opennlp.maxent.GIS;
-import opennlp.maxent.GISModel;
+import opennlp.tools.ml.maxent.GISTrainer;
+import opennlp.tools.ml.maxent.GISModel;
 import opennlp.model.EventStream;
-import opennlp.model.MaxentModel;
+import opennlp.tools.ml.model.MaxentModel;
 import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.sentdetect.DefaultSDContextGenerator;
 import opennlp.tools.sentdetect.EndOfSentenceScanner;
 import opennlp.tools.sentdetect.SDContextGenerator;
 import opennlp.tools.sentdetect.SDEventStream;
@@ -42,7 +43,10 @@ import opennlp.tools.sentdetect.Sentence
 import opennlp.tools.sentdetect.SentenceSample;
 import opennlp.tools.sentdetect.SentenceSampleStream;
 import opennlp.tools.sentdetect.lang.Factory;
-import opennlp.tools.util.HashSumEventStream;
+import opennlp.tools.ml.model.Event;
+import opennlp.tools.ml.model.HashSumEventStream;
+import opennlp.tools.util.AbstractObjectStream;
+import opennlp.tools.util.MarkableFileInputStreamFactory;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.Span;
@@ -99,7 +103,7 @@ public class SentenceDetectorCtakes {
 	   *
 	   * @param model the {@link SentenceModel}
 	   */
-	  public SentenceDetectorCtakes(MaxentModel model, SDContextGenerator cg, EndOfSentenceScanner eoss) {
+	  public SentenceDetectorCtakes(MaxentModel model, DefaultSDContextGenerator cg, EndOfSentenceScanner eoss) {
 		  this.model = model;
 		  cgen = cg;
 		  scanner = eoss;
@@ -107,7 +111,7 @@ public class SentenceDetectorCtakes {
 	  }
 
 
-	  /**
+	/**
 	   * Detect sentences in a String.
 	   *
 	   * @param s  The string to be processed.
@@ -244,18 +248,18 @@ public class SentenceDetectorCtakes {
 	    Factory factory = new Factory();
 
 	    // TODO: Fix the EventStream to throw exceptions when training goes wrong
-	    EventStream eventStream = new SDEventStream(samples,
+	    SDEventStream eventStream = new SDEventStream(samples,
 	        factory.createSentenceContextGenerator(languageCode),
 	        factory.createEndOfSentenceScanner(languageCode));
 	    
-	    HashSumEventStream hses = new HashSumEventStream(eventStream);
-	    GISModel sentModel = GIS.trainModel(hses, iterations, cutoff);
+	    HashSumEventStream hses = new HashSumEventStream(eventStream); // AbstractObjectStream<Event>
+	    GISTrainer trainer = new GISTrainer();
+	    MaxentModel sentModel = trainer.trainModel(hses, iterations, cutoff);
 
 	    manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY, 
 	        hses.calculateHashSum().toString(16));
 	    
-	    return new SentenceModel(languageCode, sentModel,
-	        useTokenEnd, abbreviations, manifestInfoEntries);
+	    return new SentenceModel(languageCode, sentModel, useTokenEnd, abbreviations, manifestInfoEntries);
 	  }
 
 	  private static void usage() {
@@ -324,10 +328,14 @@ public class SentenceDetectorCtakes {
 	      if ((lang == null) || (encoding == null)) {
 	        usage();
 	      }
-
 	      
-	      SentenceModel model = train(lang, new SentenceSampleStream(new PlainTextByLineStream(
-	          new InputStreamReader(new FileInputStream(inFile), encoding))), true, null, cutoff, iters);
+	      MarkableFileInputStreamFactory mfisf = new MarkableFileInputStreamFactory(inFile);
+		  ObjectStream<String> lineStream = null;
+		  lineStream = new PlainTextByLineStream(mfisf, encoding);
+		  ObjectStream<SentenceSample> sampleStream = new SentenceSampleStream(lineStream);
+		  
+	      //new PlainTextByLineStream(new InputStreamReader(new FileInputStream(inFile), encoding))
+		  SentenceModel model = train(lang, sampleStream, true, null, cutoff, iters);
 
 	      // TODO: add support for iterations and cutoff settings
 

Modified: ctakes/trunk/ctakes-pos-tagger/pom.xml
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-pos-tagger/pom.xml?rev=1788492&r1=1788491&r2=1788492&view=diff
==============================================================================
--- ctakes/trunk/ctakes-pos-tagger/pom.xml (original)
+++ ctakes/trunk/ctakes-pos-tagger/pom.xml Fri Mar 24 16:03:01 2017
@@ -51,11 +51,11 @@
 		</dependency>
 		<dependency>
 			<groupId>org.apache.opennlp</groupId>
-			<artifactId>opennlp-maxent</artifactId>
+			<artifactId>opennlp-tools</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>org.apache.opennlp</groupId>
-			<artifactId>opennlp-tools</artifactId>
+			<artifactId>opennlp-maxent</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>com.googlecode.clearnlp</groupId>

Modified: ctakes/trunk/ctakes-relation-extractor/pom.xml
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-relation-extractor/pom.xml?rev=1788492&r1=1788491&r2=1788492&view=diff
==============================================================================
--- ctakes/trunk/ctakes-relation-extractor/pom.xml (original)
+++ ctakes/trunk/ctakes-relation-extractor/pom.xml Fri Mar 24 16:03:01 2017
@@ -107,11 +107,11 @@
 		</dependency>
 		<dependency>
 			<groupId>org.apache.opennlp</groupId>
-			<artifactId>opennlp-maxent</artifactId>
+			<artifactId>opennlp-tools</artifactId>
 		</dependency>
 		<dependency>
 			<groupId>org.apache.opennlp</groupId>
-			<artifactId>opennlp-tools</artifactId>
+			<artifactId>opennlp-maxent</artifactId>
 		</dependency>
 		<dependency>
             <groupId>org.apache.lucene</groupId>

Modified: ctakes/trunk/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/annotators/SentenceDetector.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/annotators/SentenceDetector.java?rev=1788492&r1=1788491&r2=1788492&view=diff
==============================================================================
--- ctakes/trunk/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/annotators/SentenceDetector.java (original)
+++ ctakes/trunk/ctakes-ytex-uima/src/main/java/org/apache/ctakes/ytex/uima/annotators/SentenceDetector.java Fri Mar 24 16:03:01 2017
@@ -169,8 +169,7 @@ public class SentenceDetector extends JC
 			char[] eosc = eoss.getEndOfSentenceCharacters();
 			// SentenceDContextGenerator cg = new SentenceDContextGenerator();
 			DefaultSDContextGenerator cg = new DefaultSDContextGenerator(eosc);
-			sentenceDetector = new SentenceDetectorCtakes(
-					sdmodel.getMaxentModel(), cg, eoss);
+			sentenceDetector = new SentenceDetectorCtakes(sdmodel.getMaxentModel(), cg, eoss);
 
 			skipSegmentsSet = ParamUtil.getStringParameterValuesSet(
 					PARAM_SEGMENTS_TO_SKIP, context);

Modified: ctakes/trunk/pom.xml
URL: http://svn.apache.org/viewvc/ctakes/trunk/pom.xml?rev=1788492&r1=1788491&r2=1788492&view=diff
==============================================================================
--- ctakes/trunk/pom.xml (original)
+++ ctakes/trunk/pom.xml Fri Mar 24 16:03:01 2017
@@ -483,20 +483,20 @@
 			</dependency>	
 			<dependency>
 				<groupId>org.apache.opennlp</groupId>
-				<artifactId>opennlp-maxent</artifactId>
-				<version>3.0.3</version>
-			</dependency>
-			<dependency>
-				<groupId>org.apache.opennlp</groupId>
 				<artifactId>opennlp-tools</artifactId>
-				<version>1.5.3</version>
+				<version>1.7.2</version>
 			</dependency>
 			<dependency>
 				<groupId>org.apache.opennlp</groupId>
 				<artifactId>opennlp-uima</artifactId>
-				<version>1.5.3</version>
+				<version>1.7.2</version>
 			</dependency>			
 			<dependency>
+				<groupId>org.apache.opennlp</groupId>
+				<artifactId>opennlp-maxent</artifactId>
+				<version>3.0.3</version>
+			</dependency>
+			<dependency>
 				<groupId>org.apache.uima</groupId>
 				<artifactId>uimaj-examples</artifactId>
 				<version>2.9.0</version>