You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2014/11/17 13:06:11 UTC

svn commit: r1640146 - in /ctakes/branches/sent-detector-newline-fix/ctakes-core: desc/analysis_engine/ src/main/java/org/apache/ctakes/core/ae/ src/main/java/org/apache/ctakes/core/sentence/

Author: tmill
Date: Mon Nov 17 12:06:10 2014
New Revision: 1640146

URL: http://svn.apache.org/r1640146
Log:
Checking in sent detector code fix to branch.

Modified:
    ctakes/branches/sent-detector-newline-fix/ctakes-core/desc/analysis_engine/SentenceDetectorAnnotator.xml
    ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java
    ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/EndOfSentenceScannerImpl.java
    ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java
    ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceSpan.java

Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/desc/analysis_engine/SentenceDetectorAnnotator.xml
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/desc/analysis_engine/SentenceDetectorAnnotator.xml?rev=1640146&r1=1640145&r2=1640146&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/desc/analysis_engine/SentenceDetectorAnnotator.xml (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/desc/analysis_engine/SentenceDetectorAnnotator.xml Mon Nov 17 12:06:10 2014
@@ -53,7 +53,7 @@
 <nameValuePair>
 <name>SentenceModelFile</name>
 <value>
-<string>org/apache/ctakes/core/sentdetect/sd-med-model.zip</string>
+<string>org/apache/ctakes/core/sentdetect/sample_sd.mod</string>
 </value>
 </nameValuePair>
 </configurationParameterSettings>

Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java?rev=1640146&r1=1640145&r2=1640146&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java Mon Nov 17 12:06:10 2014
@@ -32,6 +32,7 @@ import java.util.Set;
 
 import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.sentdetect.DefaultSDContextGenerator;
+import opennlp.tools.sentdetect.SentenceDetectorFactory;
 import opennlp.tools.sentdetect.SentenceDetectorME;
 import opennlp.tools.sentdetect.SentenceModel;
 import opennlp.tools.sentdetect.SentenceSample;
@@ -51,12 +52,12 @@ import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.resource.ResourceInitializationException;
 import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
 import org.apache.uima.fit.descriptor.ConfigurationParameter;
 import org.apache.uima.fit.factory.AnalysisEngineFactory;
 import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
 
 /**
  * Wraps the OpenNLP sentence detector in a UIMA annotator
@@ -87,6 +88,15 @@ public class SentenceDetector extends JC
 	    )
 	private String sdModelPath;
 	
+	public static final String PARAM_BREAK_ON_NEWLINE = "BreakOnNewline";
+	@ConfigurationParameter(
+	    name = PARAM_BREAK_ON_NEWLINE,
+	    mandatory = false,
+	    description = "Whether the sentence detector should put breaks at every newline"
+	    )
+	private boolean breakOnNewline=false;
+	
+	
 	private opennlp.tools.sentdetect.SentenceModel sdmodel;
 
 	private SentenceDetectorCtakes sentenceDetector;
@@ -216,10 +226,14 @@ public class SentenceDetector extends JC
 		ArrayList<SentenceSpan> sentenceSpans = new ArrayList<>(0);
 		for (int i = 0; i < potentialSentSpans.length; i++) {
 			if (potentialSentSpans[i] != null) {
-				sentenceSpans.addAll(potentialSentSpans[i]
-						.splitAtLineBreaksAndTrim(NEWLINE)); // TODO Determine
-																// line break
-																// type
+			  if(breakOnNewline){
+			    sentenceSpans.addAll(potentialSentSpans[i]
+			        .splitAtLineBreaksAndTrim(NEWLINE)); // TODO Determine
+			                                             // line break
+			                                             // type
+			  }else{
+			    sentenceSpans.addAll(potentialSentSpans[i].trimOnly());
+			  }
 			}
 		}
 
@@ -315,7 +329,9 @@ public class SentenceDetector extends JC
 		  Dictionary dict = new Dictionary();
 
 		  try {
-		    mod = SentenceDetectorME.train("en", sampleStream, true, dict, mlParams);
+		    SentenceDetectorFactory sdFactory = new SentenceDetectorFactory(
+		        "en", true, dict, scanner.getEndOfSentenceCharacters());
+		    mod = SentenceDetectorME.train("en", sampleStream, sdFactory, mlParams);
 		  } finally {
 		    sampleStream.close();
 		  }

Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/EndOfSentenceScannerImpl.java
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/EndOfSentenceScannerImpl.java?rev=1640146&r1=1640145&r2=1640146&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/EndOfSentenceScannerImpl.java (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/EndOfSentenceScannerImpl.java Mon Nov 17 12:06:10 2014
@@ -30,7 +30,7 @@ import opennlp.tools.sentdetect.EndOfSen
  */
 public class EndOfSentenceScannerImpl implements EndOfSentenceScanner {
 
-    private static final char[] eosCandidates =  {'.', '!', ')', ']', '>', '\"', ':', ';'}; // CTAKES-227
+    private static final char[] eosCandidates =  {'.', '!', ')', ']', '>', '\"', ':', ';', '\n'}; // CTAKES-227
 
 	public EndOfSentenceScannerImpl() {
         super();

Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java?rev=1640146&r1=1640145&r2=1640146&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java Mon Nov 17 12:06:10 2014
@@ -29,15 +29,15 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
-import opennlp.maxent.GIS;
-import opennlp.maxent.GISModel;
-import opennlp.model.EventStream;
-import opennlp.model.MaxentModel;
 import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.ml.maxent.GIS;
+import opennlp.tools.ml.maxent.GISModel;
+import opennlp.tools.ml.model.EventStream;
+import opennlp.tools.ml.model.MaxentModel;
 import opennlp.tools.sentdetect.EndOfSentenceScanner;
 import opennlp.tools.sentdetect.SDContextGenerator;
 import opennlp.tools.sentdetect.SDEventStream;
-import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetectorME;
 import opennlp.tools.sentdetect.SentenceModel;
 import opennlp.tools.sentdetect.SentenceSample;
 import opennlp.tools.sentdetect.SentenceSampleStream;
@@ -45,7 +45,6 @@ import opennlp.tools.sentdetect.lang.Fac
 import opennlp.tools.util.HashSumEventStream;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
-import opennlp.tools.util.Span;
 import opennlp.tools.util.StringUtil;
 import opennlp.tools.util.model.BaseModel;
 import opennlp.tools.util.model.ModelUtil;
@@ -63,12 +62,12 @@ public class SentenceDetectorCtakes {
 	  /**
 	   * Constant indicates a sentence split.
 	   */
-	  public static final String SPLIT ="s";
+	  public static final String SPLIT = opennlp.tools.sentdetect.SentenceDetectorME.SPLIT;
 
 	  /**
 	   * Constant indicates no sentence split.
 	   */
-	  public static final String NO_SPLIT ="n";
+	  public static final String NO_SPLIT = opennlp.tools.sentdetect.SentenceDetectorME.NO_SPLIT;
 	  
 	  private static final Double ONE = new Double(1);
 
@@ -166,7 +165,7 @@ public class SentenceDetectorCtakes {
 	      int cint = candidate;
 	      // skip over the leading parts of non-token final delimiters
 	      int fws = getFirstWS(s,cint + 1);
-	      if (i + 1 < end && enders.get(i + 1) < fws) {
+	      if (!Character.isWhitespace(s.charAt(cint)) && i + 1 < end && enders.get(i + 1) < fws) {
 	        continue;
 	      }
 
@@ -180,7 +179,7 @@ public class SentenceDetectorCtakes {
 	            positions.add(getFirstNonWS(s, getFirstWS(s,cint + 1)));
 	          }
 	          else {
-	            positions.add(getFirstNonWS(s,cint));
+	            positions.add(cint);
 	          }
 	          sentProbs.add(new Double(probs[model.getIndex(bestOutcome)]));
 	        }
@@ -229,7 +228,7 @@ public class SentenceDetectorCtakes {
 	  protected boolean isAcceptableBreak(String s, int fromIndex, int candidateIndex) {
 	    return true;
 	  }
-	  
+	  /*
 	  public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples,
 	      boolean useTokenEnd, Dictionary abbreviations) throws IOException {
 	    return train(languageCode, samples, useTokenEnd, abbreviations,5,100);
@@ -244,7 +243,7 @@ public class SentenceDetectorCtakes {
 	    Factory factory = new Factory();
 
 	    // TODO: Fix the EventStream to throw exceptions when training goes wrong
-	    EventStream eventStream = new SDEventStream(samples,
+	    ObjectStream eventStream = new SDEventStream(samples,
 	        factory.createSentenceContextGenerator(languageCode),
 	        factory.createEndOfSentenceScanner(languageCode));
 	    
@@ -257,7 +256,7 @@ public class SentenceDetectorCtakes {
 	    return new SentenceModel(languageCode, sentModel,
 	        useTokenEnd, abbreviations, manifestInfoEntries);
 	  }
-
+*/
 	  private static void usage() {
 	    System.err.println("Usage: SentenceDetectorME -encoding charset -lang language trainData modelName [cutoff iterations]");
 	    System.err.println("-encoding charset specifies the encoding which should be used ");
@@ -279,6 +278,7 @@ public class SentenceDetectorCtakes {
 	   * @param args
 	   * @throws IOException
 	   */
+	  /*
 	  public static void main(String[] args) throws IOException {
 	    int ai=0;
 	    String encoding = null;
@@ -344,7 +344,7 @@ public class SentenceDetectorCtakes {
 	    }
 	  }
 
-
+*/
 	private static int convertToInt(String s) {
 
 		int i = Integer.parseInt(s); 

Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceSpan.java
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceSpan.java?rev=1640146&r1=1640145&r2=1640146&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceSpan.java (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceSpan.java Mon Nov 17 12:06:10 2014
@@ -186,5 +186,15 @@ public class SentenceSpan {
 		String s =  "(" + start + ", " + end + ") " + text;  
 		return s;
 	}
+
+  public List<SentenceSpan> trimOnly() {
+    List<SentenceSpan> trimmedLists = new ArrayList<>();
+    String trimmed = text.trim();
+    if(trimmed.length() > 0){
+      int offset = text.indexOf(trimmed.charAt(0));
+      trimmedLists.add(new SentenceSpan(start + offset, start + offset + trimmed.length(), trimmed));
+    }
+    return trimmedLists;
+  }
 	
 }