You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2014/11/17 13:06:11 UTC
svn commit: r1640146 - in
/ctakes/branches/sent-detector-newline-fix/ctakes-core:
desc/analysis_engine/ src/main/java/org/apache/ctakes/core/ae/
src/main/java/org/apache/ctakes/core/sentence/
Author: tmill
Date: Mon Nov 17 12:06:10 2014
New Revision: 1640146
URL: http://svn.apache.org/r1640146
Log:
Checking in sent detector code fix to branch.
Modified:
ctakes/branches/sent-detector-newline-fix/ctakes-core/desc/analysis_engine/SentenceDetectorAnnotator.xml
ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java
ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/EndOfSentenceScannerImpl.java
ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java
ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceSpan.java
Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/desc/analysis_engine/SentenceDetectorAnnotator.xml
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/desc/analysis_engine/SentenceDetectorAnnotator.xml?rev=1640146&r1=1640145&r2=1640146&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/desc/analysis_engine/SentenceDetectorAnnotator.xml (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/desc/analysis_engine/SentenceDetectorAnnotator.xml Mon Nov 17 12:06:10 2014
@@ -53,7 +53,7 @@
<nameValuePair>
<name>SentenceModelFile</name>
<value>
-<string>org/apache/ctakes/core/sentdetect/sd-med-model.zip</string>
+<string>org/apache/ctakes/core/sentdetect/sample_sd.mod</string>
</value>
</nameValuePair>
</configurationParameterSettings>
Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java?rev=1640146&r1=1640145&r2=1640146&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java Mon Nov 17 12:06:10 2014
@@ -32,6 +32,7 @@ import java.util.Set;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.sentdetect.DefaultSDContextGenerator;
+import opennlp.tools.sentdetect.SentenceDetectorFactory;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.sentdetect.SentenceSample;
@@ -51,12 +52,12 @@ import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
/**
* Wraps the OpenNLP sentence detector in a UIMA annotator
@@ -87,6 +88,15 @@ public class SentenceDetector extends JC
)
private String sdModelPath;
+ public static final String PARAM_BREAK_ON_NEWLINE = "BreakOnNewline";
+ @ConfigurationParameter(
+ name = PARAM_BREAK_ON_NEWLINE,
+ mandatory = false,
+ description = "Whether the sentence detector should put breaks at every newline"
+ )
+ private boolean breakOnNewline=false;
+
+
private opennlp.tools.sentdetect.SentenceModel sdmodel;
private SentenceDetectorCtakes sentenceDetector;
@@ -216,10 +226,14 @@ public class SentenceDetector extends JC
ArrayList<SentenceSpan> sentenceSpans = new ArrayList<>(0);
for (int i = 0; i < potentialSentSpans.length; i++) {
if (potentialSentSpans[i] != null) {
- sentenceSpans.addAll(potentialSentSpans[i]
- .splitAtLineBreaksAndTrim(NEWLINE)); // TODO Determine
- // line break
- // type
+ if(breakOnNewline){
+ sentenceSpans.addAll(potentialSentSpans[i]
+ .splitAtLineBreaksAndTrim(NEWLINE)); // TODO Determine
+ // line break
+ // type
+ }else{
+ sentenceSpans.addAll(potentialSentSpans[i].trimOnly());
+ }
}
}
@@ -315,7 +329,9 @@ public class SentenceDetector extends JC
Dictionary dict = new Dictionary();
try {
- mod = SentenceDetectorME.train("en", sampleStream, true, dict, mlParams);
+ SentenceDetectorFactory sdFactory = new SentenceDetectorFactory(
+ "en", true, dict, scanner.getEndOfSentenceCharacters());
+ mod = SentenceDetectorME.train("en", sampleStream, sdFactory, mlParams);
} finally {
sampleStream.close();
}
Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/EndOfSentenceScannerImpl.java
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/EndOfSentenceScannerImpl.java?rev=1640146&r1=1640145&r2=1640146&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/EndOfSentenceScannerImpl.java (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/EndOfSentenceScannerImpl.java Mon Nov 17 12:06:10 2014
@@ -30,7 +30,7 @@ import opennlp.tools.sentdetect.EndOfSen
*/
public class EndOfSentenceScannerImpl implements EndOfSentenceScanner {
- private static final char[] eosCandidates = {'.', '!', ')', ']', '>', '\"', ':', ';'}; // CTAKES-227
+ private static final char[] eosCandidates = {'.', '!', ')', ']', '>', '\"', ':', ';', '\n'}; // CTAKES-227
public EndOfSentenceScannerImpl() {
super();
Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java?rev=1640146&r1=1640145&r2=1640146&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java Mon Nov 17 12:06:10 2014
@@ -29,15 +29,15 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import opennlp.maxent.GIS;
-import opennlp.maxent.GISModel;
-import opennlp.model.EventStream;
-import opennlp.model.MaxentModel;
import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.ml.maxent.GIS;
+import opennlp.tools.ml.maxent.GISModel;
+import opennlp.tools.ml.model.EventStream;
+import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.sentdetect.EndOfSentenceScanner;
import opennlp.tools.sentdetect.SDContextGenerator;
import opennlp.tools.sentdetect.SDEventStream;
-import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.sentdetect.SentenceSample;
import opennlp.tools.sentdetect.SentenceSampleStream;
@@ -45,7 +45,6 @@ import opennlp.tools.sentdetect.lang.Fac
import opennlp.tools.util.HashSumEventStream;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
-import opennlp.tools.util.Span;
import opennlp.tools.util.StringUtil;
import opennlp.tools.util.model.BaseModel;
import opennlp.tools.util.model.ModelUtil;
@@ -63,12 +62,12 @@ public class SentenceDetectorCtakes {
/**
* Constant indicates a sentence split.
*/
- public static final String SPLIT ="s";
+ public static final String SPLIT = opennlp.tools.sentdetect.SentenceDetectorME.SPLIT;
/**
* Constant indicates no sentence split.
*/
- public static final String NO_SPLIT ="n";
+ public static final String NO_SPLIT = opennlp.tools.sentdetect.SentenceDetectorME.NO_SPLIT;
private static final Double ONE = new Double(1);
@@ -166,7 +165,7 @@ public class SentenceDetectorCtakes {
int cint = candidate;
// skip over the leading parts of non-token final delimiters
int fws = getFirstWS(s,cint + 1);
- if (i + 1 < end && enders.get(i + 1) < fws) {
+ if (!Character.isWhitespace(s.charAt(cint)) && i + 1 < end && enders.get(i + 1) < fws) {
continue;
}
@@ -180,7 +179,7 @@ public class SentenceDetectorCtakes {
positions.add(getFirstNonWS(s, getFirstWS(s,cint + 1)));
}
else {
- positions.add(getFirstNonWS(s,cint));
+ positions.add(cint);
}
sentProbs.add(new Double(probs[model.getIndex(bestOutcome)]));
}
@@ -229,7 +228,7 @@ public class SentenceDetectorCtakes {
protected boolean isAcceptableBreak(String s, int fromIndex, int candidateIndex) {
return true;
}
-
+ /*
public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples,
boolean useTokenEnd, Dictionary abbreviations) throws IOException {
return train(languageCode, samples, useTokenEnd, abbreviations,5,100);
@@ -244,7 +243,7 @@ public class SentenceDetectorCtakes {
Factory factory = new Factory();
// TODO: Fix the EventStream to throw exceptions when training goes wrong
- EventStream eventStream = new SDEventStream(samples,
+ ObjectStream eventStream = new SDEventStream(samples,
factory.createSentenceContextGenerator(languageCode),
factory.createEndOfSentenceScanner(languageCode));
@@ -257,7 +256,7 @@ public class SentenceDetectorCtakes {
return new SentenceModel(languageCode, sentModel,
useTokenEnd, abbreviations, manifestInfoEntries);
}
-
+*/
private static void usage() {
System.err.println("Usage: SentenceDetectorME -encoding charset -lang language trainData modelName [cutoff iterations]");
System.err.println("-encoding charset specifies the encoding which should be used ");
@@ -279,6 +278,7 @@ public class SentenceDetectorCtakes {
* @param args
* @throws IOException
*/
+ /*
public static void main(String[] args) throws IOException {
int ai=0;
String encoding = null;
@@ -344,7 +344,7 @@ public class SentenceDetectorCtakes {
}
}
-
+*/
private static int convertToInt(String s) {
int i = Integer.parseInt(s);
Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceSpan.java
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceSpan.java?rev=1640146&r1=1640145&r2=1640146&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceSpan.java (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceSpan.java Mon Nov 17 12:06:10 2014
@@ -186,5 +186,15 @@ public class SentenceSpan {
String s = "(" + start + ", " + end + ") " + text;
return s;
}
+
+ public List<SentenceSpan> trimOnly() {
+ List<SentenceSpan> trimmedLists = new ArrayList<>();
+ String trimmed = text.trim();
+ if(trimmed.length() > 0){
+ int offset = text.indexOf(trimmed.charAt(0));
+ trimmedLists.add(new SentenceSpan(start + offset, start + offset + trimmed.length(), trimmed));
+ }
+ return trimmedLists;
+ }
}