You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2014/12/05 20:18:05 UTC
svn commit: r1643405 - in
/ctakes/branches/sent-detector-newline-fix/ctakes-core: ./ .settings/
resources/launch/ src/main/java/org/apache/ctakes/core/ae/
src/main/java/org/apache/ctakes/core/cr/
src/main/java/org/apache/ctakes/core/sentence/
Author: tmill
Date: Fri Dec 5 19:18:05 2014
New Revision: 1643405
URL: http://svn.apache.org/r1643405
Log:
New features for sentence detector on clinical notes.
Added:
ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SDContextGeneratorCtakes.java
ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorFactoryCtakes.java
Modified:
ctakes/branches/sent-detector-newline-fix/ctakes-core/.settings/org.eclipse.core.resources.prefs
ctakes/branches/sent-detector-newline-fix/ctakes-core/pom.xml
ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/SentenceDetector--train_a_new_model.launch
ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/SentenceDetector_annotator.launch
ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/Tokenizer_annotator.launch
ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/UIMA_CPE_GUI--core.launch
ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/UIMA_CVD--core.launch
ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/DocumentIdPrinterAnalysisEngine.java
ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SHARPKnowtatorXMLReader.java
ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java
ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FilesInDirectoryCollectionReader.java
ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java
Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/.settings/org.eclipse.core.resources.prefs
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/.settings/org.eclipse.core.resources.prefs?rev=1643405&r1=1643404&r2=1643405&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/.settings/org.eclipse.core.resources.prefs (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/.settings/org.eclipse.core.resources.prefs Fri Dec 5 19:18:05 2014
@@ -1,5 +1,4 @@
eclipse.preferences.version=1
encoding//src/main/java=UTF-8
-encoding//src/main/resources=UTF-8
encoding//src/test/java=UTF-8
encoding/<project>=UTF-8
Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/pom.xml
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/pom.xml?rev=1643405&r1=1643404&r2=1643405&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/pom.xml (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/pom.xml Fri Dec 5 19:18:05 2014
@@ -26,7 +26,7 @@
<parent>
<groupId>org.apache.ctakes</groupId>
<artifactId>ctakes</artifactId>
- <version>3.2.1-SNAPSHOT</version>
+ <version>3.2.2-SNAPSHOT</version>
</parent>
<dependencies>
<dependency>
Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/SentenceDetector--train_a_new_model.launch
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/SentenceDetector--train_a_new_model.launch?rev=1643405&r1=1643404&r2=1643405&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/SentenceDetector--train_a_new_model.launch (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/SentenceDetector--train_a_new_model.launch Fri Dec 5 19:18:05 2014
@@ -2,7 +2,7 @@
<launchConfiguration type="org.eclipse.jdt.launching.localJavaApplication">
<stringAttribute key="bad_container_name" value="\core\resources\launch\SentenceDetector - argument checking"/>
<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
-<listEntry value="/ctakes-core/src/main/java/org/apache/ctakes/ctakes-core/ae/SentenceDetector.java"/>
+<listEntry value="/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java"/>
</listAttribute>
<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
<listEntry value="1"/>
@@ -10,7 +10,7 @@
<booleanAttribute key="org.eclipse.debug.core.appendEnvironmentVariables" value="true"/>
<stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/>
<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="org.apache.ctakes.core.ae.SentenceDetector"/>
-<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="data/test/sample_sd_training_sentences.txt resources/sentdetect/sample_sd.mod 100 5"/>
+<stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="/home/tmill/mnt/rc-pub/resources/corpora/sentence-training/finished/all_finished.train ../ctakes-core-res/src/main/resources/org/apache/ctakes/core/sentdetect/sample_sd.mod 100 5"/>
<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="ctakes-core"/>
<stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/>
</launchConfiguration>
Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/SentenceDetector_annotator.launch
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/SentenceDetector_annotator.launch?rev=1643405&r1=1643404&r2=1643405&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/SentenceDetector_annotator.launch (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/SentenceDetector_annotator.launch Fri Dec 5 19:18:05 2014
@@ -11,5 +11,5 @@
<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="org.apache.uima.tools.cpm.CpmFrame"/>
<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="ctakes-core"/>
<stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/>
-<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Xms500M -Xmx3g"/>
+<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Xms500M -Xmx500M"/>
</launchConfiguration>
Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/Tokenizer_annotator.launch
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/Tokenizer_annotator.launch?rev=1643405&r1=1643404&r2=1643405&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/Tokenizer_annotator.launch (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/Tokenizer_annotator.launch Fri Dec 5 19:18:05 2014
@@ -11,5 +11,5 @@
<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="org.apache.uima.tools.cpm.CpmFrame"/>
<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="ctakes-core"/>
<stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/>
-<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Xms500M -Xmx3g"/>
+<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Xms500M -Xmx500M"/>
</launchConfiguration>
Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/UIMA_CPE_GUI--core.launch
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/UIMA_CPE_GUI--core.launch?rev=1643405&r1=1643404&r2=1643405&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/UIMA_CPE_GUI--core.launch (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/UIMA_CPE_GUI--core.launch Fri Dec 5 19:18:05 2014
@@ -24,5 +24,5 @@
<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="org.apache.uima.tools.cpm.CpmFrame"/>
<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="ctakes-core"/>
<stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/>
-<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Xms500M -Xmx3g"/>
+<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value="-Xms500M -Xmx500M"/>
</launchConfiguration>
Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/UIMA_CVD--core.launch
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/UIMA_CVD--core.launch?rev=1643405&r1=1643404&r2=1643405&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/UIMA_CVD--core.launch (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/resources/launch/UIMA_CVD--core.launch Fri Dec 5 19:18:05 2014
@@ -13,5 +13,5 @@
<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="org.apache.uima.tools.annot_view.Gladis"/>
<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="ctakes-core"/>
<stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/>
-<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value=""-Djava.util.logging.config.file=${env_var:UIMA_HOME}/Logger.properties" -Xms500M -Xmx3g"/>
+<stringAttribute key="org.eclipse.jdt.launching.VM_ARGUMENTS" value=""-Djava.util.logging.config.file=${env_var:UIMA_HOME}/Logger.properties" -Xms500M -Xmx500M"/>
</launchConfiguration>
Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/DocumentIdPrinterAnalysisEngine.java
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/DocumentIdPrinterAnalysisEngine.java?rev=1643405&r1=1643404&r2=1643405&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/DocumentIdPrinterAnalysisEngine.java (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/DocumentIdPrinterAnalysisEngine.java Fri Dec 5 19:18:05 2014
@@ -36,7 +36,7 @@ public class DocumentIdPrinterAnalysisEn
String documentId = DocumentIDAnnotationUtil.getDocumentID(jcas);
String logMessage = String.format("##### current file document id: \"%s\"", documentId);
logger.info(logMessage);
- System.out.println(logMessage);
+// System.out.println(logMessage);
}
}
Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SHARPKnowtatorXMLReader.java
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SHARPKnowtatorXMLReader.java?rev=1643405&r1=1643404&r2=1643405&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SHARPKnowtatorXMLReader.java (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SHARPKnowtatorXMLReader.java Fri Dec 5 19:18:05 2014
@@ -41,6 +41,7 @@ import org.apache.ctakes.typesystem.type
import org.apache.ctakes.typesystem.type.refsem.Date;
import org.apache.ctakes.typesystem.type.refsem.Event;
import org.apache.ctakes.typesystem.type.refsem.EventProperties;
+import org.apache.ctakes.typesystem.type.refsem.LabDeltaFlag;
import org.apache.ctakes.typesystem.type.refsem.LabReferenceRange;
import org.apache.ctakes.typesystem.type.refsem.LabValue;
import org.apache.ctakes.typesystem.type.refsem.MedicationDosage;
@@ -59,10 +60,14 @@ import org.apache.ctakes.typesystem.type
import org.apache.ctakes.typesystem.type.relation.AspectualTextRelation;
import org.apache.ctakes.typesystem.type.relation.BinaryTextRelation;
import org.apache.ctakes.typesystem.type.relation.ComplicatesDisruptsTextRelation;
+import org.apache.ctakes.typesystem.type.relation.ContraindicatesTextRelation;
import org.apache.ctakes.typesystem.type.relation.DegreeOfTextRelation;
+import org.apache.ctakes.typesystem.type.relation.DiagnosesTextRelation;
+import org.apache.ctakes.typesystem.type.relation.IndicatesTextRelation;
import org.apache.ctakes.typesystem.type.relation.LocationOfTextRelation;
import org.apache.ctakes.typesystem.type.relation.ManagesTreatsTextRelation;
import org.apache.ctakes.typesystem.type.relation.ManifestationOfTextRelation;
+import org.apache.ctakes.typesystem.type.relation.PreventsTextRelation;
import org.apache.ctakes.typesystem.type.relation.RelationArgument;
import org.apache.ctakes.typesystem.type.relation.ResultOfTextRelation;
import org.apache.ctakes.typesystem.type.relation.TemporalTextRelation;
@@ -78,6 +83,7 @@ import org.apache.ctakes.typesystem.type
import org.apache.ctakes.typesystem.type.textsem.GenericModifier;
import org.apache.ctakes.typesystem.type.textsem.HistoryOfModifier;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.LabDeltaFlagModifier;
import org.apache.ctakes.typesystem.type.textsem.LabEstimatedModifier;
import org.apache.ctakes.typesystem.type.textsem.LabInterpretationModifier;
import org.apache.ctakes.typesystem.type.textsem.LabMention;
@@ -154,7 +160,7 @@ public class SHARPKnowtatorXMLReader ext
/**
* Get the URI that the text in this class was loaded from
*/
- protected URI getTextURI(JCas jCas) throws AnalysisEngineProcessException {
+ protected URI getTextURI(JCas jCas) {
String textPath = JCasUtil.selectSingle(jCas, DocumentID.class).getDocumentID();
if (this.textDirectory != null) {
textPath = this.textDirectory + File.separator + textPath;
@@ -176,13 +182,18 @@ public class SHARPKnowtatorXMLReader ext
* @throws URISyntaxException
*/
protected URI getKnowtatorURI(JCas jCas) throws AnalysisEngineProcessException {
- String textURI = this.getTextURI(jCas).toString();
- String xmlURI = textURI.replaceAll("Knowtator[/\\\\]text", "Knowtator_XML") + ".knowtator.xml";
- File fileTest = new File(URI.create(xmlURI));
- if(!fileTest.exists()){
- xmlURI = xmlURI.replace("_XML", " XML");
- }
- return UriUtils.create(xmlURI);
+ File textURI = new File(this.getTextURI(jCas));
+ String filename = textURI.getName().replace(".txt", "");
+
+ File xmlPath = new File(textURI.getParentFile().getParentFile().getParentFile().getParentFile().getParentFile(), "by-document/" + filename + "/" + filename + ".umls.knowtator.xml");
+
+// String xmlURI = textURI.replaceAll("Knowtator[/\\\\]text", "Knowtator_XML") + ".knowtator.xml";
+// File fileTest = new File(URI.create(xmlURI));
+// if(!fileTest.exists()){
+// xmlURI = xmlURI.replace("_XML", " XML");
+// }
+// return URI.create(xmlPath.getAbsolutePath());
+ return UriUtils.create("file:" + xmlPath.getAbsolutePath());
}
/**
@@ -229,6 +240,7 @@ public class SHARPKnowtatorXMLReader ext
entityRelationTypes.add("location_of");
entityRelationTypes.add("manages/treats");
entityRelationTypes.add("manifestation_of"); // note the misspelling
+ entityRelationTypes.add("prevents");
entityRelationTypes.add("result_of");
Set<String> eventRelationTypes = new HashSet<String>();
eventRelationTypes.add("TLINK");
@@ -274,6 +286,7 @@ public class SHARPKnowtatorXMLReader ext
AnatomicalSiteMention mention = new AnatomicalSiteMention(jCas, coveringSpan.begin, coveringSpan.end);
addIdentifiedAnnotationFeatures(
annotation,
+ knowtatorURI,
mention,
jCas,
CONST.NE_TYPE_ID_ANATOMICAL_SITE,
@@ -281,7 +294,8 @@ public class SHARPKnowtatorXMLReader ext
booleanSlots,
annotationSlots,
idAnnotationMap,
- delayedFeatures);
+ delayedFeatures,
+ delayedRelations);
KnowtatorAnnotation bodyLaterality = annotationSlots.remove("body_laterality");
delayedFeatures.add(new DelayedFeature(mention, "bodyLaterality", bodyLaterality));
KnowtatorAnnotation bodySide = annotationSlots.remove("body_side");
@@ -291,6 +305,7 @@ public class SHARPKnowtatorXMLReader ext
EventMention mention = new EventMention(jCas, coveringSpan.begin, coveringSpan.end);
addIdentifiedAnnotationFeatures(
annotation,
+ knowtatorURI,
mention,
jCas,
CONST.NE_TYPE_ID_CLINICAL_ATTRIBUTE,
@@ -298,12 +313,18 @@ public class SHARPKnowtatorXMLReader ext
booleanSlots,
annotationSlots,
idAnnotationMap,
- delayedFeatures);
+ delayedFeatures,
+ delayedRelations);
} else if ("Devices".equals(annotation.type)) {
+ if(coveringSpan.begin < 0 || coveringSpan.end < 0){
+ LOGGER.error(String.format("Device annotation (id=%s) has invalid span [%d,%d]", annotation.id, coveringSpan.begin, coveringSpan.end));
+ continue;
+ }
EntityMention mention = new EntityMention(jCas, coveringSpan.begin, coveringSpan.end);
addIdentifiedAnnotationFeatures(
annotation,
+ knowtatorURI,
mention,
jCas,
CONST.NE_TYPE_ID_DEVICE,
@@ -311,12 +332,14 @@ public class SHARPKnowtatorXMLReader ext
booleanSlots,
annotationSlots,
idAnnotationMap,
- delayedFeatures);
+ delayedFeatures,
+ delayedRelations);
} else if ("Disease_Disorder".equals(annotation.type)) {
DiseaseDisorderMention mention = new DiseaseDisorderMention(jCas, coveringSpan.begin, coveringSpan.end);
addIdentifiedAnnotationFeatures(
annotation,
+ knowtatorURI,
mention,
jCas,
CONST.NE_TYPE_ID_DISORDER,
@@ -324,7 +347,8 @@ public class SHARPKnowtatorXMLReader ext
booleanSlots,
annotationSlots,
idAnnotationMap,
- delayedFeatures);
+ delayedFeatures,
+ delayedRelations);
KnowtatorAnnotation alleviatingFactor = annotationSlots.remove("alleviating_factor");
delayedFeatures.add(DelayedRelationFeature.forArg2(
mention,
@@ -376,6 +400,7 @@ public class SHARPKnowtatorXMLReader ext
LabMention mention = new LabMention(jCas, coveringSpan.begin, coveringSpan.end);
addIdentifiedAnnotationFeatures(
annotation,
+ knowtatorURI,
mention,
jCas,
CONST.NE_TYPE_ID_LAB,
@@ -383,7 +408,8 @@ public class SHARPKnowtatorXMLReader ext
booleanSlots,
annotationSlots,
idAnnotationMap,
- delayedFeatures);
+ delayedFeatures,
+ delayedRelations);
KnowtatorAnnotation ordinal = annotationSlots.remove("ordinal_interpretation");
delayedFeatures.add(DelayedRelationFeature.forArg1(
mention,
@@ -400,11 +426,13 @@ public class SHARPKnowtatorXMLReader ext
labValue,
ResultOfTextRelation.class,
LabValueModifier.class));
-
+ KnowtatorAnnotation deltaFlag = annotationSlots.remove("delta_flag");
+ delayedFeatures.add(new DelayedFeature(mention, "deltaFlag", deltaFlag));
} else if ("Medications/Drugs".equals(annotation.type)) {
MedicationMention mention = new MedicationMention(jCas, coveringSpan.begin, coveringSpan.end);
addIdentifiedAnnotationFeatures(
annotation,
+ knowtatorURI,
mention,
jCas,
CONST.NE_TYPE_ID_DRUG,
@@ -412,7 +440,8 @@ public class SHARPKnowtatorXMLReader ext
booleanSlots,
annotationSlots,
idAnnotationMap,
- delayedFeatures);
+ delayedFeatures,
+ delayedRelations);
KnowtatorAnnotation allergy = annotationSlots.remove("allergy_indicator");
delayedFeatures.add(new DelayedFeature(mention, "medicationAllergy", allergy));
KnowtatorAnnotation changeStatus = annotationSlots.remove("change_status_model");
@@ -429,6 +458,8 @@ public class SHARPKnowtatorXMLReader ext
delayedFeatures.add(new DelayedFeature(mention, "medicationRoute", route));
KnowtatorAnnotation startDate = annotationSlots.remove("start_date");
delayedFeatures.add(new DelayedFeature(mention, "startDate", startDate));
+ KnowtatorAnnotation endDate = annotationSlots.remove("end_date");
+ delayedFeatures.add(new DelayedFeature(mention, "endDate", endDate));
KnowtatorAnnotation strength = annotationSlots.remove("strength_model");
delayedFeatures.add(new DelayedFeature(mention, "medicationStrength", strength));
@@ -436,6 +467,7 @@ public class SHARPKnowtatorXMLReader ext
EventMention mention = new EventMention(jCas, coveringSpan.begin, coveringSpan.end);
addIdentifiedAnnotationFeatures(
annotation,
+ knowtatorURI,
mention,
jCas,
CONST.NE_TYPE_ID_PHENOMENA,
@@ -443,12 +475,14 @@ public class SHARPKnowtatorXMLReader ext
booleanSlots,
annotationSlots,
idAnnotationMap,
- delayedFeatures);
+ delayedFeatures,
+ delayedRelations);
} else if ("Procedure".equals(annotation.type)) {
ProcedureMention mention = new ProcedureMention(jCas, coveringSpan.begin, coveringSpan.end);
addIdentifiedAnnotationFeatures(
annotation,
+ knowtatorURI,
mention,
jCas,
CONST.NE_TYPE_ID_PROCEDURE,
@@ -456,7 +490,8 @@ public class SHARPKnowtatorXMLReader ext
booleanSlots,
annotationSlots,
idAnnotationMap,
- delayedFeatures);
+ delayedFeatures,
+ delayedRelations);
KnowtatorAnnotation bodyLaterality = annotationSlots.remove("body_laterality");
delayedFeatures.add(new DelayedFeature(mention, "bodyLaterality", bodyLaterality));
KnowtatorAnnotation bodyLocation = annotationSlots.remove("body_location");
@@ -477,6 +512,7 @@ public class SHARPKnowtatorXMLReader ext
SignSymptomMention mention = new SignSymptomMention(jCas, coveringSpan.begin, coveringSpan.end);
addIdentifiedAnnotationFeatures(
annotation,
+ knowtatorURI,
mention,
jCas,
CONST.NE_TYPE_ID_FINDING,
@@ -484,7 +520,8 @@ public class SHARPKnowtatorXMLReader ext
booleanSlots,
annotationSlots,
idAnnotationMap,
- delayedFeatures);
+ delayedFeatures,
+ delayedRelations);
KnowtatorAnnotation alleviatingFactor = annotationSlots.remove("alleviating_factor");
delayedFeatures.add(DelayedRelationFeature.forArg2(
mention,
@@ -845,7 +882,16 @@ public class SHARPKnowtatorXMLReader ext
modifier.setNormalizedForm(attribute);
modifier.addToIndexes();
idAnnotationMap.put(annotation.id, modifier);
-
+ } else if ("delta_flag_indicator".equals(annotation.type)) {
+ String value = stringSlots.remove("delta_flag_normalization");
+ LabDeltaFlagModifier modifier = new LabDeltaFlagModifier(jCas, coveringSpan.begin, coveringSpan.end);
+ LabDeltaFlag attribute = new LabDeltaFlag(jCas);
+ attribute.setValue(value);
+ attribute.addToIndexes();
+// modifier.setValue(value)
+ modifier.setNormalizedForm(attribute);
+ modifier.addToIndexes();
+ idAnnotationMap.put(annotation.id, modifier);
} else if ("Value".equals(annotation.type)) {
KnowtatorAnnotation unit = annotationSlots.remove("value_unit");
KnowtatorAnnotation number = annotationSlots.remove("value_number");
@@ -970,18 +1016,23 @@ public class SHARPKnowtatorXMLReader ext
KnowtatorAnnotation unit = annotationSlots.remove("strength_unit");
KnowtatorAnnotation number = annotationSlots.remove("strength_number");
MedicationStrength attribute = new MedicationStrength(jCas);
+ int spanStart=text.length()-1,spanEnd=0; // the strength annotation is spanless so we get the modifier span by its components
if (unit != null) {
KnowtatorAnnotation.Span unitSpan = unit.getCoveringSpan();
String unitString = text.substring(unitSpan.begin, unitSpan.end);
attribute.setUnit(unitString);
+ if(unitSpan.begin < spanStart) spanStart = unitSpan.begin;
+ if(unitSpan.end > spanEnd) spanEnd = unitSpan.end;
}
if (number != null) {
KnowtatorAnnotation.Span numberSpan = number.getCoveringSpan();
String numberString = text.substring(numberSpan.begin, numberSpan.end);
attribute.setNumber(numberString);
+ if(numberSpan.begin < spanStart) spanStart = numberSpan.begin;
+ if(numberSpan.end > spanEnd) spanEnd = numberSpan.end;
}
attribute.addToIndexes();
- MedicationStrengthModifier modifier = new MedicationStrengthModifier(jCas, coveringSpan.begin, coveringSpan.end);
+ MedicationStrengthModifier modifier = new MedicationStrengthModifier(jCas, spanStart, spanEnd);
modifier.setNormalizedForm(attribute);
modifier.addToIndexes();
idAnnotationMap.put(annotation.id, modifier);
@@ -1023,9 +1074,11 @@ public class SHARPKnowtatorXMLReader ext
} else if ("Date".equals(annotation.type)) {
String month = stringSlots.remove("month");
String day = stringSlots.remove("day");
+ String year = stringSlots.remove("year");
Date date = new Date(jCas);
date.setMonth(month);
date.setDay(day);
+ date.setYear(year);
date.addToIndexes();
TimeMention mention = new TimeMention(jCas, coveringSpan.begin, coveringSpan.end);
mention.setDate(date);
@@ -1086,7 +1139,12 @@ public class SHARPKnowtatorXMLReader ext
// all mentions should be added, so add features that required other annotations
for (DelayedFeature delayedFeature : delayedFeatures) {
+ try{
delayedFeature.setValueFrom(idAnnotationMap);
+ }catch(Exception e){
+ System.err.println("Exception reading input: " + e.getMessage());
+ e.printStackTrace(System.err);
+ }
}
}
@@ -1100,8 +1158,22 @@ public class SHARPKnowtatorXMLReader ext
return String.format("%s(%s)", ann.getClass().getSimpleName(), result);
}
+// private static void addIdentifiedAnnotationFeatures(
+// KnowtatorAnnotation annotation,
+// final IdentifiedAnnotation mention,
+// JCas jCas,
+// int typeID,
+// Map<String, String> stringSlots,
+// Map<String, Boolean> booleanSlots,
+// Map<String, KnowtatorAnnotation> annotationSlots,
+// Map<String, TOP> idAnnotationMap,
+// List<DelayedFeature> delayedFeatures) {
+// addIdentifiedAnnotationFeatures(annotation, null, mention, jCas, typeID, stringSlots, booleanSlots, annotationSlots, idAnnotationMap, delayedFeatures, null);
+// }
+
private static void addIdentifiedAnnotationFeatures(
KnowtatorAnnotation annotation,
+ URI knowtatorURI,
final IdentifiedAnnotation mention,
JCas jCas,
int typeID,
@@ -1109,7 +1181,8 @@ public class SHARPKnowtatorXMLReader ext
Map<String, Boolean> booleanSlots,
Map<String, KnowtatorAnnotation> annotationSlots,
Map<String, TOP> idAnnotationMap,
- List<DelayedFeature> delayedFeatures) {
+ List<DelayedFeature> delayedFeatures,
+ List<DelayedRelation> delayedRelations) {
mention.setTypeID(typeID);
mention.setConfidence(1.0f);
mention.setDiscoveryTechnique(CONST.NE_DISCOVERY_TECH_GOLD_ANNOTATION);
@@ -1129,6 +1202,15 @@ public class SHARPKnowtatorXMLReader ext
delayedFeatures.add(new DelayedFeatureFromFeature(mention, "historyOf", historyOf));
KnowtatorAnnotation negationIndicator = annotationSlots.remove("negation_indicator_CU");
delayedFeatures.add(new DelayedFeatureFromFeature(mention, "polarity", negationIndicator));
+ if(negationIndicator != null){
+ DelayedRelation polRel = new DelayedRelation();
+ polRel.source = negationIndicator;
+ polRel.target = annotation;
+ polRel.type = "polarityModifier";
+ polRel.sourceFile = knowtatorURI;
+ if(delayedRelations != null) delayedRelations.add(polRel);
+ }
+// delayedFeatures.add(DelayedRelationFeature.forArg2(mention, "polarityModifier", negationIndicator, BinaryTextRelation.class, PolarityModifier.class));
KnowtatorAnnotation subject = annotationSlots.remove("subject_CU");
delayedFeatures.add(new DelayedFeatureFromFeature(mention, "subject", subject) {
@Override
@@ -1285,15 +1367,26 @@ public class SHARPKnowtatorXMLReader ext
// add the relation to the CAS
BinaryTextRelation relation = null;
- if ("affects".equals(this.annotation.type)) {
+ if(this.annotation == null){
+ relation = new BinaryTextRelation(jCas);
+ } else if ("affects".equals(this.annotation.type)) {
this.assertTypes(sourceMention, EventMention.class, targetMention, IdentifiedAnnotation.class);
relation = new AffectsTextRelation(jCas);
} else if ("complicates/disrupts".equals(this.annotation.type)) {
this.assertTypes(sourceMention, EventMention.class, targetMention, EventMention.class);
relation = new ComplicatesDisruptsTextRelation(jCas);
+ } else if ("contraindicates".equals(this.annotation.type)) {
+ this.assertTypes(sourceMention, IdentifiedAnnotation.class, targetMention, EventMention.class);
+ relation = new ContraindicatesTextRelation(jCas);
} else if ("degree_of".equals(this.annotation.type)) {
this.assertTypes(sourceMention, EventMention.class, targetMention, Modifier.class);
relation = new DegreeOfTextRelation(jCas);
+ } else if ("diagnoses".equals(this.annotation.type)) {
+ this.assertTypes(sourceMention, EventMention.class, targetMention, IdentifiedAnnotation.class);
+ relation = new DiagnosesTextRelation(jCas);
+ } else if ("indicates".equals(this.annotation.type)) {
+ this.assertTypes(sourceMention, EventMention.class, targetMention, EventMention.class);
+ relation = new IndicatesTextRelation(jCas);
} else if ("location_of".equals(this.annotation.type)) {
if (!(targetMention instanceof AnatomicalSiteMention) && (sourceMention instanceof AnatomicalSiteMention)) {
// fix reversed arguments in manual annotations
@@ -1314,9 +1407,15 @@ public class SHARPKnowtatorXMLReader ext
this.assertTypes(sourceMention, EventMention.class, targetMention, EventMention.class);
relation = new ManifestationOfTextRelation(jCas);
relation.setCategory("manifestation_of"); // fix typo in Knowtator type system
+ } else if ("prevents".equals(this.annotation.type)) {
+ this.assertTypes(sourceMention, EventMention.class, targetMention, EventMention.class);
+ relation = new PreventsTextRelation(jCas);
} else if ("result_of".equals(this.annotation.type)) {
this.assertTypes(sourceMention, EventMention.class, targetMention, IdentifiedAnnotation.class);
relation = new ResultOfTextRelation(jCas);
+// } else if ("prevents".equals(this.annotation.type)) {
+// this.assertTypes(sourceMention, expectedSourceClass, targetMention, expectedTargetClass);
+// relation = new PreventsTextRelation(jCas);
} else if ("TLINK".equals(this.annotation.type)) {
relation = new TemporalTextRelation(jCas);
relation.setCategory(this.type);
@@ -1329,7 +1428,11 @@ public class SHARPKnowtatorXMLReader ext
// set the relation cateory (if not already set)
if (relation.getCategory() == null) {
- relation.setCategory(this.annotation.type);
+ if(this.type != null){
+ relation.setCategory(this.type);
+ }else{
+ relation.setCategory(this.annotation.type);
+ }
}
// link the relation to its arguments and add it to the CAS
@@ -1344,7 +1447,7 @@ public class SHARPKnowtatorXMLReader ext
relation.addToIndexes();
// add the relation to the map so it can be used in features of other annotations
- idAnnotationMap.put(this.annotation.id, relation);
+ if(this.annotation != null) idAnnotationMap.put(this.annotation.id, relation);
}
private void assertTypes(Annotation sourceMention, Class<? extends Annotation> expectedSourceClass, Annotation targetMention, Class<? extends Annotation> expectedTargetClass) {
Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java?rev=1643405&r1=1643404&r2=1643405&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java Fri Dec 5 19:18:05 2014
@@ -30,8 +30,11 @@ import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
+import opennlp.tools.cmdline.sentdetect.SentenceDetectorCrossValidatorTool;
+import opennlp.tools.cmdline.sentdetect.SentenceEvaluationErrorListener;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.sentdetect.DefaultSDContextGenerator;
+import opennlp.tools.sentdetect.SDCrossValidator;
import opennlp.tools.sentdetect.SentenceDetectorFactory;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
@@ -42,8 +45,10 @@ import opennlp.tools.util.PlainTextByLin
import opennlp.tools.util.TrainingParameters;
import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.core.sentence.SDContextGeneratorCtakes;
import org.apache.ctakes.core.sentence.EndOfSentenceScannerImpl;
import org.apache.ctakes.core.sentence.SentenceDetectorCtakes;
+import org.apache.ctakes.core.sentence.SentenceDetectorFactoryCtakes;
import org.apache.ctakes.core.sentence.SentenceSpan;
import org.apache.ctakes.typesystem.type.textspan.Segment;
import org.apache.ctakes.typesystem.type.textspan.Sentence;
@@ -114,7 +119,7 @@ public class SentenceDetector extends JC
logger.info("Sentence detector model file: " + sdModelPath);
sdmodel = new SentenceModel(is);
EndOfSentenceScannerImpl eoss = new EndOfSentenceScannerImpl();
- DefaultSDContextGenerator cg = new DefaultSDContextGenerator(eoss.getEndOfSentenceCharacters());
+ SDContextGeneratorCtakes cg = new SDContextGeneratorCtakes(eoss.getEndOfSentenceCharacters());
sentenceDetector = new SentenceDetectorCtakes(
sdmodel.getMaxentModel(), cg, eoss);
@@ -324,23 +329,17 @@ public class SentenceDetector extends JC
mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(iters));
mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
- // Abbreviations dictionary
- // TODO: Actually import a Dictionary of abbreviations
- Dictionary dict = new Dictionary();
-
try {
- SentenceDetectorFactory sdFactory = new SentenceDetectorFactory(
- "en", true, dict, scanner.getEndOfSentenceCharacters());
- mod = SentenceDetectorME.train("en", sampleStream, sdFactory, mlParams);
+ SentenceDetectorFactoryCtakes sdFactory = new SentenceDetectorFactoryCtakes(scanner.getEndOfSentenceCharacters());
+ mod = SentenceDetectorME.train("en", sampleStream, sdFactory, mlParams);
} finally {
sampleStream.close();
}
}
-
- try(FileOutputStream outStream = new FileOutputStream(outFile)){
- logger.info("Saving the model as: " + outFile.getAbsolutePath());
- mod.serialize(outStream);
- }
+ try(FileOutputStream outStream = new FileOutputStream(outFile)){
+ logger.info("Saving the model as: " + outFile.getAbsolutePath());
+ mod.serialize(outStream);
+ }
}
public static void usage(Logger log) {
Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FilesInDirectoryCollectionReader.java
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FilesInDirectoryCollectionReader.java?rev=1643405&r1=1643404&r2=1643405&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FilesInDirectoryCollectionReader.java (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FilesInDirectoryCollectionReader.java Fri Dec 5 19:18:05 2014
@@ -138,7 +138,7 @@ public class FilesInDirectoryCollectionR
File[] files = directory.listFiles();
for (int i = 0; i < files.length; i++)
{
- if (!files[i].isDirectory() && hasValidExtension(files[i]))
+ if (!files[i].isDirectory() && hasValidExtension(files[i]) && !files[i].isHidden())
{
iv_files.add(files[i]);
}
Added: ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SDContextGeneratorCtakes.java
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SDContextGeneratorCtakes.java?rev=1643405&view=auto
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SDContextGeneratorCtakes.java (added)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SDContextGeneratorCtakes.java Fri Dec 5 19:18:05 2014
@@ -0,0 +1,116 @@
+package org.apache.ctakes.core.sentence;
+
+import java.util.ArrayList;
+
+import opennlp.tools.sentdetect.DefaultSDContextGenerator;
+import opennlp.tools.util.StringUtil;
+
+public class SDContextGeneratorCtakes extends DefaultSDContextGenerator {
+
+ // TODO -- is this threadsafe?? At the very least its not less thread-safe than existing data structures in parent class
+ String ws = null;
+
+ public SDContextGeneratorCtakes(char[] eosCharacters) {
+ super(eosCharacters);
+ }
+
+ @Override
+ public String[] getContext(CharSequence sb, int position) {
+ // add features to addlFeats string array:
+ int lastIndex = sb.length() - 1;
+ int wsEnd = nextNonspaceIndex(sb, position, lastIndex);
+ if(wsEnd != -1 && position != lastIndex){
+ ws = new StringBuilder(sb.subSequence(position + 1, wsEnd)).toString();
+ }
+
+ return super.getContext(sb, position);
+ }
+
+ private static String escapeChar(Character c) {
+ if (c == '\n') {
+ return "<LF>";
+ }
+
+ if (c == '\r') {
+ return "<CR>";
+ }
+
+ return new String(new char[]{c});
+ }
+
+ @Override
+ protected void collectFeatures(String prefix, String suffix, String previous, String next, Character eosChar) {
+ super.collectFeatures(prefix, suffix, previous, next, eosChar);
+
+ if (!next.equals("")) {
+ if(isAllUpper(next)) {
+ collectFeats.add("nbold");
+ }
+ }
+ buf.append("ws=");
+ String featValue = ws.replace("\n", "<LF>").replace("\t", "<SPACE>").replace(" ", "<SPACE>").replace("\r", "");
+ buf.append(featValue);
+ // collectFeats.add(buf.toString());
+ buf.setLength(0);
+
+ buf.append("lfs=");
+ String lfs = featValue.replace("<SPACE>", "");
+ buf.append(lfs);
+ collectFeats.add(buf.toString());
+ buf.setLength(0);
+
+ buf.append("eolws=");
+ buf.append(escapeChar(eosChar));
+ buf.append(',');
+ buf.append(lfs);
+ collectFeats.add(buf.toString());
+ buf.setLength(0);
+
+ buf.append("nextshape=");
+ buf.append(getShape(next));
+ // collectFeats.add(buf.toString());
+ buf.setLength(0);
+
+ String collapsedShape = getCollapsedShape(next);
+ buf.append("collapsedNext=");
+ buf.append(collapsedShape);
+ collectFeats.add(buf.toString());
+ buf.setLength(0);
+
+ buf.append("collapasedPrev=");
+ buf.append(getCollapsedShape(previous));
+ collectFeats.add(buf.toString());
+ buf.setLength(0);
+
+ buf.append("collapsedPrefix=");
+ buf.append(getCollapsedShape(prefix));
+ collectFeats.add(buf.toString());
+ buf.setLength(0);
+
+ }
+
+ private static final boolean isAllUpper(String s) {
+ for(int i = 0; i < s.length(); i++){
+ if(!Character.isUpperCase(s.charAt(i))){
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private static final String getShape(String s){
+ return s.replaceAll("\\p{Upper}", "U").replaceAll("\\p{Lower}", "L").replaceAll("\\p{Digit}", "D").replaceAll("\\p{Punct}","P");
+ }
+
+ private static final String getCollapsedShape(String s){
+ return getShape(s).replaceAll("(.)\\1+", "$1+");
+ }
+
+ private static final int nextNonspaceIndex(CharSequence sb, int seek, int lastIndex) {
+ while(seek < lastIndex){
+ char c = sb.charAt(++seek);
+ if(!StringUtil.isWhitespace(c)) return seek;
+ }
+ return lastIndex;
+ }
+}
Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java?rev=1643405&r1=1643404&r2=1643405&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java (original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java Fri Dec 5 19:18:05 2014
@@ -29,11 +29,8 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import opennlp.model.MaxentModel;
import opennlp.tools.dictionary.Dictionary;
-import opennlp.tools.ml.maxent.GIS;
-import opennlp.tools.ml.maxent.GISModel;
-import opennlp.tools.ml.model.EventStream;
-import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.sentdetect.EndOfSentenceScanner;
import opennlp.tools.sentdetect.SDContextGenerator;
import opennlp.tools.sentdetect.SDEventStream;
@@ -42,7 +39,6 @@ import opennlp.tools.sentdetect.Sentence
import opennlp.tools.sentdetect.SentenceSample;
import opennlp.tools.sentdetect.SentenceSampleStream;
import opennlp.tools.sentdetect.lang.Factory;
-import opennlp.tools.util.HashSumEventStream;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.StringUtil;
@@ -228,123 +224,12 @@ public class SentenceDetectorCtakes {
protected boolean isAcceptableBreak(String s, int fromIndex, int candidateIndex) {
return true;
}
- /*
- public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples,
- boolean useTokenEnd, Dictionary abbreviations) throws IOException {
- return train(languageCode, samples, useTokenEnd, abbreviations,5,100);
- }
-
- public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples,
- boolean useTokenEnd, Dictionary abbreviations, int cutoff, int iterations) throws IOException {
-
- Map<String, String> manifestInfoEntries = new HashMap<String, String>();
- ModelUtil.addCutoffAndIterations(manifestInfoEntries, cutoff, iterations);
-
- Factory factory = new Factory();
-
- // TODO: Fix the EventStream to throw exceptions when training goes wrong
- ObjectStream eventStream = new SDEventStream(samples,
- factory.createSentenceContextGenerator(languageCode),
- factory.createEndOfSentenceScanner(languageCode));
-
- HashSumEventStream hses = new HashSumEventStream(eventStream);
- GISModel sentModel = GIS.trainModel(hses, iterations, cutoff);
-
- manifestInfoEntries.put(BaseModel.TRAINING_EVENTHASH_PROPERTY,
- hses.calculateHashSum().toString(16));
-
- return new SentenceModel(languageCode, sentModel,
- useTokenEnd, abbreviations, manifestInfoEntries);
- }
-*/
- private static void usage() {
- System.err.println("Usage: SentenceDetectorME -encoding charset -lang language trainData modelName [cutoff iterations]");
- System.err.println("-encoding charset specifies the encoding which should be used ");
- System.err.println(" for reading and writing text.");
- System.err.println("-lang language specifies the language which ");
- System.err.println(" is being processed.");
- System.err.println("trainData specifies the name of the input training file");
- System.err.println(" to train the resulting model.");
- System.err.println("modelName specifies the resulting saved model after");
- System.err.println(" training.");
- System.exit(1);
- }
- /**
- * <p>Trains a new sentence detection model.</p>
- *
- * <p>Usage: opennlp.tools.sentdetect.SentenceDetectorME data_file new_model_name (iterations cutoff)?</p>
- *
- * @param args
- * @throws IOException
- */
- /*
- public static void main(String[] args) throws IOException {
- int ai=0;
- String encoding = null;
- String lang = null;
- if (args.length == 0) {
- usage();
- }
- while (args[ai].startsWith("-")) {
- if (args[ai].equals("-encoding")) {
- ai++;
- if (ai < args.length) {
- encoding = args[ai];
- ai++;
- }
- else {
- usage();
- }
- }
- else if (args[ai].equals("-lang")) {
- ai++;
- if (ai < args.length) {
- lang = args[ai];
- ai++;
- }
- else {
- usage();
- }
- }
- else {
- usage();
- }
- }
-
- File inFile = new File(args[ai++]);
- File outFile = new File(args[ai++]);
-
- int numberOfArgs = args.length;
- int iters = (ai < numberOfArgs ? convertToInt(args[ai++]) : 100);
- int cutoff = (ai < numberOfArgs ? convertToInt(args[ai++]) : 4);
-
-
- try {
- if ((lang == null) || (encoding == null)) {
- usage();
- }
-
-
- SentenceModel model = train(lang, new SentenceSampleStream(new PlainTextByLineStream(
- new InputStreamReader(new FileInputStream(inFile), encoding))), true, null, cutoff, iters);
-
- // TODO: add support for iterations and cutoff settings
-
-// if (args.length > ai)
-// mod = train(es, Integer.parseInt(args[ai++]), Integer.parseInt(args[ai++]));
-// else
-// mod = train(es, 100, 5);
-
- System.out.println("Saving the model as: " + outFile);
- model.serialize(new FileOutputStream(outFile));
- }
- catch (Exception e) {
- e.printStackTrace();
- }
- }
-
-*/
+ // RE: Missing main method for training -- there were two versions -- one in here and one
+ // in SentenceDetector.java, and the one in here was old so it was removed.
+ // Please use the org.apache.ctakes.core.ae.SentenceDetector for training
+ // sentence detector models in cTAKES.
+
private static int convertToInt(String s) {
int i = Integer.parseInt(s);
Added: ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorFactoryCtakes.java
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorFactoryCtakes.java?rev=1643405&view=auto
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorFactoryCtakes.java (added)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorFactoryCtakes.java Fri Dec 5 19:18:05 2014
@@ -0,0 +1,22 @@
+package org.apache.ctakes.core.sentence;
+
+import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.sentdetect.SDContextGenerator;
+import opennlp.tools.sentdetect.SentenceDetectorFactory;
+
+public class SentenceDetectorFactoryCtakes extends SentenceDetectorFactory {
+
+ // need empty constructor to allow this to be instantiated through reflection in opennlp classes
+ public SentenceDetectorFactoryCtakes(){
+ super();
+ }
+
+ public SentenceDetectorFactoryCtakes(char[] eosChars){
+ super("en", true, new Dictionary(), eosChars);
+ }
+
+ @Override
+ public SDContextGenerator getSDContextGenerator() {
+ return new SDContextGeneratorCtakes(this.getEOSCharacters());
+ }
+}