You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2021/09/22 19:00:06 UTC
svn commit: r1893521 - in
/ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor:
eval/ metastasis/ pipelines/
Author: tmill
Date: Wed Sep 22 19:00:05 2021
New Revision: 1893521
URL: http://svn.apache.org/viewvc?rev=1893521&view=rev
Log:
Major refactor of relation evaluation code to generalize the idea of different corpora, add reader for DeepPhe, and make it easier to evaluate cross-domain.
Added:
ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/CorpusXMI.java
ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/DeepPheXMI.java
ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/RelationEvaluation_ImplBase.java
Modified:
ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/ModifierExtractorEvaluation.java
ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/RelationExtractorEvaluation.java
ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/SHARPXMI.java
ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/metastasis/MetastasisXmiGenerationPipeline.java
ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/pipelines/RelationExtractorTrain.java
Added: ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/CorpusXMI.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/CorpusXMI.java?rev=1893521&view=auto
==============================================================================
--- ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/CorpusXMI.java (added)
+++ ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/CorpusXMI.java Wed Sep 22 19:00:05 2021
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.relationextractor.eval;
+
+
+import com.google.common.collect.Lists;
+import org.apache.ctakes.typesystem.type.structured.DocumentID;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.util.ViewUriUtil;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Created by tmill on 1/31/17.
+ */
+public abstract class CorpusXMI {
+ public enum Corpus {SHARP, SHARP_RELEASE, DeepPhe}
+
+ public enum EvaluateOn {
+ TRAIN, DEV, TEST, OTHER
+ }
+
+ public static final String GOLD_VIEW_NAME = "GoldView";
+
+ public static void validate(RelationEvaluation_ImplBase.EvaluationOptions options) throws Exception {
+ // error on invalid option combinations
+ if (options.getEvaluateOn().equals(EvaluateOn.TEST) && options.getGridSearch()) {
+ throw new IllegalArgumentException("grid search can only be run on the train or dev sets");
+ }
+ }
+
+ public static List<File> getTrainTextFiles(Corpus trainCorpus, EvaluateOn split, File corpusDirectory) {
+ List<File> trainFiles = null;
+ new ArrayList<>();
+
+ // No matter what, the training files will contain the training data from the training corpus. May need to
+ // add dev later.
+ if (trainCorpus == Corpus.SHARP) {
+ trainFiles = SHARPXMI.getTrainTextFiles(corpusDirectory);
+ } else if (trainCorpus == Corpus.SHARP_RELEASE) {
+ trainFiles = SHARPXMI.getTrainTextFilesFromCorpus(corpusDirectory);
+ } else if (trainCorpus == Corpus.DeepPhe) {
+ trainFiles = DeepPheXMI.getTrainTextFiles(corpusDirectory);
+ } else {
+ throw new RuntimeException("Unrecognized train corpus option: " + trainCorpus);
+ }
+
+ if (split == EvaluateOn.TEST) {
+ // if we are testing on an actual test set then we first need to add the dev set notes to the training
+ // set.
+ if (trainCorpus == Corpus.SHARP) {
+ trainFiles.addAll(SHARPXMI.getDevTextFiles(corpusDirectory));
+ } else if (trainCorpus == Corpus.SHARP_RELEASE) {
+ trainFiles.addAll(SHARPXMI.getTrainTextFilesFromCorpus(corpusDirectory));
+ } else if (trainCorpus == Corpus.DeepPhe) {
+ trainFiles.addAll(DeepPheXMI.getTrainTextFiles(corpusDirectory));
+ } else {
+ throw new RuntimeException("Unrecognized train corpus option: " + trainCorpus);
+ }
+
+ }
+ return trainFiles;
+ }
+
+ public static List<File> getTestTextFiles(Corpus testCorpus, EvaluateOn split, File corpusDirectory) {
+ List<File> testFiles = null;
+
+ if (split == CorpusXMI.EvaluateOn.TRAIN) {
+ if (testCorpus == CorpusXMI.Corpus.SHARP) {
+ testFiles = SHARPXMI.getTrainTextFiles(corpusDirectory);
+ } else if (testCorpus == CorpusXMI.Corpus.SHARP_RELEASE) {
+ testFiles = SHARPXMI.getTrainTextFilesFromCorpus(corpusDirectory);
+ } else if (testCorpus == CorpusXMI.Corpus.DeepPhe) {
+ testFiles = DeepPheXMI.getTrainTextFiles(corpusDirectory);
+ }
+ } else if (split == CorpusXMI.EvaluateOn.DEV) {
+ if (testCorpus == CorpusXMI.Corpus.SHARP) {
+ testFiles = SHARPXMI.getDevTextFiles(corpusDirectory);
+ } else if (testCorpus == Corpus.SHARP_RELEASE) {
+ testFiles = SHARPXMI.getDevTextFilesFromCorpus(corpusDirectory);
+ } else if (testCorpus == CorpusXMI.Corpus.DeepPhe) {
+ testFiles = DeepPheXMI.getDevTextFiles(corpusDirectory);
+ }
+ } else if (split == CorpusXMI.EvaluateOn.TEST) {
+ // find the test set files:
+ if (testCorpus == CorpusXMI.Corpus.SHARP) {
+ testFiles = SHARPXMI.getTestTextFiles(corpusDirectory);
+ } else if (testCorpus == Corpus.SHARP_RELEASE) {
+ testFiles = SHARPXMI.getTestTextFilesFromCorpus(corpusDirectory);
+ } else if (testCorpus == CorpusXMI.Corpus.DeepPhe) {
+ testFiles = DeepPheXMI.getTestTextFiles(corpusDirectory);
+ }
+ }
+ return testFiles;
+ }
+
+ public static List<File> toXMIFiles( File xmiDirectory, List<File> textFiles ) {
+ List<File> xmiFiles = Lists.newArrayList();
+ for ( File textFile : textFiles ) {
+ xmiFiles.add( toXMIFile( xmiDirectory, textFile ) );
+ }
+ return xmiFiles;
+ }
+
+ protected static File toXMIFile( File xmiDirectory, File textFile ) {
+ return new File( xmiDirectory, textFile.getName() + ".xmi" );
+ }
+
+ public static class DocumentIDAnnotator extends JCasAnnotator_ImplBase {
+
+ @Override
+ public void process( JCas jCas ) throws AnalysisEngineProcessException {
+ String documentID = new File( ViewUriUtil.getURI( jCas ) ).getPath();
+ DocumentID documentIDAnnotation = new DocumentID( jCas );
+ documentIDAnnotation.setDocumentID( documentID );
+ documentIDAnnotation.addToIndexes();
+ }
+ }
+
+ public static class CopyDocumentTextToGoldView extends JCasAnnotator_ImplBase {
+ @Override
+ public void process( JCas jCas ) throws AnalysisEngineProcessException {
+ try {
+ JCas goldView = jCas.getView( GOLD_VIEW_NAME );
+ goldView.setDocumentText( jCas.getDocumentText() );
+ } catch ( CASException e ) {
+ throw new AnalysisEngineProcessException( e );
+ }
+ }
+ }
+}
+
Added: ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/DeepPheXMI.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/DeepPheXMI.java?rev=1893521&view=auto
==============================================================================
--- ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/DeepPheXMI.java (added)
+++ ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/DeepPheXMI.java Wed Sep 22 19:00:05 2021
@@ -0,0 +1,391 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.relationextractor.eval;
+
+import com.google.common.collect.Sets;
+import org.apache.ctakes.chunker.ae.Chunker;
+import org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster;
+import org.apache.ctakes.constituency.parser.ae.ConstituencyParser;
+import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator;
+import org.apache.ctakes.core.ae.SentenceDetectorAnnotatorBIO;
+import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
+import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
+import org.apache.ctakes.core.util.doc.DocIdUtil;
+import org.apache.ctakes.dependency.parser.ae.ClearNLPDependencyParserAE;
+import org.apache.ctakes.dictionary.lookup2.ae.DefaultJCasTermAnnotator;
+import org.apache.ctakes.lvg.ae.LvgAnnotator;
+import org.apache.ctakes.postagger.POSTagger;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.relation.LocationOfTextRelation;
+import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.textsem.*;
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.component.ViewCreatorAnnotator;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AggregateBuilder;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.pipeline.JCasIterator;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.XMLSerializer;
+import org.cleartk.util.ViewUriUtil;
+import org.cleartk.util.ae.UriToDocumentTextAnnotator;
+import org.cleartk.util.cr.UriCollectionReader;
+import org.jdom2.Element;
+import org.jdom2.JDOMException;
+import org.jdom2.input.SAXBuilder;
+import org.xml.sax.ContentHandler;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Created by tmill on 1/24/17.
+ */
+public class DeepPheXMI extends CorpusXMI {
+
+ private static Pattern dirPatt = Pattern.compile("patient(\\d+)_report(\\d+)_(.*)");
+ private static Matcher matcher = null;
+
+ // These are the splits for the breast cancer patient set.
+ // See here: https://healthnlp.hms.harvard.edu/cancer/wiki/index.php/Main_Page#DeepPhe_Gold_Set
+ public final static Set<Integer> trainPatients = Sets.newHashSet(3, 11, 92, 93);
+ public final static Set<Integer> devPatients = Sets.newHashSet(2, 21);
+ public final static Set<Integer> testPatients = Sets.newHashSet(1, 16);
+
+ // TODO - much of this can be encapsulated in the parent class and just pass it the description for the corpus reader.
+ public static void generateXMI(File xmiDirectory, File anaforaInputDirectory) throws Exception {
+ // if necessary, write the XMIs first
+ if ( !xmiDirectory.exists() ) {
+ xmiDirectory.mkdirs();
+ }
+
+ List<File> files = new ArrayList<>();
+ files.addAll(getTrainTextFiles(anaforaInputDirectory));
+ files.addAll(getDevTextFiles(anaforaInputDirectory));
+ files.addAll(getTestTextFiles(anaforaInputDirectory));
+
+ CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(files);
+ AggregateBuilder builder = new AggregateBuilder();
+ builder.add( UriToDocumentTextAnnotator.getDescription() );
+
+ builder.add( getDeepPhePreprocessingPipeline() );
+ builder.add( AnalysisEngineFactory.createEngineDescription(
+ ViewCreatorAnnotator.class,
+ ViewCreatorAnnotator.PARAM_VIEW_NAME,
+ GOLD_VIEW_NAME ) );
+ builder.add( AnalysisEngineFactory.createEngineDescription( CopyDocumentTextToGoldView.class ) );
+ builder.add(
+ AnalysisEngineFactory.createEngineDescription( DocumentIDAnnotator.class ),
+ CAS.NAME_DEFAULT_SOFA,
+ GOLD_VIEW_NAME );
+ builder.add(
+ AnalysisEngineFactory.createEngineDescription( DeepPheAnaforaXMLReader.getDescription(anaforaInputDirectory) ),
+ CAS.NAME_DEFAULT_SOFA,
+ GOLD_VIEW_NAME );
+
+ // write out an XMI for each file
+ for (Iterator<JCas> casIter = new JCasIterator( reader, builder.createAggregate() ); casIter.hasNext(); ) {
+ JCas jCas = casIter.next();
+ JCas goldView = jCas.getView(GOLD_VIEW_NAME);
+ String documentID = DocIdUtil.getDocumentID(goldView);
+ if (documentID == null) {//|| documentID.equals( DocumentIDAnnotationUtil.NO_DOCUMENT_ID ) ) {
+ throw new IllegalArgumentException("No documentID for CAS:\n" + jCas);
+ }
+ File outFile = toXMIFile(xmiDirectory, new File(documentID));
+ FileOutputStream stream = new FileOutputStream(outFile);
+ ContentHandler handler = new XMLSerializer(stream).getContentHandler();
+ new XmiCasSerializer(jCas.getTypeSystem()).serialize(jCas.getCas(), handler);
+ stream.close();
+ }
+ }
+
+ public static List<File> getTrainTextFiles(File anaforaDirectory) {
+ return getSetTextFiles(anaforaDirectory, trainPatients);
+ }
+
+ public static List<File> getDevTextFiles(File anaforaDirectory){
+ return getSetTextFiles(anaforaDirectory, devPatients);
+ }
+
+ public static List<File> getTestTextFiles(File anaforaDirectory){
+ return getSetTextFiles(anaforaDirectory, testPatients);
+ }
+
+ private static List<File> getSetTextFiles(File anaforaDirectory, Set<Integer> setToUse){
+ List<File> files = new ArrayList<>();
+
+ for(File file : anaforaDirectory.listFiles()){
+ if(file.isDirectory()){
+ // Anafora files are organized into directories per annotation file.
+ matcher = dirPatt.matcher(file.getName());
+ if(matcher.matches()){
+ int patientId = Integer.parseInt(matcher.group(1));
+ if(setToUse.contains(patientId)){
+ // The text file just replicates the last level of the directory path:
+ files.add(new File(file, file.getName()));
+ }
+ }
+ }
+ }
+ return files;
+
+ }
+
+ private static AnalysisEngineDescription getDeepPhePreprocessingPipeline() throws ResourceInitializationException, MalformedURLException {
+ AggregateBuilder builder = new AggregateBuilder();
+ builder.add(SimpleSegmentAnnotator.createAnnotatorDescription());
+ builder.add(SentenceDetectorAnnotatorBIO.getDescription());
+ builder.add(TokenizerAnnotatorPTB.createAnnotatorDescription());
+ builder.add(LvgAnnotator.createAnnotatorDescription());
+ builder.add(ContextDependentTokenizerAnnotator.createAnnotatorDescription());
+ builder.add(POSTagger.createAnnotatorDescription());
+ builder.add(ConstituencyParser.createAnnotatorDescription());
+ builder.add(ClearNLPDependencyParserAE.createAnnotatorDescription());
+ builder.add(Chunker.createAnnotatorDescription());
+ builder.add(ChunkAdjuster.createAnnotatorDescription(new String[]{"NP", "NP"}, 1));
+ builder.add(ChunkAdjuster.createAnnotatorDescription(new String[]{"NP", "PP", "NP"}, 2));
+ builder.add(DefaultJCasTermAnnotator.createAnnotatorDescription());
+
+ return builder.createAggregateDescription();
+ }
+
+
+
+ /**
+ * Created by tmill on 2/7/17.
+ */
+ public static class DeepPheAnaforaXMLReader extends JCasAnnotator_ImplBase {
+ private static Logger LOGGER = Logger.getLogger(DeepPheAnaforaXMLReader.class);
+
+ public static final String PARAM_ANAFORA_DIRECTORY = "anaforaDirectory";
+
+ @ConfigurationParameter(
+ name = PARAM_ANAFORA_DIRECTORY,
+ description = "root directory of the Anafora-annotated files, with one subdirectory for "
+ + "each annotated file")
+ private File anaforaDirectory;
+
+ public static final String PARAM_ANAFORA_XML_SUFFIXES = "anaforaSuffixes";
+ @ConfigurationParameter(
+ name = PARAM_ANAFORA_XML_SUFFIXES,
+ mandatory = false,
+ description = "list of suffixes that might be added to a file name to identify the Anafora "
+ + "XML annotations file; only the first suffix corresponding to a file will be used")
+ private String[] anaforaXMLSuffixes = new String[]{".UmlsDeepPhe.dave.completed.xml"};
+
+ public static AnalysisEngineDescription getDescription() throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngineDescription(DeepPheAnaforaXMLReader.class);
+ }
+
+ public static AnalysisEngineDescription getDescription(File anaforaDirectory)
+ throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngineDescription(
+ DeepPheAnaforaXMLReader.class,
+ DeepPheAnaforaXMLReader.PARAM_ANAFORA_DIRECTORY,
+ anaforaDirectory);
+ }
+
+ @Override
+ public void process(JCas jCas) throws AnalysisEngineProcessException {
+ File textFile = new File(ViewUriUtil.getURI(jCas));
+ LOGGER.info("processing " + textFile);
+
+ List<File> possibleXMLFiles = new ArrayList<>();
+ for (String anaforaXMLSuffix : this.anaforaXMLSuffixes) {
+ if (this.anaforaDirectory == null) {
+ possibleXMLFiles.add(new File(textFile + anaforaXMLSuffix));
+ } else {
+ possibleXMLFiles.add(new File(textFile.getPath() + anaforaXMLSuffix));
+ }
+ }
+
+ // find an Anafora XML file that actually exists
+ File xmlFile = null;
+ for (File possibleXMLFile : possibleXMLFiles) {
+ if (possibleXMLFile.exists()) {
+ xmlFile = possibleXMLFile;
+ break;
+ }
+ }
+ if (this.anaforaXMLSuffixes.length > 0 && xmlFile == null) {
+ throw new IllegalArgumentException("no Anafora XML file found from " + possibleXMLFiles);
+ }
+
+ if (xmlFile != null) {
+ processXmlFile(jCas, xmlFile);
+ }
+
+ }
+
+ private static void processXmlFile(JCas jCas, File xmlFile) throws AnalysisEngineProcessException {
+ // load the XML
+ Element dataElem;
+ try {
+ dataElem = new SAXBuilder().build(xmlFile.toURI().toURL()).getRootElement();
+ } catch (MalformedURLException e) {
+ throw new AnalysisEngineProcessException(e);
+ } catch (JDOMException e) {
+ throw new AnalysisEngineProcessException(e);
+ } catch (IOException e) {
+ throw new AnalysisEngineProcessException(e);
+ }
+
+ List<String[]> delayedLocationRelations = new ArrayList<>();
+ int docLen = jCas.getDocumentText().length();
+
+ for (Element annotationsElem : dataElem.getChildren("annotations")) {
+
+ Map<String, Annotation> idToAnnotation = new HashMap<>();
+ for (Element entityElem : annotationsElem.getChildren("entity")) {
+ String id = removeSingleChildText(entityElem, "id", null);
+ Element spanElem = removeSingleChild(entityElem, "span", id);
+ String type = removeSingleChildText(entityElem, "type", id);
+ Element propertiesElem = removeSingleChild(entityElem, "properties", id);
+
+ // UIMA doesn't support disjoint spans, so take the span enclosing
+ // everything
+ int begin = Integer.MAX_VALUE;
+ int end = Integer.MIN_VALUE;
+ for (String spanString : spanElem.getText().split(";")) {
+ String[] beginEndStrings = spanString.split(",");
+ if (beginEndStrings.length != 2) {
+ error("span not of the format 'number,number'", id);
+ }
+ int spanBegin = Integer.parseInt(beginEndStrings[0]);
+ int spanEnd = Integer.parseInt(beginEndStrings[1]);
+ if (spanBegin < begin) {
+ begin = spanBegin;
+ }
+ if (spanEnd > end) {
+ end = spanEnd;
+ }
+ }
+ if (begin < 0 || end >= docLen) {
+ error("Illegal begin or end boundary", id);
+ continue;
+ }
+
+ Annotation annotation = null;
+ if (type.equals("Disease_Disorder")) {
+ DiseaseDisorderMention dd = new DiseaseDisorderMention(jCas, begin, end);
+
+ String bodyLocation = removeSingleChildText(propertiesElem, "body_location", id);
+ if (bodyLocation != null && !bodyLocation.equals("")) {
+ delayedLocationRelations.add(new String[]{id, bodyLocation});
+ }
+ annotation = dd;
+ } else if (type.equals("Procedure")) {
+ ProcedureMention proc = new ProcedureMention(jCas, begin, end);
+ String bodyLocation = removeSingleChildText(propertiesElem, "body_location", id);
+ if (bodyLocation != null && !bodyLocation.equals("")) {
+ delayedLocationRelations.add(new String[]{id, bodyLocation});
+ }
+ annotation = proc;
+ } else if (type.equals("Sign_symptom")) {
+ SignSymptomMention ss = new SignSymptomMention(jCas, begin, end);
+ String bodyLocation = removeSingleChildText(propertiesElem, "body_location", id);
+ if (bodyLocation != null && !bodyLocation.equals("")) {
+ delayedLocationRelations.add(new String[]{id, bodyLocation});
+ }
+ annotation = ss;
+ } else if (type.equals("Metastasis")) {
+ EventMention meta = new EventMention(jCas, begin, end);
+ String bodyLocation = removeSingleChildText(propertiesElem, "body_location", id);
+ if (bodyLocation != null && !bodyLocation.equals("")) {
+ delayedLocationRelations.add(new String[]{id, bodyLocation});
+ }
+ annotation = meta;
+ } else if (type.equals("Anatomical_site")) {
+ AnatomicalSiteMention as = new AnatomicalSiteMention(jCas, begin, end);
+ String code = removeSingleChildText(propertiesElem, "associatedCode", id);
+ extractAttributeValues(propertiesElem, as, id);
+ annotation = as;
+ } else {
+ LOGGER.info("This entity type is not being extracted yet!");
+ }
+
+ // match the annotation to it's ID for later use
+ if (annotation != null) {
+ annotation.addToIndexes();
+ idToAnnotation.put(id, annotation);
+ }
+ }
+
+ for (String[] args : delayedLocationRelations) {
+ LocationOfTextRelation rel = new LocationOfTextRelation(jCas);
+ rel.setCategory("location_of");
+ RelationArgument arg1 = new RelationArgument(jCas);
+ arg1.setArgument(idToAnnotation.get(args[0]));
+ rel.setArg1(arg1);
+ RelationArgument arg2 = new RelationArgument(jCas);
+ arg2.setArgument(idToAnnotation.get(args[1]));
+ rel.setArg2(arg2);
+ rel.setDiscoveryTechnique(CONST.REL_DISCOVERY_TECH_GOLD_ANNOTATION);
+ rel.addToIndexes();
+ }
+ }
+ }
+
+ private static void extractAttributeValues(Element propertiesElem, IdentifiedAnnotation annotation, String id) {
+
+ }
+
+ private static Element getSingleChild(Element elem, String elemName, String causeID) {
+ List<Element> children = elem.getChildren(elemName);
+ if (children.size() != 1) {
+ error(String.format("not exactly one '%s' child", elemName), causeID);
+ }
+ return children.size() > 0 ? children.get(0) : null;
+ }
+
+ private static Element removeSingleChild(Element elem, String elemName, String causeID) {
+ Element child = getSingleChild(elem, elemName, causeID);
+ elem.removeChildren(elemName);
+ return child;
+ }
+
+ private static String removeSingleChildText(Element elem, String elemName, String causeID) {
+ Element child = getSingleChild(elem, elemName, causeID);
+ String text = child.getText();
+ if (text.isEmpty()) {
+ error(String.format("an empty '%s' child", elemName), causeID);
+ text = null;
+ }
+ elem.removeChildren(elemName);
+ return text;
+ }
+
+ private static void error(String found, String id) {
+ LOGGER.error(String.format("found %s in annotation with ID %s", found, id));
+ }
+ }
+}
Modified: ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/ModifierExtractorEvaluation.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/ModifierExtractorEvaluation.java?rev=1893521&r1=1893520&r2=1893521&view=diff
==============================================================================
--- ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/ModifierExtractorEvaluation.java (original)
+++ ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/ModifierExtractorEvaluation.java Wed Sep 22 19:00:05 2021
@@ -18,15 +18,10 @@
*/
package org.apache.ctakes.relationextractor.eval;
-import java.io.File;
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.List;
-
-import javax.annotation.Nullable;
-
+import com.google.common.base.Function;
+import com.google.common.collect.Lists;
+import com.lexicalscope.jewel.cli.CliFactory;
import org.apache.ctakes.relationextractor.ae.ModifierExtractorAnnotator;
-import org.apache.ctakes.relationextractor.eval.SHARPXMI.EvaluationOptions;
import org.apache.ctakes.typesystem.type.textsem.Modifier;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
@@ -45,11 +40,11 @@ import org.cleartk.ml.jar.GenericJarClas
import org.cleartk.ml.jar.JarClassifierBuilder;
import org.cleartk.ml.liblinear.LibLinearStringOutcomeDataWriter;
-import com.google.common.base.Function;
-import com.google.common.collect.Lists;
-import com.lexicalscope.jewel.cli.CliFactory;
+import javax.annotation.Nullable;
+import java.io.File;
+import java.util.*;
-public class ModifierExtractorEvaluation extends SHARPXMI.Evaluation_ImplBase {
+public class ModifierExtractorEvaluation extends RelationEvaluation_ImplBase {
public static final ParameterSettings BEST_PARAMETERS = new ParameterSettings(
LibLinearStringOutcomeDataWriter.class,
@@ -57,9 +52,30 @@ public class ModifierExtractorEvaluation
public static void main(String[] args) throws Exception {
// parse the options, validate them, and generate XMI if necessary
- final EvaluationOptions options = CliFactory.parseArguments(EvaluationOptions.class, args);
- SHARPXMI.validate(options);
- SHARPXMI.generateXMI(options);
+ final RelationExtractorEvaluation.Options options = CliFactory.parseArguments(RelationExtractorEvaluation.Options.class, args);
+ CorpusXMI.validate(options);
+ if(options.getGenerateXMI()) {
+ boolean generateSharp = false, generateDeepPhe = false;
+ if (options.getTestCorpus() == CorpusXMI.Corpus.SHARP || options.getTestCorpus() == CorpusXMI.Corpus.SHARP_RELEASE) {
+ generateSharp = true;
+ } else if (options.getTestCorpus() == CorpusXMI.Corpus.DeepPhe) {
+ generateDeepPhe = true;
+ }
+ for(CorpusXMI.Corpus corpus : options.getTrainCorpus()){
+ if(corpus == CorpusXMI.Corpus.SHARP_RELEASE || corpus == CorpusXMI.Corpus.SHARP){
+ generateSharp = true;
+ }else if(corpus == CorpusXMI.Corpus.DeepPhe){
+ generateDeepPhe = true;
+ }
+ }
+
+ if(generateSharp){
+ SHARPXMI.generateXMI(options.getXMIDirectory(), options.getSharpCorpusDirectory(), options.getSharpBatchesDirectory());
+ }
+ if(generateDeepPhe){
+ DeepPheXMI.generateXMI(options.getXMIDirectory(), options.getDeepPheAnaforaDirectory());
+ }
+ }
// determine the grid of parameters to search through
// for the full set of LibLinear parameters, see:
@@ -73,17 +89,36 @@ public class ModifierExtractorEvaluation
}
}
- // run the evaluation
- SHARPXMI.evaluate(
- options,
- BEST_PARAMETERS,
- gridOfSettings,
- new Function<ParameterSettings, ModifierExtractorEvaluation>() {
- @Override
- public ModifierExtractorEvaluation apply(@Nullable ParameterSettings params) {
- return new ModifierExtractorEvaluation(new File("target/models/modifier"), params);
- }
- });
+ List<File> trainFiles = new ArrayList<>();
+ for(CorpusXMI.Corpus corpus : options.getTrainCorpus()){
+ File trainCorpusDirectory;
+ if(corpus == CorpusXMI.Corpus.SHARP) trainCorpusDirectory = options.getSharpBatchesDirectory();
+ else if(corpus == CorpusXMI.Corpus.SHARP_RELEASE) trainCorpusDirectory = options.getSharpCorpusDirectory();
+ else if(corpus == CorpusXMI.Corpus.DeepPhe) trainCorpusDirectory = options.getDeepPheAnaforaDirectory();
+ else{
+ throw new Exception("Train corpus not recognized: " + corpus);
+ }
+ trainFiles.addAll(CorpusXMI.toXMIFiles(options.getXMIDirectory(), CorpusXMI.getTrainTextFiles(corpus, options.getEvaluateOn(), trainCorpusDirectory)));
+ }
+
+ File testCorpusDirectory=null;
+ if(options.getTestCorpus() == CorpusXMI.Corpus.SHARP) testCorpusDirectory = options.getSharpBatchesDirectory();
+ else if(options.getTestCorpus() == CorpusXMI.Corpus.SHARP_RELEASE) testCorpusDirectory = options.getSharpCorpusDirectory();
+ else if(options.getTestCorpus() == CorpusXMI.Corpus.DeepPhe) testCorpusDirectory = options.getDeepPheAnaforaDirectory();
+
+ List<File> testFiles = CorpusXMI.getTestTextFiles(options.getTestCorpus(), options.getEvaluateOn(), testCorpusDirectory);
+
+ if(options.getGridSearch()){
+ Map<ParameterSettings, Double> scoredParams = new HashMap<>();
+ for(ParameterSettings params : gridOfSettings){
+ ModifierExtractorEvaluation eval = new ModifierExtractorEvaluation(new File("target/models/modifier"), params);
+ params.stats = eval.trainAndTest(trainFiles, testFiles);
+ scoredParams.put(params, params.stats.f1());
+ }
+ }else {
+ ModifierExtractorEvaluation eval = new ModifierExtractorEvaluation(new File("target/models/modifier"), BEST_PARAMETERS);
+ System.err.println(eval.trainAndTest(trainFiles, testFiles));
+ }
}
private ParameterSettings parameterSettings;
Added: ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/RelationEvaluation_ImplBase.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/RelationEvaluation_ImplBase.java?rev=1893521&view=auto
==============================================================================
--- ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/RelationEvaluation_ImplBase.java (added)
+++ ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/RelationEvaluation_ImplBase.java Wed Sep 22 19:00:05 2021
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.relationextractor.eval;
+
+import com.lexicalscope.jewel.cli.Option;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.fit.factory.CollectionReaderFactory;
+import org.apache.uima.fit.factory.TypeSystemDescriptionFactory;
+import org.cleartk.eval.AnnotationStatistics;
+
+import java.io.File;
+import java.util.List;
+
+/**
+ * Created by tmill on 1/31/17.
+ */
+public abstract class RelationEvaluation_ImplBase extends org.cleartk.eval.Evaluation_ImplBase<File, AnnotationStatistics<String>> {
+ public RelationEvaluation_ImplBase( File baseDirectory ) {
+ super( baseDirectory );
+ }
+
+ @Override
+ public CollectionReader getCollectionReader(List<File> items ) throws Exception {
+ return CollectionReaderFactory.createReader(
+ XMIReader.class,
+ TypeSystemDescriptionFactory.createTypeSystemDescription(),
+ XMIReader.PARAM_FILES,
+ items );
+ }
+
+ public static interface EvaluationOptions {
+ @Option(
+ longName = "evaluate-on",
+ defaultValue = "DEV",
+ description = "perform evaluation using the training (TRAIN), development (DEV) or test "
+ + "(TEST) data.")
+ public CorpusXMI.EvaluateOn getEvaluateOn();
+
+ @Option(
+ longName = "grid-search",
+ description = "run a grid search to select the best parameters")
+ public boolean getGridSearch();
+
+ @Option(
+ defaultToNull=true,
+ longName = "train-xmi-dir",
+ description = "use these XMI files for training; they must contain the necessary preprocessing "
+ + "in system view and gold annotation in gold view")
+ public File getTrainXmiDir();
+
+ @Option(
+ longName = "test-xmi-dir",
+ defaultValue = "",
+ description = "evaluate on these XMI files; they must contain the necessary preprocessing "
+ + "in system view and gold annotation in gold view")
+ public File getTestXmiDir();
+
+ @Option(
+ longName = "batches-dir",
+ description = "directory containing ssN_batchNN directories, each of which should contain "
+ + "a Knowtator directory and a Knowtator_XML directory",
+ defaultToNull = true)
+ public File getSharpBatchesDirectory();
+
+ @Option(
+ longName = "corpus-dir",
+ description = "Path to the SHARP corpus release (version 2 would end in /v2/SHARP)",
+ defaultToNull = true)
+ public File getSharpCorpusDirectory();
+
+ @Option(
+ longName = "deepphe-anafora-dir",
+ description = "Path to the anafora directory containing DeepPhe data",
+ shortName = "d",
+ defaultToNull = true)
+ public File getDeepPheAnaforaDirectory();
+
+ @Option(
+ longName = "xmi-dir",
+ defaultValue = "target/xmi",
+ description = "directory to store and load XMI serialization of annotations")
+ public File getXMIDirectory();
+
+ @Option(
+ longName = "generate-xmi",
+ description = "read in the gold annotations and serialize them as XMI")
+ public boolean getGenerateXMI();
+ }
+}
Modified: ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/RelationExtractorEvaluation.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/RelationExtractorEvaluation.java?rev=1893521&r1=1893520&r2=1893521&view=diff
==============================================================================
--- ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/RelationExtractorEvaluation.java (original)
+++ ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/RelationExtractorEvaluation.java Wed Sep 22 19:00:05 2021
@@ -18,17 +18,51 @@
*/
package org.apache.ctakes.relationextractor.eval;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.ctakes.relationextractor.ae.CausesBringsAboutRelationExtractorAnnotator;
+import org.apache.ctakes.relationextractor.ae.DegreeOfRelationExtractorAnnotator;
+import org.apache.ctakes.relationextractor.ae.LocationOfRelationExtractorAnnotator;
+import org.apache.ctakes.relationextractor.ae.ManagesTreatsRelationExtractorAnnotator;
+import org.apache.ctakes.relationextractor.ae.ManifestationOfRelationExtractorAnnotator;
+import org.apache.ctakes.relationextractor.ae.RelationExtractorAnnotator;
+import org.apache.ctakes.typesystem.type.relation.BinaryTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CausesBringsAboutTextRelation;
+import org.apache.ctakes.typesystem.type.relation.DegreeOfTextRelation;
+import org.apache.ctakes.typesystem.type.relation.LocationOfTextRelation;
+import org.apache.ctakes.typesystem.type.relation.ManagesTreatsTextRelation;
+import org.apache.ctakes.typesystem.type.relation.ManifestationOfTextRelation;
+import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Modifier;
import com.google.common.base.Function;
+import com.google.common.base.Functions;
import com.google.common.base.Objects;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
+import com.google.common.collect.Ordering;
import com.google.common.collect.Sets;
import com.lexicalscope.jewel.cli.CliFactory;
import com.lexicalscope.jewel.cli.Option;
import org.apache.ctakes.core.pipeline.PipeBitInfo;
-import org.apache.ctakes.relationextractor.ae.*;
-import org.apache.ctakes.typesystem.type.relation.*;
-import org.apache.ctakes.typesystem.type.textsem.*;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
@@ -57,13 +91,11 @@ import org.cleartk.ml.jar.JarClassifierB
import org.cleartk.ml.liblinear.LibLinearStringOutcomeDataWriter;
import org.cleartk.util.ViewUriUtil;
-import javax.annotation.Nullable;
-import java.io.*;
-import java.util.*;
+public class RelationExtractorEvaluation extends RelationEvaluation_ImplBase {
-public class RelationExtractorEvaluation extends SHARPXMI.Evaluation_ImplBase {
- public static interface Options extends SHARPXMI.EvaluationOptions {
+
+ public static interface Options extends RelationEvaluation_ImplBase.EvaluationOptions {
@Option(
longName = "relations",
@@ -104,6 +136,15 @@ public class RelationExtractorEvaluation
description = "expand events to their covering or covered events")
public boolean getExpandEvents();
+ @Option(
+ longName = "train-corpus",
+ description = "Corpora to use for training (space-separated if more than one)")
+ public List<CorpusXMI.Corpus> getTrainCorpus();
+
+ @Option(
+ longName = "test-corpus",
+ description = "Corpus to use for testing")
+ public CorpusXMI.Corpus getTestCorpus();
}
public static final Map<String, Class<? extends BinaryTextRelation>> RELATION_CLASSES =
@@ -158,55 +199,140 @@ public class RelationExtractorEvaluation
public static void main(String[] args) throws Exception {
// parse the options, validate them, and generate XMI if necessary
final Options options = CliFactory.parseArguments(Options.class, args);
- SHARPXMI.validate(options);
- SHARPXMI.generateXMI(options);
+ CorpusXMI.validate(options);
+ if(options.getGenerateXMI()) {
+ boolean generateSharp = false, generateDeepPhe = false;
+ if (options.getTestCorpus() == CorpusXMI.Corpus.SHARP || options.getTestCorpus() == CorpusXMI.Corpus.SHARP_RELEASE) {
+ generateSharp = true;
+ } else if (options.getTestCorpus() == CorpusXMI.Corpus.DeepPhe) {
+ generateDeepPhe = true;
+ }
+ for(CorpusXMI.Corpus corpus : options.getTrainCorpus()){
+ if(corpus == CorpusXMI.Corpus.SHARP_RELEASE || corpus == CorpusXMI.Corpus.SHARP){
+ generateSharp = true;
+ }else if(corpus == CorpusXMI.Corpus.DeepPhe){
+ generateDeepPhe = true;
+ }
+ }
+
+ if(generateSharp){
+ SHARPXMI.generateXMI(options.getXMIDirectory(), options.getSharpCorpusDirectory(), options.getSharpBatchesDirectory());
+ }
+ if(generateDeepPhe){
+ DeepPheXMI.generateXMI(options.getXMIDirectory(), options.getDeepPheAnaforaDirectory());
+ }
+ }
+
// determine the grid of parameters to search through
// for the full set of LibLinear parameters, see:
// https://github.com/bwaldvogel/liblinear-java/blob/master/src/main/java/de/bwaldvogel/liblinear/Train.java
- List<ParameterSettings> gridOfSettings = Lists.newArrayList();
- for (float probabilityOfKeepingANegativeExample : new float[] { 1.0f }) {//0.5f,
- for (int solver : new int[] { 0 /* logistic regression */, 1 /* SVM */}) {
- for (double svmCost : new double[] { 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100 }) {
- gridOfSettings.add(new ParameterSettings(
- LibLinearStringOutcomeDataWriter.class,
- new Object[] {
- RelationExtractorAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
- probabilityOfKeepingANegativeExample },
- new String[] { "-s", String.valueOf(solver), "-c", String.valueOf(svmCost) }));
+ List<ParameterSettings> gridOfSettings = null;
+ if(options.getGridSearch()) {
+ gridOfSettings = new ArrayList<>();
+ for (float probabilityOfKeepingANegativeExample : new float[]{1.0f}) {//0.5f,
+ for (int solver : new int[]{0 /* logistic regression */, 1 /* SVM */}) {
+ for (double svmCost : new double[]{0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100}) {
+ gridOfSettings.add(new ParameterSettings(
+ LibLinearStringOutcomeDataWriter.class,
+ new Object[]{
+ RelationExtractorAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+ probabilityOfKeepingANegativeExample},
+ new String[]{"-s", String.valueOf(solver), "-c", String.valueOf(svmCost)}));
+ }
}
}
}
// run an evaluation for each selected relation
for (final String relationCategory : options.getRelations()) {
-
// get the best parameters for the relation
final Class<? extends BinaryTextRelation> relationClass =
RELATION_CLASSES.get(relationCategory);
- ParameterSettings bestSettings = BEST_PARAMETERS.get(relationClass);
- // run the evaluation
- SHARPXMI.evaluate(
- options,
- bestSettings,
- gridOfSettings,
- new Function<ParameterSettings, RelationExtractorEvaluation>() {
- @Override
- public RelationExtractorEvaluation apply(@Nullable ParameterSettings params) {
- return new RelationExtractorEvaluation(
- new File("target/models/" + relationCategory),
- relationClass,
- ANNOTATOR_CLASSES.get(relationClass),
- params,
- options.getTestOnCTakes(),
- options.getAllowSmallerSystemArguments(),
- options.getIgnoreImpossibleGoldRelations(),
- options.getPrintErrors(),
- options.getClassWeights(),
- options.getExpandEvents());
- }
- });
+ List<File> trainFiles = new ArrayList<>();
+ for(CorpusXMI.Corpus corpus : options.getTrainCorpus()){
+ File trainCorpusDirectory;
+ if(corpus == CorpusXMI.Corpus.SHARP) trainCorpusDirectory = options.getSharpBatchesDirectory();
+ else if(corpus == CorpusXMI.Corpus.SHARP_RELEASE) trainCorpusDirectory = options.getSharpCorpusDirectory();
+ else if(corpus == CorpusXMI.Corpus.DeepPhe) trainCorpusDirectory = options.getDeepPheAnaforaDirectory();
+ else{
+ throw new Exception("Train corpus not recognized: " + corpus);
+ }
+ trainFiles.addAll(CorpusXMI.toXMIFiles(options.getXMIDirectory(), CorpusXMI.getTrainTextFiles(corpus, options.getEvaluateOn(), trainCorpusDirectory)));
+ }
+
+ File testCorpusDirectory=null;
+
+ if(options.getTestCorpus() == CorpusXMI.Corpus.SHARP) testCorpusDirectory = options.getSharpBatchesDirectory();
+ else if(options.getTestCorpus() == CorpusXMI.Corpus.SHARP_RELEASE) testCorpusDirectory = options.getSharpCorpusDirectory();
+ else if(options.getTestCorpus() == CorpusXMI.Corpus.DeepPhe) testCorpusDirectory = options.getDeepPheAnaforaDirectory();
+
+ List<File> testFiles = CorpusXMI.toXMIFiles(options.getXMIDirectory(), CorpusXMI.getTestTextFiles(options.getTestCorpus(), options.getEvaluateOn(), testCorpusDirectory));
+
+ if(gridOfSettings != null){
+ // grid search:
+ Map<ParameterSettings, Double> scoredParams = new HashMap<>();
+ for(ParameterSettings params : gridOfSettings) {
+ RelationExtractorEvaluation eval = new RelationExtractorEvaluation(
+ new File("target/models/" + relationCategory),
+ relationClass,
+ ANNOTATOR_CLASSES.get(relationClass),
+ params,
+ options.getTestOnCTakes(),
+ options.getAllowSmallerSystemArguments(),
+ options.getIgnoreImpossibleGoldRelations(),
+ options.getPrintErrors(),
+ options.getClassWeights(),
+ options.getExpandEvents());
+ params.stats = eval.trainAndTest(trainFiles, testFiles);
+ scoredParams.put(params, params.stats.f1());
+ }
+ // print parameters sorted by F1
+ List<ParameterSettings> list = new ArrayList<>( scoredParams.keySet() );
+ Function<ParameterSettings, Double> getCount = Functions.forMap( scoredParams );
+ Collections.sort( list, Ordering.natural().onResultOf( getCount ) );
+
+ // print performance of each set of parameters
+ if ( list.size() > 1 ) {
+ System.err.println( "Summary" );
+ for ( ParameterSettings params : list ) {
+ System.err.printf(
+ "F1=%.3f P=%.3f R=%.3f %s\n",
+ params.stats.f1(),
+ params.stats.precision(),
+ params.stats.recall(),
+ params );
+ }
+ System.err.println();
+ }
+ // print best settings:
+ if ( !list.isEmpty() ) {
+ ParameterSettings lastParams = list.get( list.size() - 1 );
+ System.err.println( "Best model:" );
+ System.err.print( lastParams.stats );
+ System.err.println( lastParams );
+ System.err.println( lastParams.stats.confusions() );
+ System.err.println();
+ }
+ }else {
+ ParameterSettings bestSettings = BEST_PARAMETERS.get(relationClass);
+ RelationExtractorEvaluation eval = new RelationExtractorEvaluation(new File("target/models/" + relationCategory),
+ relationClass,
+ ANNOTATOR_CLASSES.get(relationClass),
+ bestSettings,
+ options.getTestOnCTakes(),
+ options.getAllowSmallerSystemArguments(),
+ options.getIgnoreImpossibleGoldRelations(),
+ options.getPrintErrors(),
+ options.getClassWeights(),
+ options.getExpandEvents());
+ bestSettings.stats = eval.trainAndTest(trainFiles, testFiles);
+ System.err.println( bestSettings.stats);
+ System.err.println(bestSettings);
+ System.err.println(bestSettings.stats.confusions());
+ System.err.println();
+ }
}
}
@@ -250,7 +376,7 @@ public class RelationExtractorEvaluation
* @param ignoreImpossibleGoldRelations
* During testing, ignore gold relations that would be impossible to
* find because there are no corresponding system mentions
- // * @param expandEvent
+ * @param expandEventParameter
*/
public RelationExtractorEvaluation(
File baseDirectory,
Modified: ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/SHARPXMI.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/SHARPXMI.java?rev=1893521&r1=1893520&r2=1893521&view=diff
==============================================================================
--- ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/SHARPXMI.java (original)
+++ ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/SHARPXMI.java Wed Sep 22 19:00:05 2021
@@ -22,10 +22,11 @@ import com.google.common.base.Function;
import com.google.common.base.Functions;
import com.google.common.collect.Lists;
import com.google.common.collect.Ordering;
+import com.lexicalscope.jewel.cli.CliFactory;
import com.lexicalscope.jewel.cli.Option;
import org.apache.ctakes.core.ae.SHARPKnowtatorXMLReader;
import org.apache.ctakes.core.pipeline.PipeBitInfo;
-import org.apache.ctakes.core.util.doc.DocIdUtil;
+import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
import org.apache.ctakes.typesystem.type.structured.DocumentID;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
@@ -55,7 +56,9 @@ import java.io.FileOutputStream;
import java.util.*;
import java.util.regex.Pattern;
-public class SHARPXMI {
+public class SHARPXMI extends CorpusXMI {
+
+ private static String BATCH_TEXT_SUBDIR = "Knowtator/text";
public static List<File> getTrainTextFiles( File batchesDirectory ) {
// seed_set1: batches 2, 3, 4, 5, 6, 7, 8, 9, 13, 14, 15, 16, 18, 19
@@ -66,7 +69,8 @@ public class SHARPXMI {
batchesDirectory,
Pattern.compile( "^(ss[1234]_batch0[2-9]|ss[1234]_batch1[56]"
+ "|ss[1234]_batch1[89]|ss[123]_batch01"
- + "|ss[12]_batch1[34]|ss[34]_batch1[12])$" ) );
+ + "|ss[12]_batch1[34]|ss[34]_batch1[12])$" ),
+ BATCH_TEXT_SUBDIR);
}
public static List<File> getDevTextFiles( File batchesDirectory ) {
@@ -74,7 +78,7 @@ public class SHARPXMI {
// seed_set2: batches 10, 17
// seed_set3: batches 10, 17
// seed_set4: batches 10, 17
- return getTextFilesFor( batchesDirectory, Pattern.compile( "^(ss[1234]_batch1[07])$" ) );
+ return getTextFilesFor( batchesDirectory, Pattern.compile( "^(ss[1234]_batch1[07])$" ), BATCH_TEXT_SUBDIR );
}
public static List<File> getTestTextFiles( File batchesDirectory ) {
@@ -84,19 +88,20 @@ public class SHARPXMI {
// seed_set4: batches 13, 14
return getTextFilesFor(
batchesDirectory,
- Pattern.compile( "^(ss[12]_batch1[12]|ss[34]_batch1[34])$" ) );
+ Pattern.compile( "^(ss[12]_batch1[12]|ss[34]_batch1[34])$" ),
+ BATCH_TEXT_SUBDIR);
}
public static List<File> getAllTextFiles( File batchesDirectory ) {
- return getTextFilesFor( batchesDirectory, Pattern.compile( "" ) );
+ return getTextFilesFor( batchesDirectory, Pattern.compile( "" ), BATCH_TEXT_SUBDIR );
}
- private static List<File> getTextFilesFor( File batchesDirectory, Pattern pattern ) {
+ private static List<File> getTextFilesFor( File batchesDirectory, Pattern pattern, String textSubdir ) {
List<File> files = Lists.newArrayList();
for ( File batchDir : batchesDirectory.listFiles() ) {
if ( batchDir.isDirectory() && !batchDir.isHidden() ) {
if ( pattern.matcher( batchDir.getName() ).find() ) {
- File textDirectory = new File( batchDir, "Knowtator/text" );
+ File textDirectory = new File( batchDir, textSubdir );
for ( File textFile : textDirectory.listFiles() ) {
if ( textFile.isFile() && !textFile.isHidden() ) {
files.add( textFile );
@@ -108,151 +113,81 @@ public class SHARPXMI {
return files;
}
- public static List<File> toXMIFiles( Options options, List<File> textFiles ) {
- List<File> xmiFiles = Lists.newArrayList();
- for ( File textFile : textFiles ) {
- xmiFiles.add( toXMIFile( options, textFile ) );
- }
- return xmiFiles;
+ public static List<File> getTrainTextFilesFromCorpus(File corpusDirectory) {
+ return getTextFilesFor(new File(corpusDirectory, "SeedSet1/by-batch/umls"), Pattern.compile("^0[2-9]|1[3-6,8-9]"), "text");
}
- private static File toXMIFile( Options options, File textFile ) {
- return new File( options.getXMIDirectory(), textFile.getName() + ".xmi" );
+ public static List<File> getDevTextFilesFromCorpus(File corpusDirectory) {
+ return getTextFilesFor(new File(corpusDirectory, "SeedSet1/by-batch/umls"), Pattern.compile("^1[0,7]"), "text");
}
- public static interface Options {
- @Option(
- longName = "batches-dir",
- description = "directory containing ssN_batchNN directories, each of which should contain "
- + "a Knowtator directory and a Knowtator_XML directory")
- public File getBatchesDirectory();
-
- @Option(
- longName = "xmi-dir",
- defaultValue = "target/xmi",
- description = "directory to store and load XMI serialization of annotations")
- public File getXMIDirectory();
-
- @Option(
- longName = "generate-xmi",
- description = "read in the gold annotations and serialize them as XMI")
- public boolean getGenerateXMI();
+ public static List<File> getTestTextFilesFromCorpus(File corpusDirectory) {
+ return getTextFilesFor(new File(corpusDirectory, "SeedSet1/by-batch/umls"), Pattern.compile("^1[1-2]"), "text");
}
- public static final String GOLD_VIEW_NAME = "GoldView";
-
- public static void generateXMI( Options options ) throws Exception {
+ public static void generateXMI( File xmiDirectory, File corpusDirectory, File batchesDirectory ) throws Exception {
// if necessary, write the XMIs first
- if ( options.getGenerateXMI() ) {
- if ( !options.getXMIDirectory().exists() ) {
- options.getXMIDirectory().mkdirs();
- }
-
- // create a collection reader that loads URIs for all Knowtator text files
- List<File> files = Lists.newArrayList();
- files.addAll( getTrainTextFiles( options.getBatchesDirectory() ) );
- files.addAll( getDevTextFiles( options.getBatchesDirectory() ) );
- files.addAll( getTestTextFiles( options.getBatchesDirectory() ) );
- CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles( files );
-
- // load the text from the URI, run the preprocessor, then run the
- // Knowtator XML reader
- AggregateBuilder builder = new AggregateBuilder();
- builder.add( UriToDocumentTextAnnotator.getDescription() );
- File preprocessDescFile = new File( "desc/analysis_engine/RelationExtractorPreprocessor.xml" );
- XMLParser parser = UIMAFramework.getXMLParser();
- XMLInputSource source = new XMLInputSource( preprocessDescFile );
- builder.add( parser.parseAnalysisEngineDescription( source ) );
- builder.add( AnalysisEngineFactory.createEngineDescription(
- ViewCreatorAnnotator.class,
- ViewCreatorAnnotator.PARAM_VIEW_NAME,
- GOLD_VIEW_NAME ) );
- builder.add( AnalysisEngineFactory.createEngineDescription( CopyDocumentTextToGoldView.class ) );
- builder.add(
- AnalysisEngineFactory.createEngineDescription( DocumentIDAnnotator.class ),
- CAS.NAME_DEFAULT_SOFA,
- GOLD_VIEW_NAME );
- builder.add(
- AnalysisEngineFactory.createEngineDescription( SHARPKnowtatorXMLReader.class,
- SHARPKnowtatorXMLReader.PARAM_SET_DEFAULTS,
- true ),
- CAS.NAME_DEFAULT_SOFA,
- GOLD_VIEW_NAME );
-
- // write out an XMI for each file
- for ( Iterator<JCas> casIter = new JCasIterator( reader, builder.createAggregate() ); casIter.hasNext(); ) {
- JCas jCas = casIter.next();
- JCas goldView = jCas.getView( GOLD_VIEW_NAME );
- String documentID = DocIdUtil.getDocumentID( goldView );
- if ( documentID == null ){//|| documentID.equals( DocumentIDAnnotationUtil.NO_DOCUMENT_ID ) ) {
- throw new IllegalArgumentException( "No documentID for CAS:\n" + jCas );
- }
- File outFile = toXMIFile( options, new File( documentID ) );
- FileOutputStream stream = new FileOutputStream( outFile );
- ContentHandler handler = new XMLSerializer( stream ).getContentHandler();
- new XmiCasSerializer( jCas.getTypeSystem() ).serialize( jCas.getCas(), handler );
- stream.close();
- }
+ if ( !xmiDirectory.exists() ) {
+ xmiDirectory.mkdirs();
}
- }
- public enum EvaluateOn {
- TRAIN, DEV, TEST, OTHER
- }
-
- public static interface EvaluationOptions extends Options {
- @Option(
- longName = "evaluate-on",
- defaultValue = "DEV",
- description = "perform evaluation using the training (TRAIN), development (DEV) or test "
- + "(TEST) data.")
- public EvaluateOn getEvaluteOn();
-
- @Option(
- longName = "grid-search",
- description = "run a grid search to select the best parameters")
- public boolean getGridSearch();
-
- @Option(
- defaultToNull=true,
- longName = "train-xmi-dir",
- description = "use these XMI files for training; they must contain the necessary preprocessing "
- + "in system view and gold annotation in gold view")
- public File getTrainXmiDir();
-
- @Option(
- longName = "test-xmi-dir",
- defaultValue = "",
- description = "evaluate on these XMI files; they must contain the necessary preprocessing "
- + "in system view and gold annotation in gold view")
- public File getTestXmiDir();
- }
-
- public static abstract class Evaluation_ImplBase
- extends org.cleartk.eval.Evaluation_ImplBase<File, AnnotationStatistics<String>> {
-
- public Evaluation_ImplBase( File baseDirectory ) {
- super( baseDirectory );
- }
-
- @Override
- public CollectionReader getCollectionReader( List<File> items ) throws Exception {
- return CollectionReaderFactory.createReader(
- XMIReader.class,
- TypeSystemDescriptionFactory.createTypeSystemDescription(),
- XMIReader.PARAM_FILES,
- items );
- }
- }
-
- public static void validate( EvaluationOptions options ) throws Exception {
- // error on invalid option combinations
- if ( options.getEvaluteOn().equals( EvaluateOn.TEST ) && options.getGridSearch() ) {
- throw new IllegalArgumentException( "grid search can only be run on the train or dev sets" );
+ // create a collection reader that loads URIs for all Knowtator text files
+ List<File> files = new ArrayList<>();
+ if(corpusDirectory != null){
+ files.addAll(getTrainTextFilesFromCorpus(corpusDirectory));
+ files.addAll(getDevTextFilesFromCorpus(corpusDirectory));
+ files.addAll(getTestTextFilesFromCorpus(corpusDirectory));
+ }else if(batchesDirectory != null) {
+ files.addAll(getTrainTextFiles(batchesDirectory));
+ files.addAll(getDevTextFiles(batchesDirectory));
+ files.addAll(getTestTextFiles(batchesDirectory));
+ }else{
+ throw new RuntimeException("Either the corpus-dir or batches-dir option must be set.");
+ }
+
+ CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(files);
+ // load the text from the URI, run the preprocessor, then run the
+ // Knowtator XML reader
+ AggregateBuilder builder = new AggregateBuilder();
+ builder.add( UriToDocumentTextAnnotator.getDescription() );
+ File preprocessDescFile = new File( "desc/analysis_engine/RelationExtractorPreprocessor.xml" );
+ XMLParser parser = UIMAFramework.getXMLParser();
+ XMLInputSource source = new XMLInputSource( preprocessDescFile );
+ builder.add( parser.parseAnalysisEngineDescription( source ) );
+ builder.add( AnalysisEngineFactory.createEngineDescription(
+ ViewCreatorAnnotator.class,
+ ViewCreatorAnnotator.PARAM_VIEW_NAME,
+ GOLD_VIEW_NAME ) );
+ builder.add( AnalysisEngineFactory.createEngineDescription( CopyDocumentTextToGoldView.class ) );
+ builder.add(
+ AnalysisEngineFactory.createEngineDescription( DocumentIDAnnotator.class ),
+ CAS.NAME_DEFAULT_SOFA,
+ GOLD_VIEW_NAME );
+ builder.add(
+ AnalysisEngineFactory.createEngineDescription( SHARPKnowtatorXMLReader.class,
+ SHARPKnowtatorXMLReader.PARAM_SET_DEFAULTS,
+ true ),
+ CAS.NAME_DEFAULT_SOFA,
+ GOLD_VIEW_NAME );
+
+ // write out an XMI for each file
+ for ( Iterator<JCas> casIter = new JCasIterator( reader, builder.createAggregate() ); casIter.hasNext(); ) {
+ JCas jCas = casIter.next();
+ JCas goldView = jCas.getView(GOLD_VIEW_NAME);
+ String documentID = DocumentIDAnnotationUtil.getDocumentID(goldView);
+ if (documentID == null) {//|| documentID.equals( DocumentIDAnnotationUtil.NO_DOCUMENT_ID ) ) {
+ throw new IllegalArgumentException("No documentID for CAS:\n" + jCas);
+ }
+ File outFile = toXMIFile(xmiDirectory, new File(documentID));
+ FileOutputStream stream = new FileOutputStream(outFile);
+ ContentHandler handler = new XMLSerializer(stream).getContentHandler();
+ new XmiCasSerializer(jCas.getTypeSystem()).serialize(jCas.getCas(), handler);
+ stream.close();
}
}
- public static <T extends Evaluation_ImplBase> void evaluate(
+ /*
+ public static <T extends RelationEvaluation_ImplBase> void evaluate(
EvaluationOptions options,
ParameterSettings bestSettings,
List<ParameterSettings> gridOfSettings,
@@ -268,30 +203,50 @@ public class SHARPXMI {
// run an evaluation for each set of parameters
Map<ParameterSettings, Double> scoredParams = new HashMap<>();
for ( ParameterSettings params : possibleParams ) {
- Evaluation_ImplBase evaluation = getEvaluation.apply( params );
+ RelationEvaluation_ImplBase evaluation = getEvaluation.apply( params );
List<File> trainFiles, devFiles, testFiles;
switch ( options.getEvaluteOn() ) {
case TRAIN:
// run n-fold cross-validation on the training set
- trainFiles = getTrainTextFiles( options.getBatchesDirectory() );
+ if(options.getCorpusDirectory() != null){
+ trainFiles = getTrainTextFilesFromCorpus(options.getCorpusDirectory());
+ }else if(options.getBatchesDirectory() != null) {
+ trainFiles = getTrainTextFiles(options.getBatchesDirectory());
+ }else{
+ throw new RuntimeException("Either corpus-dir or batch-dir must have an argument.");
+ }
trainFiles = toXMIFiles( options, trainFiles );
List<AnnotationStatistics<String>> foldStats = evaluation.crossValidation( trainFiles, 2 );
params.stats = AnnotationStatistics.addAll( foldStats );
break;
case DEV:
// train on the training set and evaluate on the dev set
- trainFiles = getTrainTextFiles( options.getBatchesDirectory() );
+ if(options.getCorpusDirectory() != null){
+ trainFiles = getTrainTextFilesFromCorpus(options.getCorpusDirectory());
+ devFiles = getDevTextFilesFromCorpus(options.getCorpusDirectory());
+ }else if(options.getBatchesDirectory() != null) {
+ trainFiles = getTrainTextFiles(options.getBatchesDirectory());
+ devFiles = getDevTextFiles( options.getBatchesDirectory() );
+ }else{
+ throw new RuntimeException("Either corpus-dir or batch-dir must have an argument.");
+ }
trainFiles = toXMIFiles( options, trainFiles );
- devFiles = getDevTextFiles( options.getBatchesDirectory() );
devFiles = toXMIFiles( options, devFiles );
params.stats = evaluation.trainAndTest( trainFiles, devFiles );
break;
case TEST:
// train on the training set + dev set and evaluate on the test set
List<File> allTrainFiles = new ArrayList<>();
- allTrainFiles.addAll( getTrainTextFiles( options.getBatchesDirectory() ) );
- allTrainFiles.addAll( getDevTextFiles( options.getBatchesDirectory() ) );
+ if(options.getCorpusDirectory() != null){
+ allTrainFiles.addAll( getTrainTextFilesFromCorpus(options.getCorpusDirectory()));
+ allTrainFiles.addAll( getDevTextFilesFromCorpus(options.getCorpusDirectory()));
+ }else if(options.getBatchesDirectory() != null) {
+ allTrainFiles.addAll(getTrainTextFiles(options.getBatchesDirectory()));
+ allTrainFiles.addAll(getDevTextFiles(options.getBatchesDirectory()));
+ }else{
+ throw new RuntimeException("Either corpus-dir or batch-dir must have an argument.");
+ }
allTrainFiles = toXMIFiles( options, allTrainFiles );
testFiles = getTestTextFiles( options.getBatchesDirectory() );
testFiles = toXMIFiles( options, testFiles );
@@ -356,6 +311,7 @@ public class SHARPXMI {
System.err.println();
}
}
+*/
public static class DocumentIDAnnotator extends JCasAnnotator_ImplBase {
@@ -369,9 +325,9 @@ public class SHARPXMI {
}
@PipeBitInfo(
- name = "Text to Gold Copier",
- description = "Copies Text from the System view to the Gold view.",
- role = PipeBitInfo.Role.SPECIAL
+ name = "Text to Gold Copier",
+ description = "Copies Text from the System view to the Gold view.",
+ role = PipeBitInfo.Role.SPECIAL
)
public static class CopyDocumentTextToGoldView extends JCasAnnotator_ImplBase {
@Override
Modified: ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/metastasis/MetastasisXmiGenerationPipeline.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/metastasis/MetastasisXmiGenerationPipeline.java?rev=1893521&r1=1893520&r2=1893521&view=diff
==============================================================================
--- ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/metastasis/MetastasisXmiGenerationPipeline.java (original)
+++ ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/metastasis/MetastasisXmiGenerationPipeline.java Wed Sep 22 19:00:05 2021
@@ -1,8 +1,8 @@
package org.apache.ctakes.relationextractor.metastasis;
import com.google.common.io.CharStreams;
-import org.apache.ctakes.relationextractor.eval.SHARPXMI.CopyDocumentTextToGoldView;
-import org.apache.ctakes.relationextractor.eval.SHARPXMI.DocumentIDAnnotator;
+import org.apache.ctakes.relationextractor.eval.CorpusXMI.CopyDocumentTextToGoldView;
+import org.apache.ctakes.relationextractor.eval.CorpusXMI.DocumentIDAnnotator;
import org.apache.uima.UIMAFramework;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngine;
Modified: ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/pipelines/RelationExtractorTrain.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/pipelines/RelationExtractorTrain.java?rev=1893521&r1=1893520&r2=1893521&view=diff
==============================================================================
--- ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/pipelines/RelationExtractorTrain.java (original)
+++ ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/pipelines/RelationExtractorTrain.java Wed Sep 22 19:00:05 2021
@@ -94,8 +94,8 @@ public class RelationExtractorTrain {
+ preprocessDescFile.getCanonicalPath());
}
- List<File> trainFiles = SHARPXMI.getAllTextFiles(options.getBatchesDirectory());
- trainFiles = SHARPXMI.toXMIFiles(options, trainFiles);
+ List<File> trainFiles = SHARPXMI.getAllTextFiles(options.getSharpBatchesDirectory());
+ trainFiles = SHARPXMI.toXMIFiles(options.getXMIDirectory(), trainFiles);
// Initialize model directories
String modelPathPrefix = "org/apache/ctakes/relationextractor/models/";