You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by ma...@apache.org on 2013/01/25 19:48:41 UTC

svn commit: r1438643 - /incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/TrainAllAssertionModels.txt

Author: mattcoarr
Date: Fri Jan 25 18:48:41 2013
New Revision: 1438643

URL: http://svn.apache.org/viewvc?rev=1438643&view=rev
Log:
this is currently unused, but I didn't want to lose track of it.  (This is a class to train all the models at the same time.)

Currently this work is just done as part of the evaluation system.

Added:
    incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/TrainAllAssertionModels.txt

Added: incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/TrainAllAssertionModels.txt
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/TrainAllAssertionModels.txt?rev=1438643&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/TrainAllAssertionModels.txt (added)
+++ incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/TrainAllAssertionModels.txt Fri Jan 25 18:48:41 2013
@@ -0,0 +1,331 @@
+package org.apache.ctakes.assertion.medfacts.cleartk;
+
+import java.util.Locale;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.collection.CollectionReaderDescription;
+import org.cleartk.classifier.CleartkAnnotatorDescriptionFactory;
+import org.cleartk.classifier.jar.DirectoryDataWriterFactory;
+import org.cleartk.classifier.jar.GenericJarClassifierFactory;
+import org.cleartk.classifier.opennlp.MaxentDataWriter;
+import org.cleartk.classifier.opennlp.MaxentStringOutcomeDataWriter;
+import org.cleartk.util.cr.FilesCollectionReader;
+import org.cleartk.util.cr.XReader;
+import org.uimafit.component.xwriter.XWriter;
+import org.uimafit.factory.AggregateBuilder;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.CollectionReaderFactory;
+import org.uimafit.pipeline.SimplePipeline;
+import org.uimafit.testing.util.HideOutput;
+//import org.junit.Test;
+import org.apache.ctakes.assertion.medfacts.AssertionAnalysisEngine;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+//import edu.mayo.bmi.uima.core.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.cleartk.classifier.jar.DefaultDataWriterFactory;
+import org.cleartk.examples.pos.ExamplePOSPlainTextWriter;
+
+
+public class TrainAllAssertionModels {
+	
+	public static final String PARAM_NAME_DECODING_OUTPUT_DIRECTORY = "decoding-output-directory";
+
+	  public static final String PARAM_NAME_DECODING_INPUT_DIRECTORY = "decoding-input-directory";
+
+	  public static final String PARAM_NAME_TRAINING_INPUT_DIRECTORY = "training-input-directory";
+
+	  public static final String PARAM_NAME_MODEL_DIRECTORY = "model-directory";
+
+	  protected static final Logger logger = Logger.getLogger(TrainAssertionModel.class.getName());
+
+
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+		
+	    Options options = new Options();
+	    
+	    Option modelDirectoryOption =
+	        OptionBuilder
+	          .withLongOpt(TrainAssertionModel.PARAM_NAME_MODEL_DIRECTORY)
+	          .withArgName("DIR")
+	          .hasArg()
+	          .isRequired()
+	          .withDescription("the directory where the model is written to for training, or read from for decoding")
+	          .create();
+	    options.addOption(modelDirectoryOption);
+	    
+	    Option trainingInputDirectoryOption =
+	        OptionBuilder
+	          .withLongOpt(TrainAssertionModel.PARAM_NAME_TRAINING_INPUT_DIRECTORY)
+	          .withArgName("DIR")
+	          .hasArg()
+	          .isRequired()
+	          .withDescription("directory where input training xmi files are located")
+	          .create();
+	    options.addOption(trainingInputDirectoryOption);
+	    
+	    Option decodingInputDirectoryOption =
+	        OptionBuilder
+	          .withLongOpt(TrainAssertionModel.PARAM_NAME_DECODING_INPUT_DIRECTORY)
+	          .withArgName("DIR")
+	          .hasArg()
+	          .isRequired()
+	          .withDescription("directory where input xmi files are located for decoding")
+	          .create();
+	    options.addOption(decodingInputDirectoryOption);
+	    
+	    Option decodingOutputDirectoryOption =
+	        OptionBuilder
+	          .withLongOpt(TrainAssertionModel.PARAM_NAME_DECODING_OUTPUT_DIRECTORY)
+	          .withArgName("DIR")
+	          .hasArg()
+	          .isRequired()
+	          .withDescription("directory where output xmi files that are generated in decoding are placed")
+	          .create();
+	    options.addOption(decodingOutputDirectoryOption);
+	    
+	    CommandLineParser parser = new GnuParser();
+	    
+	    boolean invalidInput = false;
+	    
+	    CommandLine commandLine = null;
+	    String modelDirectory = null;
+	    String trainingInputDirectory = null;
+	    String decodingInputDirectory = null;
+	    String decodingOutputDirectory = null;
+	    try
+	    {
+	      commandLine = parser.parse(options, args);
+	      
+	      modelDirectory = commandLine.getOptionValue(TrainAssertionModel.PARAM_NAME_MODEL_DIRECTORY);
+	      trainingInputDirectory = commandLine.getOptionValue(TrainAssertionModel.PARAM_NAME_TRAINING_INPUT_DIRECTORY);
+	      decodingInputDirectory = commandLine.getOptionValue(TrainAssertionModel.PARAM_NAME_DECODING_INPUT_DIRECTORY);
+	      decodingOutputDirectory = commandLine.getOptionValue(TrainAssertionModel.PARAM_NAME_DECODING_OUTPUT_DIRECTORY);
+	    } catch (ParseException e)
+	    {
+	      invalidInput = true;
+	      logger.error("unable to parse command-line arguments", e);
+	    }
+	    
+	    if (modelDirectory == null || modelDirectory.isEmpty() ||
+	        trainingInputDirectory == null || trainingInputDirectory.isEmpty() ||
+	        decodingInputDirectory == null || decodingInputDirectory.isEmpty() ||
+	        decodingOutputDirectory == null || decodingOutputDirectory.isEmpty()
+	        )
+	    {
+	      logger.error("required parameters not supplied");
+	      invalidInput = true;
+	    }
+	    
+	    if (invalidInput)
+	    {
+	      HelpFormatter formatter = new HelpFormatter();
+	      formatter.printHelp(TrainAssertionModel.class.getName(), options, true);
+	      return;
+	    }
+	    
+	    logger.info(String.format(
+	        "%n" +
+	        "model dir:           \"%s\"%n" +
+	        "training input dir:  \"%s\"%n" +
+	        "decoding input dir:  \"%s\"%n" +
+	        "decoding output dir: \"%s\"%n",
+	        modelDirectory,
+	        trainingInputDirectory,
+	        decodingInputDirectory,
+	        decodingOutputDirectory));
+	    
+	    String polarityModelOutputDirectory = modelDirectory + "/maxent-polarity";
+	    String uncertaintyModelOutputDirectory = modelDirectory + "/maxent-uncertainty";
+	    String conditionalModelOutputDirectory = modelDirectory + "/maxent-conditional";
+	    String subjectModelOutputDirectory = modelDirectory + "/maxent-subject";
+	    
+	    try
+	    {
+	    	AnalysisEngineDescription uncertaintyDataWriter = AnalysisEngineFactory.createPrimitiveDescription(
+	  	          UncertaintyCleartkAnalysisEngine.class,
+	  	          AssertionComponents.CTAKES_CTS_TYPE_SYSTEM_DESCRIPTION,
+	  	          DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+	  	          MaxentStringOutcomeDataWriter.class.getName(),
+	  	          DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+	  	          uncertaintyModelOutputDirectory);
+	    	AnalysisEngineDescription polarityDataWriter = AnalysisEngineFactory.createPrimitiveDescription(
+		  	          PolarityCleartkAnalysisEngine.class,
+		  	          AssertionComponents.CTAKES_CTS_TYPE_SYSTEM_DESCRIPTION,
+		  	          DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+		  	          MaxentStringOutcomeDataWriter.class.getName(),
+		  	          DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+		  	          polarityModelOutputDirectory);
+	    	
+	    	AnalysisEngineDescription conditionalDataWriter = AnalysisEngineFactory.createPrimitiveDescription(
+		  	          ConditionalCleartkAnalysisEngine.class,
+		  	          AssertionComponents.CTAKES_CTS_TYPE_SYSTEM_DESCRIPTION,
+		  	          DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+		  	          MaxentStringOutcomeDataWriter.class.getName(),
+		  	          DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+		  	          conditionalModelOutputDirectory);
+	    	
+	    	AnalysisEngineDescription subjectDataWriter = AnalysisEngineFactory.createPrimitiveDescription(
+		  	          SubjectCleartkAnalysisEngine.class,
+		  	          AssertionComponents.CTAKES_CTS_TYPE_SYSTEM_DESCRIPTION,
+		  	          DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+		  	          MaxentStringOutcomeDataWriter.class.getName(),
+		  	          DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+		  	          subjectModelOutputDirectory);
+	    	
+	      testClassifierPipeline(
+	          polarityDataWriter,
+	          polarityModelOutputDirectory,
+	          uncertaintyDataWriter,
+	          uncertaintyModelOutputDirectory,
+	          conditionalDataWriter,
+	          conditionalModelOutputDirectory,
+	          subjectDataWriter,
+	          subjectModelOutputDirectory,
+	          trainingInputDirectory,
+	          decodingInputDirectory,
+	          decodingOutputDirectory
+	      );
+	    } catch (Exception e)
+	    {
+	      logger.error("Some exception happened while training or decoding...", e);
+	      return;
+	    }
+	 }
+	
+	public static void testClassifierPipeline(
+			  AnalysisEngineDescription polarityDataWriter,
+		      String polarityModelOutputDirectory,
+		      AnalysisEngineDescription uncertaintyDataWriter,
+		      String uncertaintyModelOutputDirectory,
+		      AnalysisEngineDescription conditionalDataWriter,
+		      String conditionalModelOutputDirectory,
+		      AnalysisEngineDescription subjectDataWriter,
+		      String subjectModelOutputDirectory,
+		      String trainingDataInputDirectory,
+		      String decodingInputDirectory,
+		      String decodingOutputDirectory) throws Exception {
+		
+		    CollectionReader trainingCollectionReader = CollectionReaderFactory.createCollectionReader(
+		        XReader.class,
+		        XReader.PARAM_ROOT_FILE,
+		        trainingDataInputDirectory,
+		        XReader.PARAM_XML_SCHEME,
+		        XReader.XMI);
+		
+		    CollectionReader evaluationCollectionReader = CollectionReaderFactory.createCollectionReader(
+		        XReader.class,
+		        XReader.PARAM_ROOT_FILE,
+		        decodingInputDirectory,
+		        XReader.PARAM_XML_SCHEME,
+		        XReader.XMI);
+		    
+		    logger.info("starting feature generation... POLARITY");
+		    SimplePipeline.runPipeline(
+		        trainingCollectionReader,
+		        polarityDataWriter);
+		    logger.info("finished feature generation... POLARITY");
+
+		    trainingCollectionReader.reconfigure();
+		    logger.info("starting feature generation... UNCERTAINTY");
+		    SimplePipeline.runPipeline(
+		        trainingCollectionReader,
+		        uncertaintyDataWriter);
+		    logger.info("finished feature generation... UNCERTAINTY.");
+
+		    trainingCollectionReader.reconfigure();
+		    logger.info("starting feature generation... UNCERTAINTY");
+		    SimplePipeline.runPipeline(
+		        trainingCollectionReader,
+		        conditionalDataWriter);
+		    logger.info("finished feature generation... UNCERTAINTY.");
+
+		    trainingCollectionReader.reconfigure();
+		    logger.info("starting feature generation... UNCERTAINTY");
+		    SimplePipeline.runPipeline(
+		        trainingCollectionReader,
+		        subjectDataWriter);
+		    logger.info("finished feature generation... UNCERTAINTY.");
+		    
+		    String[] args = new String[] {polarityModelOutputDirectory};
+		    HideOutput hider = new HideOutput();
+		    logger.info("starting training POLARITY...");
+		    org.cleartk.classifier.jar.Train.main(args);
+		    logger.info("finished training POLARITY .");
+		    
+		    args = new String[] {uncertaintyModelOutputDirectory};
+		    logger.info("starting training UNCERTAINTY...");
+		    org.cleartk.classifier.jar.Train.main(args);
+		    logger.info("finished training UNCERTAINTY .");
+		    
+		    args = new String[] {conditionalModelOutputDirectory};
+		    logger.info("starting training CONDITIONAL...");
+		    org.cleartk.classifier.jar.Train.main(args);
+		    logger.info("finished training CONDITIONAL .");
+		    
+		    args = new String[] {subjectModelOutputDirectory};
+		    logger.info("starting training SUBJECT...");
+		    org.cleartk.classifier.jar.Train.main(args);
+		    logger.info("finished training SUBJECT .");		    
+		    hider.restoreOutput();		    
+
+		    AnalysisEngineDescription polarityTaggerDescription = AnalysisEngineFactory.createPrimitiveDescription(
+			        PolarityCleartkAnalysisEngine.class,
+			        GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+			        polarityModelOutputDirectory + "/model.jar");
+		    
+		    AnalysisEngineDescription uncertaintyTaggerDescription = AnalysisEngineFactory.createPrimitiveDescription(
+			        UncertaintyCleartkAnalysisEngine.class,
+			        GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+			        uncertaintyModelOutputDirectory + "/model.jar");
+		    
+		    AnalysisEngineDescription conditionalTaggerDescription = AnalysisEngineFactory.createPrimitiveDescription(
+			        ConditionalCleartkAnalysisEngine.class,
+			        GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+			        conditionalModelOutputDirectory + "/model.jar");
+		    
+		    AnalysisEngineDescription subjectTaggerDescription = AnalysisEngineFactory.createPrimitiveDescription(
+			        SubjectCleartkAnalysisEngine.class,
+			        GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+			        subjectModelOutputDirectory + "/model.jar");
+
+		    logger.info("starting decoding...");
+		    SimplePipeline.runPipeline(
+		        evaluationCollectionReader,
+//		        BreakIteratorAnnotatorFactory.createSentenceAnnotator(Locale.US),
+//		        TokenAnnotator.getDescription(),
+//		        DefaultSnowballStemmer.getDescription("English"),
+		        polarityTaggerDescription,
+		        uncertaintyTaggerDescription,
+		        conditionalTaggerDescription,
+		        subjectTaggerDescription,
+		        AnalysisEngineFactory.createPrimitiveDescription(
+		            XWriter.class,
+		            AssertionComponents.CTAKES_CTS_TYPE_SYSTEM_DESCRIPTION,
+		            XWriter.PARAM_OUTPUT_DIRECTORY_NAME,
+		            decodingOutputDirectory,
+		            XWriter.PARAM_XML_SCHEME_NAME,
+		            XWriter.XMI));
+		    logger.info("finished decoding.");
+
+				  
+			  }
+
+		
+		// TODO Auto-generated method stub
+
+}