You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by st...@apache.org on 2012/12/19 22:49:47 UTC
svn commit: r1424157 [3/3] - in /incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal: ae/ ae/feature/ ae/feature/selection/ eval/

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java?rev=1424157&r1=1424156&r2=1424157&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java Wed Dec 19 21:49:46 2012
@@ -1,132 +1,132 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.ctakes.temporal.eval;
-
-import java.io.File;
-import java.util.Collection;
-import java.util.EnumSet;
-import java.util.List;
-import java.util.logging.Level;
-
-import org.apache.ctakes.temporal.ae.EventAnnotator;
-import org.apache.ctakes.typesystem.type.textsem.EntityMention;
-import org.apache.ctakes.typesystem.type.textsem.EventMention;
-import org.apache.uima.analysis_engine.AnalysisEngineDescription;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.cas.TOP;
-import org.apache.uima.jcas.tcas.Annotation;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.cleartk.classifier.feature.transform.InstanceDataWriter;
-import org.cleartk.classifier.jar.JarClassifierBuilder;
-import org.cleartk.classifier.libsvm.LIBSVMStringOutcomeDataWriter;
-import org.cleartk.eval.AnnotationStatistics;
-import org.uimafit.util.JCasUtil;
-
-import com.lexicalscope.jewel.cli.CliFactory;
-
-public class EvaluationOfEventSpans extends EvaluationOfAnnotationSpans_ImplBase {
-
-  public static void main(String[] args) throws Exception {
-    Options options = CliFactory.parseArguments(Options.class, args);
-    EvaluationOfEventSpans evaluation = new EvaluationOfEventSpans(
-        new File("target/eval"),
-        options.getRawTextDirectory(),
-        options.getKnowtatorXMLDirectory(),
-        options.getPatients().getList(),
-        options.getDownSampleRatio(),
-    	options.getFeatureSelect()); //control apply feature selection or not
-    evaluation.setLogging(Level.FINE, new File("target/eval/ctakes-event-errors.log"));
-    List<AnnotationStatistics<String>> foldStats = evaluation.crossValidation(4);
-    for (AnnotationStatistics<String> stats : foldStats) {
-      System.err.println(stats);
-    }
-    System.err.println("OVERALL");
-    System.err.println(AnnotationStatistics.addAll(foldStats));
-  }
-  
-  private float downratio;
-  private float featureTrim;
-
-  public EvaluationOfEventSpans(
-      File baseDirectory,
-      File rawTextDirectory,
-      File knowtatorXMLDirectory,
-      List<Integer> patientSets,
-      float downratio, float featureSelect) {
-    super(
-        baseDirectory,
-        rawTextDirectory,
-        knowtatorXMLDirectory,
-        patientSets,
-        EnumSet.of(AnnotatorType.PART_OF_SPEECH_TAGS,
-        //AnnotatorType.UMLS_NAMED_ENTITIES,
-//        AnnotatorType.LEXICAL_VARIANTS,
-        AnnotatorType.DEPENDENCIES,
-        AnnotatorType.SEMANTIC_ROLES));
-    this.downratio = downratio;
-    this.featureTrim = featureSelect;
-  }
-
-  @Override
-  protected AnalysisEngineDescription getDataWriterDescription(File directory)
-      throws ResourceInitializationException {
-	if(this.featureTrim > 0){
-		return EventAnnotator.createDataWriterDescription(
-		    	InstanceDataWriter.class.getName(),
-		        directory,
-		        this.downratio,
-		        this.featureTrim);
-	}
-	return EventAnnotator.createDataWriterDescription(
-	        LIBSVMStringOutcomeDataWriter.class.getName(),
-	        directory,
-	        this.downratio,
-	        this.featureTrim);
-	
-    
-  }
-
-  @Override
-  protected void trainAndPackage(File directory) throws Exception {
-    JarClassifierBuilder.trainAndPackage(directory, "-c", "10000");
-  }
-
-  @Override
-  protected List<Class<? extends TOP>> getAnnotationClassesThatShouldBeGoldAtTestTime() {
-    List<Class<? extends TOP>> result = super.getAnnotationClassesThatShouldBeGoldAtTestTime();
-    result.add(EntityMention.class);
-    return result;
-  }
-
-  @Override
-  protected AnalysisEngineDescription getAnnotatorDescription(File directory)
-      throws ResourceInitializationException {
-    return EventAnnotator.createAnnotatorDescription(directory);
-  }
-
-  @Override
-  protected Collection<? extends Annotation> getGoldAnnotations(JCas jCas) {
-    return JCasUtil.select(jCas, EventMention.class);
-  }
-
-  @Override
-  protected Collection<? extends Annotation> getSystemAnnotations(JCas jCas) {
-    return JCasUtil.select(jCas, EventMention.class);
-  }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.temporal.eval;
+
+import java.io.File;
+import java.util.Collection;
+import java.util.EnumSet;
+import java.util.List;
+import java.util.logging.Level;
+
+import org.apache.ctakes.temporal.ae.EventAnnotator;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.TOP;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.classifier.feature.transform.InstanceDataWriter;
+import org.cleartk.classifier.jar.JarClassifierBuilder;
+import org.cleartk.classifier.libsvm.LIBSVMStringOutcomeDataWriter;
+import org.cleartk.eval.AnnotationStatistics;
+import org.uimafit.util.JCasUtil;
+
+import com.lexicalscope.jewel.cli.CliFactory;
+
+public class EvaluationOfEventSpans extends EvaluationOfAnnotationSpans_ImplBase {
+
+  public static void main(String[] args) throws Exception {
+    Options options = CliFactory.parseArguments(Options.class, args);
+    EvaluationOfEventSpans evaluation = new EvaluationOfEventSpans(
+        new File("target/eval"),
+        options.getRawTextDirectory(),
+        options.getKnowtatorXMLDirectory(),
+        options.getPatients().getList(),
+        options.getDownSampleRatio(),
+    	options.getFeatureSelect()); //control apply feature selection or not
+    evaluation.setLogging(Level.FINE, new File("target/eval/ctakes-event-errors.log"));
+    List<AnnotationStatistics<String>> foldStats = evaluation.crossValidation(4);
+    for (AnnotationStatistics<String> stats : foldStats) {
+      System.err.println(stats);
+    }
+    System.err.println("OVERALL");
+    System.err.println(AnnotationStatistics.addAll(foldStats));
+  }
+  
+  private float downratio;
+  private float featureTrim;
+
+  public EvaluationOfEventSpans(
+      File baseDirectory,
+      File rawTextDirectory,
+      File knowtatorXMLDirectory,
+      List<Integer> patientSets,
+      float downratio, float featureSelect) {
+    super(
+        baseDirectory,
+        rawTextDirectory,
+        knowtatorXMLDirectory,
+        patientSets,
+        EnumSet.of(AnnotatorType.PART_OF_SPEECH_TAGS,
+        //AnnotatorType.UMLS_NAMED_ENTITIES,
+//        AnnotatorType.LEXICAL_VARIANTS,
+        AnnotatorType.DEPENDENCIES,
+        AnnotatorType.SEMANTIC_ROLES));
+    this.downratio = downratio;
+    this.featureTrim = featureSelect;
+  }
+
+  @Override
+  protected AnalysisEngineDescription getDataWriterDescription(File directory)
+      throws ResourceInitializationException {
+	if(this.featureTrim > 0){
+		return EventAnnotator.createDataWriterDescription(
+		    	InstanceDataWriter.class.getName(),
+		        directory,
+		        this.downratio,
+		        this.featureTrim);
+	}
+	return EventAnnotator.createDataWriterDescription(
+	        LIBSVMStringOutcomeDataWriter.class.getName(),
+	        directory,
+	        this.downratio,
+	        this.featureTrim);
+	
+    
+  }
+
+  @Override
+  protected void trainAndPackage(File directory) throws Exception {
+    JarClassifierBuilder.trainAndPackage(directory, "-c", "10000");
+  }
+
+  @Override
+  protected List<Class<? extends TOP>> getAnnotationClassesThatShouldBeGoldAtTestTime() {
+    List<Class<? extends TOP>> result = super.getAnnotationClassesThatShouldBeGoldAtTestTime();
+    result.add(EntityMention.class);
+    return result;
+  }
+
+  @Override
+  protected AnalysisEngineDescription getAnnotatorDescription(File directory)
+      throws ResourceInitializationException {
+    return EventAnnotator.createAnnotatorDescription(directory);
+  }
+
+  @Override
+  protected Collection<? extends Annotation> getGoldAnnotations(JCas jCas) {
+    return JCasUtil.select(jCas, EventMention.class);
+  }
+
+  @Override
+  protected Collection<? extends Annotation> getSystemAnnotations(JCas jCas) {
+    return JCasUtil.select(jCas, EventMention.class);
+  }
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java?rev=1424157&r1=1424156&r2=1424157&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java Wed Dec 19 21:49:46 2012
@@ -1,399 +1,399 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.ctakes.temporal.eval;
-
-import java.io.File;
-import java.net.URISyntaxException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.ctakes.chunker.ae.Chunker;
-import org.apache.ctakes.chunker.ae.DefaultChunkCreator;
-import org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster;
-import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator;
-import org.apache.ctakes.core.ae.OverlapAnnotator;
-import org.apache.ctakes.core.ae.SentenceDetector;
-import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
-import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
-import org.apache.ctakes.core.resource.FileResourceImpl;
-import org.apache.ctakes.core.resource.JdbcConnectionResourceImpl;
-import org.apache.ctakes.core.resource.LuceneIndexReaderResourceImpl;
-import org.apache.ctakes.core.resource.SuffixMaxentModelResourceImpl;
-import org.apache.ctakes.dependency.parser.ae.ClearParserDependencyParserAE;
-import org.apache.ctakes.dependency.parser.ae.ClearParserSemanticRoleLabelerAE;
-import org.apache.ctakes.dictionary.lookup.ae.UmlsDictionaryLookupAnnotator;
-import org.apache.ctakes.lvg.ae.LvgAnnotator;
-import org.apache.ctakes.lvg.resource.LvgCmdApiResourceImpl;
-import org.apache.ctakes.postagger.POSTagger;
-import org.apache.ctakes.temporal.ae.THYMEKnowtatorXMLReader;
-import org.apache.ctakes.typesystem.type.syntax.Chunk;
-import org.apache.ctakes.typesystem.type.textsem.EntityMention;
-import org.apache.ctakes.typesystem.type.textspan.LookupWindowAnnotation;
-import org.apache.uima.analysis_engine.AnalysisEngineDescription;
-import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
-import org.apache.uima.cas.CAS;
-import org.apache.uima.collection.CollectionReader;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.cas.TOP;
-import org.cleartk.util.ae.UriToDocumentTextAnnotator;
-import org.cleartk.util.cr.UriCollectionReader;
-import org.uimafit.component.JCasAnnotator_ImplBase;
-import org.uimafit.component.ViewCreatorAnnotator;
-import org.uimafit.component.ViewTextCopierAnnotator;
-import org.uimafit.factory.AggregateBuilder;
-import org.uimafit.factory.AnalysisEngineFactory;
-import org.uimafit.factory.ExternalResourceFactory;
-import org.uimafit.util.JCasUtil;
-
-import com.google.common.collect.Lists;
-import com.lexicalscope.jewel.cli.Option;
-
-public abstract class Evaluation_ImplBase<STATISTICS_TYPE> extends
-    org.cleartk.eval.Evaluation_ImplBase<Integer, STATISTICS_TYPE> {
-
-  public enum AnnotatorType {
-    PART_OF_SPEECH_TAGS, UMLS_NAMED_ENTITIES, LEXICAL_VARIANTS, DEPENDENCIES, SEMANTIC_ROLES
-  }
-
-  protected final String GOLD_VIEW_NAME = "GoldView";
-
-  static interface Options {
-
-    @Option(longName = "text")
-    public File getRawTextDirectory();
-
-    @Option(longName = "xml")
-    public File getKnowtatorXMLDirectory();
-
-    @Option(longName = "patients")
-    public CommandLine.IntegerRanges getPatients();
-    
-    @Option(longName = "downratio")
-	public float getDownSampleRatio();
-
-    @Option(longName = "featureSelect")
-    public float getFeatureSelect(); //get feature selection cut off threshold is it is > 0. apply no FS if featureSelect == 0 
-  }
-
-  protected File rawTextDirectory;
-
-  protected File knowtatorXMLDirectory;
-
-  protected List<Integer> patientSets;
-
-  private Set<AnnotatorType> annotatorFlags;
-
-  public Evaluation_ImplBase(
-      File baseDirectory,
-      File rawTextDirectory,
-      File knowtatorXMLDirectory,
-      List<Integer> patientSets,
-      Set<AnnotatorType> annotatorFlags) {
-    super(baseDirectory);
-    this.rawTextDirectory = rawTextDirectory;
-    this.knowtatorXMLDirectory = knowtatorXMLDirectory;
-    this.patientSets = patientSets;
-    this.annotatorFlags = annotatorFlags;
-  }
-
-  public List<STATISTICS_TYPE> crossValidation(int nFolds) throws Exception {
-    return this.crossValidation(this.patientSets, nFolds);
-  }
-
-  @Override
-  protected CollectionReader getCollectionReader(List<Integer> selectedPatientSets) throws Exception {
-    List<File> files = new ArrayList<File>();
-    for (Integer set : selectedPatientSets) {
-      File setTextDirectory = new File(this.rawTextDirectory, "doc" + set);
-      for (File file : setTextDirectory.listFiles()) {
-        files.add(file);
-      }
-    }
-    return UriCollectionReader.getCollectionReaderFromFiles(files);
-  }
-
-  protected AnalysisEngineDescription getPreprocessorTrainDescription() throws Exception {
-    return this.getPreprocessorDescription(PipelineType.TRAIN);
-  }
-
-  protected AnalysisEngineDescription getPreprocessorTestDescription() throws Exception {
-    return this.getPreprocessorDescription(PipelineType.TEST);
-  }
-
-  protected List<Class<? extends TOP>> getAnnotationClassesThatShouldBeGoldAtTestTime() {
-    return new ArrayList<Class<? extends TOP>>();
-  }
-
-  private static enum PipelineType {
-    TRAIN, TEST
-  }
-
-  private AnalysisEngineDescription getPreprocessorDescription(PipelineType pipelineType)
-      throws Exception {
-    AggregateBuilder aggregateBuilder = new AggregateBuilder();
-    aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription());
-    switch (pipelineType) {
-      case TRAIN:
-        aggregateBuilder.add(THYMEKnowtatorXMLReader.getDescription(this.knowtatorXMLDirectory));
-        break;
-      case TEST:
-        aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
-            ViewCreatorAnnotator.class,
-            ViewCreatorAnnotator.PARAM_VIEW_NAME,
-            GOLD_VIEW_NAME));
-        aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
-            ViewTextCopierAnnotator.class,
-            ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME,
-            CAS.NAME_DEFAULT_SOFA,
-            ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME,
-            GOLD_VIEW_NAME));
-        aggregateBuilder.add(
-            THYMEKnowtatorXMLReader.getDescription(this.knowtatorXMLDirectory),
-            CAS.NAME_DEFAULT_SOFA,
-            GOLD_VIEW_NAME);
-        for (Class<? extends TOP> annotationClass : this.getAnnotationClassesThatShouldBeGoldAtTestTime()) {
-          aggregateBuilder.add(AnnotationCopier.getDescription(
-              GOLD_VIEW_NAME,
-              CAS.NAME_DEFAULT_SOFA,
-              annotationClass));
-        }
-        break;
-    }
-    // identify segments
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SimpleSegmentAnnotator.class));
-    // identify sentences
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
-        SentenceDetector.class,
-        "MaxentModel",
-        ExternalResourceFactory.createExternalResourceDescription(
-            SuffixMaxentModelResourceImpl.class,
-            SentenceDetector.class.getResource("../sentdetect/sdmed.mod"))));
-    // identify tokens
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TokenizerAnnotatorPTB.class));
-    // merge some tokens
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ContextDependentTokenizerAnnotator.class));
-
-    // identify part-of-speech tags if requested
-    if (this.annotatorFlags.contains(AnnotatorType.PART_OF_SPEECH_TAGS)) {
-      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
-          POSTagger.class,
-          POSTagger.POS_MODEL_FILE_PARAM,
-          "org/apache/ctakes/postagger/models/mayo-pos.zip",
-          POSTagger.TAG_DICTIONARY_PARAM,
-          "org/apache/ctakes/postagger/models/tag.dictionary.txt",
-          POSTagger.CASE_SENSITIVE_PARAM,
-          true));
-    }
-
-    // identify UMLS named entities if requested
-    if (this.annotatorFlags.contains(AnnotatorType.UMLS_NAMED_ENTITIES)) {
-      // remove gold mentions if they're there (we'll add cTAKES mentions later instead)
-      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(EntityMentionRemover.class));
-      // identify chunks
-      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
-          Chunker.class,
-          Chunker.CHUNKER_MODEL_FILE_PARAM,
-          Chunker.class.getResource("../models/chunk-model.claims-1.5.zip").toURI().getPath(),
-          Chunker.CHUNKER_CREATOR_CLASS_PARAM,
-          DefaultChunkCreator.class));
-      // adjust NP in NP NP to span both
-      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
-          ChunkAdjuster.class,
-          ChunkAdjuster.PARAM_CHUNK_PATTERN,
-          new String[] { "NP", "NP" },
-          ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
-          1));
-      // adjust NP in NP PP NP to span all three
-      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
-          ChunkAdjuster.class,
-          ChunkAdjuster.PARAM_CHUNK_PATTERN,
-          new String[] { "NP", "PP", "NP" },
-          ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
-          2));
-      // add lookup windows for each NP
-      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(CopyNPChunksToLookupWindowAnnotations.class));
-      // maximize lookup windows
-      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
-          OverlapAnnotator.class,
-          "A_ObjectClass",
-          LookupWindowAnnotation.class,
-          "B_ObjectClass",
-          LookupWindowAnnotation.class,
-          "OverlapType",
-          "A_ENV_B",
-          "ActionType",
-          "DELETE",
-          "DeleteAction",
-          new String[] { "selector=B" }));
-      // add UMLS on top of lookup windows
-      String umlsUser = System.getProperty("umls.user");
-      String umlsPassword = System.getProperty("umls.password");
-      if (umlsUser == null || umlsPassword == null) {
-        throw new IllegalArgumentException(
-            "The properties umls.user and umls.password must be set to use the "
-                + "UmlsDictionaryLookupAnnotator. You can set them by provding java with the "
-                + "arguments -Dumls.user=... and -Dumls.password=...");
-      }
-      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
-          UmlsDictionaryLookupAnnotator.class,
-          "UMLSAddr",
-          "https://uts-ws.nlm.nih.gov/restful/isValidUMLSUser",
-          "UMLSVendor",
-          "NLM-6515182895",
-          "UMLSUser",
-          umlsUser,
-          "UMLSPW",
-          umlsPassword,
-          "LookupDescriptor",
-          ExternalResourceFactory.createExternalResourceDescription(
-              FileResourceImpl.class,
-              getResourceAsFile(UmlsDictionaryLookupAnnotator.class, "../LookupDesc_Db.xml")),
-          "DbConnection",
-          ExternalResourceFactory.createExternalResourceDescription(
-              JdbcConnectionResourceImpl.class,
-              "",
-              JdbcConnectionResourceImpl.PARAM_DRIVER_CLASS,
-              "org.hsqldb.jdbcDriver",
-              JdbcConnectionResourceImpl.PARAM_URL,
-              "jdbc:hsqldb:res:/org/apache/ctakes/dictionary/lookup/umls2011ab/umls"),
-          "RxnormIndexReader",
-          ExternalResourceFactory.createExternalResourceDescription(
-              LuceneIndexReaderResourceImpl.class,
-              "",
-              "UseMemoryIndex",
-              true,
-              "IndexDirectory",
-              getResourceAsFile(UmlsDictionaryLookupAnnotator.class, "../rxnorm_index")),
-          "OrangeBookIndexReader",
-          ExternalResourceFactory.createExternalResourceDescription(
-              LuceneIndexReaderResourceImpl.class,
-              "",
-              "UseMemoryIndex",
-              true,
-              "IndexDirectory",
-              getResourceAsFile(UmlsDictionaryLookupAnnotator.class, "../OrangeBook"))));
-    }
-
-    // add lvg annotator
-    if (this.annotatorFlags.contains(AnnotatorType.LEXICAL_VARIANTS)) {
-      String[] XeroxTreebankMap = {
-          "adj|JJ",
-          "adv|RB",
-          "aux|AUX",
-          "compl|CS",
-          "conj|CC",
-          "det|DET",
-          "modal|MD",
-          "noun|NN",
-          "prep|IN",
-          "pron|PRP",
-          "verb|VB" };
-      String[] ExclusionSet = {
-          "and",
-          "And",
-          "by",
-          "By",
-          "for",
-          "For",
-          "in",
-          "In",
-          "of",
-          "Of",
-          "on",
-          "On",
-          "the",
-          "The",
-          "to",
-          "To",
-          "with",
-          "With" };
-      AnalysisEngineDescription lvgAnnotator = AnalysisEngineFactory.createPrimitiveDescription(
-          LvgAnnotator.class,
-          "UseSegments",
-          false,
-          "SegmentsToSkip",
-          new String[0],
-          "UseCmdCache",
-          false,
-          "CmdCacheFileLocation",
-          "/org/apache/ctakes/lvg/2005_norm.voc",
-          "CmdCacheFrequencyCutoff",
-          20,
-          "ExclusionSet",
-          ExclusionSet,
-          "XeroxTreebankMap",
-          XeroxTreebankMap,
-          "LemmaCacheFileLocation",
-          "/org/apache/ctakes/lvg/2005_lemma.voc",
-          "UseLemmaCache",
-          false,
-          "LemmaCacheFrequencyCutoff",
-          20,
-          "PostLemmas",
-          true,
-          "LvgCmdApi",
-          ExternalResourceFactory.createExternalResourceDescription(
-              LvgCmdApiResourceImpl.class,
-              getResourceAsFile(LvgAnnotator.class, "../data/config/lvg.properties")));
-      aggregateBuilder.add(lvgAnnotator);
-    }
-
-    // add dependency parser
-    if (this.annotatorFlags.contains(AnnotatorType.DEPENDENCIES)) {
-      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearParserDependencyParserAE.class));
-    }
-
-    // add semantic role labeler
-    if (this.annotatorFlags.contains(AnnotatorType.SEMANTIC_ROLES)) {
-      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearParserSemanticRoleLabelerAE.class));
-    }
-    return aggregateBuilder.createAggregateDescription();
-  }
-
-  /**
-   * This is hack to deal with classes that don't handle resources correctly
-   */
-  private static File getResourceAsFile(Class<?> cls, String path) throws URISyntaxException {
-    // this will fail if the resource is not a real File, but the UMLS code assumes that
-    return new File(cls.getResource(path).toURI());
-  }
-
-  public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase {
-
-    @Override
-    public void process(JCas jCas) throws AnalysisEngineProcessException {
-      for (Chunk chunk : JCasUtil.select(jCas, Chunk.class)) {
-        if (chunk.getChunkType().equals("NP")) {
-          new LookupWindowAnnotation(jCas, chunk.getBegin(), chunk.getEnd()).addToIndexes();
-        }
-      }
-    }
-  }
-
-  public static class EntityMentionRemover extends JCasAnnotator_ImplBase {
-
-    @Override
-    public void process(JCas jCas) throws AnalysisEngineProcessException {
-      for (EntityMention mention : Lists.newArrayList(JCasUtil.select(jCas, EntityMention.class))) {
-        mention.removeFromIndexes();
-      }
-    }
-  }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.temporal.eval;
+
+import java.io.File;
+import java.net.URISyntaxException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.ctakes.chunker.ae.Chunker;
+import org.apache.ctakes.chunker.ae.DefaultChunkCreator;
+import org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster;
+import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator;
+import org.apache.ctakes.core.ae.OverlapAnnotator;
+import org.apache.ctakes.core.ae.SentenceDetector;
+import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
+import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
+import org.apache.ctakes.core.resource.FileResourceImpl;
+import org.apache.ctakes.core.resource.JdbcConnectionResourceImpl;
+import org.apache.ctakes.core.resource.LuceneIndexReaderResourceImpl;
+import org.apache.ctakes.core.resource.SuffixMaxentModelResourceImpl;
+import org.apache.ctakes.dependency.parser.ae.ClearParserDependencyParserAE;
+import org.apache.ctakes.dependency.parser.ae.ClearParserSemanticRoleLabelerAE;
+import org.apache.ctakes.dictionary.lookup.ae.UmlsDictionaryLookupAnnotator;
+import org.apache.ctakes.lvg.ae.LvgAnnotator;
+import org.apache.ctakes.lvg.resource.LvgCmdApiResourceImpl;
+import org.apache.ctakes.postagger.POSTagger;
+import org.apache.ctakes.temporal.ae.THYMEKnowtatorXMLReader;
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textspan.LookupWindowAnnotation;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.TOP;
+import org.cleartk.util.ae.UriToDocumentTextAnnotator;
+import org.cleartk.util.cr.UriCollectionReader;
+import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.component.ViewCreatorAnnotator;
+import org.uimafit.component.ViewTextCopierAnnotator;
+import org.uimafit.factory.AggregateBuilder;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.ExternalResourceFactory;
+import org.uimafit.util.JCasUtil;
+
+import com.google.common.collect.Lists;
+import com.lexicalscope.jewel.cli.Option;
+
+public abstract class Evaluation_ImplBase<STATISTICS_TYPE> extends
+    org.cleartk.eval.Evaluation_ImplBase<Integer, STATISTICS_TYPE> {
+
+  public enum AnnotatorType {
+    PART_OF_SPEECH_TAGS, UMLS_NAMED_ENTITIES, LEXICAL_VARIANTS, DEPENDENCIES, SEMANTIC_ROLES
+  }
+
+  protected final String GOLD_VIEW_NAME = "GoldView";
+
+  static interface Options {
+
+    @Option(longName = "text")
+    public File getRawTextDirectory();
+
+    @Option(longName = "xml")
+    public File getKnowtatorXMLDirectory();
+
+    @Option(longName = "patients")
+    public CommandLine.IntegerRanges getPatients();
+    
+    @Option(longName = "downratio")
+	public float getDownSampleRatio();
+
+    @Option(longName = "featureSelect")
+    public float getFeatureSelect(); //get feature selection cut off threshold is it is > 0. apply no FS if featureSelect == 0 
+  }
+
+  protected File rawTextDirectory;
+
+  protected File knowtatorXMLDirectory;
+
+  protected List<Integer> patientSets;
+
+  private Set<AnnotatorType> annotatorFlags;
+
+  public Evaluation_ImplBase(
+      File baseDirectory,
+      File rawTextDirectory,
+      File knowtatorXMLDirectory,
+      List<Integer> patientSets,
+      Set<AnnotatorType> annotatorFlags) {
+    super(baseDirectory);
+    this.rawTextDirectory = rawTextDirectory;
+    this.knowtatorXMLDirectory = knowtatorXMLDirectory;
+    this.patientSets = patientSets;
+    this.annotatorFlags = annotatorFlags;
+  }
+
+  public List<STATISTICS_TYPE> crossValidation(int nFolds) throws Exception {
+    return this.crossValidation(this.patientSets, nFolds);
+  }
+
+  @Override
+  protected CollectionReader getCollectionReader(List<Integer> selectedPatientSets) throws Exception {
+    List<File> files = new ArrayList<File>();
+    for (Integer set : selectedPatientSets) {
+      File setTextDirectory = new File(this.rawTextDirectory, "doc" + set);
+      for (File file : setTextDirectory.listFiles()) {
+        files.add(file);
+      }
+    }
+    return UriCollectionReader.getCollectionReaderFromFiles(files);
+  }
+
+  protected AnalysisEngineDescription getPreprocessorTrainDescription() throws Exception {
+    return this.getPreprocessorDescription(PipelineType.TRAIN);
+  }
+
+  protected AnalysisEngineDescription getPreprocessorTestDescription() throws Exception {
+    return this.getPreprocessorDescription(PipelineType.TEST);
+  }
+
+  protected List<Class<? extends TOP>> getAnnotationClassesThatShouldBeGoldAtTestTime() {
+    return new ArrayList<Class<? extends TOP>>();
+  }
+
+  private static enum PipelineType {
+    TRAIN, TEST
+  }
+
+  private AnalysisEngineDescription getPreprocessorDescription(PipelineType pipelineType)
+      throws Exception {
+    AggregateBuilder aggregateBuilder = new AggregateBuilder();
+    aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription());
+    switch (pipelineType) {
+      case TRAIN:
+        aggregateBuilder.add(THYMEKnowtatorXMLReader.getDescription(this.knowtatorXMLDirectory));
+        break;
+      case TEST:
+        aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+            ViewCreatorAnnotator.class,
+            ViewCreatorAnnotator.PARAM_VIEW_NAME,
+            GOLD_VIEW_NAME));
+        aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+            ViewTextCopierAnnotator.class,
+            ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME,
+            CAS.NAME_DEFAULT_SOFA,
+            ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME,
+            GOLD_VIEW_NAME));
+        aggregateBuilder.add(
+            THYMEKnowtatorXMLReader.getDescription(this.knowtatorXMLDirectory),
+            CAS.NAME_DEFAULT_SOFA,
+            GOLD_VIEW_NAME);
+        for (Class<? extends TOP> annotationClass : this.getAnnotationClassesThatShouldBeGoldAtTestTime()) {
+          aggregateBuilder.add(AnnotationCopier.getDescription(
+              GOLD_VIEW_NAME,
+              CAS.NAME_DEFAULT_SOFA,
+              annotationClass));
+        }
+        break;
+    }
+    // identify segments
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SimpleSegmentAnnotator.class));
+    // identify sentences
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        SentenceDetector.class,
+        "MaxentModel",
+        ExternalResourceFactory.createExternalResourceDescription(
+            SuffixMaxentModelResourceImpl.class,
+            SentenceDetector.class.getResource("../sentdetect/sdmed.mod"))));
+    // identify tokens
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TokenizerAnnotatorPTB.class));
+    // merge some tokens
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ContextDependentTokenizerAnnotator.class));
+
+    // identify part-of-speech tags if requested
+    if (this.annotatorFlags.contains(AnnotatorType.PART_OF_SPEECH_TAGS)) {
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+          POSTagger.class,
+          POSTagger.POS_MODEL_FILE_PARAM,
+          "org/apache/ctakes/postagger/models/mayo-pos.zip",
+          POSTagger.TAG_DICTIONARY_PARAM,
+          "org/apache/ctakes/postagger/models/tag.dictionary.txt",
+          POSTagger.CASE_SENSITIVE_PARAM,
+          true));
+    }
+
+    // identify UMLS named entities if requested
+    if (this.annotatorFlags.contains(AnnotatorType.UMLS_NAMED_ENTITIES)) {
+      // remove gold mentions if they're there (we'll add cTAKES mentions later instead)
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(EntityMentionRemover.class));
+      // identify chunks
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+          Chunker.class,
+          Chunker.CHUNKER_MODEL_FILE_PARAM,
+          Chunker.class.getResource("../models/chunk-model.claims-1.5.zip").toURI().getPath(),
+          Chunker.CHUNKER_CREATOR_CLASS_PARAM,
+          DefaultChunkCreator.class));
+      // adjust NP in NP NP to span both
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+          ChunkAdjuster.class,
+          ChunkAdjuster.PARAM_CHUNK_PATTERN,
+          new String[] { "NP", "NP" },
+          ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+          1));
+      // adjust NP in NP PP NP to span all three
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+          ChunkAdjuster.class,
+          ChunkAdjuster.PARAM_CHUNK_PATTERN,
+          new String[] { "NP", "PP", "NP" },
+          ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+          2));
+      // add lookup windows for each NP
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(CopyNPChunksToLookupWindowAnnotations.class));
+      // maximize lookup windows
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+          OverlapAnnotator.class,
+          "A_ObjectClass",
+          LookupWindowAnnotation.class,
+          "B_ObjectClass",
+          LookupWindowAnnotation.class,
+          "OverlapType",
+          "A_ENV_B",
+          "ActionType",
+          "DELETE",
+          "DeleteAction",
+          new String[] { "selector=B" }));
+      // add UMLS on top of lookup windows
+      String umlsUser = System.getProperty("umls.user");
+      String umlsPassword = System.getProperty("umls.password");
+      if (umlsUser == null || umlsPassword == null) {
+        throw new IllegalArgumentException(
+            "The properties umls.user and umls.password must be set to use the "
+                + "UmlsDictionaryLookupAnnotator. You can set them by provding java with the "
+                + "arguments -Dumls.user=... and -Dumls.password=...");
+      }
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+          UmlsDictionaryLookupAnnotator.class,
+          "UMLSAddr",
+          "https://uts-ws.nlm.nih.gov/restful/isValidUMLSUser",
+          "UMLSVendor",
+          "NLM-6515182895",
+          "UMLSUser",
+          umlsUser,
+          "UMLSPW",
+          umlsPassword,
+          "LookupDescriptor",
+          ExternalResourceFactory.createExternalResourceDescription(
+              FileResourceImpl.class,
+              getResourceAsFile(UmlsDictionaryLookupAnnotator.class, "../LookupDesc_Db.xml")),
+          "DbConnection",
+          ExternalResourceFactory.createExternalResourceDescription(
+              JdbcConnectionResourceImpl.class,
+              "",
+              JdbcConnectionResourceImpl.PARAM_DRIVER_CLASS,
+              "org.hsqldb.jdbcDriver",
+              JdbcConnectionResourceImpl.PARAM_URL,
+              "jdbc:hsqldb:res:/org/apache/ctakes/dictionary/lookup/umls2011ab/umls"),
+          "RxnormIndexReader",
+          ExternalResourceFactory.createExternalResourceDescription(
+              LuceneIndexReaderResourceImpl.class,
+              "",
+              "UseMemoryIndex",
+              true,
+              "IndexDirectory",
+              getResourceAsFile(UmlsDictionaryLookupAnnotator.class, "../rxnorm_index")),
+          "OrangeBookIndexReader",
+          ExternalResourceFactory.createExternalResourceDescription(
+              LuceneIndexReaderResourceImpl.class,
+              "",
+              "UseMemoryIndex",
+              true,
+              "IndexDirectory",
+              getResourceAsFile(UmlsDictionaryLookupAnnotator.class, "../OrangeBook"))));
+    }
+
+    // add lvg annotator
+    if (this.annotatorFlags.contains(AnnotatorType.LEXICAL_VARIANTS)) {
+      String[] XeroxTreebankMap = {
+          "adj|JJ",
+          "adv|RB",
+          "aux|AUX",
+          "compl|CS",
+          "conj|CC",
+          "det|DET",
+          "modal|MD",
+          "noun|NN",
+          "prep|IN",
+          "pron|PRP",
+          "verb|VB" };
+      String[] ExclusionSet = {
+          "and",
+          "And",
+          "by",
+          "By",
+          "for",
+          "For",
+          "in",
+          "In",
+          "of",
+          "Of",
+          "on",
+          "On",
+          "the",
+          "The",
+          "to",
+          "To",
+          "with",
+          "With" };
+      AnalysisEngineDescription lvgAnnotator = AnalysisEngineFactory.createPrimitiveDescription(
+          LvgAnnotator.class,
+          "UseSegments",
+          false,
+          "SegmentsToSkip",
+          new String[0],
+          "UseCmdCache",
+          false,
+          "CmdCacheFileLocation",
+          "/org/apache/ctakes/lvg/2005_norm.voc",
+          "CmdCacheFrequencyCutoff",
+          20,
+          "ExclusionSet",
+          ExclusionSet,
+          "XeroxTreebankMap",
+          XeroxTreebankMap,
+          "LemmaCacheFileLocation",
+          "/org/apache/ctakes/lvg/2005_lemma.voc",
+          "UseLemmaCache",
+          false,
+          "LemmaCacheFrequencyCutoff",
+          20,
+          "PostLemmas",
+          true,
+          "LvgCmdApi",
+          ExternalResourceFactory.createExternalResourceDescription(
+              LvgCmdApiResourceImpl.class,
+              getResourceAsFile(LvgAnnotator.class, "../data/config/lvg.properties")));
+      aggregateBuilder.add(lvgAnnotator);
+    }
+
+    // add dependency parser
+    if (this.annotatorFlags.contains(AnnotatorType.DEPENDENCIES)) {
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearParserDependencyParserAE.class));
+    }
+
+    // add semantic role labeler
+    if (this.annotatorFlags.contains(AnnotatorType.SEMANTIC_ROLES)) {
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearParserSemanticRoleLabelerAE.class));
+    }
+    return aggregateBuilder.createAggregateDescription();
+  }
+
+  /**
+   * This is hack to deal with classes that don't handle resources correctly
+   */
+  private static File getResourceAsFile(Class<?> cls, String path) throws URISyntaxException {
+    // this will fail if the resource is not a real File, but the UMLS code assumes that
+    return new File(cls.getResource(path).toURI());
+  }
+
+  public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase {
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      for (Chunk chunk : JCasUtil.select(jCas, Chunk.class)) {
+        if (chunk.getChunkType().equals("NP")) {
+          new LookupWindowAnnotation(jCas, chunk.getBegin(), chunk.getEnd()).addToIndexes();
+        }
+      }
+    }
+  }
+
+  public static class EntityMentionRemover extends JCasAnnotator_ImplBase {
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      for (EntityMention mention : Lists.newArrayList(JCasUtil.select(jCas, EntityMention.class))) {
+        mention.removeFromIndexes();
+      }
+    }
+  }
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
------------------------------------------------------------------------------
    svn:eol-style = native