You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2018/02/03 18:23:27 UTC
svn commit: r1823049 - /ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java

Author: seanfinan
Date: Sat Feb  3 18:23:27 2018
New Revision: 1823049

URL: http://svn.apache.org/viewvc?rev=1823049&view=rev
Log:
CTAKES-449 : increased speed in cleartk attribute engines on larger files

Modified:
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java?rev=1823049&r1=1823048&r2=1823049&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/AssertionCleartkAnalysisEngine.java Sat Feb  3 18:23:27 2018
@@ -18,17 +18,6 @@
  */
 package org.apache.ctakes.assertion.medfacts.cleartk;
 
-import java.io.File;
-import java.net.URI;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-
 import org.apache.commons.io.FilenameUtils;
 import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
 import org.apache.ctakes.assertion.medfacts.cleartk.extractors.FedaFeatureFunction;
@@ -50,6 +39,7 @@ import org.apache.uima.fit.factory.Analy
 import org.apache.uima.fit.factory.ConfigurationParameterFactory;
 import org.apache.uima.fit.util.JCasUtil;
 import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.cleartk.ml.CleartkAnnotator;
 import org.cleartk.ml.Feature;
@@ -60,95 +50,102 @@ import org.cleartk.ml.feature.extractor.
 import org.cleartk.ml.feature.extractor.FeatureExtractor1;
 import org.cleartk.ml.feature.extractor.TypePathExtractor;
 import org.cleartk.ml.feature.function.FeatureFunctionExtractor;
+
+import java.io.File;
+import java.net.URI;
+import java.util.*;
 //import org.chboston.cnlp.ctakes.relationextractor.ae.ModifierExtractorAnnotator;
 
 /**
  * @author swu
- *
  */
 public abstract class AssertionCleartkAnalysisEngine extends
-    CleartkAnnotator<String>
-{
-  Logger logger = Logger.getLogger(AssertionCleartkAnalysisEngine.class);
-
-  public static final String PARAM_GOLD_VIEW_NAME = "GoldViewName";
-  public enum FEATURE_CONFIG {NO_SEM, NO_SYN, STK, STK_FRAGS, PTK, PTK_FRAGS, DEP_REGEX, DEP_REGEX_FRAGS, ALL_SYN, VECTORS, NO_TOK}
-	
-  public static int relationId; // counter for error logging
-
-  // additional parameter for domain adaptation
-  public static final String FILE_TO_DOMAIN_MAP = "mapTrainFileToDomain";
-
-
-  @ConfigurationParameter(
-      name = PARAM_GOLD_VIEW_NAME,
-      mandatory = false,
-      description = "view containing the manual identified annotations (especially EntityMention and EventMention annotations); needed for training")
-  protected String goldViewName;
-
-  public static final String PARAM_PRINT_ERRORS = "PrintErrors";
-  
-  @ConfigurationParameter(
-     name = PARAM_PRINT_ERRORS,
-     mandatory = false,
-     description = "Print errors true/false",
-     defaultValue = "false")
-  boolean printErrors;
-  
-  public static final String PARAM_PROBABILITY_OF_KEEPING_DEFAULT_EXAMPLE = "ProbabilityOfKeepingADefaultExample";
-
-  @ConfigurationParameter(
-      name = PARAM_PROBABILITY_OF_KEEPING_DEFAULT_EXAMPLE,
-      mandatory = false,
-      description = "probability that a default example should be retained for training")
-  protected double probabilityOfKeepingADefaultExample = 1.0;
-
-  public static final String PARAM_PORTION_OF_DATA_TO_USE = "PortionOfDataToUse";
-  @ConfigurationParameter(
-      name = PARAM_PORTION_OF_DATA_TO_USE,
-      mandatory = false,
-      description = "How much data to actually use during training (e.g. for building learning curves)"
-      )
-  protected double portionOfDataToUse=1.0;
-  
-  public static final String PARAM_FEATURE_SELECTION_THRESHOLD = "WhetherToDoFeatureSelection"; // Accurate name? Actually uses the threshold, right?
-
-  @ConfigurationParameter(
-		  name = PARAM_FEATURE_SELECTION_THRESHOLD,
-		  mandatory = false,
-		  description = "the Chi-squared threshold at which features should be removed")
-  protected Float featureSelectionThreshold = 0f;
-
-  public static final String PARAM_FEATURE_CONFIG = "FEATURE_CONFIG";
-  @ConfigurationParameter(
-      name = PARAM_FEATURE_CONFIG,
-      description = "Feature configuration to use (for experiments)",
-      mandatory = false
-  )protected FEATURE_CONFIG featConfig = FEATURE_CONFIG.ALL_SYN;
-
-  public static final String PARAM_FEATURE_SELECTION_URI = "FeatureSelectionURI";
-
-  @ConfigurationParameter(
-      mandatory = false,
-      name = PARAM_FEATURE_SELECTION_URI,
-      description = "provides a URI where the feature selection data will be written")
-  protected URI featureSelectionURI;
-  
-  protected static Random coin = new Random(0);
-
-  protected static final String FEATURE_SELECTION_NAME = "SelectNeighborFeatures";
-
-  @ConfigurationParameter(
-		  name = FILE_TO_DOMAIN_MAP,
-		  mandatory = false,
-		  description = "a map of filenames to their respective domains (i.e., directories that contain them)")
-  protected String fileDomainMap;
-  protected Map<String,String> fileToDomain = new HashMap<>();
-  
-  protected String lastLabel;
-  
-  
-/* DEPRECATED: STW 2013/03/28.  Use DependencyUtility:getNominalHeadNode(jCas,annotation) instead */
+                                                     CleartkAnnotator<String> {
+   Logger logger = Logger.getLogger( AssertionCleartkAnalysisEngine.class );
+
+   public static final String PARAM_GOLD_VIEW_NAME = "GoldViewName";
+
+   public enum FEATURE_CONFIG {
+      NO_SEM, NO_SYN, STK, STK_FRAGS, PTK, PTK_FRAGS, DEP_REGEX, DEP_REGEX_FRAGS, ALL_SYN, VECTORS, NO_TOK
+   }
+
+   public static int relationId; // counter for error logging
+
+   // additional parameter for domain adaptation
+   public static final String FILE_TO_DOMAIN_MAP = "mapTrainFileToDomain";
+
+
+   @ConfigurationParameter(
+         name = PARAM_GOLD_VIEW_NAME,
+         mandatory = false,
+         description = "view containing the manual identified annotations (especially EntityMention and EventMention annotations); needed for training" )
+   protected String goldViewName;
+
+   public static final String PARAM_PRINT_ERRORS = "PrintErrors";
+
+   @ConfigurationParameter(
+         name = PARAM_PRINT_ERRORS,
+         mandatory = false,
+         description = "Print errors true/false",
+         defaultValue = "false" )
+   boolean printErrors;
+
+   public static final String PARAM_PROBABILITY_OF_KEEPING_DEFAULT_EXAMPLE = "ProbabilityOfKeepingADefaultExample";
+
+   @ConfigurationParameter(
+         name = PARAM_PROBABILITY_OF_KEEPING_DEFAULT_EXAMPLE,
+         mandatory = false,
+         description = "probability that a default example should be retained for training" )
+   protected double probabilityOfKeepingADefaultExample = 1.0;
+
+   public static final String PARAM_PORTION_OF_DATA_TO_USE = "PortionOfDataToUse";
+   @ConfigurationParameter(
+         name = PARAM_PORTION_OF_DATA_TO_USE,
+         mandatory = false,
+         description = "How much data to actually use during training (e.g. for building learning curves)"
+   )
+   protected double portionOfDataToUse = 1.0;
+
+   public static final String PARAM_FEATURE_SELECTION_THRESHOLD = "WhetherToDoFeatureSelection";
+   // Accurate name? Actually uses the threshold, right?
+
+   @ConfigurationParameter(
+         name = PARAM_FEATURE_SELECTION_THRESHOLD,
+         mandatory = false,
+         description = "the Chi-squared threshold at which features should be removed" )
+   protected Float featureSelectionThreshold = 0f;
+
+   public static final String PARAM_FEATURE_CONFIG = "FEATURE_CONFIG";
+   @ConfigurationParameter(
+         name = PARAM_FEATURE_CONFIG,
+         description = "Feature configuration to use (for experiments)",
+         mandatory = false
+   )
+   protected FEATURE_CONFIG featConfig = FEATURE_CONFIG.ALL_SYN;
+
+   public static final String PARAM_FEATURE_SELECTION_URI = "FeatureSelectionURI";
+
+   @ConfigurationParameter(
+         mandatory = false,
+         name = PARAM_FEATURE_SELECTION_URI,
+         description = "provides a URI where the feature selection data will be written" )
+   protected URI featureSelectionURI;
+
+   protected static Random coin = new Random( 0 );
+
+   protected static final String FEATURE_SELECTION_NAME = "SelectNeighborFeatures";
+
+   @ConfigurationParameter(
+         name = FILE_TO_DOMAIN_MAP,
+         mandatory = false,
+         description = "a map of filenames to their respective domains (i.e., directories that contain them)" )
+   protected String fileDomainMap;
+   protected Map<String, String> fileToDomain = new HashMap<>();
+
+   protected String lastLabel;
+
+
+   /* DEPRECATED: STW 2013/03/28.  Use DependencyUtility:getNominalHeadNode(jCas,annotation) instead */
 //  public ConllDependencyNode findAnnotationHead(JCas jcas, Annotation annotation) {
 //		
 //	    for (ConllDependencyNode depNode : JCasUtil.selectCovered(jcas, ConllDependencyNode.class, annotation)) {
@@ -163,110 +160,120 @@ public abstract class AssertionCleartkAn
 //	    return null;
 //	}
 
-  
-	
-	
-//private FeatureExtractor1 tokenFeatureExtractor;
+
+   //private FeatureExtractor1 tokenFeatureExtractor;
 //  protected List<ContextExtractor<IdentifiedAnnotation>> contextFeatureExtractors;
 //  protected List<ContextExtractor<BaseToken>> tokenContextFeatureExtractors;
-  protected List<CleartkExtractor<IdentifiedAnnotation,BaseToken>> contextFeatureExtractors;
-  protected List<CleartkExtractor<IdentifiedAnnotation,BaseToken>> tokenContextFeatureExtractors;
-  protected List<CleartkExtractor<IdentifiedAnnotation,BaseToken>> tokenCleartkExtractors;
-  protected List<FeatureExtractor1<IdentifiedAnnotation>> entityFeatureExtractors;
-  protected List<FeatureExtractor1<IdentifiedAnnotation>> entityTreeExtractors;
-  protected CleartkExtractor<IdentifiedAnnotation,BaseToken> cuePhraseInWindowExtractor;
-  
-
-  protected List<FeatureFunctionExtractor<IdentifiedAnnotation>> featureFunctionExtractors = new ArrayList<>();
-  protected FedaFeatureFunction ffDomainAdaptor=null;
-  
-  protected FeatureSelection<String> featureSelection;
-  
-  public abstract void setClassLabel(IdentifiedAnnotation entityMention, Instance<String> instance) throws AnalysisEngineProcessException;
+   protected List<CleartkExtractor<IdentifiedAnnotation, BaseToken>> contextFeatureExtractors;
+   protected List<CleartkExtractor<IdentifiedAnnotation, BaseToken>> tokenContextFeatureExtractors;
+   protected List<CleartkExtractor<IdentifiedAnnotation, BaseToken>> tokenCleartkExtractors;
+   protected List<FeatureExtractor1<IdentifiedAnnotation>> entityFeatureExtractors;
+   protected List<FeatureExtractor1<IdentifiedAnnotation>> entityTreeExtractors;
+   protected CleartkExtractor<IdentifiedAnnotation, BaseToken> cuePhraseInWindowExtractor;
+
+
+   protected List<FeatureFunctionExtractor<IdentifiedAnnotation>> featureFunctionExtractors = new ArrayList<>();
+   protected FedaFeatureFunction ffDomainAdaptor = null;
 
-  protected abstract void initializeFeatureSelection() throws ResourceInitializationException;
+   protected FeatureSelection<String> featureSelection;
+
+   public abstract void setClassLabel( IdentifiedAnnotation entityMention, Instance<String> instance )
+         throws AnalysisEngineProcessException;
+
+   protected abstract void initializeFeatureSelection() throws ResourceInitializationException;
 //  public abstract FeatureSelection<String> createFeatureSelection(double threshold);
 //  public abstract URI createFeatureSelectionURI(File outputDirectoryName);
 
-  @Override
-  @SuppressWarnings("deprecation")
-  public void initialize(UimaContext context) throws ResourceInitializationException {
-    super.initialize(context);
-
-    // Re-process the "directory" string for domains that were used in the data
-    if (null != fileDomainMap) {
-    	String[] dirs = fileDomainMap.split("[;:]");
-    	for (String dir : dirs) {
-    		
-    		// TODO: normalize dir to real domainId
-    		String domainId = normalizeToDomain(dir);
-    		
-    		File dataDir = new File(dir);
-    		if (dataDir.listFiles()!=null) {
-    			for (File f : dataDir.listFiles()) {
-    				fileToDomain.put( FilenameUtils.removeExtension(f.getName()), domainId );
-    			}
-        		//    	System.out.println(trainFiles.toString());
-    		}
-    	}
-    }
-    
-    if (this.isTraining() && this.goldViewName == null) {
-      throw new IllegalArgumentException(PARAM_GOLD_VIEW_NAME + " must be defined during training");
-    }
-    
-    // alias for NGram feature parameters
+   private JCas getAnnotationView( final JCas jCas ) throws AnalysisEngineProcessException {
+      if ( this.isTraining() ) {
+         try {
+            return jCas.getView( this.goldViewName );
+         } catch ( CASException e ) {
+            throw new AnalysisEngineProcessException( e );
+         }
+      }
+      return jCas;
+   }
+
+   @Override
+   @SuppressWarnings( "deprecation" )
+   public void initialize( UimaContext context ) throws ResourceInitializationException {
+      super.initialize( context );
+
+      // Re-process the "directory" string for domains that were used in the data
+      if ( null != fileDomainMap ) {
+         String[] dirs = fileDomainMap.split( "[;:]" );
+         for ( String dir : dirs ) {
+
+            // TODO: normalize dir to real domainId
+            String domainId = normalizeToDomain( dir );
+
+            File dataDir = new File( dir );
+            if ( dataDir.listFiles() != null ) {
+               for ( File f : dataDir.listFiles() ) {
+                  fileToDomain.put( FilenameUtils.removeExtension( f.getName() ), domainId );
+               }
+               //    	System.out.println(trainFiles.toString());
+            }
+         }
+      }
+
+      if ( this.isTraining() && this.goldViewName == null ) {
+         throw new IllegalArgumentException( PARAM_GOLD_VIEW_NAME + " must be defined during training" );
+      }
+
+      // alias for NGram feature parameters
 //    int fromRight = CharacterNGramProliferator.RIGHT_TO_LEFT;
 
-    // a list of feature extractors that require only the token:
-    // the stem of the word, the text of the word itself, plus
-    // features created from the word text like character ngrams
-    this.entityFeatureExtractors = new ArrayList<>();
-    
-    // a list of feature extractors that require the token and the sentence
+      // a list of feature extractors that require only the token:
+      // the stem of the word, the text of the word itself, plus
+      // features created from the word text like character ngrams
+      this.entityFeatureExtractors = new ArrayList<>();
+
+      // a list of feature extractors that require the token and the sentence
 //    this.contextFeatureExtractors = new ArrayList<CleartkExtractor>();
-    
-    this.tokenCleartkExtractors = new ArrayList<>();
 
-    CleartkExtractor<IdentifiedAnnotation,BaseToken> tokenExtraction1 = 
-    		new CleartkExtractor<>(
-    				BaseToken.class, 
+      this.tokenCleartkExtractors = new ArrayList<>();
+
+      CleartkExtractor<IdentifiedAnnotation, BaseToken> tokenExtraction1 =
+            new CleartkExtractor<>(
+                  BaseToken.class,
 //    				new FeatureFunctionExtractor(new CoveredTextExtractor(), new LowerCaseFeatureFunction()),
 //            new FeatureFunctionExtractor(new CoveredTextExtractor(), new BrownClusterFeatureFunction()),
-    				new CoveredTextExtractor<BaseToken>(),
-    				//new CleartkExtractor.Covered(),
-    				new CleartkExtractor.LastCovered(2),
-    				new CleartkExtractor.Preceding(5),
-    				new CleartkExtractor.Following(4),
-    				new CleartkExtractor.Bag(new CleartkExtractor.Preceding(3)),
-    				new CleartkExtractor.Bag(new CleartkExtractor.Following(3)),
-            new CleartkExtractor.Bag(new CleartkExtractor.Preceding(5)),
-            new CleartkExtractor.Bag(new CleartkExtractor.Following(5)),
-            new CleartkExtractor.Bag(new CleartkExtractor.Preceding(10)),
-            new CleartkExtractor.Bag(new CleartkExtractor.Following(10))
-    				);
-    
-    CleartkExtractor<IdentifiedAnnotation,BaseToken> posExtraction1 = 
-    		new CleartkExtractor<>(
-    				BaseToken.class,
-    				new TypePathExtractor<>(BaseToken.class, "partOfSpeech"),
-    				new CleartkExtractor.LastCovered(2),
-    				new CleartkExtractor.Preceding(3),
-    				new CleartkExtractor.Following(2)
-    				);
+                  new CoveredTextExtractor<BaseToken>(),
+                  //new CleartkExtractor.Covered(),
+                  new CleartkExtractor.LastCovered( 2 ),
+                  new CleartkExtractor.Preceding( 5 ),
+                  new CleartkExtractor.Following( 4 ),
+                  new CleartkExtractor.Bag( new CleartkExtractor.Preceding( 3 ) ),
+                  new CleartkExtractor.Bag( new CleartkExtractor.Following( 3 ) ),
+                  new CleartkExtractor.Bag( new CleartkExtractor.Preceding( 5 ) ),
+                  new CleartkExtractor.Bag( new CleartkExtractor.Following( 5 ) ),
+                  new CleartkExtractor.Bag( new CleartkExtractor.Preceding( 10 ) ),
+                  new CleartkExtractor.Bag( new CleartkExtractor.Following( 10 ) )
+            );
+
+      CleartkExtractor<IdentifiedAnnotation, BaseToken> posExtraction1 =
+            new CleartkExtractor<>(
+                  BaseToken.class,
+                  new TypePathExtractor<>( BaseToken.class, "partOfSpeech" ),
+                  new CleartkExtractor.LastCovered( 2 ),
+                  new CleartkExtractor.Preceding( 3 ),
+                  new CleartkExtractor.Following( 2 )
+            );
 
-    this.tokenCleartkExtractors.add(tokenExtraction1);
+      this.tokenCleartkExtractors.add( tokenExtraction1 );
 //    this.tokenCleartkExtractors.add(posExtraction1);
-    
+
 //    this.contextFeatureExtractors.add(new CleartkExtractor(IdentifiedAnnotation.class,
 //        new CoveredTextExtractor(),
 //        //new TypePathExtractor(IdentifiedAnnotation.class, "stem"),
 //        new Preceding(2),
 //        new Following(2)));
-    
-    // stab at dependency-based features
-    //List<Feature> features = new ArrayList<Feature>();
-    //ConllDependencyNode node1 = findAnnotationHead(jCas, arg1);
+
+      // stab at dependency-based features
+      //List<Feature> features = new ArrayList<Feature>();
+      //ConllDependencyNode node1 = findAnnotationHead(jCas, arg1);
 
 //    CombinedExtractor1 baseExtractorCuePhraseCategory =
 //        new CombinedExtractor1
@@ -276,8 +283,8 @@ public abstract class AssertionCleartkAn
 //           new TypePathExtractor(AssertionCuePhraseAnnotation.class, "cuePhraseCategory"),
 //           new TypePathExtractor(AssertionCuePhraseAnnotation.class, "cuePhraseAssertionFamily")
 //          );
-    
-    // Commented out by TM because it is never actually used:
+
+      // Commented out by TM because it is never actually used:
 /*    
     cuePhraseInWindowExtractor = new CleartkExtractor<>(
         BaseToken.class,
@@ -293,44 +300,39 @@ public abstract class AssertionCleartkAn
 //          new CleartkExtractor.Bag(new CleartkExtractor.Following(10))
           );
 */
-    if (!fileToDomain.isEmpty()) {
-    	// set up FeatureFunction for all the laggard, non-Extractor features
-    	ffDomainAdaptor = new FedaFeatureFunction( new ArrayList<>(new HashSet<>(fileToDomain.values())) );
-    }
-    entityTreeExtractors =  new ArrayList<>();
-  }
-
-  @Override
-  public void process(JCas jCas) throws AnalysisEngineProcessException
-  {
-    String documentId = DocumentIDAnnotationUtil.getDocumentID(jCas);
-    String domainId = "";
-    String domainFeature = null;
-
-    if(this.featureFunctionExtractors.size() <= 0){
-      this.ffDomainAdaptor = null;
-    }
-
-    if (documentId != null)
-    {
-      logger.debug("processing next doc: " + documentId);
-
-      // set the domain to be FeatureFunction'ed into all extractors
-      if (!fileToDomain.isEmpty() && ffDomainAdaptor != null) {
-    	  domainId = fileToDomain.get(documentId);
-    	  ffDomainAdaptor.setDomain(domainId); // if domain is not found, no warning -- just considers general domain
-      }else if(!fileToDomain.isEmpty()){
-        domainFeature = fileToDomain.get(documentId);
+      if ( !fileToDomain.isEmpty() ) {
+         // set up FeatureFunction for all the laggard, non-Extractor features
+         ffDomainAdaptor = new FedaFeatureFunction( new ArrayList<>( new HashSet<>( fileToDomain.values() ) ) );
+      }
+      entityTreeExtractors = new ArrayList<>();
+   }
+
+   @Override
+   public void process( JCas jCas ) throws AnalysisEngineProcessException {
+      String documentId = DocumentIDAnnotationUtil.getDocumentID( jCas );
+      String domainId = "";
+      String domainFeature = null;
+
+      if ( this.featureFunctionExtractors.size() <= 0 ) {
+         this.ffDomainAdaptor = null;
+      }
+
+      if ( documentId != null ) {
+         logger.debug( "processing next doc: " + documentId );
+         // set the domain to be FeatureFunction'ed into all extractors
+         if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
+            domainId = fileToDomain.get( documentId );
+            // if domain is not found, no warning -- just considers general domain
+            ffDomainAdaptor.setDomain( domainId );
+         } else if ( !fileToDomain.isEmpty() ) {
+            domainFeature = fileToDomain.get( documentId );
+         }
+      } else {
+         logger.debug( "processing next doc (doc id is null)" );
       }
-      
-    } else
-    {
-      logger.debug("processing next doc (doc id is null)");
-    }
-    
 
-    this.lastLabel = "<BEGIN>";
-    
+      this.lastLabel = "<BEGIN>";
+
 //    // get gold standard relation instances during testing for error analysis
 //    if (! this.isTraining() && printErrors) {
 //      JCas goldView;
@@ -342,62 +344,68 @@ public abstract class AssertionCleartkAn
 //      
 //      //categoryLookup = createCategoryLookup(goldView); 
 //    }
-    
-    JCas identifiedAnnotationView;
-    if (this.isTraining()) {
-      try {
-        identifiedAnnotationView = jCas.getView(this.goldViewName);
-      } catch (CASException e) {
-        throw new AnalysisEngineProcessException(e);
-      }
-    } else {
-      identifiedAnnotationView = jCas;
-    }
 
+      final JCas annotationView = getAnnotationView( jCas );
 
-//    Map<IdentifiedAnnotation, Collection<Sentence>> coveringSentenceMap = JCasUtil.indexCovering(identifiedAnnotationView, IdentifiedAnnotation.class, Sentence.class);
-//    Map<Sentence, Collection<BaseToken>> tokensCoveredInSentenceMap = JCasUtil.indexCovered(identifiedAnnotationView, Sentence.class, BaseToken.class);
+//    Map<IdentifiedAnnotation, Collection<Sentence>> coveringSentenceMap = JCasUtil.indexCovering(annotationView, IdentifiedAnnotation.class, Sentence.class);
+//    Map<Sentence, Collection<BaseToken>> tokensCoveredInSentenceMap = JCasUtil.indexCovered(annotationView, Sentence.class, BaseToken.class);
 
 //    Map<IdentifiedAnnotation, Collection<Zone>> coveringZoneMap =
 //        JCasUtil.indexCovering(jCas, IdentifiedAnnotation.class, Zone.class);
 //    Map<IdentifiedAnnotation, Collection<Sentence>> coveringSents =
 //        JCasUtil.indexCovering(jCas, IdentifiedAnnotation.class, Sentence.class);
-    
+
 //    List<Instance<String>> instances = new ArrayList<Instance<String>>();
-    // generate a list of training instances for each sentence in the document
-    Collection<IdentifiedAnnotation> entities = JCasUtil.select(identifiedAnnotationView, IdentifiedAnnotation.class);
-    for (IdentifiedAnnotation identifiedAnnotation : entities)
-    {
-      if (!(identifiedAnnotation instanceof EntityMention || identifiedAnnotation instanceof EventMention))
-      {
-        continue;
-      }
-      IdentifiedAnnotation entityOrEventMention = identifiedAnnotation;
-      if (entityOrEventMention.getPolarity() == -1)
-      {
-        logger.debug(String.format(" - identified annotation: [%d-%d] polarity %d (%s)",
-            entityOrEventMention.getBegin(),
-            entityOrEventMention.getEnd(),
-            entityOrEventMention.getPolarity(),
-            entityOrEventMention.getClass().getName()));
-      }
-      Instance<String> instance = new Instance<>();
-      
-      if(domainFeature != null){
-        instance.add(new Feature("Domain", domainFeature));
-      }
+      // generate a list of training instances for each sentence in the document
+      // Use an indexed map.  This is faster than calling select and then selectCovering within a loop.
+      final Map<Sentence, Collection<Annotation>> sentenceAnnotationMap
+            = JCasUtil.indexCovered( jCas, Sentence.class, Annotation.class );
+      // Faster than calling JCasUtil methods for each which has to iterate through the full cas each time.
+      final Collection<IdentifiedAnnotation> entities = new ArrayList<>();
+      final Collection<AssertionCuePhraseAnnotation> cues = new ArrayList<>();
+      final Collection<BaseToken> baseTokens = new ArrayList<>();
+      for ( Map.Entry<Sentence, Collection<Annotation>> sentenceAnnotations : sentenceAnnotationMap.entrySet() ) {
+         final Sentence coveringSent = sentenceAnnotations.getKey();
+         // Sort Annotations into *Mention, assertion cues and BaseTokens in one loop.
+         // Faster than calling JCasUtil methods for each which has to iterate through the full cas each time.
+         entities.clear();
+         cues.clear();
+         baseTokens.clear();
+         for ( Annotation annotation : sentenceAnnotations.getValue() ) {
+            if ( annotation instanceof EventMention || annotation instanceof EntityMention ) {
+               entities.add( (IdentifiedAnnotation)annotation );
+            } else if ( annotation instanceof AssertionCuePhraseAnnotation ) {
+               cues.add( (AssertionCuePhraseAnnotation)annotation );
+            } else if ( annotation instanceof BaseToken ) {
+               baseTokens.add( (BaseToken)annotation );
+            }
+         }
+
+         for ( IdentifiedAnnotation identifiedAnnotation : entities ) {
+            if ( identifiedAnnotation.getPolarity() == -1 ) {
+               logger.debug( String.format( " - identified annotation: [%d-%d] polarity %d (%s)",
+                     identifiedAnnotation.getBegin(),
+                     identifiedAnnotation.getEnd(),
+                     identifiedAnnotation.getPolarity(),
+                     identifiedAnnotation.getClass().getName() ) );
+            }
+            Instance<String> instance = new Instance<>();
+
+            if ( domainFeature != null ) {
+               instance.add( new Feature( "Domain", domainFeature ) );
+            }
 //      // extract all features that require only the entity mention annotation
 //      instance.addAll(tokenFeatureExtractor.extract(jCas, entityMention));
 
-      // extract all features that require the token and sentence annotations
+            // extract all features that require the token and sentence annotations
 
-      //Sentence sentence = sentenceList.iterator().next();
+            //Sentence sentence = sentenceList.iterator().next();
       
       /*
       if (sentence != null)
       {
         for (ContextExtractor<IdentifiedAnnotation> extractor : this.contextFeatureExtractors) {
-          instance.addAll(extractor.extractWithin(identifiedAnnotationView, entityMention, sentence));
+          instance.addAll(extractor.extractWithin(annotationView, entityMention, sentence));
         }
       } else
       {
@@ -408,92 +416,88 @@ public abstract class AssertionCleartkAn
       
       /*
       for (ContextExtractor<BaseToken> extractor : this.tokenContextFeatureExtractors) {
-          instance.addAll(extractor.extract(identifiedAnnotationView, entityMention));
+          instance.addAll(extractor.extract(annotationView, entityMention));
         }
         */
-      List<Sentence> sents = new ArrayList<>(JCasUtil.selectCovering(jCas, Sentence.class, entityOrEventMention.getBegin(), entityOrEventMention.getEnd()));
-      Sentence coveringSent = null;
-      if(sents.size() > 0){
-        coveringSent = sents.get(0);
-      }
-      
-      // only use extract this version if not doing domain adaptation 
-      if (ffDomainAdaptor==null) {
-    	  for (CleartkExtractor<IdentifiedAnnotation, BaseToken> extractor : this.tokenCleartkExtractors) {
-//    		  instance.addAll(extractor.extractWithin(identifiedAnnotationView, entityMention, sentence));
-    		  if(coveringSent != null){
-    			  instance.addAll(extractor.extractWithin(identifiedAnnotationView, entityOrEventMention, coveringSent));
-    		  }else{
-    			  instance.addAll(extractor.extract(identifiedAnnotationView, entityOrEventMention));
-    		  }
-    	  }
-      }
-      
-      if(coveringSent != null){
-//      List<Feature> cuePhraseFeatures = null;
-//          cuePhraseInWindowExtractor.extract(jCas, entityOrEventMention);
-          //cuePhraseInWindowExtractor.extractWithin(jCas, entityMention, firstCoveringSentence);
-//      List<Sentence> sents = new ArrayList<Sentence>(coveringSents.get(entityOrEventMention));
-        List<AssertionCuePhraseAnnotation> cues = JCasUtil.selectCovered(AssertionCuePhraseAnnotation.class, coveringSent);
-        int closest = Integer.MAX_VALUE;
-        AssertionCuePhraseAnnotation closestCue = null;
-        for(AssertionCuePhraseAnnotation cue : cues){
-          List<BaseToken> tokens = JCasUtil.selectBetween(BaseToken.class, cue, entityOrEventMention);
-          if(tokens.size() < closest){
-            closestCue = cue;
-            closest = tokens.size();
-          }
+
+            // only use extract this version if not doing domain adaptation
+            if ( ffDomainAdaptor == null ) {
+               for ( CleartkExtractor<IdentifiedAnnotation, BaseToken> extractor : this.tokenCleartkExtractors ) {
+//    		  instance.addAll(extractor.extractWithin(annotationView, entityMention, sentence));
+//                  if ( coveringSent != null ) {
+                  instance.addAll( extractor
+                        .extractWithin( annotationView, identifiedAnnotation, coveringSent ) );
+//                  } else {
+//                     instance.addAll( extractor.extract( annotationView, identifiedAnnotation ) );
+//                  }
+               }
+            }
+
+            int closest = Integer.MAX_VALUE;
+            AssertionCuePhraseAnnotation closestCue = null;
+            for ( AssertionCuePhraseAnnotation cue : cues ) {
+               // It is much faster to count between BaseTokens already isolated within the same sentence.
+               final int betweenCount = countBetween( cue, identifiedAnnotation, baseTokens );
+               if ( betweenCount < closest ) {
+                  closestCue = cue;
+                  closest = betweenCount;
+               }
+
 //          instance.addAll(cuePhraseInWindowExtractor.extractBetween(jCas, cue, entityOrEventMention));
-        }
-        if(closestCue != null && closest < 21){
-          instance.add(new Feature("ClosestCue_Word", closestCue.getCoveredText()));
+            }
+            if ( closestCue != null && closest < 21 ) {
+               instance.add( new Feature( "ClosestCue_Word", closestCue.getCoveredText() ) );
 //          instance.add(new Feature("ClosestCue_Phrase", closestCue.getCuePhrase()));
-          instance.add(new Feature("ClosestCue_PhraseFamily", closestCue.getCuePhraseAssertionFamily()));
-          instance.add(new Feature("ClosestCue_PhraseCategory", closestCue.getCuePhraseCategory()));
-          
-          // add hack-ey domain adaptation to these hacked-in features
-          if (!fileToDomain.isEmpty() && ffDomainAdaptor!=null) {
-        	  instance.addAll(ffDomainAdaptor.apply(new Feature("ClosestCue_Word", closestCue.getCoveredText())));
-        	  instance.addAll(ffDomainAdaptor.apply(new Feature("ClosestCue_PhraseFamily", closestCue.getCuePhraseAssertionFamily())));
-              instance.addAll(ffDomainAdaptor.apply(new Feature("ClosestCue_PhraseCategory", closestCue.getCuePhraseCategory())));
-          }
-          
-        }
-      }
+               instance.add( new Feature( "ClosestCue_PhraseFamily", closestCue.getCuePhraseAssertionFamily() ) );
+               instance.add( new Feature( "ClosestCue_PhraseCategory", closestCue.getCuePhraseCategory() ) );
+
+               // add hack-ey domain adaptation to these hacked-in features
+               if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
+                  instance.addAll( ffDomainAdaptor
+                        .apply( new Feature( "ClosestCue_Word", closestCue.getCoveredText() ) ) );
+                  instance.addAll( ffDomainAdaptor
+                        .apply( new Feature( "ClosestCue_PhraseFamily", closestCue
+                              .getCuePhraseAssertionFamily() ) ) );
+                  instance.addAll( ffDomainAdaptor
+                        .apply( new Feature( "ClosestCue_PhraseCategory", closestCue.getCuePhraseCategory() ) ) );
+               }
+
+            }
+//            }
 //      if (cuePhraseFeatures != null && !cuePhraseFeatures.isEmpty())
 //      {
 //        instance.addAll(cuePhraseFeatures);
 //      }
 
 
-      // 7/9/13 SRH trying to make it work just for anatomical site
-      int eemTypeId = entityOrEventMention.getTypeID(); 
-      if (eemTypeId == CONST.NE_TYPE_ID_ANATOMICAL_SITE) {
-          // 7/9/13 srh modified per tmiller so it's binary but not numeric feature
-          //instance.add(new Feature("ENTITY_TYPE_" + entityOrEventMention.getTypeID()));
-          instance.add(new Feature("ENTITY_TYPE_ANAT_SITE"));
-          // add hack-ey domain adaptation to these hacked-in features
-          if (!fileToDomain.isEmpty() && ffDomainAdaptor!=null) {
-        	  instance.addAll(ffDomainAdaptor.apply(new Feature("ENTITY_TYPE_ANAT_SITE")));
-          }
-      }
+            // 7/9/13 SRH trying to make it work just for anatomical site
+            int eemTypeId = identifiedAnnotation.getTypeID();
+            if ( eemTypeId == CONST.NE_TYPE_ID_ANATOMICAL_SITE ) {
+               // 7/9/13 srh modified per tmiller so it's binary but not numeric feature
+               //instance.add(new Feature("ENTITY_TYPE_" + entityOrEventMention.getTypeID()));
+               instance.add( new Feature( "ENTITY_TYPE_ANAT_SITE" ) );
+               // add hack-ey domain adaptation to these hacked-in features
+               if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
+                  instance.addAll( ffDomainAdaptor.apply( new Feature( "ENTITY_TYPE_ANAT_SITE" ) ) );
+               }
+            }
       /* This hurts recall more than it helps precision
       else if (eemTypeId == CONST.NE_TYPE_ID_DRUG) {
     	  // 7/10 adding drug
     	  instance.add(new Feature("ENTITY_TYPE_DRUG"));
       }
       */
-      
-      // only extract these features if not doing domain adaptation
-      if (ffDomainAdaptor==null) {
-    	  for (FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityFeatureExtractors) {
-    		  instance.addAll(extractor.extract(jCas, entityOrEventMention));
-    	  }
-      }
 
-      for (FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityTreeExtractors) {
-        instance.addAll(extractor.extract(jCas, entityOrEventMention));
-      }
+            // only extract these features if not doing domain adaptation
+            if ( ffDomainAdaptor == null ) {
+               for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityFeatureExtractors ) {
+                  instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
+               }
+            }
+
+            for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityTreeExtractors ) {
+               instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
+            }
 
 //      List<Feature> zoneFeatures = extractZoneFeatures(coveringZoneMap, entityOrEventMention);
 //      if (zoneFeatures != null && !zoneFeatures.isEmpty())
@@ -501,44 +505,53 @@ public abstract class AssertionCleartkAn
 //        instance.addAll(zoneFeatures);
 //      }
 
-      List<Feature> feats = instance.getFeatures();
+            List<Feature> feats = instance.getFeatures();
 //      List<Feature> lcFeats = new ArrayList<Feature>();
-      
-      for(Feature feat : feats){
-    	  if(feat instanceof TreeFeature || (feat.getName() != null && (feat.getName().startsWith("TreeFrag") || feat.getName().startsWith("WORD") || feat.getName().startsWith("NEG")))) continue;
-    	  if(feat.getName() != null && (feat.getName().contains("_TreeFrag") || feat.getName().contains("_WORD") || feat.getName().contains("_NEG"))) continue;
-    	  if(feat.getValue() instanceof String){
-    		  feat.setValue(((String)feat.getValue()).toLowerCase());
-    	  }
-      }
 
-      if (!fileToDomain.isEmpty() && ffDomainAdaptor!=null) {
-    	  for (FeatureFunctionExtractor<IdentifiedAnnotation> extractor : this.featureFunctionExtractors) {
-    		  // TODO: extend to the case where the extractors take a different argument besides entityOrEventMention
-    		  instance.addAll(extractor.extract(jCas, entityOrEventMention));
-    	  }
+            for ( Feature feat : feats ) {
+               if ( feat instanceof TreeFeature ||
+                    (feat.getName() != null && (feat.getName().startsWith( "TreeFrag" ) ||
+                                                feat.getName().startsWith( "WORD" ) ||
+                                                feat.getName().startsWith( "NEG" ))) ) {
+                  continue;
+               }
+               if ( feat.getName() != null &&
+                    (feat.getName().contains( "_TreeFrag" ) || feat.getName().contains( "_WORD" ) ||
+                     feat.getName().contains( "_NEG" )) ) {
+                  continue;
+               }
+               if ( feat.getValue() instanceof String ) {
+                  feat.setValue( ((String)feat.getValue()).toLowerCase() );
+               }
+            }
+
+            if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
+               for ( FeatureFunctionExtractor<IdentifiedAnnotation> extractor : this.featureFunctionExtractors ) {
+                  // TODO: extend to the case where the extractors take a different argument besides entityOrEventMention
+                  instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
+               }
+            }
+
+
+            // grab the output label
+            setClassLabel( identifiedAnnotation, instance );
+
+            if ( this.isTraining() ) {
+               // apply feature selection, if necessary
+               if ( this.featureSelection != null ) {
+                  feats = this.featureSelection.transform( feats );
+               }
+
+               // ensures that the (possibly) transformed feats are used
+               if ( instance.getOutcome() != null ) {
+                  if ( coin.nextDouble() < this.portionOfDataToUse ) {
+                     this.dataWriter.write( new Instance<>( instance.getOutcome(), feats ) );
+                  }
+               }
+            }
+         }
       }
-      
-      
-      // grab the output label
-      setClassLabel(entityOrEventMention, instance);
-
-      if (this.isTraining()) {
-    	  // apply feature selection, if necessary
-    	  if (this.featureSelection != null) {
-    		  feats = this.featureSelection.transform(feats);
-    	  }
-
-    	  // ensures that the (possibly) transformed feats are used
-    	  if (instance.getOutcome()!=null) {
-    	    if(coin.nextDouble() < this.portionOfDataToUse){
-    	      this.dataWriter.write(new Instance<>(instance.getOutcome(),feats));
-    	    }
-    	  }
-      }
-    }
-    
-  }
+   }
 
   /*
   public List<Feature> extractZoneFeatures(Map<IdentifiedAnnotation, Collection<Zone>> coveringZoneMap, IdentifiedAnnotation entityOrEventMention)
@@ -567,42 +580,67 @@ public abstract class AssertionCleartkAn
   }
   */
 
-  public static AnalysisEngineDescription getDescription(Object... additionalConfiguration)
-	      throws ResourceInitializationException {
-	    AnalysisEngineDescription desc = AnalysisEngineFactory.createEngineDescription(AssertionCleartkAnalysisEngine.class);
-	    if (additionalConfiguration.length > 0) {
-	      ConfigurationParameterFactory.addConfigurationParameters(desc, additionalConfiguration);
-	    }
-	    return desc;
-	  }
-
-public Map<String, String> getTrainFileToDomain() {
-	return fileToDomain;
-}
-
-public void setTrainFileToDomain(Map<String, String> trainFileToDomain) {
-	this.fileToDomain = trainFileToDomain;
-}
-
-/** Looks in the domain string (path) for meaningful corpus names 
- * @param dir
- * @return
- */
-public static String normalizeToDomain(String dir) {
-	  // TODO: real normalization
-	  String[] p = dir.split("/");
-	  List<String> parts = new ArrayList<>();
-	  Collections.addAll(parts, p);
-	  Collections.reverse(parts);
-	  for (String part : parts) {
-		  if ( part.toLowerCase().startsWith("test") || part.toLowerCase().startsWith("train") || part.toLowerCase().startsWith("dev") ) {
-			  continue;
-		  }
-		  return part;
-	  }
-	  return dir;
-}
-  
+   public static AnalysisEngineDescription getDescription( Object... additionalConfiguration )
+         throws ResourceInitializationException {
+      AnalysisEngineDescription desc = AnalysisEngineFactory
+            .createEngineDescription( AssertionCleartkAnalysisEngine.class );
+      if ( additionalConfiguration.length > 0 ) {
+         ConfigurationParameterFactory.addConfigurationParameters( desc, additionalConfiguration );
+      }
+      return desc;
+   }
+
+   public Map<String, String> getTrainFileToDomain() {
+      return fileToDomain;
+   }
+
+   public void setTrainFileToDomain( Map<String, String> trainFileToDomain ) {
+      this.fileToDomain = trainFileToDomain;
+   }
+
+   /**
+    * Looks in the domain string (path) for meaningful corpus names
+    *
+    * @param dir
+    * @return
+    */
+   public static String normalizeToDomain( String dir ) {
+      // TODO: real normalization
+      String[] p = dir.split( "/" );
+      List<String> parts = new ArrayList<>();
+      Collections.addAll( parts, p );
+      Collections.reverse( parts );
+      for ( String part : parts ) {
+         if ( part.toLowerCase().startsWith( "test" ) || part.toLowerCase().startsWith( "train" ) ||
+              part.toLowerCase().startsWith( "dev" ) ) {
+            continue;
+         }
+         return part;
+      }
+      return dir;
+   }
+
+
+   /**
+    * @param annotation1 -
+    * @param annotation2 -
+    * @param baseTokens  baseTokens within window
+    * @return number of basetokens that lie between annotation1 and annotation2
+    */
+   static private int countBetween( final Annotation annotation1,
+                                    final Annotation annotation2,
+                                    final Collection<BaseToken> baseTokens ) {
+      final int lowEnd = Math.min( annotation1.getEnd(), annotation2.getEnd() );
+      final int highBegin = Math.max( annotation1.getBegin(), annotation2.getBegin() );
+      int between = 0;
+      for ( BaseToken baseToken : baseTokens ) {
+         if ( lowEnd < baseToken.getBegin() && baseToken.getEnd() < highBegin ) {
+            between++;
+         }
+      }
+      return between;
+   }
+
   /*
   public static AnalysisEngineDescription getClassifierDescription(String modelFileName)
       throws ResourceInitializationException {