You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2019/01/08 03:45:51 UTC
svn commit: r1850705 [1/2] - in /ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed: ./ classifier/ context/ context/feature/ context/feature/extractor/

Author: seanfinan
Date: Tue Jan  8 03:45:51 2019
New Revision: 1850705

URL: http://svn.apache.org/viewvc?rev=1850705&view=rev
Log:
CTAKES-449 : Partial rewrite using forwarded Annotations for speed.  Already faster, but Bag needs to be finished in WindowedAssertionCleartkAnalysisEngine.

Added:
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/ConditionalCleartkAnalysisEngineWindowed.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/GenericCleartkAnalysisEngineWindowed.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/HistoryCleartkAnalysisEngineWindowed.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/PolarityCleartkAnalysisEngineWindowed.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/SubjectCleartkAnalysisEngineWindowed.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/UncertaintyCleartkAnalysisEngineWindowed.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/WindowedAssertionCleartkAnalysisEngine.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/classifier/
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/classifier/WindowedGenericAttributeClassifier.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/classifier/WindowedHistoryAttributeClassifier.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/AbstractLeftToRightContext.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/AbstractRightToLeftContext.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/AbstractWindowedContext.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/FollowingContext.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/LastCoveredContext.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/PrecedingContext.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/WindowedBag.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/WindowedContextFeature.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/extractor/
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/extractor/AbstractTreeFragmentFeatureExtractor1.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/extractor/AbstractWindowedFeatureExtractor1.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/extractor/WindowedAssertionDependencyTreeExtractor.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/extractor/WindowedContextWordWindowExtractor.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/extractor/WindowedDependencyWordsFragmentExtractor.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/extractor/WindowedGenericFeaturesExtractor.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/extractor/WindowedHistoryFeatureExtractor.java
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/extractor/WindowedNegationDependencyFeatureExtractor.java

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/ConditionalCleartkAnalysisEngineWindowed.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/ConditionalCleartkAnalysisEngineWindowed.java?rev=1850705&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/ConditionalCleartkAnalysisEngineWindowed.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/ConditionalCleartkAnalysisEngineWindowed.java Tue Jan  8 03:45:51 2019
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.assertion.medfacts.cleartk.windowed;
+
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
+import org.apache.ctakes.core.pipeline.PipeBitInfo;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.Instance;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+
+import java.io.File;
+import java.net.URI;
+
+@PipeBitInfo(
+      name = "ClearTK Conditional Annotator",
+      description = "Determines whether or not Identified Annotations are conditional.",
+      dependencies = { PipeBitInfo.TypeProduct.IDENTIFIED_ANNOTATION },
+      usables = { PipeBitInfo.TypeProduct.DOCUMENT_ID }
+)
+public class ConditionalCleartkAnalysisEngineWindowed extends
+                                                      WindowedAssertionCleartkAnalysisEngine {
+
+   @Override
+   public void initialize( UimaContext context ) throws ResourceInitializationException {
+      super.initialize( context );
+      probabilityOfKeepingADefaultExample = 1.0;
+      initializeFeatureSelection();
+   }
+
+   @Override
+   public void setClassLabel( IdentifiedAnnotation entityOrEventMention,
+                              Instance<String> instance ) throws AnalysisEngineProcessException {
+      if ( this.isTraining() ) {
+         boolean conditional = entityOrEventMention.getConditional();
+
+         // downsampling. initialize probabilityOfKeepingADefaultExample to 1.0 for no downsampling
+         if ( !conditional
+              && coin.nextDouble() >= this.probabilityOfKeepingADefaultExample ) {
+            return;
+         }
+         instance.setOutcome( "" + conditional );
+      } else {
+         String label = this.classifier.classify( instance.getFeatures() );
+         boolean conditional = false;
+         if ( label != null ) {
+            conditional = Boolean.parseBoolean( label );
+         }
+         entityOrEventMention.setConditional( conditional );
+      }
+   }
+
+   public static FeatureSelection<String> createFeatureSelection( double threshold ) {
+      return new Chi2FeatureSelection<>( WindowedAssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold, false );
+   }
+
+   public static URI createFeatureSelectionURI( File outputDirectoryName ) {
+      return new File( outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat" ).toURI();
+   }
+
+   @Override
+   protected void initializeFeatureSelection() throws ResourceInitializationException {
+      if ( featureSelectionThreshold == 0 ) {
+         this.featureSelection = null;
+      } else {
+         this.featureSelection = createFeatureSelection( this.featureSelectionThreshold );
+      }
+   }
+
+   public static AnalysisEngineDescription createAnnotatorDescription( String modelPath )
+         throws ResourceInitializationException {
+      return AnalysisEngineFactory.createEngineDescription( ConditionalCleartkAnalysisEngineWindowed.class,
+            GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+            modelPath );
+   }
+
+   public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException {
+      return createAnnotatorDescription( "/org/apache/ctakes/assertion/models/conditional/model.jar" );
+   }
+
+}

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/GenericCleartkAnalysisEngineWindowed.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/GenericCleartkAnalysisEngineWindowed.java?rev=1850705&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/GenericCleartkAnalysisEngineWindowed.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/GenericCleartkAnalysisEngineWindowed.java Tue Jan  8 03:45:51 2019
@@ -0,0 +1,113 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.assertion.medfacts.cleartk.windowed;
+
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedContextWordWindowExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedGenericFeaturesExtractor;
+import org.apache.ctakes.core.pipeline.PipeBitInfo;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.Instance;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+
+import java.io.File;
+import java.net.URI;
+import java.util.ArrayList;
+
+@PipeBitInfo(
+      name = "Generic Status ClearTK Annotator",
+      description = "Annotates the Generic status for Identified Annotations.",
+      dependencies = { PipeBitInfo.TypeProduct.SENTENCE, PipeBitInfo.TypeProduct.IDENTIFIED_ANNOTATION }
+)
+public class GenericCleartkAnalysisEngineWindowed extends
+                                                  WindowedAssertionCleartkAnalysisEngine {
+
+   boolean USE_DEFAULT_EXTRACTORS = false;
+
+   @Override
+   public void initialize( UimaContext context ) throws ResourceInitializationException {
+      super.initialize( context );
+      probabilityOfKeepingADefaultExample = 0.5;
+
+      initialize_generic_extractor();
+      initializeFeatureSelection();
+   }
+
+   private void initialize_generic_extractor() {
+      if ( this.entityFeatureExtractors == null ) {
+         this.entityFeatureExtractors = new ArrayList<>();
+      }
+//		this.entityFeatureExtractors.add(new ContextWordWindowExtractor("org/apache/ctakes/assertion/models/generic.txt"));
+      this.entityFeatureExtractors.add( new WindowedContextWordWindowExtractor( "org/apache/ctakes/assertion/models/generic.txt" ) );
+      this.entityFeatureExtractors.add( new WindowedGenericFeaturesExtractor() );
+   }
+
+   @Override
+   public void setClassLabel( IdentifiedAnnotation entityOrEventMention,
+                              Instance<String> instance ) throws AnalysisEngineProcessException {
+      if ( this.isTraining() ) {
+         boolean generic = entityOrEventMention.getGeneric();
+
+         // downsampling. initialize probabilityOfKeepingADefaultExample to 1.0 for no downsampling
+         if ( !generic
+              && coin.nextDouble() >= this.probabilityOfKeepingADefaultExample ) {
+            return;
+         }
+         instance.setOutcome( "" + generic );
+      } else {
+         String label = this.classifier.classify( instance.getFeatures() );
+         entityOrEventMention.setGeneric( Boolean.parseBoolean( label ) );
+      }
+   }
+
+   public static FeatureSelection<String> createFeatureSelection( double threshold ) {
+      return new Chi2FeatureSelection<>( WindowedAssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold, false );
+   }
+
+   public static URI createFeatureSelectionURI( File outputDirectoryName ) {
+      return new File( outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat" ).toURI();
+   }
+
+   @Override
+   protected void initializeFeatureSelection() throws ResourceInitializationException {
+      if ( featureSelectionThreshold == 0 ) {
+         this.featureSelection = null;
+      } else {
+         this.featureSelection = createFeatureSelection( this.featureSelectionThreshold );
+      }
+   }
+
+   public static AnalysisEngineDescription createAnnotatorDescription( String modelPath )
+         throws ResourceInitializationException {
+      return AnalysisEngineFactory.createEngineDescription( GenericCleartkAnalysisEngineWindowed.class,
+            GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+            modelPath );
+   }
+
+   public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException {
+      return createAnnotatorDescription( "/org/apache/ctakes/assertion/models/generic/model.jar" );
+   }
+
+}

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/HistoryCleartkAnalysisEngineWindowed.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/HistoryCleartkAnalysisEngineWindowed.java?rev=1850705&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/HistoryCleartkAnalysisEngineWindowed.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/HistoryCleartkAnalysisEngineWindowed.java Tue Jan  8 03:45:51 2019
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.assertion.medfacts.cleartk.windowed;
+
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedContextWordWindowExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedHistoryFeatureExtractor;
+import org.apache.ctakes.core.pipeline.PipeBitInfo;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.Instance;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+
+import java.io.File;
+import java.net.URI;
+import java.util.ArrayList;
+
+@PipeBitInfo(
+      name = "History of ClearTK Annotator",
+      description = "Annotate History of property.",
+      dependencies = { PipeBitInfo.TypeProduct.SENTENCE, PipeBitInfo.TypeProduct.IDENTIFIED_ANNOTATION }
+)
+public class HistoryCleartkAnalysisEngineWindowed extends
+                                                  WindowedAssertionCleartkAnalysisEngine {
+
+   boolean USE_DEFAULT_EXTRACTORS = false;
+
+   @Override
+   public void initialize( UimaContext context ) throws ResourceInitializationException {
+      super.initialize( context );
+      probabilityOfKeepingADefaultExample = 0.5;
+
+      initialize_history_extractor();
+      initializeFeatureSelection();
+   }
+
+   private void initialize_history_extractor() {
+
+      if ( this.entityFeatureExtractors == null ) {
+         this.entityFeatureExtractors = new ArrayList<>();
+      }
+      this.entityFeatureExtractors.add( new WindowedContextWordWindowExtractor( "org/apache/ctakes/assertion/models/history.txt" ) );
+      this.entityFeatureExtractors.add( new WindowedHistoryFeatureExtractor() );
+   }
+
+   @Override
+   public void setClassLabel( IdentifiedAnnotation entityOrEventMention,
+                              Instance<String> instance ) throws AnalysisEngineProcessException {
+      if ( this.isTraining() ) {
+         int history = entityOrEventMention.getHistoryOf();
+
+         // downsampling. initialize probabilityOfKeepingADefaultExample to 1.0 for no downsampling
+         if ( history == CONST.NE_HISTORY_OF_ABSENT
+              && coin.nextDouble() >= this.probabilityOfKeepingADefaultExample ) {
+            return;
+         }
+
+         instance.setOutcome( String.valueOf( history ) );
+      } else {
+         String label = this.classifier.classify( instance.getFeatures() );
+         entityOrEventMention.setHistoryOf( Integer.parseInt( label ) );
+      }
+   }
+
+   public static FeatureSelection<String> createFeatureSelection( double threshold ) {
+      return new Chi2FeatureSelection<>( WindowedAssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold, false );
+   }
+
+   public static URI createFeatureSelectionURI( File outputDirectoryName ) {
+      return new File( outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat" ).toURI();
+   }
+
+   @Override
+   protected void initializeFeatureSelection() throws ResourceInitializationException {
+      if ( featureSelectionThreshold == 0 ) {
+         this.featureSelection = null;
+      } else {
+         this.featureSelection = createFeatureSelection( this.featureSelectionThreshold );
+      }
+   }
+
+   public static AnalysisEngineDescription createAnnotatorDescription( String modelPath )
+         throws ResourceInitializationException {
+      return AnalysisEngineFactory.createEngineDescription( HistoryCleartkAnalysisEngineWindowed.class,
+            GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+            modelPath );
+   }
+
+   public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException {
+      return createAnnotatorDescription( "/org/apache/ctakes/assertion/models/historyOf/model.jar" );
+   }
+}

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/PolarityCleartkAnalysisEngineWindowed.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/PolarityCleartkAnalysisEngineWindowed.java?rev=1850705&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/PolarityCleartkAnalysisEngineWindowed.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/PolarityCleartkAnalysisEngineWindowed.java Tue Jan  8 03:45:51 2019
@@ -0,0 +1,179 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.assertion.medfacts.cleartk.windowed;
+
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
+import org.apache.ctakes.assertion.medfacts.cleartk.extractors.AboveLeftFragmentExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.extractors.AssertionAboveLeftTreeExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedAssertionDependencyTreeExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedContextWordWindowExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedNegationDependencyFeatureExtractor;
+import org.apache.ctakes.core.pipeline.PipeBitInfo;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.Instance;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+
+import java.io.File;
+import java.net.URI;
+import java.util.ArrayList;
+
+
+@PipeBitInfo(
+      name = "Negation Annotator (ClearTK)",
+      description = "Annotates negation property.",
+      dependencies = { PipeBitInfo.TypeProduct.SENTENCE, PipeBitInfo.TypeProduct.IDENTIFIED_ANNOTATION }
+)
+public class PolarityCleartkAnalysisEngineWindowed extends WindowedAssertionCleartkAnalysisEngine {
+
+   public static final String NEGATED = "NEGATED";
+   public static final String NOT_NEGATED = "NOT_NEGATED";
+
+
+   @Override
+   public void initialize( UimaContext context ) throws ResourceInitializationException {
+      super.initialize( context );
+      probabilityOfKeepingADefaultExample = 1.0; //0.1;
+
+      if ( this.entityFeatureExtractors == null ) {
+         this.entityFeatureExtractors = new ArrayList<>();
+      }
+
+      if ( featConfig == FEATURE_CONFIG.NO_TOK ) {
+         this.tokenCleartkExtractors = new ArrayList<>();
+      }
+
+      // polarity keyword list:
+      if ( featConfig != FEATURE_CONFIG.NO_SEM ) {
+         this.entityFeatureExtractors.add( new WindowedContextWordWindowExtractor( "org/apache/ctakes/assertion/models/polarity.txt" ) );
+      }
+
+      // stk frags feature:
+      if ( featConfig == FEATURE_CONFIG.STK_FRAGS || featConfig == FEATURE_CONFIG.ALL_SYN ||
+           featConfig == FEATURE_CONFIG.NO_TOK ) {
+//		  this.entityFeatureExtractors.add(new AboveLeftFragmentExtractor("AL_Polarity","org/apache/ctakes/assertion/models/jbi_paper_polarity_sems_frags.txt"));
+         this.entityFeatureExtractors.add( new AboveLeftFragmentExtractor( "AL_Polarity", "org/apache/ctakes/assertion/models/sharpPolarityFrags.txt" ) );
+//		  this.entityFeatureExtractors.add(new ConceptModifierPETFragmentExtractor("NegRel", "org/apache/ctakes/assertion/models/polarityRelnFragsStrat.txt"));
+      }
+
+      if ( featConfig == FEATURE_CONFIG.PTK_FRAGS || featConfig == FEATURE_CONFIG.DEP_REGEX_FRAGS ||
+           featConfig == FEATURE_CONFIG.ALL_SYN ) {
+//	     ptk frags feature:
+//		  this.entityFeatureExtractors.add(new DependencyWordsFragmentExtractor("DW_Polarity", "org/apache/ctakes/assertion/models/jbi_paper_polarity_dw_frags.txt"));
+      }
+
+      if ( featConfig == FEATURE_CONFIG.DEP_REGEX || featConfig == FEATURE_CONFIG.DEP_REGEX_FRAGS ||
+           featConfig == FEATURE_CONFIG.ALL_SYN || featConfig == FEATURE_CONFIG.NO_TOK ) {
+         // dep regex feature:
+         this.entityFeatureExtractors.add( new WindowedNegationDependencyFeatureExtractor() );
+      }
+
+      if ( featConfig == FEATURE_CONFIG.STK ) {
+         // stk constituency feature:
+         this.entityTreeExtractors.add( new AssertionAboveLeftTreeExtractor() );
+      }
+
+      if ( featConfig == FEATURE_CONFIG.PTK ) {
+         // ptk dependency feature:
+         this.entityTreeExtractors.add( new WindowedAssertionDependencyTreeExtractor() );
+      }
+
+      // srl & non-effective stk frags feature:
+//  this.entityFeatureExtractors.add(new SRLFeatureExtractor());
+//  this.entityFeatureExtractors.add(new AboveRightFragmentExtractor("AR_Polarity","org/apache/ctakes/assertion/models/sharpArPolarityFrags.txt"));
+      initializeFeatureSelection();
+   }
+
+   @Override
+   public void setClassLabel( IdentifiedAnnotation entityOrEventMention, Instance<String> instance )
+         throws AnalysisEngineProcessException {
+      if ( this.isTraining() ) {
+         String polarity = (entityOrEventMention.getPolarity() == CONST.NE_POLARITY_NEGATION_PRESENT) ? NEGATED
+                                                                                                      : NOT_NEGATED; // "negated" : "present";
+         this.lastLabel = polarity;
+         // downsampling. initialize probabilityOfKeepingADefaultExample to 1.0 for no downsampling
+         if ( NEGATED.equals( polarity ) ) {
+            logger.debug( "TRAINING: " + polarity );
+         }
+         if ( NOT_NEGATED.equals( polarity )
+              && coin.nextDouble() >= this.probabilityOfKeepingADefaultExample ) {
+            return;
+         }
+         instance.setOutcome( polarity );
+//	        this.dataWriter.write(instance);
+      } else {
+         String label = this.classifier.classify( instance.getFeatures() );
+         this.lastLabel = label;
+         int polarity = CONST.NE_POLARITY_NEGATION_ABSENT;
+         if ( NOT_NEGATED.equals( label ) ) {
+            polarity = CONST.NE_POLARITY_NEGATION_ABSENT;
+         } else if ( NEGATED.equals( label ) ) {
+            polarity = CONST.NE_POLARITY_NEGATION_PRESENT;
+            logger.debug( String.format( "DECODING/EVAL: %s//%s [%d-%d] (%s)", label, polarity, entityOrEventMention.getBegin(), entityOrEventMention
+                  .getEnd(), entityOrEventMention.getClass().getName() ) );
+         }
+         entityOrEventMention.setPolarity( polarity );
+      }
+   }
+
+   public static FeatureSelection<String> createFeatureSelection( double threshold ) {
+      return new Chi2FeatureSelection<String>( WindowedAssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold, false );
+      //		  return new MutualInformationFeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME);
+   }
+
+   public static URI createFeatureSelectionURI( File outputDirectoryName ) {
+      return new File( outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat" ).toURI();
+   }
+
+   @Override
+   protected void initializeFeatureSelection() throws ResourceInitializationException {
+      if ( featureSelectionThreshold == 0 ) {
+         this.featureSelection = null;
+      } else {
+         this.featureSelection = this.createFeatureSelection( this.featureSelectionThreshold );
+
+//	    	if ( (new File(this.featureSelectionURI)).exists() ) {
+//	    		try {
+//	    			this.featureSelection.load(this.featureSelectionURI);
+//	    		} catch (IOException e) {
+//	    			throw new ResourceInitializationException(e);
+//	    		}
+//	    	}
+      }
+   }
+
+   public static AnalysisEngineDescription createAnnotatorDescription( String modelPath )
+         throws ResourceInitializationException {
+      return AnalysisEngineFactory.createEngineDescription( PolarityCleartkAnalysisEngineWindowed.class,
+            WindowedAssertionCleartkAnalysisEngine.PARAM_FEATURE_CONFIG,
+            FEATURE_CONFIG.ALL_SYN,
+            GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+            modelPath );
+   }
+
+   public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException {
+      return createAnnotatorDescription( "/org/apache/ctakes/assertion/models/polarity/sharpi2b2mipacqnegex/model.jar" );
+   }
+}

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/SubjectCleartkAnalysisEngineWindowed.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/SubjectCleartkAnalysisEngineWindowed.java?rev=1850705&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/SubjectCleartkAnalysisEngineWindowed.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/SubjectCleartkAnalysisEngineWindowed.java Tue Jan  8 03:45:51 2019
@@ -0,0 +1,115 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.assertion.medfacts.cleartk.windowed;
+
+import org.apache.ctakes.assertion.attributes.features.SubjectFeaturesExtractor;
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.log4j.Level;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.Instance;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+
+import java.io.File;
+import java.net.URI;
+
+public class SubjectCleartkAnalysisEngineWindowed extends
+                                                  WindowedAssertionCleartkAnalysisEngine {
+
+   boolean USE_DEFAULT_EXTRACTORS = false;
+
+   @Override
+   public void initialize( UimaContext context ) throws ResourceInitializationException {
+      super.initialize( context );
+      probabilityOfKeepingADefaultExample = 1.0;
+
+      if ( this.isTraining() && this.goldViewName == null ) {
+         throw new IllegalArgumentException( PARAM_GOLD_VIEW_NAME + " must be defined during training" );
+      }
+
+      initialize_subject_extractor();
+      initializeFeatureSelection();
+
+   }
+
+
+   private void initialize_subject_extractor() {
+      this.entityFeatureExtractors.add( new SubjectFeaturesExtractor() );
+   }
+
+   @Override
+   public void setClassLabel( IdentifiedAnnotation entityOrEventMention,
+                              Instance<String> instance ) throws AnalysisEngineProcessException {
+      if ( this.isTraining() ) {
+         String subj = entityOrEventMention.getSubject();
+
+         // downsampling. initialize probabilityOfKeepingADefaultExample to 1.0 for no downsampling
+         if ( "patient".equals( subj )
+              && coin.nextDouble() >= this.probabilityOfKeepingADefaultExample ) {
+            return;
+         }
+         instance.setOutcome( subj );
+         logger.log( Level.DEBUG, String.format( "[%s] expected: ''; actual: ''; features: %s",
+               this.getClass().getSimpleName(),
+               instance.toString()
+         ) );
+      } else {
+         String label = this.classifier.classify( instance.getFeatures() );
+         entityOrEventMention.setSubject( label );
+         logger.log( Level.DEBUG,
+               "SUBJECT is being set on an IdentifiedAnnotation: " + label + " " + entityOrEventMention.getSubject() );
+      }
+   }
+
+   public static FeatureSelection<String> createFeatureSelection( double threshold ) {
+      return new Chi2FeatureSelection<>( WindowedAssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold, false );
+   }
+
+   public static URI createFeatureSelectionURI( File outputDirectoryName ) {
+      return new File( outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat" ).toURI();
+   }
+
+   @Override
+   protected void initializeFeatureSelection() throws ResourceInitializationException {
+      if ( featureSelectionThreshold == 0 ) {
+         this.featureSelection = null;
+      } else {
+         this.featureSelection = createFeatureSelection( this.featureSelectionThreshold );
+      }
+   }
+
+   public static AnalysisEngineDescription createAnnotatorDescription( String modelPath )
+         throws ResourceInitializationException {
+      return AnalysisEngineFactory.createEngineDescription( SubjectCleartkAnalysisEngineWindowed.class,
+            WindowedAssertionCleartkAnalysisEngine.PARAM_FEATURE_CONFIG,
+            FEATURE_CONFIG.DEP_REGEX,
+            GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+            modelPath );
+   }
+
+   public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException {
+      return createAnnotatorDescription( "/org/apache/ctakes/assertion/models/subject/model.jar" );
+   }
+
+}

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/UncertaintyCleartkAnalysisEngineWindowed.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/UncertaintyCleartkAnalysisEngineWindowed.java?rev=1850705&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/UncertaintyCleartkAnalysisEngineWindowed.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/UncertaintyCleartkAnalysisEngineWindowed.java Tue Jan  8 03:45:51 2019
@@ -0,0 +1,137 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.assertion.medfacts.cleartk.windowed;
+
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
+import org.apache.ctakes.assertion.medfacts.cleartk.extractors.AboveLeftFragmentExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.extractors.AssertionAboveLeftTreeExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.extractors.UncertaintyFeatureExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedAssertionDependencyTreeExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedContextWordWindowExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedDependencyWordsFragmentExtractor;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.Instance;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+
+import java.io.File;
+import java.net.URI;
+import java.util.ArrayList;
+
+import static org.apache.ctakes.assertion.medfacts.cleartk.windowed.WindowedAssertionCleartkAnalysisEngine.FEATURE_CONFIG.*;
+
+public class UncertaintyCleartkAnalysisEngineWindowed extends WindowedAssertionCleartkAnalysisEngine {
+
+   @Override
+   public void initialize( UimaContext context ) throws ResourceInitializationException {
+      super.initialize( context );
+      probabilityOfKeepingADefaultExample = 0.25;
+      if ( this.entityFeatureExtractors == null ) {
+         this.entityFeatureExtractors = new ArrayList<>();
+      }
+      this.entityFeatureExtractors.add( new WindowedContextWordWindowExtractor( "org/apache/ctakes/assertion/models/uncertainty.txt" ) );
+      this.entityFeatureExtractors.add( new UncertaintyFeatureExtractor() );
+      // TODO: Uncomment below when good features are found:
+//		try {
+//      this.entityFeatureExtractors.add(new DependencyPathRegexpFeatureExtractor());
+//    } catch (FileNotFoundException e) {
+//      e.printStackTrace();
+//      throw new ResourceInitializationException(e);
+//    }
+
+      if ( featConfig == STK_FRAGS ) {
+         this.entityFeatureExtractors.add( new AboveLeftFragmentExtractor( "AL_Unc", "org/apache/ctakes/assertion/models/jbi_paper_unc_seed_frags.txt" ) );
+      }
+
+      if ( featConfig == PTK_FRAGS ) {
+         this.entityFeatureExtractors.add( new WindowedDependencyWordsFragmentExtractor( "DW_Uncertainty", "org/apache/ctakes/assertion/models/jbi_paper_uncertainty_dw_frags.txt" ) );
+      }
+      if ( featConfig == STK ) {
+         this.entityTreeExtractors.add( new AssertionAboveLeftTreeExtractor() );
+      }
+
+      if ( featConfig == PTK ) {
+         this.entityTreeExtractors.add( new WindowedAssertionDependencyTreeExtractor() );
+      }
+
+      initializeFeatureSelection();
+
+   }
+
+   @Override
+   public void setClassLabel( IdentifiedAnnotation entityOrEventMention, Instance<String> instance )
+         throws AnalysisEngineProcessException {
+      if ( this.isTraining() ) {
+         String uncertainty = (entityOrEventMention.getUncertainty() == CONST.NE_UNCERTAINTY_PRESENT) ? "uncertain"
+                                                                                                      : "certain";
+
+         // downsampling. initialize probabilityOfKeepingADefaultExample to 1.0 for no downsampling
+         if ( "certain".equals( uncertainty )
+              && coin.nextDouble() >= this.probabilityOfKeepingADefaultExample ) {
+            return;
+         }
+         instance.setOutcome( uncertainty );
+      } else {
+         String label = this.classifier.classify( instance.getFeatures() );
+         int uncertainty = 0;
+         if ( label != null && label.equals( "uncertain" ) ) {
+            uncertainty = CONST.NE_UNCERTAINTY_PRESENT;
+         } else if ( label != null && label.equals( "certain" ) ) {
+            uncertainty = CONST.NE_UNCERTAINTY_ABSENT;
+         }
+         entityOrEventMention.setUncertainty( uncertainty );
+      }
+   }
+
+   public static FeatureSelection<String> createFeatureSelection( double threshold ) {
+      return new Chi2FeatureSelection<>( WindowedAssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold, false );
+   }
+
+   public static URI createFeatureSelectionURI( File outputDirectoryName ) {
+      return new File( outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat" ).toURI();
+   }
+
+   @Override
+   protected void initializeFeatureSelection() throws ResourceInitializationException {
+      if ( featureSelectionThreshold == 0 ) {
+         this.featureSelection = null;
+      } else {
+         this.featureSelection = createFeatureSelection( this.featureSelectionThreshold );
+      }
+   }
+
+   public static AnalysisEngineDescription createAnnotatorDescription( String modelPath )
+         throws ResourceInitializationException {
+      return AnalysisEngineFactory.createEngineDescription( UncertaintyCleartkAnalysisEngineWindowed.class,
+            WindowedAssertionCleartkAnalysisEngine.PARAM_FEATURE_CONFIG,
+            FEATURE_CONFIG.ALL_SYN,
+            GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+            modelPath );
+   }
+
+   public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException {
+      return createAnnotatorDescription( "/org/apache/ctakes/assertion/models/uncertainty/model.jar" );
+   }
+}

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/WindowedAssertionCleartkAnalysisEngine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/WindowedAssertionCleartkAnalysisEngine.java?rev=1850705&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/WindowedAssertionCleartkAnalysisEngine.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/WindowedAssertionCleartkAnalysisEngine.java Tue Jan  8 03:45:51 2019
@@ -0,0 +1,544 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.assertion.medfacts.cleartk.windowed;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
+import org.apache.ctakes.assertion.medfacts.cleartk.extractors.FedaFeatureFunction;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.AbstractWindowedContext;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.FollowingContext;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.LastCoveredContext;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.PrecedingContext;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.AbstractWindowedFeatureExtractor1;
+import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.temporary.assertion.AssertionCuePhraseAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.factory.ConfigurationParameterFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.CleartkAnnotator;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.Instance;
+import org.cleartk.ml.TreeFeature;
+import org.cleartk.ml.feature.extractor.CleartkExtractor;
+import org.cleartk.ml.feature.extractor.CoveredTextExtractor;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+import org.cleartk.ml.feature.function.FeatureFunctionExtractor;
+
+import java.io.File;
+import java.net.URI;
+import java.util.*;
+
+/**
+ * @author swu
+ */
+public abstract class WindowedAssertionCleartkAnalysisEngine extends
+                                                             CleartkAnnotator<String> {
+   Logger logger = Logger.getLogger( WindowedAssertionCleartkAnalysisEngine.class );
+
+   public static final String PARAM_GOLD_VIEW_NAME = "GoldViewName";
+
+   public enum FEATURE_CONFIG {
+      NO_SEM, NO_SYN, STK, STK_FRAGS, PTK, PTK_FRAGS, DEP_REGEX, DEP_REGEX_FRAGS, ALL_SYN, VECTORS, NO_TOK
+   }
+
+   public static int relationId; // counter for error logging
+
+   // additional parameter for domain adaptation
+   public static final String FILE_TO_DOMAIN_MAP = "mapTrainFileToDomain";
+
+
+   @ConfigurationParameter(
+         name = PARAM_GOLD_VIEW_NAME,
+         mandatory = false,
+         description = "view containing the manual identified annotations (especially EntityMention and EventMention annotations); needed for training" )
+   protected String goldViewName;
+
+   public static final String PARAM_PRINT_ERRORS = "PrintErrors";
+
+   @ConfigurationParameter(
+         name = PARAM_PRINT_ERRORS,
+         mandatory = false,
+         description = "Print errors true/false",
+         defaultValue = "false" )
+   boolean printErrors;
+
+   public static final String PARAM_PROBABILITY_OF_KEEPING_DEFAULT_EXAMPLE = "ProbabilityOfKeepingADefaultExample";
+
+   @ConfigurationParameter(
+         name = PARAM_PROBABILITY_OF_KEEPING_DEFAULT_EXAMPLE,
+         mandatory = false,
+         description = "probability that a default example should be retained for training" )
+   protected double probabilityOfKeepingADefaultExample = 1.0;
+
+   public static final String PARAM_PORTION_OF_DATA_TO_USE = "PortionOfDataToUse";
+   @ConfigurationParameter(
+         name = PARAM_PORTION_OF_DATA_TO_USE,
+         mandatory = false,
+         description = "How much data to actually use during training (e.g. for building learning curves)"
+   )
+   protected double portionOfDataToUse = 1.0;
+
+   public static final String PARAM_FEATURE_SELECTION_THRESHOLD = "WhetherToDoFeatureSelection";
+   // Accurate name? Actually uses the threshold, right?
+
+   @ConfigurationParameter(
+         name = PARAM_FEATURE_SELECTION_THRESHOLD,
+         mandatory = false,
+         description = "the Chi-squared threshold at which features should be removed" )
+   protected Float featureSelectionThreshold = 0f;
+
+   public static final String PARAM_FEATURE_CONFIG = "FEATURE_CONFIG";
+   @ConfigurationParameter(
+         name = PARAM_FEATURE_CONFIG,
+         description = "Feature configuration to use (for experiments)",
+         mandatory = false
+   )
+   protected FEATURE_CONFIG featConfig = FEATURE_CONFIG.ALL_SYN;
+
+   public static final String PARAM_FEATURE_SELECTION_URI = "FeatureSelectionURI";
+
+   @ConfigurationParameter(
+         mandatory = false,
+         name = PARAM_FEATURE_SELECTION_URI,
+         description = "provides a URI where the feature selection data will be written" )
+   protected URI featureSelectionURI;
+
+   protected static Random coin = new Random( 0 );
+
+   protected static final String FEATURE_SELECTION_NAME = "SelectNeighborFeatures";
+
+   @ConfigurationParameter(
+         name = FILE_TO_DOMAIN_MAP,
+         mandatory = false,
+         description = "a map of filenames to their respective domains (i.e., directories that contain them)" )
+   protected String fileDomainMap;
+   protected Map<String, String> fileToDomain = new HashMap<>();
+
+   protected String lastLabel;
+
+   //   protected List<CleartkExtractor<IdentifiedAnnotation, BaseToken>> contextFeatureExtractors;
+//   protected List<CleartkExtractor<IdentifiedAnnotation, BaseToken>> tokenContextFeatureExtractors;
+   protected List<CleartkExtractor<IdentifiedAnnotation, BaseToken>> tokenCleartkExtractors;
+   protected List<FeatureExtractor1<IdentifiedAnnotation>> entityFeatureExtractors;
+   protected List<FeatureExtractor1<IdentifiedAnnotation>> entityTreeExtractors;
+//   protected CleartkExtractor<IdentifiedAnnotation, BaseToken> cuePhraseInWindowExtractor;
+
+
+   protected List<FeatureFunctionExtractor<IdentifiedAnnotation>> featureFunctionExtractors = new ArrayList<>();
+   protected FedaFeatureFunction ffDomainAdaptor = null;
+
+   protected FeatureSelection<String> featureSelection;
+
+
+   protected List<AbstractWindowedContext> _windowedContexts = new ArrayList<>();
+
+
+   public abstract void setClassLabel( IdentifiedAnnotation entityMention, Instance<String> instance )
+         throws AnalysisEngineProcessException;
+
+   protected abstract void initializeFeatureSelection() throws ResourceInitializationException;
+
+   private JCas getAnnotationView( final JCas jCas ) throws AnalysisEngineProcessException {
+      if ( this.isTraining() ) {
+         try {
+            return jCas.getView( this.goldViewName );
+         } catch ( CASException e ) {
+            throw new AnalysisEngineProcessException( e );
+         }
+      }
+      return jCas;
+   }
+
+   @Override
+   @SuppressWarnings( "deprecation" )
+   public void initialize( UimaContext context ) throws ResourceInitializationException {
+      super.initialize( context );
+
+      // Re-process the "directory" string for domains that were used in the data
+      if ( null != fileDomainMap ) {
+         String[] dirs = fileDomainMap.split( "[;:]" );
+         for ( String dir : dirs ) {
+
+            // TODO: normalize dir to real domainId
+            String domainId = normalizeToDomain( dir );
+
+            File dataDir = new File( dir );
+            if ( dataDir.listFiles() != null ) {
+               for ( File f : dataDir.listFiles() ) {
+                  fileToDomain.put( FilenameUtils.removeExtension( f.getName() ), domainId );
+               }
+            }
+         }
+      }
+
+      if ( this.isTraining() && this.goldViewName == null ) {
+         throw new IllegalArgumentException( PARAM_GOLD_VIEW_NAME + " must be defined during training" );
+      }
+
+      // a list of feature extractors that require only the token:
+      // the stem of the word, the text of the word itself, plus
+      // features created from the word text like character ngrams
+      this.entityFeatureExtractors = new ArrayList<>();
+
+      this.tokenCleartkExtractors = new ArrayList<>();
+
+
+      final LastCoveredContext lastExtraction1 = new LastCoveredContext( 2 );
+      final PrecedingContext precedingExtraction1 = new PrecedingContext( 5 );
+      final FollowingContext followingExtraction1 = new FollowingContext( 4 );
+      final PrecedingContext bagPreceding3 = new PrecedingContext( 3 );
+      final PrecedingContext bagPreceding5 = new PrecedingContext( 5 );
+      final PrecedingContext bagPreceding10 = new PrecedingContext( 10 );
+      final FollowingContext bagFollowing3 = new FollowingContext( 3 );
+      final FollowingContext bagFollowing5 = new FollowingContext( 5 );
+      final FollowingContext bagFollowing10 = new FollowingContext( 10 );
+      _windowedContexts.add( lastExtraction1 );
+      _windowedContexts.add( precedingExtraction1 );
+      _windowedContexts.add( followingExtraction1 );
+      _windowedContexts.add( bagPreceding3 );
+      _windowedContexts.add( bagPreceding5 );
+      _windowedContexts.add( bagPreceding10 );
+      _windowedContexts.add( bagFollowing3 );
+      _windowedContexts.add( bagFollowing5 );
+      _windowedContexts.add( bagFollowing10 );
+
+//      CleartkExtractor<IdentifiedAnnotation, BaseToken> tokenExtraction1 =
+//            new CleartkExtractor<>(
+//                  BaseToken.class,
+//                  new CoveredTextExtractor<>(),
+//                  lastExtraction1,
+//                  precedingExtraction1,
+//                  followingExtraction1,
+//                  new WindowedBag( bagPreceding3 ),
+//                  new WindowedBag( bagFollowing3 ),
+//                  new WindowedBag( bagPreceding5 ),
+//                  new WindowedBag( bagFollowing5 ),
+//                  new WindowedBag( bagPreceding10 ),
+//                  new WindowedBag( bagFollowing10 )
+//            );
+      CleartkExtractor<IdentifiedAnnotation, BaseToken> tokenExtraction1 =
+            new CleartkExtractor<>(
+                  BaseToken.class,
+                  new CoveredTextExtractor<>(),
+                  new CleartkExtractor.LastCovered( 2 ),     // Worked fine
+//                  lastExtraction1,                             // Doesn't work.  Does same thing - wtf ?
+                  new CleartkExtractor.Preceding( 5 ),
+                  new CleartkExtractor.Following( 4 ),
+                  new CleartkExtractor.Bag( new CleartkExtractor.Preceding( 3 ) ),
+                  new CleartkExtractor.Bag( new CleartkExtractor.Following( 3 ) ),
+                  new CleartkExtractor.Bag( new CleartkExtractor.Preceding( 5 ) ),
+                  new CleartkExtractor.Bag( new CleartkExtractor.Following( 5 ) ),
+                  new CleartkExtractor.Bag( new CleartkExtractor.Preceding( 10 ) ),
+                  new CleartkExtractor.Bag( new CleartkExtractor.Following( 10 ) )
+            );
+
+      this.tokenCleartkExtractors.add( tokenExtraction1 );
+      if ( !fileToDomain.isEmpty() ) {
+         // set up FeatureFunction for all the laggard, non-Extractor features
+         ffDomainAdaptor = new FedaFeatureFunction( new ArrayList<>( new HashSet<>( fileToDomain.values() ) ) );
+      }
+      entityTreeExtractors = new ArrayList<>();
+   }
+
+   @Override
+   public void process( JCas jCas ) throws AnalysisEngineProcessException {
+      String documentId = DocumentIDAnnotationUtil.getDocumentID( jCas );
+      String domainId = "";
+      String domainFeature = null;
+
+      if ( this.featureFunctionExtractors.size() <= 0 ) {
+         this.ffDomainAdaptor = null;
+      }
+
+      if ( documentId != null ) {
+         logger.debug( "processing next doc: " + documentId );
+         // set the domain to be FeatureFunction'ed into all extractors
+         if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
+            domainId = fileToDomain.get( documentId );
+            // if domain is not found, no warning -- just considers general domain
+            ffDomainAdaptor.setDomain( domainId );
+         } else if ( !fileToDomain.isEmpty() ) {
+            domainFeature = fileToDomain.get( documentId );
+         }
+      } else {
+         logger.debug( "processing next doc (doc id is null)" );
+      }
+
+      this.lastLabel = "<BEGIN>";
+
+      final JCas annotationView = getAnnotationView( jCas );
+
+      // generate a list of training instances for each sentence in the document
+      // Use an indexed map.  This is faster than calling select and then selectCovering within a loop.
+      final Map<Sentence, Collection<Annotation>> sentenceAnnotationMap
+            = JCasUtil.indexCovered( annotationView, Sentence.class, Annotation.class );
+      // Faster than calling JCasUtil methods for each which has to iterate through the full cas each time.
+      final List<IdentifiedAnnotation> entities = new ArrayList<>();
+      final List<AssertionCuePhraseAnnotation> cues = new ArrayList<>();
+      final List<BaseToken> baseTokens = new ArrayList<>();
+//          25 Dec 2018 10:51:49  INFO CleartkAnalysisEngine - Assigning Attributes ...
+//          25 Dec 2018 14:35:45  INFO CleartkAnalysisEngine - Finished Assigning Attributes
+      // Rather than iterate through all features again, just sort the sentences that have already been fetched.
+      // As far as I can tell, order should be unnecessary.
+      // Using a treemap that is sorted during putAll prevents the need to run a Map.get(..) - fast, but not that fast.
+//          25 Dec 2018 14:52:37  INFO CleartkAnalysisEngine - Assigning Attributes ...
+//          25 Dec 2018 18:32:24  INFO CleartkAnalysisEngine - Finished Assigning Attributes
+      //
+      //  TODO : Windowed Assertion:
+//      26 Dec 2018 16:21:30  INFO CleartkAnalysisEngine - Assigning Attributes ...
+//      26 Dec 2018 17:38:11  INFO CleartkAnalysisEngine - Finished Assigning Attributes
+      final TreeMap<Sentence, Collection<Annotation>> sentenceTreeMap
+            = new TreeMap<>( Comparator.comparingInt( Sentence::getBegin ) );
+      sentenceTreeMap.putAll( sentenceAnnotationMap );
+      // History needs full list of sentences
+      for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityFeatureExtractors ) {
+         if ( extractor instanceof AbstractWindowedFeatureExtractor1 ) {
+            ((AbstractWindowedFeatureExtractor1)extractor).setSentences( new ArrayList<>( sentenceTreeMap.keySet() ) );
+         }
+      }
+      for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityTreeExtractors ) {
+         if ( extractor instanceof AbstractWindowedFeatureExtractor1 ) {
+            ((AbstractWindowedFeatureExtractor1)extractor).setSentences( new ArrayList<>( sentenceTreeMap.keySet() ) );
+         }
+      }
+
+      int sentenceIndex = -1;
+      for ( Map.Entry<Sentence, Collection<Annotation>> sortedEntry : sentenceTreeMap.entrySet() ) {
+         sentenceIndex++;
+         final Sentence coveringSent = sortedEntry.getKey();
+         final List<Annotation> coveredAnnotations = new ArrayList<>( sortedEntry.getValue() );
+         coveredAnnotations.sort( Comparator.comparingInt( Annotation::getBegin ) );
+//         _windowedContexts.forEach( c -> c.setWindow( coveredAnnotations ) );
+         // Sort Annotations into *Mention, assertion cues and BaseTokens in one loop.
+         // Faster than calling JCasUtil methods for each which has to iterate through the full cas each time.
+         entities.clear();
+         cues.clear();
+         baseTokens.clear();
+         for ( Annotation annotation : coveredAnnotations ) {
+            if ( annotation instanceof EventMention || annotation instanceof EntityMention ) {
+               entities.add( (IdentifiedAnnotation)annotation );
+            } else if ( annotation instanceof AssertionCuePhraseAnnotation ) {
+               cues.add( (AssertionCuePhraseAnnotation)annotation );
+            } else if ( annotation instanceof BaseToken ) {
+               baseTokens.add( (BaseToken)annotation );
+            }
+         }
+         _windowedContexts.forEach( c -> c.setWindow( baseTokens ) );
+
+         for ( IdentifiedAnnotation identifiedAnnotation : entities ) {
+            if ( identifiedAnnotation.getPolarity() == -1 ) {
+               logger.debug( String.format( " - identified annotation: [%d-%d] polarity %d (%s)",
+                     identifiedAnnotation.getBegin(),
+                     identifiedAnnotation.getEnd(),
+                     identifiedAnnotation.getPolarity(),
+                     identifiedAnnotation.getClass().getName() ) );
+            }
+            Instance<String> instance = new Instance<>();
+
+            if ( domainFeature != null ) {
+               instance.add( new Feature( "Domain", domainFeature ) );
+            }
+            // only use extract this version if not doing domain adaptation
+            if ( ffDomainAdaptor == null ) {
+               for ( CleartkExtractor<IdentifiedAnnotation, BaseToken> extractor : this.tokenCleartkExtractors ) {
+                  instance.addAll( extractor
+                        .extractWithin( annotationView, identifiedAnnotation, coveringSent ) );
+               }
+            }
+
+            int closest = Integer.MAX_VALUE;
+            AssertionCuePhraseAnnotation closestCue = null;
+            for ( AssertionCuePhraseAnnotation cue : cues ) {
+               // It is much faster to count between BaseTokens already isolated within the same sentence.
+               final int betweenCount = countBetween( cue, identifiedAnnotation, baseTokens );
+               if ( betweenCount < closest ) {
+                  closestCue = cue;
+                  closest = betweenCount;
+               }
+            }
+            if ( closestCue != null && closest < 21 ) {
+               instance.add( new Feature( "ClosestCue_Word", closestCue.getCoveredText() ) );
+               instance.add( new Feature( "ClosestCue_PhraseFamily", closestCue.getCuePhraseAssertionFamily() ) );
+               instance.add( new Feature( "ClosestCue_PhraseCategory", closestCue.getCuePhraseCategory() ) );
+
+               // add hack-ey domain adaptation to these hacked-in features
+               if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
+                  instance.addAll( ffDomainAdaptor
+                        .apply( new Feature( "ClosestCue_Word", closestCue.getCoveredText() ) ) );
+                  instance.addAll( ffDomainAdaptor
+                        .apply( new Feature( "ClosestCue_PhraseFamily", closestCue
+                              .getCuePhraseAssertionFamily() ) ) );
+                  instance.addAll( ffDomainAdaptor
+                        .apply( new Feature( "ClosestCue_PhraseCategory", closestCue.getCuePhraseCategory() ) ) );
+               }
+
+            }
+
+            // 7/9/13 SRH trying to make it work just for anatomical site
+            int eemTypeId = identifiedAnnotation.getTypeID();
+            if ( eemTypeId == CONST.NE_TYPE_ID_ANATOMICAL_SITE ) {
+               // 7/9/13 srh modified per tmiller so it's binary but not numeric feature
+               instance.add( new Feature( "ENTITY_TYPE_ANAT_SITE" ) );
+               // add hack-ey domain adaptation to these hacked-in features
+               if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
+                  instance.addAll( ffDomainAdaptor.apply( new Feature( "ENTITY_TYPE_ANAT_SITE" ) ) );
+               }
+            }
+
+            // only extract these features if not doing domain adaptation
+            if ( ffDomainAdaptor == null ) {
+               for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityFeatureExtractors ) {
+                  if ( extractor instanceof AbstractWindowedFeatureExtractor1 ) {
+                     ((AbstractWindowedFeatureExtractor1)extractor).setWindow( coveringSent, sentenceIndex, baseTokens );
+                  }
+                  instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
+               }
+            }
+
+            for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityTreeExtractors ) {
+               if ( extractor instanceof AbstractWindowedFeatureExtractor1 ) {
+                  ((AbstractWindowedFeatureExtractor1)extractor).setWindow( coveringSent, sentenceIndex, baseTokens );
+               }
+               instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
+            }
+
+            List<Feature> feats = instance.getFeatures();
+
+            for ( Feature feat : feats ) {
+               if ( feat instanceof TreeFeature ||
+                    (feat.getName() != null && (feat.getName().startsWith( "TreeFrag" ) ||
+                                                feat.getName().startsWith( "WORD" ) ||
+                                                feat.getName().startsWith( "NEG" ))) ) {
+                  continue;
+               }
+               if ( feat.getName() != null &&
+                    (feat.getName().contains( "_TreeFrag" ) || feat.getName().contains( "_WORD" ) ||
+                     feat.getName().contains( "_NEG" )) ) {
+                  continue;
+               }
+               if ( feat.getValue() instanceof String ) {
+                  feat.setValue( ((String)feat.getValue()).toLowerCase() );
+               }
+            }
+
+            if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
+               for ( FeatureFunctionExtractor<IdentifiedAnnotation> extractor : this.featureFunctionExtractors ) {
+                  // TODO: extend to the case where the extractors take a different argument besides entityOrEventMention
+                  instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
+               }
+            }
+
+
+            // grab the output label
+            setClassLabel( identifiedAnnotation, instance );
+
+            if ( this.isTraining() ) {
+               // apply feature selection, if necessary
+               if ( this.featureSelection != null ) {
+                  feats = this.featureSelection.transform( feats );
+               }
+
+               // ensures that the (possibly) transformed feats are used
+               if ( instance.getOutcome() != null ) {
+                  if ( coin.nextDouble() < this.portionOfDataToUse ) {
+                     this.dataWriter.write( new Instance<>( instance.getOutcome(), feats ) );
+                  }
+               }
+            }
+         }
+      }
+   }
+
+   public static AnalysisEngineDescription getDescription( Object... additionalConfiguration )
+         throws ResourceInitializationException {
+      AnalysisEngineDescription desc = AnalysisEngineFactory
+            .createEngineDescription( WindowedAssertionCleartkAnalysisEngine.class );
+      if ( additionalConfiguration.length > 0 ) {
+         ConfigurationParameterFactory.addConfigurationParameters( desc, additionalConfiguration );
+      }
+      return desc;
+   }
+
+//   public Map<String, String> getTrainFileToDomain() {
+//      return fileToDomain;
+//   }
+//
+//   public void setTrainFileToDomain( Map<String, String> trainFileToDomain ) {
+//      this.fileToDomain = trainFileToDomain;
+//   }
+
+   /**
+    * Looks in the domain string (path) for meaningful corpus names
+    *
+    * @param dir
+    * @return
+    */
+   public static String normalizeToDomain( String dir ) {
+      // TODO: real normalization
+      String[] p = dir.split( "/" );
+      List<String> parts = new ArrayList<>();
+      Collections.addAll( parts, p );
+      Collections.reverse( parts );
+      for ( String part : parts ) {
+         if ( part.toLowerCase().startsWith( "test" ) || part.toLowerCase().startsWith( "train" ) ||
+              part.toLowerCase().startsWith( "dev" ) ) {
+            continue;
+         }
+         return part;
+      }
+      return dir;
+   }
+
+
+   /**
+    * @param annotation1 -
+    * @param annotation2 -
+    * @param baseTokens  baseTokens within window
+    * @return number of basetokens that lie between annotation1 and annotation2
+    */
+   static private int countBetween( final Annotation annotation1,
+                                    final Annotation annotation2,
+                                    final Collection<BaseToken> baseTokens ) {
+      final int lowEnd = Math.min( annotation1.getEnd(), annotation2.getEnd() );
+      final int highBegin = Math.max( annotation1.getBegin(), annotation2.getBegin() );
+      int between = 0;
+      for ( BaseToken baseToken : baseTokens ) {
+         if ( lowEnd < baseToken.getBegin() && baseToken.getEnd() < highBegin ) {
+            between++;
+         }
+      }
+      return between;
+   }
+
+}

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/classifier/WindowedGenericAttributeClassifier.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/classifier/WindowedGenericAttributeClassifier.java?rev=1850705&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/classifier/WindowedGenericAttributeClassifier.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/classifier/WindowedGenericAttributeClassifier.java Tue Jan  8 03:45:51 2019
@@ -0,0 +1,162 @@
+package org.apache.ctakes.assertion.medfacts.cleartk.windowed.classifier;
+
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 12/26/2018
+ */
+public class WindowedGenericAttributeClassifier {
+
+   private static final String POSTCOORD_NMOD = "donor_srlarg";
+   private static final String DISCUSSION_DEPPATH = "discussion_deppath";
+   private static final String SUBSUMED_CHUNK = "other_token";
+   private static final String SUBSUMED_ANNOT = "other_deppath";
+   public static ArrayList<String> FeatureIndex = new ArrayList<String>();
+
+   static {
+      FeatureIndex.add( POSTCOORD_NMOD );
+      FeatureIndex.add( DISCUSSION_DEPPATH );
+      FeatureIndex.add( SUBSUMED_CHUNK );
+      FeatureIndex.add( SUBSUMED_ANNOT );
+   }
+
+   // currently goes from entityMention to Sentence to SemanticArgument
+   public static Boolean getGeneric( JCas jCas, final Sentence sentence, IdentifiedAnnotation mention ) {
+
+      HashMap<String, Boolean> vfeat = extract( jCas, sentence, mention );
+
+      return classifyWithLogic( vfeat );
+
+   }
+
+
+   public static Boolean classifyWithLogic( HashMap<String, Boolean> vfeat ) {
+      // Logic to identify cases, may be replaced by learned classification
+      int subsumectr = 0;
+      if ( vfeat.get( SUBSUMED_CHUNK ) ) {
+      } //subsumectr++; }
+      if ( vfeat.get( SUBSUMED_ANNOT ) ) {
+         subsumectr++;
+      }
+      if ( vfeat.get( POSTCOORD_NMOD ) ) {
+         subsumectr++;
+      }
+      Boolean subsume_summary = (subsumectr > 0);
+      if ( vfeat.get( DISCUSSION_DEPPATH ) || subsume_summary ) {
+         return true;
+      } else {
+         return false;
+      }
+   }
+
+
+   public static HashMap<String, Boolean> extract( JCas jCas,
+                                                   final Sentence sentence,
+                                                   Annotation arg ) {
+      HashMap<String, Boolean> vfeat = new HashMap<String, Boolean>();
+      for ( String feat : FeatureIndex ) {
+         vfeat.put( feat, false );
+      }
+
+      // find the sentence that entityMention is in
+      Sentence sEntity = sentence;
+
+      if ( sEntity != null ) {
+
+
+         // 2) some other identified annotation subsumes this one?
+         List<IdentifiedAnnotation> lsmentions = JCasUtil.selectPreceding( jCas, IdentifiedAnnotation.class, arg, 5 );
+         lsmentions.addAll( JCasUtil.selectFollowing( jCas, IdentifiedAnnotation.class, arg, 5 ) );
+         for ( IdentifiedAnnotation annot : lsmentions ) {
+            if ( annot.getBegin() > arg.getBegin() ) {
+               break;
+            } else {
+               if ( annot.getEnd() < arg.getEnd() ) {
+                  continue;
+               } else if ( !DependencyUtility.equalCoverage(
+                     DependencyUtility.getNominalHeadNode( jCas, annot ),
+                     DependencyUtility.getNominalHeadNode( jCas, arg ) ) ) {
+                  // the case that annot is a superset
+                  vfeat.put( SUBSUMED_ANNOT, true );
+               }
+            }
+         }
+
+         // 3) some chunk subsumes this?
+         List<Chunk> lschunks = JCasUtil.selectPreceding( jCas, Chunk.class, arg, 5 );
+         lschunks.addAll( JCasUtil.selectFollowing( jCas, Chunk.class, arg, 5 ) );
+         for ( Chunk chunk : lschunks ) {
+            if ( chunk.getBegin() > arg.getBegin() ) {
+               break;
+            } else {
+               if ( chunk.getEnd() < arg.getEnd() ) {
+                  continue;
+               } else if ( !DependencyUtility.equalCoverage(
+                     DependencyUtility.getNominalHeadNode( jCas, chunk ),
+                     DependencyUtility.getNominalHeadNode( jCas, arg ) ) ) {
+                  // the case that annot is a superset
+                  vfeat.put( SUBSUMED_CHUNK, true );
+               }
+            }
+         }
+      }
+
+
+      List<ConllDependencyNode> depnodes = JCasUtil.selectCovered( jCas, ConllDependencyNode.class, arg );
+      if ( !depnodes.isEmpty() ) {
+         ConllDependencyNode depnode = DependencyUtility.getNominalHeadNode( depnodes );
+
+         // 1) check if the head node of the entity mention is really just part of a larger noun phrase
+         if ( depnode.getDeprel().matches( "(NMOD|amod|nmod|det|predet|nn|poss|possessive|infmod|partmod|rcmod)" ) ) {
+            vfeat.put( POSTCOORD_NMOD, true );
+         }
+
+         // 4) search dependency paths for discussion context
+         for ( ConllDependencyNode dn : DependencyUtility.getPathToTop( jCas, depnode ) ) {
+            if ( isDiscussionContext( dn ) ) {
+               vfeat.put( DISCUSSION_DEPPATH, true );
+            }
+         }
+      }
+      return vfeat;
+   }
+
+
+   private static boolean isDonorTerm( Annotation arg ) {
+      return arg.getCoveredText().toLowerCase()
+                .matches( "(donor).*" );
+   }
+
+
+   private static boolean isDiscussionContext( Annotation arg ) {
+      return arg.getCoveredText().toLowerCase()
+                .matches( "(discuss|ask|understand|understood|tell|told|mention|talk|speak|spoke|address).*" );
+   }
+
+
+   // a main method for regex testing
+   public static void main( String[] args ) {
+      String s = "steps";
+      if ( s.toLowerCase().matches( ".*(in-law|stepc|stepd|stepso|stepf|stepm|step-).*" ) ) {
+         System.out.println( "match" );
+      } else {
+         System.out.println( "no match" );
+      }
+   }
+
+
+}