You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2019/01/08 03:45:51 UTC
svn commit: r1850705 [1/2] - in
/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed:
./ classifier/ context/ context/feature/ context/feature/extractor/
Author: seanfinan
Date: Tue Jan 8 03:45:51 2019
New Revision: 1850705
URL: http://svn.apache.org/viewvc?rev=1850705&view=rev
Log:
CTAKES-449 : Partial rewrite using forwarded Annotations for speed. Already faster, but Bag needs to be finished in WindowedAssertionCleartkAnalysisEngine.
Added:
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/ConditionalCleartkAnalysisEngineWindowed.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/GenericCleartkAnalysisEngineWindowed.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/HistoryCleartkAnalysisEngineWindowed.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/PolarityCleartkAnalysisEngineWindowed.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/SubjectCleartkAnalysisEngineWindowed.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/UncertaintyCleartkAnalysisEngineWindowed.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/WindowedAssertionCleartkAnalysisEngine.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/classifier/
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/classifier/WindowedGenericAttributeClassifier.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/classifier/WindowedHistoryAttributeClassifier.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/AbstractLeftToRightContext.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/AbstractRightToLeftContext.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/AbstractWindowedContext.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/FollowingContext.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/LastCoveredContext.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/PrecedingContext.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/WindowedBag.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/WindowedContextFeature.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/extractor/
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/extractor/AbstractTreeFragmentFeatureExtractor1.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/extractor/AbstractWindowedFeatureExtractor1.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/extractor/WindowedAssertionDependencyTreeExtractor.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/extractor/WindowedContextWordWindowExtractor.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/extractor/WindowedDependencyWordsFragmentExtractor.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/extractor/WindowedGenericFeaturesExtractor.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/extractor/WindowedHistoryFeatureExtractor.java
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/context/feature/extractor/WindowedNegationDependencyFeatureExtractor.java
Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/ConditionalCleartkAnalysisEngineWindowed.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/ConditionalCleartkAnalysisEngineWindowed.java?rev=1850705&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/ConditionalCleartkAnalysisEngineWindowed.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/ConditionalCleartkAnalysisEngineWindowed.java Tue Jan 8 03:45:51 2019
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.assertion.medfacts.cleartk.windowed;
+
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
+import org.apache.ctakes.core.pipeline.PipeBitInfo;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.Instance;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+
+import java.io.File;
+import java.net.URI;
+
+@PipeBitInfo(
+ name = "ClearTK Conditional Annotator",
+ description = "Determines whether or not Identified Annotations are conditional.",
+ dependencies = { PipeBitInfo.TypeProduct.IDENTIFIED_ANNOTATION },
+ usables = { PipeBitInfo.TypeProduct.DOCUMENT_ID }
+)
+public class ConditionalCleartkAnalysisEngineWindowed extends
+ WindowedAssertionCleartkAnalysisEngine {
+
+ @Override
+ public void initialize( UimaContext context ) throws ResourceInitializationException {
+ super.initialize( context );
+ probabilityOfKeepingADefaultExample = 1.0;
+ initializeFeatureSelection();
+ }
+
+ @Override
+ public void setClassLabel( IdentifiedAnnotation entityOrEventMention,
+ Instance<String> instance ) throws AnalysisEngineProcessException {
+ if ( this.isTraining() ) {
+ boolean conditional = entityOrEventMention.getConditional();
+
+ // downsampling. initialize probabilityOfKeepingADefaultExample to 1.0 for no downsampling
+ if ( !conditional
+ && coin.nextDouble() >= this.probabilityOfKeepingADefaultExample ) {
+ return;
+ }
+ instance.setOutcome( "" + conditional );
+ } else {
+ String label = this.classifier.classify( instance.getFeatures() );
+ boolean conditional = false;
+ if ( label != null ) {
+ conditional = Boolean.parseBoolean( label );
+ }
+ entityOrEventMention.setConditional( conditional );
+ }
+ }
+
+ public static FeatureSelection<String> createFeatureSelection( double threshold ) {
+ return new Chi2FeatureSelection<>( WindowedAssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold, false );
+ }
+
+ public static URI createFeatureSelectionURI( File outputDirectoryName ) {
+ return new File( outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat" ).toURI();
+ }
+
+ @Override
+ protected void initializeFeatureSelection() throws ResourceInitializationException {
+ if ( featureSelectionThreshold == 0 ) {
+ this.featureSelection = null;
+ } else {
+ this.featureSelection = createFeatureSelection( this.featureSelectionThreshold );
+ }
+ }
+
+ public static AnalysisEngineDescription createAnnotatorDescription( String modelPath )
+ throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngineDescription( ConditionalCleartkAnalysisEngineWindowed.class,
+ GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+ modelPath );
+ }
+
+ public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException {
+ return createAnnotatorDescription( "/org/apache/ctakes/assertion/models/conditional/model.jar" );
+ }
+
+}
Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/GenericCleartkAnalysisEngineWindowed.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/GenericCleartkAnalysisEngineWindowed.java?rev=1850705&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/GenericCleartkAnalysisEngineWindowed.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/GenericCleartkAnalysisEngineWindowed.java Tue Jan 8 03:45:51 2019
@@ -0,0 +1,113 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.assertion.medfacts.cleartk.windowed;
+
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedContextWordWindowExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedGenericFeaturesExtractor;
+import org.apache.ctakes.core.pipeline.PipeBitInfo;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.Instance;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+
+import java.io.File;
+import java.net.URI;
+import java.util.ArrayList;
+
+@PipeBitInfo(
+ name = "Generic Status ClearTK Annotator",
+ description = "Annotates the Generic status for Identified Annotations.",
+ dependencies = { PipeBitInfo.TypeProduct.SENTENCE, PipeBitInfo.TypeProduct.IDENTIFIED_ANNOTATION }
+)
+public class GenericCleartkAnalysisEngineWindowed extends
+ WindowedAssertionCleartkAnalysisEngine {
+
+ boolean USE_DEFAULT_EXTRACTORS = false;
+
+ @Override
+ public void initialize( UimaContext context ) throws ResourceInitializationException {
+ super.initialize( context );
+ probabilityOfKeepingADefaultExample = 0.5;
+
+ initialize_generic_extractor();
+ initializeFeatureSelection();
+ }
+
+ private void initialize_generic_extractor() {
+ if ( this.entityFeatureExtractors == null ) {
+ this.entityFeatureExtractors = new ArrayList<>();
+ }
+// this.entityFeatureExtractors.add(new ContextWordWindowExtractor("org/apache/ctakes/assertion/models/generic.txt"));
+ this.entityFeatureExtractors.add( new WindowedContextWordWindowExtractor( "org/apache/ctakes/assertion/models/generic.txt" ) );
+ this.entityFeatureExtractors.add( new WindowedGenericFeaturesExtractor() );
+ }
+
+ @Override
+ public void setClassLabel( IdentifiedAnnotation entityOrEventMention,
+ Instance<String> instance ) throws AnalysisEngineProcessException {
+ if ( this.isTraining() ) {
+ boolean generic = entityOrEventMention.getGeneric();
+
+ // downsampling. initialize probabilityOfKeepingADefaultExample to 1.0 for no downsampling
+ if ( !generic
+ && coin.nextDouble() >= this.probabilityOfKeepingADefaultExample ) {
+ return;
+ }
+ instance.setOutcome( "" + generic );
+ } else {
+ String label = this.classifier.classify( instance.getFeatures() );
+ entityOrEventMention.setGeneric( Boolean.parseBoolean( label ) );
+ }
+ }
+
+ public static FeatureSelection<String> createFeatureSelection( double threshold ) {
+ return new Chi2FeatureSelection<>( WindowedAssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold, false );
+ }
+
+ public static URI createFeatureSelectionURI( File outputDirectoryName ) {
+ return new File( outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat" ).toURI();
+ }
+
+ @Override
+ protected void initializeFeatureSelection() throws ResourceInitializationException {
+ if ( featureSelectionThreshold == 0 ) {
+ this.featureSelection = null;
+ } else {
+ this.featureSelection = createFeatureSelection( this.featureSelectionThreshold );
+ }
+ }
+
+ public static AnalysisEngineDescription createAnnotatorDescription( String modelPath )
+ throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngineDescription( GenericCleartkAnalysisEngineWindowed.class,
+ GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+ modelPath );
+ }
+
+ public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException {
+ return createAnnotatorDescription( "/org/apache/ctakes/assertion/models/generic/model.jar" );
+ }
+
+}
Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/HistoryCleartkAnalysisEngineWindowed.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/HistoryCleartkAnalysisEngineWindowed.java?rev=1850705&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/HistoryCleartkAnalysisEngineWindowed.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/HistoryCleartkAnalysisEngineWindowed.java Tue Jan 8 03:45:51 2019
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.assertion.medfacts.cleartk.windowed;
+
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedContextWordWindowExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedHistoryFeatureExtractor;
+import org.apache.ctakes.core.pipeline.PipeBitInfo;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.Instance;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+
+import java.io.File;
+import java.net.URI;
+import java.util.ArrayList;
+
+@PipeBitInfo(
+ name = "History of ClearTK Annotator",
+ description = "Annotate History of property.",
+ dependencies = { PipeBitInfo.TypeProduct.SENTENCE, PipeBitInfo.TypeProduct.IDENTIFIED_ANNOTATION }
+)
+public class HistoryCleartkAnalysisEngineWindowed extends
+ WindowedAssertionCleartkAnalysisEngine {
+
+ boolean USE_DEFAULT_EXTRACTORS = false;
+
+ @Override
+ public void initialize( UimaContext context ) throws ResourceInitializationException {
+ super.initialize( context );
+ probabilityOfKeepingADefaultExample = 0.5;
+
+ initialize_history_extractor();
+ initializeFeatureSelection();
+ }
+
+ private void initialize_history_extractor() {
+
+ if ( this.entityFeatureExtractors == null ) {
+ this.entityFeatureExtractors = new ArrayList<>();
+ }
+ this.entityFeatureExtractors.add( new WindowedContextWordWindowExtractor( "org/apache/ctakes/assertion/models/history.txt" ) );
+ this.entityFeatureExtractors.add( new WindowedHistoryFeatureExtractor() );
+ }
+
+ @Override
+ public void setClassLabel( IdentifiedAnnotation entityOrEventMention,
+ Instance<String> instance ) throws AnalysisEngineProcessException {
+ if ( this.isTraining() ) {
+ int history = entityOrEventMention.getHistoryOf();
+
+ // downsampling. initialize probabilityOfKeepingADefaultExample to 1.0 for no downsampling
+ if ( history == CONST.NE_HISTORY_OF_ABSENT
+ && coin.nextDouble() >= this.probabilityOfKeepingADefaultExample ) {
+ return;
+ }
+
+ instance.setOutcome( String.valueOf( history ) );
+ } else {
+ String label = this.classifier.classify( instance.getFeatures() );
+ entityOrEventMention.setHistoryOf( Integer.parseInt( label ) );
+ }
+ }
+
+ public static FeatureSelection<String> createFeatureSelection( double threshold ) {
+ return new Chi2FeatureSelection<>( WindowedAssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold, false );
+ }
+
+ public static URI createFeatureSelectionURI( File outputDirectoryName ) {
+ return new File( outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat" ).toURI();
+ }
+
+ @Override
+ protected void initializeFeatureSelection() throws ResourceInitializationException {
+ if ( featureSelectionThreshold == 0 ) {
+ this.featureSelection = null;
+ } else {
+ this.featureSelection = createFeatureSelection( this.featureSelectionThreshold );
+ }
+ }
+
+ public static AnalysisEngineDescription createAnnotatorDescription( String modelPath )
+ throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngineDescription( HistoryCleartkAnalysisEngineWindowed.class,
+ GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+ modelPath );
+ }
+
+ public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException {
+ return createAnnotatorDescription( "/org/apache/ctakes/assertion/models/historyOf/model.jar" );
+ }
+}
Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/PolarityCleartkAnalysisEngineWindowed.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/PolarityCleartkAnalysisEngineWindowed.java?rev=1850705&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/PolarityCleartkAnalysisEngineWindowed.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/PolarityCleartkAnalysisEngineWindowed.java Tue Jan 8 03:45:51 2019
@@ -0,0 +1,179 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.assertion.medfacts.cleartk.windowed;
+
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
+import org.apache.ctakes.assertion.medfacts.cleartk.extractors.AboveLeftFragmentExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.extractors.AssertionAboveLeftTreeExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedAssertionDependencyTreeExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedContextWordWindowExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedNegationDependencyFeatureExtractor;
+import org.apache.ctakes.core.pipeline.PipeBitInfo;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.Instance;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+
+import java.io.File;
+import java.net.URI;
+import java.util.ArrayList;
+
+
+@PipeBitInfo(
+ name = "Negation Annotator (ClearTK)",
+ description = "Annotates negation property.",
+ dependencies = { PipeBitInfo.TypeProduct.SENTENCE, PipeBitInfo.TypeProduct.IDENTIFIED_ANNOTATION }
+)
+public class PolarityCleartkAnalysisEngineWindowed extends WindowedAssertionCleartkAnalysisEngine {
+
+ public static final String NEGATED = "NEGATED";
+ public static final String NOT_NEGATED = "NOT_NEGATED";
+
+
+ @Override
+ public void initialize( UimaContext context ) throws ResourceInitializationException {
+ super.initialize( context );
+ probabilityOfKeepingADefaultExample = 1.0; //0.1;
+
+ if ( this.entityFeatureExtractors == null ) {
+ this.entityFeatureExtractors = new ArrayList<>();
+ }
+
+ if ( featConfig == FEATURE_CONFIG.NO_TOK ) {
+ this.tokenCleartkExtractors = new ArrayList<>();
+ }
+
+ // polarity keyword list:
+ if ( featConfig != FEATURE_CONFIG.NO_SEM ) {
+ this.entityFeatureExtractors.add( new WindowedContextWordWindowExtractor( "org/apache/ctakes/assertion/models/polarity.txt" ) );
+ }
+
+ // stk frags feature:
+ if ( featConfig == FEATURE_CONFIG.STK_FRAGS || featConfig == FEATURE_CONFIG.ALL_SYN ||
+ featConfig == FEATURE_CONFIG.NO_TOK ) {
+// this.entityFeatureExtractors.add(new AboveLeftFragmentExtractor("AL_Polarity","org/apache/ctakes/assertion/models/jbi_paper_polarity_sems_frags.txt"));
+ this.entityFeatureExtractors.add( new AboveLeftFragmentExtractor( "AL_Polarity", "org/apache/ctakes/assertion/models/sharpPolarityFrags.txt" ) );
+// this.entityFeatureExtractors.add(new ConceptModifierPETFragmentExtractor("NegRel", "org/apache/ctakes/assertion/models/polarityRelnFragsStrat.txt"));
+ }
+
+ if ( featConfig == FEATURE_CONFIG.PTK_FRAGS || featConfig == FEATURE_CONFIG.DEP_REGEX_FRAGS ||
+ featConfig == FEATURE_CONFIG.ALL_SYN ) {
+// ptk frags feature:
+// this.entityFeatureExtractors.add(new DependencyWordsFragmentExtractor("DW_Polarity", "org/apache/ctakes/assertion/models/jbi_paper_polarity_dw_frags.txt"));
+ }
+
+ if ( featConfig == FEATURE_CONFIG.DEP_REGEX || featConfig == FEATURE_CONFIG.DEP_REGEX_FRAGS ||
+ featConfig == FEATURE_CONFIG.ALL_SYN || featConfig == FEATURE_CONFIG.NO_TOK ) {
+ // dep regex feature:
+ this.entityFeatureExtractors.add( new WindowedNegationDependencyFeatureExtractor() );
+ }
+
+ if ( featConfig == FEATURE_CONFIG.STK ) {
+ // stk constituency feature:
+ this.entityTreeExtractors.add( new AssertionAboveLeftTreeExtractor() );
+ }
+
+ if ( featConfig == FEATURE_CONFIG.PTK ) {
+ // ptk dependency feature:
+ this.entityTreeExtractors.add( new WindowedAssertionDependencyTreeExtractor() );
+ }
+
+ // srl & non-effective stk frags feature:
+// this.entityFeatureExtractors.add(new SRLFeatureExtractor());
+// this.entityFeatureExtractors.add(new AboveRightFragmentExtractor("AR_Polarity","org/apache/ctakes/assertion/models/sharpArPolarityFrags.txt"));
+ initializeFeatureSelection();
+ }
+
+ @Override
+ public void setClassLabel( IdentifiedAnnotation entityOrEventMention, Instance<String> instance )
+ throws AnalysisEngineProcessException {
+ if ( this.isTraining() ) {
+ String polarity = (entityOrEventMention.getPolarity() == CONST.NE_POLARITY_NEGATION_PRESENT) ? NEGATED
+ : NOT_NEGATED; // "negated" : "present";
+ this.lastLabel = polarity;
+ // downsampling. initialize probabilityOfKeepingADefaultExample to 1.0 for no downsampling
+ if ( NEGATED.equals( polarity ) ) {
+ logger.debug( "TRAINING: " + polarity );
+ }
+ if ( NOT_NEGATED.equals( polarity )
+ && coin.nextDouble() >= this.probabilityOfKeepingADefaultExample ) {
+ return;
+ }
+ instance.setOutcome( polarity );
+// this.dataWriter.write(instance);
+ } else {
+ String label = this.classifier.classify( instance.getFeatures() );
+ this.lastLabel = label;
+ int polarity = CONST.NE_POLARITY_NEGATION_ABSENT;
+ if ( NOT_NEGATED.equals( label ) ) {
+ polarity = CONST.NE_POLARITY_NEGATION_ABSENT;
+ } else if ( NEGATED.equals( label ) ) {
+ polarity = CONST.NE_POLARITY_NEGATION_PRESENT;
+ logger.debug( String.format( "DECODING/EVAL: %s//%s [%d-%d] (%s)", label, polarity, entityOrEventMention.getBegin(), entityOrEventMention
+ .getEnd(), entityOrEventMention.getClass().getName() ) );
+ }
+ entityOrEventMention.setPolarity( polarity );
+ }
+ }
+
+ public static FeatureSelection<String> createFeatureSelection( double threshold ) {
+ return new Chi2FeatureSelection<String>( WindowedAssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold, false );
+ // return new MutualInformationFeatureSelection<String>(AssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME);
+ }
+
+ public static URI createFeatureSelectionURI( File outputDirectoryName ) {
+ return new File( outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat" ).toURI();
+ }
+
+ @Override
+ protected void initializeFeatureSelection() throws ResourceInitializationException {
+ if ( featureSelectionThreshold == 0 ) {
+ this.featureSelection = null;
+ } else {
+ this.featureSelection = this.createFeatureSelection( this.featureSelectionThreshold );
+
+// if ( (new File(this.featureSelectionURI)).exists() ) {
+// try {
+// this.featureSelection.load(this.featureSelectionURI);
+// } catch (IOException e) {
+// throw new ResourceInitializationException(e);
+// }
+// }
+ }
+ }
+
+ public static AnalysisEngineDescription createAnnotatorDescription( String modelPath )
+ throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngineDescription( PolarityCleartkAnalysisEngineWindowed.class,
+ WindowedAssertionCleartkAnalysisEngine.PARAM_FEATURE_CONFIG,
+ FEATURE_CONFIG.ALL_SYN,
+ GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+ modelPath );
+ }
+
+ public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException {
+ return createAnnotatorDescription( "/org/apache/ctakes/assertion/models/polarity/sharpi2b2mipacqnegex/model.jar" );
+ }
+}
Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/SubjectCleartkAnalysisEngineWindowed.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/SubjectCleartkAnalysisEngineWindowed.java?rev=1850705&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/SubjectCleartkAnalysisEngineWindowed.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/SubjectCleartkAnalysisEngineWindowed.java Tue Jan 8 03:45:51 2019
@@ -0,0 +1,115 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.assertion.medfacts.cleartk.windowed;
+
+import org.apache.ctakes.assertion.attributes.features.SubjectFeaturesExtractor;
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.log4j.Level;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.Instance;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+
+import java.io.File;
+import java.net.URI;
+
+public class SubjectCleartkAnalysisEngineWindowed extends
+ WindowedAssertionCleartkAnalysisEngine {
+
+ boolean USE_DEFAULT_EXTRACTORS = false;
+
+ @Override
+ public void initialize( UimaContext context ) throws ResourceInitializationException {
+ super.initialize( context );
+ probabilityOfKeepingADefaultExample = 1.0;
+
+ if ( this.isTraining() && this.goldViewName == null ) {
+ throw new IllegalArgumentException( PARAM_GOLD_VIEW_NAME + " must be defined during training" );
+ }
+
+ initialize_subject_extractor();
+ initializeFeatureSelection();
+
+ }
+
+
+ private void initialize_subject_extractor() {
+ this.entityFeatureExtractors.add( new SubjectFeaturesExtractor() );
+ }
+
+ @Override
+ public void setClassLabel( IdentifiedAnnotation entityOrEventMention,
+ Instance<String> instance ) throws AnalysisEngineProcessException {
+ if ( this.isTraining() ) {
+ String subj = entityOrEventMention.getSubject();
+
+ // downsampling. initialize probabilityOfKeepingADefaultExample to 1.0 for no downsampling
+ if ( "patient".equals( subj )
+ && coin.nextDouble() >= this.probabilityOfKeepingADefaultExample ) {
+ return;
+ }
+ instance.setOutcome( subj );
+ logger.log( Level.DEBUG, String.format( "[%s] expected: ''; actual: ''; features: %s",
+ this.getClass().getSimpleName(),
+ instance.toString()
+ ) );
+ } else {
+ String label = this.classifier.classify( instance.getFeatures() );
+ entityOrEventMention.setSubject( label );
+ logger.log( Level.DEBUG,
+ "SUBJECT is being set on an IdentifiedAnnotation: " + label + " " + entityOrEventMention.getSubject() );
+ }
+ }
+
+ public static FeatureSelection<String> createFeatureSelection( double threshold ) {
+ return new Chi2FeatureSelection<>( WindowedAssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold, false );
+ }
+
+ public static URI createFeatureSelectionURI( File outputDirectoryName ) {
+ return new File( outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat" ).toURI();
+ }
+
+ @Override
+ protected void initializeFeatureSelection() throws ResourceInitializationException {
+ if ( featureSelectionThreshold == 0 ) {
+ this.featureSelection = null;
+ } else {
+ this.featureSelection = createFeatureSelection( this.featureSelectionThreshold );
+ }
+ }
+
+ public static AnalysisEngineDescription createAnnotatorDescription( String modelPath )
+ throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngineDescription( SubjectCleartkAnalysisEngineWindowed.class,
+ WindowedAssertionCleartkAnalysisEngine.PARAM_FEATURE_CONFIG,
+ FEATURE_CONFIG.DEP_REGEX,
+ GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+ modelPath );
+ }
+
+ public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException {
+ return createAnnotatorDescription( "/org/apache/ctakes/assertion/models/subject/model.jar" );
+ }
+
+}
Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/UncertaintyCleartkAnalysisEngineWindowed.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/UncertaintyCleartkAnalysisEngineWindowed.java?rev=1850705&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/UncertaintyCleartkAnalysisEngineWindowed.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/UncertaintyCleartkAnalysisEngineWindowed.java Tue Jan 8 03:45:51 2019
@@ -0,0 +1,137 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.assertion.medfacts.cleartk.windowed;
+
+import org.apache.ctakes.assertion.attributes.features.selection.Chi2FeatureSelection;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
+import org.apache.ctakes.assertion.medfacts.cleartk.extractors.AboveLeftFragmentExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.extractors.AssertionAboveLeftTreeExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.extractors.UncertaintyFeatureExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedAssertionDependencyTreeExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedContextWordWindowExtractor;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.WindowedDependencyWordsFragmentExtractor;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.Instance;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+
+import java.io.File;
+import java.net.URI;
+import java.util.ArrayList;
+
+import static org.apache.ctakes.assertion.medfacts.cleartk.windowed.WindowedAssertionCleartkAnalysisEngine.FEATURE_CONFIG.*;
+
+public class UncertaintyCleartkAnalysisEngineWindowed extends WindowedAssertionCleartkAnalysisEngine {
+
+ @Override
+ public void initialize( UimaContext context ) throws ResourceInitializationException {
+ super.initialize( context );
+ probabilityOfKeepingADefaultExample = 0.25;
+ if ( this.entityFeatureExtractors == null ) {
+ this.entityFeatureExtractors = new ArrayList<>();
+ }
+ this.entityFeatureExtractors.add( new WindowedContextWordWindowExtractor( "org/apache/ctakes/assertion/models/uncertainty.txt" ) );
+ this.entityFeatureExtractors.add( new UncertaintyFeatureExtractor() );
+ // TODO: Uncomment below when good features are found:
+// try {
+// this.entityFeatureExtractors.add(new DependencyPathRegexpFeatureExtractor());
+// } catch (FileNotFoundException e) {
+// e.printStackTrace();
+// throw new ResourceInitializationException(e);
+// }
+
+ if ( featConfig == STK_FRAGS ) {
+ this.entityFeatureExtractors.add( new AboveLeftFragmentExtractor( "AL_Unc", "org/apache/ctakes/assertion/models/jbi_paper_unc_seed_frags.txt" ) );
+ }
+
+ if ( featConfig == PTK_FRAGS ) {
+ this.entityFeatureExtractors.add( new WindowedDependencyWordsFragmentExtractor( "DW_Uncertainty", "org/apache/ctakes/assertion/models/jbi_paper_uncertainty_dw_frags.txt" ) );
+ }
+ if ( featConfig == STK ) {
+ this.entityTreeExtractors.add( new AssertionAboveLeftTreeExtractor() );
+ }
+
+ if ( featConfig == PTK ) {
+ this.entityTreeExtractors.add( new WindowedAssertionDependencyTreeExtractor() );
+ }
+
+ initializeFeatureSelection();
+
+ }
+
+ @Override
+ public void setClassLabel( IdentifiedAnnotation entityOrEventMention, Instance<String> instance )
+ throws AnalysisEngineProcessException {
+ if ( this.isTraining() ) {
+ String uncertainty = (entityOrEventMention.getUncertainty() == CONST.NE_UNCERTAINTY_PRESENT) ? "uncertain"
+ : "certain";
+
+ // downsampling. initialize probabilityOfKeepingADefaultExample to 1.0 for no downsampling
+ if ( "certain".equals( uncertainty )
+ && coin.nextDouble() >= this.probabilityOfKeepingADefaultExample ) {
+ return;
+ }
+ instance.setOutcome( uncertainty );
+ } else {
+ String label = this.classifier.classify( instance.getFeatures() );
+ int uncertainty = 0;
+ if ( label != null && label.equals( "uncertain" ) ) {
+ uncertainty = CONST.NE_UNCERTAINTY_PRESENT;
+ } else if ( label != null && label.equals( "certain" ) ) {
+ uncertainty = CONST.NE_UNCERTAINTY_ABSENT;
+ }
+ entityOrEventMention.setUncertainty( uncertainty );
+ }
+ }
+
+ public static FeatureSelection<String> createFeatureSelection( double threshold ) {
+ return new Chi2FeatureSelection<>( WindowedAssertionCleartkAnalysisEngine.FEATURE_SELECTION_NAME, threshold, false );
+ }
+
+ public static URI createFeatureSelectionURI( File outputDirectoryName ) {
+ return new File( outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat" ).toURI();
+ }
+
+ @Override
+ protected void initializeFeatureSelection() throws ResourceInitializationException {
+ if ( featureSelectionThreshold == 0 ) {
+ this.featureSelection = null;
+ } else {
+ this.featureSelection = createFeatureSelection( this.featureSelectionThreshold );
+ }
+ }
+
+ public static AnalysisEngineDescription createAnnotatorDescription( String modelPath )
+ throws ResourceInitializationException {
+ return AnalysisEngineFactory.createEngineDescription( UncertaintyCleartkAnalysisEngineWindowed.class,
+ WindowedAssertionCleartkAnalysisEngine.PARAM_FEATURE_CONFIG,
+ FEATURE_CONFIG.ALL_SYN,
+ GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+ modelPath );
+ }
+
+ public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException {
+ return createAnnotatorDescription( "/org/apache/ctakes/assertion/models/uncertainty/model.jar" );
+ }
+}
Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/WindowedAssertionCleartkAnalysisEngine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/WindowedAssertionCleartkAnalysisEngine.java?rev=1850705&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/WindowedAssertionCleartkAnalysisEngine.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/WindowedAssertionCleartkAnalysisEngine.java Tue Jan 8 03:45:51 2019
@@ -0,0 +1,544 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.assertion.medfacts.cleartk.windowed;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.ctakes.assertion.attributes.features.selection.FeatureSelection;
+import org.apache.ctakes.assertion.medfacts.cleartk.extractors.FedaFeatureFunction;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.AbstractWindowedContext;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.FollowingContext;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.LastCoveredContext;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.PrecedingContext;
+import org.apache.ctakes.assertion.medfacts.cleartk.windowed.context.feature.extractor.AbstractWindowedFeatureExtractor1;
+import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.temporary.assertion.AssertionCuePhraseAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.factory.ConfigurationParameterFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.CleartkAnnotator;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.Instance;
+import org.cleartk.ml.TreeFeature;
+import org.cleartk.ml.feature.extractor.CleartkExtractor;
+import org.cleartk.ml.feature.extractor.CoveredTextExtractor;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+import org.cleartk.ml.feature.function.FeatureFunctionExtractor;
+
+import java.io.File;
+import java.net.URI;
+import java.util.*;
+
+/**
+ * @author swu
+ */
+public abstract class WindowedAssertionCleartkAnalysisEngine extends
+ CleartkAnnotator<String> {
+ Logger logger = Logger.getLogger( WindowedAssertionCleartkAnalysisEngine.class );
+
+ public static final String PARAM_GOLD_VIEW_NAME = "GoldViewName";
+
+ public enum FEATURE_CONFIG {
+ NO_SEM, NO_SYN, STK, STK_FRAGS, PTK, PTK_FRAGS, DEP_REGEX, DEP_REGEX_FRAGS, ALL_SYN, VECTORS, NO_TOK
+ }
+
+ public static int relationId; // counter for error logging
+
+ // additional parameter for domain adaptation
+ public static final String FILE_TO_DOMAIN_MAP = "mapTrainFileToDomain";
+
+
+ @ConfigurationParameter(
+ name = PARAM_GOLD_VIEW_NAME,
+ mandatory = false,
+ description = "view containing the manual identified annotations (especially EntityMention and EventMention annotations); needed for training" )
+ protected String goldViewName;
+
+ public static final String PARAM_PRINT_ERRORS = "PrintErrors";
+
+ @ConfigurationParameter(
+ name = PARAM_PRINT_ERRORS,
+ mandatory = false,
+ description = "Print errors true/false",
+ defaultValue = "false" )
+ boolean printErrors;
+
+ public static final String PARAM_PROBABILITY_OF_KEEPING_DEFAULT_EXAMPLE = "ProbabilityOfKeepingADefaultExample";
+
+ @ConfigurationParameter(
+ name = PARAM_PROBABILITY_OF_KEEPING_DEFAULT_EXAMPLE,
+ mandatory = false,
+ description = "probability that a default example should be retained for training" )
+ protected double probabilityOfKeepingADefaultExample = 1.0;
+
+ public static final String PARAM_PORTION_OF_DATA_TO_USE = "PortionOfDataToUse";
+ @ConfigurationParameter(
+ name = PARAM_PORTION_OF_DATA_TO_USE,
+ mandatory = false,
+ description = "How much data to actually use during training (e.g. for building learning curves)"
+ )
+ protected double portionOfDataToUse = 1.0;
+
+ public static final String PARAM_FEATURE_SELECTION_THRESHOLD = "WhetherToDoFeatureSelection";
+ // Accurate name? Actually uses the threshold, right?
+
+ @ConfigurationParameter(
+ name = PARAM_FEATURE_SELECTION_THRESHOLD,
+ mandatory = false,
+ description = "the Chi-squared threshold at which features should be removed" )
+ protected Float featureSelectionThreshold = 0f;
+
+ public static final String PARAM_FEATURE_CONFIG = "FEATURE_CONFIG";
+ @ConfigurationParameter(
+ name = PARAM_FEATURE_CONFIG,
+ description = "Feature configuration to use (for experiments)",
+ mandatory = false
+ )
+ protected FEATURE_CONFIG featConfig = FEATURE_CONFIG.ALL_SYN;
+
+ public static final String PARAM_FEATURE_SELECTION_URI = "FeatureSelectionURI";
+
+ @ConfigurationParameter(
+ mandatory = false,
+ name = PARAM_FEATURE_SELECTION_URI,
+ description = "provides a URI where the feature selection data will be written" )
+ protected URI featureSelectionURI;
+
+ protected static Random coin = new Random( 0 );
+
+ protected static final String FEATURE_SELECTION_NAME = "SelectNeighborFeatures";
+
+ @ConfigurationParameter(
+ name = FILE_TO_DOMAIN_MAP,
+ mandatory = false,
+ description = "a map of filenames to their respective domains (i.e., directories that contain them)" )
+ protected String fileDomainMap;
+ protected Map<String, String> fileToDomain = new HashMap<>();
+
+ protected String lastLabel;
+
+ // protected List<CleartkExtractor<IdentifiedAnnotation, BaseToken>> contextFeatureExtractors;
+// protected List<CleartkExtractor<IdentifiedAnnotation, BaseToken>> tokenContextFeatureExtractors;
+ protected List<CleartkExtractor<IdentifiedAnnotation, BaseToken>> tokenCleartkExtractors;
+ protected List<FeatureExtractor1<IdentifiedAnnotation>> entityFeatureExtractors;
+ protected List<FeatureExtractor1<IdentifiedAnnotation>> entityTreeExtractors;
+// protected CleartkExtractor<IdentifiedAnnotation, BaseToken> cuePhraseInWindowExtractor;
+
+
+ protected List<FeatureFunctionExtractor<IdentifiedAnnotation>> featureFunctionExtractors = new ArrayList<>();
+ protected FedaFeatureFunction ffDomainAdaptor = null;
+
+ protected FeatureSelection<String> featureSelection;
+
+
+ protected List<AbstractWindowedContext> _windowedContexts = new ArrayList<>();
+
+
+ public abstract void setClassLabel( IdentifiedAnnotation entityMention, Instance<String> instance )
+ throws AnalysisEngineProcessException;
+
+ protected abstract void initializeFeatureSelection() throws ResourceInitializationException;
+
+ private JCas getAnnotationView( final JCas jCas ) throws AnalysisEngineProcessException {
+ if ( this.isTraining() ) {
+ try {
+ return jCas.getView( this.goldViewName );
+ } catch ( CASException e ) {
+ throw new AnalysisEngineProcessException( e );
+ }
+ }
+ return jCas;
+ }
+
+ @Override
+ @SuppressWarnings( "deprecation" )
+ public void initialize( UimaContext context ) throws ResourceInitializationException {
+ super.initialize( context );
+
+ // Re-process the "directory" string for domains that were used in the data
+ if ( null != fileDomainMap ) {
+ String[] dirs = fileDomainMap.split( "[;:]" );
+ for ( String dir : dirs ) {
+
+ // TODO: normalize dir to real domainId
+ String domainId = normalizeToDomain( dir );
+
+ File dataDir = new File( dir );
+ if ( dataDir.listFiles() != null ) {
+ for ( File f : dataDir.listFiles() ) {
+ fileToDomain.put( FilenameUtils.removeExtension( f.getName() ), domainId );
+ }
+ }
+ }
+ }
+
+ if ( this.isTraining() && this.goldViewName == null ) {
+ throw new IllegalArgumentException( PARAM_GOLD_VIEW_NAME + " must be defined during training" );
+ }
+
+ // a list of feature extractors that require only the token:
+ // the stem of the word, the text of the word itself, plus
+ // features created from the word text like character ngrams
+ this.entityFeatureExtractors = new ArrayList<>();
+
+ this.tokenCleartkExtractors = new ArrayList<>();
+
+
+ final LastCoveredContext lastExtraction1 = new LastCoveredContext( 2 );
+ final PrecedingContext precedingExtraction1 = new PrecedingContext( 5 );
+ final FollowingContext followingExtraction1 = new FollowingContext( 4 );
+ final PrecedingContext bagPreceding3 = new PrecedingContext( 3 );
+ final PrecedingContext bagPreceding5 = new PrecedingContext( 5 );
+ final PrecedingContext bagPreceding10 = new PrecedingContext( 10 );
+ final FollowingContext bagFollowing3 = new FollowingContext( 3 );
+ final FollowingContext bagFollowing5 = new FollowingContext( 5 );
+ final FollowingContext bagFollowing10 = new FollowingContext( 10 );
+ _windowedContexts.add( lastExtraction1 );
+ _windowedContexts.add( precedingExtraction1 );
+ _windowedContexts.add( followingExtraction1 );
+ _windowedContexts.add( bagPreceding3 );
+ _windowedContexts.add( bagPreceding5 );
+ _windowedContexts.add( bagPreceding10 );
+ _windowedContexts.add( bagFollowing3 );
+ _windowedContexts.add( bagFollowing5 );
+ _windowedContexts.add( bagFollowing10 );
+
+// CleartkExtractor<IdentifiedAnnotation, BaseToken> tokenExtraction1 =
+// new CleartkExtractor<>(
+// BaseToken.class,
+// new CoveredTextExtractor<>(),
+// lastExtraction1,
+// precedingExtraction1,
+// followingExtraction1,
+// new WindowedBag( bagPreceding3 ),
+// new WindowedBag( bagFollowing3 ),
+// new WindowedBag( bagPreceding5 ),
+// new WindowedBag( bagFollowing5 ),
+// new WindowedBag( bagPreceding10 ),
+// new WindowedBag( bagFollowing10 )
+// );
+ CleartkExtractor<IdentifiedAnnotation, BaseToken> tokenExtraction1 =
+ new CleartkExtractor<>(
+ BaseToken.class,
+ new CoveredTextExtractor<>(),
+ new CleartkExtractor.LastCovered( 2 ), // Worked fine
+// lastExtraction1, // Doesn't work. Does same thing - wtf ?
+ new CleartkExtractor.Preceding( 5 ),
+ new CleartkExtractor.Following( 4 ),
+ new CleartkExtractor.Bag( new CleartkExtractor.Preceding( 3 ) ),
+ new CleartkExtractor.Bag( new CleartkExtractor.Following( 3 ) ),
+ new CleartkExtractor.Bag( new CleartkExtractor.Preceding( 5 ) ),
+ new CleartkExtractor.Bag( new CleartkExtractor.Following( 5 ) ),
+ new CleartkExtractor.Bag( new CleartkExtractor.Preceding( 10 ) ),
+ new CleartkExtractor.Bag( new CleartkExtractor.Following( 10 ) )
+ );
+
+ this.tokenCleartkExtractors.add( tokenExtraction1 );
+ if ( !fileToDomain.isEmpty() ) {
+ // set up FeatureFunction for all the laggard, non-Extractor features
+ ffDomainAdaptor = new FedaFeatureFunction( new ArrayList<>( new HashSet<>( fileToDomain.values() ) ) );
+ }
+ entityTreeExtractors = new ArrayList<>();
+ }
+
+ @Override
+ public void process( JCas jCas ) throws AnalysisEngineProcessException {
+ String documentId = DocumentIDAnnotationUtil.getDocumentID( jCas );
+ String domainId = "";
+ String domainFeature = null;
+
+ if ( this.featureFunctionExtractors.size() <= 0 ) {
+ this.ffDomainAdaptor = null;
+ }
+
+ if ( documentId != null ) {
+ logger.debug( "processing next doc: " + documentId );
+ // set the domain to be FeatureFunction'ed into all extractors
+ if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
+ domainId = fileToDomain.get( documentId );
+ // if domain is not found, no warning -- just considers general domain
+ ffDomainAdaptor.setDomain( domainId );
+ } else if ( !fileToDomain.isEmpty() ) {
+ domainFeature = fileToDomain.get( documentId );
+ }
+ } else {
+ logger.debug( "processing next doc (doc id is null)" );
+ }
+
+ this.lastLabel = "<BEGIN>";
+
+ final JCas annotationView = getAnnotationView( jCas );
+
+ // generate a list of training instances for each sentence in the document
+ // Use an indexed map. This is faster than calling select and then selectCovering within a loop.
+ final Map<Sentence, Collection<Annotation>> sentenceAnnotationMap
+ = JCasUtil.indexCovered( annotationView, Sentence.class, Annotation.class );
+ // Faster than calling JCasUtil methods for each which has to iterate through the full cas each time.
+ final List<IdentifiedAnnotation> entities = new ArrayList<>();
+ final List<AssertionCuePhraseAnnotation> cues = new ArrayList<>();
+ final List<BaseToken> baseTokens = new ArrayList<>();
+// 25 Dec 2018 10:51:49 INFO CleartkAnalysisEngine - Assigning Attributes ...
+// 25 Dec 2018 14:35:45 INFO CleartkAnalysisEngine - Finished Assigning Attributes
+ // Rather than iterate through all features again, just sort the sentences that have already been fetched.
+ // As far as I can tell, order should be unnecessary.
+ // Using a treemap that is sorted during putAll prevents the need to run a Map.get(..) - fast, but not that fast.
+// 25 Dec 2018 14:52:37 INFO CleartkAnalysisEngine - Assigning Attributes ...
+// 25 Dec 2018 18:32:24 INFO CleartkAnalysisEngine - Finished Assigning Attributes
+ //
+ // TODO : Windowed Assertion:
+// 26 Dec 2018 16:21:30 INFO CleartkAnalysisEngine - Assigning Attributes ...
+// 26 Dec 2018 17:38:11 INFO CleartkAnalysisEngine - Finished Assigning Attributes
+ final TreeMap<Sentence, Collection<Annotation>> sentenceTreeMap
+ = new TreeMap<>( Comparator.comparingInt( Sentence::getBegin ) );
+ sentenceTreeMap.putAll( sentenceAnnotationMap );
+ // History needs full list of sentences
+ for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityFeatureExtractors ) {
+ if ( extractor instanceof AbstractWindowedFeatureExtractor1 ) {
+ ((AbstractWindowedFeatureExtractor1)extractor).setSentences( new ArrayList<>( sentenceTreeMap.keySet() ) );
+ }
+ }
+ for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityTreeExtractors ) {
+ if ( extractor instanceof AbstractWindowedFeatureExtractor1 ) {
+ ((AbstractWindowedFeatureExtractor1)extractor).setSentences( new ArrayList<>( sentenceTreeMap.keySet() ) );
+ }
+ }
+
+ int sentenceIndex = -1;
+ for ( Map.Entry<Sentence, Collection<Annotation>> sortedEntry : sentenceTreeMap.entrySet() ) {
+ sentenceIndex++;
+ final Sentence coveringSent = sortedEntry.getKey();
+ final List<Annotation> coveredAnnotations = new ArrayList<>( sortedEntry.getValue() );
+ coveredAnnotations.sort( Comparator.comparingInt( Annotation::getBegin ) );
+// _windowedContexts.forEach( c -> c.setWindow( coveredAnnotations ) );
+ // Sort Annotations into *Mention, assertion cues and BaseTokens in one loop.
+ // Faster than calling JCasUtil methods for each which has to iterate through the full cas each time.
+ entities.clear();
+ cues.clear();
+ baseTokens.clear();
+ for ( Annotation annotation : coveredAnnotations ) {
+ if ( annotation instanceof EventMention || annotation instanceof EntityMention ) {
+ entities.add( (IdentifiedAnnotation)annotation );
+ } else if ( annotation instanceof AssertionCuePhraseAnnotation ) {
+ cues.add( (AssertionCuePhraseAnnotation)annotation );
+ } else if ( annotation instanceof BaseToken ) {
+ baseTokens.add( (BaseToken)annotation );
+ }
+ }
+ _windowedContexts.forEach( c -> c.setWindow( baseTokens ) );
+
+ for ( IdentifiedAnnotation identifiedAnnotation : entities ) {
+ if ( identifiedAnnotation.getPolarity() == -1 ) {
+ logger.debug( String.format( " - identified annotation: [%d-%d] polarity %d (%s)",
+ identifiedAnnotation.getBegin(),
+ identifiedAnnotation.getEnd(),
+ identifiedAnnotation.getPolarity(),
+ identifiedAnnotation.getClass().getName() ) );
+ }
+ Instance<String> instance = new Instance<>();
+
+ if ( domainFeature != null ) {
+ instance.add( new Feature( "Domain", domainFeature ) );
+ }
+ // only use extract this version if not doing domain adaptation
+ if ( ffDomainAdaptor == null ) {
+ for ( CleartkExtractor<IdentifiedAnnotation, BaseToken> extractor : this.tokenCleartkExtractors ) {
+ instance.addAll( extractor
+ .extractWithin( annotationView, identifiedAnnotation, coveringSent ) );
+ }
+ }
+
+ int closest = Integer.MAX_VALUE;
+ AssertionCuePhraseAnnotation closestCue = null;
+ for ( AssertionCuePhraseAnnotation cue : cues ) {
+ // It is much faster to count between BaseTokens already isolated within the same sentence.
+ final int betweenCount = countBetween( cue, identifiedAnnotation, baseTokens );
+ if ( betweenCount < closest ) {
+ closestCue = cue;
+ closest = betweenCount;
+ }
+ }
+ if ( closestCue != null && closest < 21 ) {
+ instance.add( new Feature( "ClosestCue_Word", closestCue.getCoveredText() ) );
+ instance.add( new Feature( "ClosestCue_PhraseFamily", closestCue.getCuePhraseAssertionFamily() ) );
+ instance.add( new Feature( "ClosestCue_PhraseCategory", closestCue.getCuePhraseCategory() ) );
+
+ // add hack-ey domain adaptation to these hacked-in features
+ if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
+ instance.addAll( ffDomainAdaptor
+ .apply( new Feature( "ClosestCue_Word", closestCue.getCoveredText() ) ) );
+ instance.addAll( ffDomainAdaptor
+ .apply( new Feature( "ClosestCue_PhraseFamily", closestCue
+ .getCuePhraseAssertionFamily() ) ) );
+ instance.addAll( ffDomainAdaptor
+ .apply( new Feature( "ClosestCue_PhraseCategory", closestCue.getCuePhraseCategory() ) ) );
+ }
+
+ }
+
+ // 7/9/13 SRH trying to make it work just for anatomical site
+ int eemTypeId = identifiedAnnotation.getTypeID();
+ if ( eemTypeId == CONST.NE_TYPE_ID_ANATOMICAL_SITE ) {
+ // 7/9/13 srh modified per tmiller so it's binary but not numeric feature
+ instance.add( new Feature( "ENTITY_TYPE_ANAT_SITE" ) );
+ // add hack-ey domain adaptation to these hacked-in features
+ if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
+ instance.addAll( ffDomainAdaptor.apply( new Feature( "ENTITY_TYPE_ANAT_SITE" ) ) );
+ }
+ }
+
+ // only extract these features if not doing domain adaptation
+ if ( ffDomainAdaptor == null ) {
+ for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityFeatureExtractors ) {
+ if ( extractor instanceof AbstractWindowedFeatureExtractor1 ) {
+ ((AbstractWindowedFeatureExtractor1)extractor).setWindow( coveringSent, sentenceIndex, baseTokens );
+ }
+ instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
+ }
+ }
+
+ for ( FeatureExtractor1<IdentifiedAnnotation> extractor : this.entityTreeExtractors ) {
+ if ( extractor instanceof AbstractWindowedFeatureExtractor1 ) {
+ ((AbstractWindowedFeatureExtractor1)extractor).setWindow( coveringSent, sentenceIndex, baseTokens );
+ }
+ instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
+ }
+
+ List<Feature> feats = instance.getFeatures();
+
+ for ( Feature feat : feats ) {
+ if ( feat instanceof TreeFeature ||
+ (feat.getName() != null && (feat.getName().startsWith( "TreeFrag" ) ||
+ feat.getName().startsWith( "WORD" ) ||
+ feat.getName().startsWith( "NEG" ))) ) {
+ continue;
+ }
+ if ( feat.getName() != null &&
+ (feat.getName().contains( "_TreeFrag" ) || feat.getName().contains( "_WORD" ) ||
+ feat.getName().contains( "_NEG" )) ) {
+ continue;
+ }
+ if ( feat.getValue() instanceof String ) {
+ feat.setValue( ((String)feat.getValue()).toLowerCase() );
+ }
+ }
+
+ if ( !fileToDomain.isEmpty() && ffDomainAdaptor != null ) {
+ for ( FeatureFunctionExtractor<IdentifiedAnnotation> extractor : this.featureFunctionExtractors ) {
+ // TODO: extend to the case where the extractors take a different argument besides entityOrEventMention
+ instance.addAll( extractor.extract( jCas, identifiedAnnotation ) );
+ }
+ }
+
+
+ // grab the output label
+ setClassLabel( identifiedAnnotation, instance );
+
+ if ( this.isTraining() ) {
+ // apply feature selection, if necessary
+ if ( this.featureSelection != null ) {
+ feats = this.featureSelection.transform( feats );
+ }
+
+ // ensures that the (possibly) transformed feats are used
+ if ( instance.getOutcome() != null ) {
+ if ( coin.nextDouble() < this.portionOfDataToUse ) {
+ this.dataWriter.write( new Instance<>( instance.getOutcome(), feats ) );
+ }
+ }
+ }
+ }
+ }
+ }
+
+ public static AnalysisEngineDescription getDescription( Object... additionalConfiguration )
+ throws ResourceInitializationException {
+ AnalysisEngineDescription desc = AnalysisEngineFactory
+ .createEngineDescription( WindowedAssertionCleartkAnalysisEngine.class );
+ if ( additionalConfiguration.length > 0 ) {
+ ConfigurationParameterFactory.addConfigurationParameters( desc, additionalConfiguration );
+ }
+ return desc;
+ }
+
+// public Map<String, String> getTrainFileToDomain() {
+// return fileToDomain;
+// }
+//
+// public void setTrainFileToDomain( Map<String, String> trainFileToDomain ) {
+// this.fileToDomain = trainFileToDomain;
+// }
+
+ /**
+ * Looks in the domain string (path) for meaningful corpus names
+ *
+ * @param dir
+ * @return
+ */
+ public static String normalizeToDomain( String dir ) {
+ // TODO: real normalization
+ String[] p = dir.split( "/" );
+ List<String> parts = new ArrayList<>();
+ Collections.addAll( parts, p );
+ Collections.reverse( parts );
+ for ( String part : parts ) {
+ if ( part.toLowerCase().startsWith( "test" ) || part.toLowerCase().startsWith( "train" ) ||
+ part.toLowerCase().startsWith( "dev" ) ) {
+ continue;
+ }
+ return part;
+ }
+ return dir;
+ }
+
+
+ /**
+ * @param annotation1 -
+ * @param annotation2 -
+ * @param baseTokens baseTokens within window
+ * @return number of basetokens that lie between annotation1 and annotation2
+ */
+ static private int countBetween( final Annotation annotation1,
+ final Annotation annotation2,
+ final Collection<BaseToken> baseTokens ) {
+ final int lowEnd = Math.min( annotation1.getEnd(), annotation2.getEnd() );
+ final int highBegin = Math.max( annotation1.getBegin(), annotation2.getBegin() );
+ int between = 0;
+ for ( BaseToken baseToken : baseTokens ) {
+ if ( lowEnd < baseToken.getBegin() && baseToken.getEnd() < highBegin ) {
+ between++;
+ }
+ }
+ return between;
+ }
+
+}
Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/classifier/WindowedGenericAttributeClassifier.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/classifier/WindowedGenericAttributeClassifier.java?rev=1850705&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/classifier/WindowedGenericAttributeClassifier.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/windowed/classifier/WindowedGenericAttributeClassifier.java Tue Jan 8 03:45:51 2019
@@ -0,0 +1,162 @@
+package org.apache.ctakes.assertion.medfacts.cleartk.windowed.classifier;
+
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 12/26/2018
+ */
+public class WindowedGenericAttributeClassifier {
+
+ private static final String POSTCOORD_NMOD = "donor_srlarg";
+ private static final String DISCUSSION_DEPPATH = "discussion_deppath";
+ private static final String SUBSUMED_CHUNK = "other_token";
+ private static final String SUBSUMED_ANNOT = "other_deppath";
+ public static ArrayList<String> FeatureIndex = new ArrayList<String>();
+
+ static {
+ FeatureIndex.add( POSTCOORD_NMOD );
+ FeatureIndex.add( DISCUSSION_DEPPATH );
+ FeatureIndex.add( SUBSUMED_CHUNK );
+ FeatureIndex.add( SUBSUMED_ANNOT );
+ }
+
+ // currently goes from entityMention to Sentence to SemanticArgument
+ public static Boolean getGeneric( JCas jCas, final Sentence sentence, IdentifiedAnnotation mention ) {
+
+ HashMap<String, Boolean> vfeat = extract( jCas, sentence, mention );
+
+ return classifyWithLogic( vfeat );
+
+ }
+
+
+ public static Boolean classifyWithLogic( HashMap<String, Boolean> vfeat ) {
+ // Logic to identify cases, may be replaced by learned classification
+ int subsumectr = 0;
+ if ( vfeat.get( SUBSUMED_CHUNK ) ) {
+ } //subsumectr++; }
+ if ( vfeat.get( SUBSUMED_ANNOT ) ) {
+ subsumectr++;
+ }
+ if ( vfeat.get( POSTCOORD_NMOD ) ) {
+ subsumectr++;
+ }
+ Boolean subsume_summary = (subsumectr > 0);
+ if ( vfeat.get( DISCUSSION_DEPPATH ) || subsume_summary ) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+
+ public static HashMap<String, Boolean> extract( JCas jCas,
+ final Sentence sentence,
+ Annotation arg ) {
+ HashMap<String, Boolean> vfeat = new HashMap<String, Boolean>();
+ for ( String feat : FeatureIndex ) {
+ vfeat.put( feat, false );
+ }
+
+ // find the sentence that entityMention is in
+ Sentence sEntity = sentence;
+
+ if ( sEntity != null ) {
+
+
+ // 2) some other identified annotation subsumes this one?
+ List<IdentifiedAnnotation> lsmentions = JCasUtil.selectPreceding( jCas, IdentifiedAnnotation.class, arg, 5 );
+ lsmentions.addAll( JCasUtil.selectFollowing( jCas, IdentifiedAnnotation.class, arg, 5 ) );
+ for ( IdentifiedAnnotation annot : lsmentions ) {
+ if ( annot.getBegin() > arg.getBegin() ) {
+ break;
+ } else {
+ if ( annot.getEnd() < arg.getEnd() ) {
+ continue;
+ } else if ( !DependencyUtility.equalCoverage(
+ DependencyUtility.getNominalHeadNode( jCas, annot ),
+ DependencyUtility.getNominalHeadNode( jCas, arg ) ) ) {
+ // the case that annot is a superset
+ vfeat.put( SUBSUMED_ANNOT, true );
+ }
+ }
+ }
+
+ // 3) some chunk subsumes this?
+ List<Chunk> lschunks = JCasUtil.selectPreceding( jCas, Chunk.class, arg, 5 );
+ lschunks.addAll( JCasUtil.selectFollowing( jCas, Chunk.class, arg, 5 ) );
+ for ( Chunk chunk : lschunks ) {
+ if ( chunk.getBegin() > arg.getBegin() ) {
+ break;
+ } else {
+ if ( chunk.getEnd() < arg.getEnd() ) {
+ continue;
+ } else if ( !DependencyUtility.equalCoverage(
+ DependencyUtility.getNominalHeadNode( jCas, chunk ),
+ DependencyUtility.getNominalHeadNode( jCas, arg ) ) ) {
+ // the case that annot is a superset
+ vfeat.put( SUBSUMED_CHUNK, true );
+ }
+ }
+ }
+ }
+
+
+ List<ConllDependencyNode> depnodes = JCasUtil.selectCovered( jCas, ConllDependencyNode.class, arg );
+ if ( !depnodes.isEmpty() ) {
+ ConllDependencyNode depnode = DependencyUtility.getNominalHeadNode( depnodes );
+
+ // 1) check if the head node of the entity mention is really just part of a larger noun phrase
+ if ( depnode.getDeprel().matches( "(NMOD|amod|nmod|det|predet|nn|poss|possessive|infmod|partmod|rcmod)" ) ) {
+ vfeat.put( POSTCOORD_NMOD, true );
+ }
+
+ // 4) search dependency paths for discussion context
+ for ( ConllDependencyNode dn : DependencyUtility.getPathToTop( jCas, depnode ) ) {
+ if ( isDiscussionContext( dn ) ) {
+ vfeat.put( DISCUSSION_DEPPATH, true );
+ }
+ }
+ }
+ return vfeat;
+ }
+
+
+ private static boolean isDonorTerm( Annotation arg ) {
+ return arg.getCoveredText().toLowerCase()
+ .matches( "(donor).*" );
+ }
+
+
+ private static boolean isDiscussionContext( Annotation arg ) {
+ return arg.getCoveredText().toLowerCase()
+ .matches( "(discuss|ask|understand|understood|tell|told|mention|talk|speak|spoke|address).*" );
+ }
+
+
+ // a main method for regex testing
+ public static void main( String[] args ) {
+ String s = "steps";
+ if ( s.toLowerCase().matches( ".*(in-law|stepc|stepd|stepso|stepf|stepm|step-).*" ) ) {
+ System.out.println( "match" );
+ } else {
+ System.out.println( "no match" );
+ }
+ }
+
+
+}