You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by st...@apache.org on 2012/12/19 22:49:47 UTC
svn commit: r1424157 [1/3] - in
/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal:
ae/ ae/feature/ ae/feature/selection/ eval/
Author: stevenbethard
Date: Wed Dec 19 21:49:46 2012
New Revision: 1424157
URL: http://svn.apache.org/viewvc?rev=1424157&view=rev
Log:
Fixes svn:eol-style for .java files in ctakes-temporal
Modified:
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java (contents, props changed)
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java (contents, props changed)
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java (contents, props changed)
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java (contents, props changed)
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SurfaceFormFeatureExtractor.java (contents, props changed)
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java (contents, props changed)
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelectionExtractor.java (contents, props changed)
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelectionExtractor.java (contents, props changed)
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java (contents, props changed)
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java (contents, props changed)
incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java (contents, props changed)
Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java?rev=1424157&r1=1424156&r2=1424157&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java Wed Dec 19 21:49:46 2012
@@ -1,380 +1,380 @@
-package org.apache.ctakes.temporal.ae;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-
-import java.io.File;
-import java.io.IOException;
-import java.net.URI;
-//import java.net.URL;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-//import java.util.logging.Logger;
-
-//import org.apache.ctakes.temporal.ae.feature.CoveredTextToValuesExtractor;
-import org.apache.ctakes.temporal.ae.feature.PhraseExtractor;
-import org.apache.ctakes.temporal.ae.feature.SRLExtractor;
-import org.apache.ctakes.temporal.ae.feature.SurfaceFormFeatureExtractor;
-import org.apache.ctakes.temporal.ae.feature.selection.Chi2NeighborFSExtractor;
-import org.apache.ctakes.typesystem.type.constants.CONST;
-import org.apache.ctakes.typesystem.type.syntax.BaseToken;
-import org.apache.ctakes.typesystem.type.textsem.EntityMention;
-import org.apache.ctakes.typesystem.type.textsem.EventMention;
-import org.apache.ctakes.typesystem.type.textspan.Sentence;
-import org.apache.uima.UimaContext;
-import org.apache.uima.analysis_engine.AnalysisEngineDescription;
-import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.cleartk.classifier.CleartkAnnotator;
-//import org.cleartk.classifier.DataWriter;
-import org.cleartk.classifier.Feature;
-import org.cleartk.classifier.Instance;
-//import org.cleartk.classifier.feature.transform.InstanceDataWriter;
-import org.cleartk.classifier.chunking.BIOChunking;
-import org.cleartk.classifier.feature.extractor.CleartkExtractor;
-import org.cleartk.classifier.feature.extractor.CleartkExtractor.Following;
-import org.cleartk.classifier.feature.extractor.CleartkExtractor.Preceding;
-import org.cleartk.classifier.feature.extractor.simple.CharacterCategoryPatternExtractor;
-import org.cleartk.classifier.feature.extractor.simple.CharacterCategoryPatternExtractor.PatternType;
-import org.cleartk.classifier.feature.extractor.simple.CombinedExtractor;
-import org.cleartk.classifier.feature.extractor.simple.CoveredTextExtractor;
-import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
-import org.cleartk.classifier.feature.extractor.simple.TypePathExtractor;
-import org.cleartk.classifier.jar.DefaultDataWriterFactory;
-import org.cleartk.classifier.jar.DirectoryDataWriterFactory;
-import org.cleartk.classifier.jar.GenericJarClassifierFactory;
-import org.uimafit.factory.AnalysisEngineFactory;
-import org.uimafit.factory.ConfigurationParameterFactory;
-import org.uimafit.descriptor.ConfigurationParameter;
-import org.uimafit.util.JCasUtil;
-
-//import com.google.common.base.Charsets;
-import com.google.common.base.Predicate;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-//import com.google.common.io.LineProcessor;
-//import com.google.common.io.Resources;
-
-public class EventAnnotator extends CleartkAnnotator<String> {
-
- public static final String PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE = "ProbabilityOfKeepingANegativeExample";
-
- @ConfigurationParameter(
- name = PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
- mandatory = false,
- description = "probability that a negative example should be retained for training")
- protected Float probabilityOfKeepingANegativeExample = 0.8f;
-
- public static final String PARAM_FEATURE_TRIM_ORNOT = "WhetherToDoFeatureSelection";
-
- @ConfigurationParameter(
- name = PARAM_FEATURE_TRIM_ORNOT,
- mandatory = false,
- description = "set whether feature selection is used or not")
- public static Float featureTrim = 0f;
-
- public static AnalysisEngineDescription createDataWriterDescription(
- String dataWriterName,
- File outputDirectory, float downratio, float featureSelect) throws ResourceInitializationException {
- return AnalysisEngineFactory.createPrimitiveDescription(
- EventAnnotator.class,
- CleartkAnnotator.PARAM_IS_TRAINING,
- true,
- DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
- dataWriterName,
- DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
- outputDirectory,
- EventAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
- downratio,
- EventAnnotator.PARAM_FEATURE_TRIM_ORNOT,
- featureSelect);
- }
-
- public static AnalysisEngineDescription createAnnotatorDescription(File modelDirectory)
- throws ResourceInitializationException {
- AnalysisEngineDescription fsEventAnnotator =AnalysisEngineFactory.createPrimitiveDescription(
- EventAnnotator.class,
- CleartkAnnotator.PARAM_IS_TRAINING,
- false,
- GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
- new File(modelDirectory, "model.jar"));
- ConfigurationParameterFactory.addConfigurationParameter(
- fsEventAnnotator,
- EventAnnotator.PARAM_NB_FS_URI,
- EventAnnotator.createNbFSURI(modelDirectory) );
-
- return(fsEventAnnotator);
- }
-
- protected List<SimpleFeatureExtractor> tokenFeatureExtractors;
-
- protected List<CleartkExtractor> contextFeatureExtractors;
-
- private BIOChunking<BaseToken, EntityMention> entityChunking;
-
- private BIOChunking<BaseToken, EventMention> eventChunking;
-
- public static final String PARAM_NB_FS_URI = ConfigurationParameterFactory.createConfigurationParameterName(
- EventAnnotator.class,
- "neighborFsUri");
-
- @ConfigurationParameter(
- mandatory = false,
- description = "provides a URI where the neighbor annotation's feature selection data will be written")
- protected URI neighborFsUri;
-
- public static final String FS_NEIGHBOR_EXTRACTOR_KEY = "SelectNeighborFeatures";
-
- private Chi2NeighborFSExtractor<String> chi2NeighborFsExtractor;
-
-
- public static URI createNbFSURI(File outputDirectoryName) {
- File f = new File(outputDirectoryName, FS_NEIGHBOR_EXTRACTOR_KEY + "_Chi2_extractor.dat");
- return f.toURI();
- }
- //*****feature selection related parameters
-
- @Override
- public void initialize(UimaContext context) throws ResourceInitializationException {
- super.initialize(context);
-
- // define chunkings
- this.entityChunking = new BIOChunking<BaseToken, EntityMention>(
- BaseToken.class,
- EntityMention.class,
- "typeID");
- this.eventChunking = new BIOChunking<BaseToken, EventMention>(
- BaseToken.class,
- EventMention.class);
-
- //configure FS extractor:
- if (featureTrim > 0){//if feature selection
- CombinedExtractor forneighbors = new CombinedExtractor(
- new CoveredTextExtractor(),
- new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
- new TypePathExtractor(BaseToken.class, "partOfSpeech"),
- new SurfaceFormFeatureExtractor(),
- new PhraseExtractor(),
- new SRLExtractor());
-
- try {
- this.chi2NeighborFsExtractor = initNbFSExtractor(forneighbors);
- } catch (IOException e) {
- e.printStackTrace();
- }
- }else{//if no feature selection
- // add features: word, stem, pos and more
- this.tokenFeatureExtractors = new ArrayList<SimpleFeatureExtractor>();
- // try {
- this.tokenFeatureExtractors.addAll(Arrays.asList(
- new CoveredTextExtractor(),
-// new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
-// new TypePathExtractor(BaseToken.class, "partOfSpeech"),//);
-// new SurfaceFormFeatureExtractor(),
-// new PhraseExtractor(),
- new SRLExtractor()));
- // new CoveredTextToValuesExtractor("ACF", StringToDoublesProcessor.parse("/word_freq.lst")),
- // new CoveredTextToValuesExtractor("PCA", StringToDoublesProcessor.parse("/word_pca.lst")),
- // new CoveredTextToValuesExtractor("TimPCA", StringToDoublesProcessor.parse("/tim_word_pca.txt"))));
-
- //add window of features before and after
- CombinedExtractor subExtractor = new CombinedExtractor(
- new CoveredTextExtractor(),
- new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
- new TypePathExtractor(BaseToken.class, "partOfSpeech"),//);
- new SurfaceFormFeatureExtractor(),
- new SRLExtractor());
-
- this.contextFeatureExtractors = new ArrayList<CleartkExtractor>();
- this.contextFeatureExtractors.add(new CleartkExtractor(
- BaseToken.class,
- subExtractor,
- new Preceding(3),
- new Following(3)));
- }
-
-
- }
-
-
-private Chi2NeighborFSExtractor<String> initNbFSExtractor(
- CombinedExtractor subextractor) throws IOException{
-
- Chi2NeighborFSExtractor<String> chi2NbFSExtractor = new Chi2NeighborFSExtractor<String>(EventAnnotator.FS_NEIGHBOR_EXTRACTOR_KEY, BaseToken.class, subextractor, featureTrim, new Preceding(4),
- new Following(4)); //the 3rd last parameter is used to control chi2 threshold, the last two are used to control window size
-
- if (this.neighborFsUri != null) {
- chi2NbFSExtractor.load(this.neighborFsUri);
- }
- return chi2NbFSExtractor;
-}
-
-
- @Override
- public void process(JCas jCas) throws AnalysisEngineProcessException {
-
- Random rand = new Random();
- // classify tokens within each sentence
- for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
- List<BaseToken> tokens = JCasUtil.selectCovered(jCas, BaseToken.class, sentence);
-
- // during training, the list of all outcomes for the tokens
- List<String> outcomes;
- if (this.isTraining()) {
- List<EventMention> events = JCasUtil.selectCovered(jCas, EventMention.class, sentence);
- outcomes = this.eventChunking.createOutcomes(jCas, tokens, events);
- }
- // during prediction, the list of outcomes predicted so far
- else {
- outcomes = new ArrayList<String>();
- }
-
- // get BIO entity tags for each entity type
- int[] entityTypeIDs = new int[] {
- CONST.NE_TYPE_ID_ANATOMICAL_SITE,
- CONST.NE_TYPE_ID_DISORDER,
- CONST.NE_TYPE_ID_DRUG,
- CONST.NE_TYPE_ID_FINDING,
- CONST.NE_TYPE_ID_PROCEDURE,
- CONST.NE_TYPE_ID_UNKNOWN };
- List<EntityMention> entities = JCasUtil.selectCovered(jCas, EntityMention.class, sentence);
- Map<Integer, List<String>> entityTagsByType = new HashMap<Integer, List<String>>();
- for (int typeID : entityTypeIDs) {
- Predicate<EntityMention> hasTypeID = hasEntityType(typeID);
- List<EntityMention> subEntities = Lists.newArrayList(Iterables.filter(entities, hasTypeID));
- entityTagsByType.put(typeID, this.entityChunking.createOutcomes(jCas, tokens, subEntities));
- }
-
- // extract features for all tokens
- int tokenIndex = -1;
- int window = 2;
- int nPreviousClassifications = 2;
-
- for (BaseToken token : tokens) {
- ++tokenIndex;
-
- List<Feature> features = new ArrayList<Feature>();
-
- if (featureTrim >0 ){//if feature selection
- features.addAll(this.chi2NeighborFsExtractor.extract(jCas, token)); //base features
- features.addAll(this.chi2NeighborFsExtractor.extractWithin(jCas, token, sentence)); //neighbor features
- features.addAll(this.chi2NeighborFsExtractor.extract(entityTypeIDs, entityTagsByType,tokenIndex, window)); // features from surrounding entities
- features.addAll(this.chi2NeighborFsExtractor.extract(nPreviousClassifications, tokenIndex, outcomes)); //features from previous classifications
- }else{ //if no feature selection
- // features from token attributes
- for (SimpleFeatureExtractor extractor : this.tokenFeatureExtractors) {
- features.addAll(extractor.extract(jCas, token));
- }
- // features from surrounding tokens
- for (CleartkExtractor extractor : this.contextFeatureExtractors) {
- features.addAll(extractor.extractWithin(jCas, token, sentence));
- }
- // features from surrounding entities
- for (int typeID : entityTypeIDs) {
- List<String> tokenEntityTags = entityTagsByType.get(typeID);
- int begin = Math.max(tokenIndex - window, 0);
- int end = Math.min(tokenIndex + window, tokenEntityTags.size());
- for (int i = begin; i < end; ++i) {
- String name = String.format("EntityTag_%d_%d", typeID, i - begin);
- features.add(new Feature(name, tokenEntityTags.get(i)));
- }
- }
- // features from previous classifications
- for (int i = nPreviousClassifications; i > 0; --i) {
- int index = tokenIndex - i;
- String previousOutcome = index < 0 ? "O" : outcomes.get(index);
- features.add(new Feature("PreviousOutcome_" + i, previousOutcome));
- }
- }
-
- // if training, write to data file
- if (this.isTraining()) {
- String outcome = outcomes.get(tokenIndex);
- if(outcome.equals("O")){ //if it is an "O". downsample it
- if (rand.nextDouble()<=probabilityOfKeepingANegativeExample)
- this.dataWriter.write(new Instance<String>(outcome, features));
- }else {
- this.dataWriter.write(new Instance<String>(outcome, features));
- }
- }
-
- // if predicting, add prediction to outcomes
- else {
- outcomes.add(this.classifier.classify(features));
- }
- }
-
- // during prediction, convert chunk labels to events and add them to the CAS
- if (!this.isTraining()) {
- this.eventChunking.createChunks(jCas, tokens, outcomes);
- }
- }
- }
-
- private static Predicate<EntityMention> hasEntityType(final int typeID) {
- return new Predicate<EntityMention>() {
- public boolean apply(EntityMention mention) {
- return mention.getTypeID() == typeID;
- }
- };
- }
-
-// private static class StringToDoublesProcessor implements LineProcessor<Map<String, double[]>> {
-// private Logger logger = Logger.getLogger(this.getClass().getName());
-//
-// private Map<String, double[]> result = new HashMap<String, double[]>();
-//
-// private int length = -1;
-//
-// @Override
-// public Map<String, double[]> getResult() {
-// return this.result;
-// }
-//
-// @Override
-// public boolean processLine(String line) throws IOException {
-// String[] parts = line.trim().split(",");
-// String key = parts[0];
-// int partsOffset = 0;
-// if (this.length == -1) {
-// this.length = parts.length;
-// } else if (parts.length != this.length) {
-// String message = "expected %d parts, found %d, skipping line '%s'";
-// this.logger.warning(String.format(message, this.length, parts.length, line));
-// return true;
-// }
-// double[] values = new double[parts.length - 1];
-// for (int i = 0; i < values.length; ++i) {
-// values[i] = Double.parseDouble(parts[i + 1 + partsOffset]);
-// }
-// this.result.put(key, values);
-// return true;
-// }
-// }
-
-
-public Chi2NeighborFSExtractor<String> getChi2NbSubExtractor() {
- return this.chi2NeighborFsExtractor;
-}
-}
+package org.apache.ctakes.temporal.ae;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+//import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+//import java.util.logging.Logger;
+
+//import org.apache.ctakes.temporal.ae.feature.CoveredTextToValuesExtractor;
+import org.apache.ctakes.temporal.ae.feature.PhraseExtractor;
+import org.apache.ctakes.temporal.ae.feature.SRLExtractor;
+import org.apache.ctakes.temporal.ae.feature.SurfaceFormFeatureExtractor;
+import org.apache.ctakes.temporal.ae.feature.selection.Chi2NeighborFSExtractor;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.classifier.CleartkAnnotator;
+//import org.cleartk.classifier.DataWriter;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+//import org.cleartk.classifier.feature.transform.InstanceDataWriter;
+import org.cleartk.classifier.chunking.BIOChunking;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor.Following;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor.Preceding;
+import org.cleartk.classifier.feature.extractor.simple.CharacterCategoryPatternExtractor;
+import org.cleartk.classifier.feature.extractor.simple.CharacterCategoryPatternExtractor.PatternType;
+import org.cleartk.classifier.feature.extractor.simple.CombinedExtractor;
+import org.cleartk.classifier.feature.extractor.simple.CoveredTextExtractor;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import org.cleartk.classifier.feature.extractor.simple.TypePathExtractor;
+import org.cleartk.classifier.jar.DefaultDataWriterFactory;
+import org.cleartk.classifier.jar.DirectoryDataWriterFactory;
+import org.cleartk.classifier.jar.GenericJarClassifierFactory;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.ConfigurationParameterFactory;
+import org.uimafit.descriptor.ConfigurationParameter;
+import org.uimafit.util.JCasUtil;
+
+//import com.google.common.base.Charsets;
+import com.google.common.base.Predicate;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+//import com.google.common.io.LineProcessor;
+//import com.google.common.io.Resources;
+
+public class EventAnnotator extends CleartkAnnotator<String> {
+
+ public static final String PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE = "ProbabilityOfKeepingANegativeExample";
+
+ @ConfigurationParameter(
+ name = PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+ mandatory = false,
+ description = "probability that a negative example should be retained for training")
+ protected Float probabilityOfKeepingANegativeExample = 0.8f;
+
+ public static final String PARAM_FEATURE_TRIM_ORNOT = "WhetherToDoFeatureSelection";
+
+ @ConfigurationParameter(
+ name = PARAM_FEATURE_TRIM_ORNOT,
+ mandatory = false,
+ description = "set whether feature selection is used or not")
+ public static Float featureTrim = 0f;
+
+ public static AnalysisEngineDescription createDataWriterDescription(
+ String dataWriterName,
+ File outputDirectory, float downratio, float featureSelect) throws ResourceInitializationException {
+ return AnalysisEngineFactory.createPrimitiveDescription(
+ EventAnnotator.class,
+ CleartkAnnotator.PARAM_IS_TRAINING,
+ true,
+ DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+ dataWriterName,
+ DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+ outputDirectory,
+ EventAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+ downratio,
+ EventAnnotator.PARAM_FEATURE_TRIM_ORNOT,
+ featureSelect);
+ }
+
+ public static AnalysisEngineDescription createAnnotatorDescription(File modelDirectory)
+ throws ResourceInitializationException {
+ AnalysisEngineDescription fsEventAnnotator =AnalysisEngineFactory.createPrimitiveDescription(
+ EventAnnotator.class,
+ CleartkAnnotator.PARAM_IS_TRAINING,
+ false,
+ GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+ new File(modelDirectory, "model.jar"));
+ ConfigurationParameterFactory.addConfigurationParameter(
+ fsEventAnnotator,
+ EventAnnotator.PARAM_NB_FS_URI,
+ EventAnnotator.createNbFSURI(modelDirectory) );
+
+ return(fsEventAnnotator);
+ }
+
+ protected List<SimpleFeatureExtractor> tokenFeatureExtractors;
+
+ protected List<CleartkExtractor> contextFeatureExtractors;
+
+ private BIOChunking<BaseToken, EntityMention> entityChunking;
+
+ private BIOChunking<BaseToken, EventMention> eventChunking;
+
+ public static final String PARAM_NB_FS_URI = ConfigurationParameterFactory.createConfigurationParameterName(
+ EventAnnotator.class,
+ "neighborFsUri");
+
+ @ConfigurationParameter(
+ mandatory = false,
+ description = "provides a URI where the neighbor annotation's feature selection data will be written")
+ protected URI neighborFsUri;
+
+ public static final String FS_NEIGHBOR_EXTRACTOR_KEY = "SelectNeighborFeatures";
+
+ private Chi2NeighborFSExtractor<String> chi2NeighborFsExtractor;
+
+
+ public static URI createNbFSURI(File outputDirectoryName) {
+ File f = new File(outputDirectoryName, FS_NEIGHBOR_EXTRACTOR_KEY + "_Chi2_extractor.dat");
+ return f.toURI();
+ }
+ //*****feature selection related parameters
+
+ @Override
+ public void initialize(UimaContext context) throws ResourceInitializationException {
+ super.initialize(context);
+
+ // define chunkings
+ this.entityChunking = new BIOChunking<BaseToken, EntityMention>(
+ BaseToken.class,
+ EntityMention.class,
+ "typeID");
+ this.eventChunking = new BIOChunking<BaseToken, EventMention>(
+ BaseToken.class,
+ EventMention.class);
+
+ //configure FS extractor:
+ if (featureTrim > 0){//if feature selection
+ CombinedExtractor forneighbors = new CombinedExtractor(
+ new CoveredTextExtractor(),
+ new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
+ new TypePathExtractor(BaseToken.class, "partOfSpeech"),
+ new SurfaceFormFeatureExtractor(),
+ new PhraseExtractor(),
+ new SRLExtractor());
+
+ try {
+ this.chi2NeighborFsExtractor = initNbFSExtractor(forneighbors);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }else{//if no feature selection
+ // add features: word, stem, pos and more
+ this.tokenFeatureExtractors = new ArrayList<SimpleFeatureExtractor>();
+ // try {
+ this.tokenFeatureExtractors.addAll(Arrays.asList(
+ new CoveredTextExtractor(),
+// new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
+// new TypePathExtractor(BaseToken.class, "partOfSpeech"),//);
+// new SurfaceFormFeatureExtractor(),
+// new PhraseExtractor(),
+ new SRLExtractor()));
+ // new CoveredTextToValuesExtractor("ACF", StringToDoublesProcessor.parse("/word_freq.lst")),
+ // new CoveredTextToValuesExtractor("PCA", StringToDoublesProcessor.parse("/word_pca.lst")),
+ // new CoveredTextToValuesExtractor("TimPCA", StringToDoublesProcessor.parse("/tim_word_pca.txt"))));
+
+ //add window of features before and after
+ CombinedExtractor subExtractor = new CombinedExtractor(
+ new CoveredTextExtractor(),
+ new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
+ new TypePathExtractor(BaseToken.class, "partOfSpeech"),//);
+ new SurfaceFormFeatureExtractor(),
+ new SRLExtractor());
+
+ this.contextFeatureExtractors = new ArrayList<CleartkExtractor>();
+ this.contextFeatureExtractors.add(new CleartkExtractor(
+ BaseToken.class,
+ subExtractor,
+ new Preceding(3),
+ new Following(3)));
+ }
+
+
+ }
+
+
+private Chi2NeighborFSExtractor<String> initNbFSExtractor(
+ CombinedExtractor subextractor) throws IOException{
+
+ Chi2NeighborFSExtractor<String> chi2NbFSExtractor = new Chi2NeighborFSExtractor<String>(EventAnnotator.FS_NEIGHBOR_EXTRACTOR_KEY, BaseToken.class, subextractor, featureTrim, new Preceding(4),
+ new Following(4)); //the 3rd last parameter is used to control chi2 threshold, the last two are used to control window size
+
+ if (this.neighborFsUri != null) {
+ chi2NbFSExtractor.load(this.neighborFsUri);
+ }
+ return chi2NbFSExtractor;
+}
+
+
+ @Override
+ public void process(JCas jCas) throws AnalysisEngineProcessException {
+
+ Random rand = new Random();
+ // classify tokens within each sentence
+ for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
+ List<BaseToken> tokens = JCasUtil.selectCovered(jCas, BaseToken.class, sentence);
+
+ // during training, the list of all outcomes for the tokens
+ List<String> outcomes;
+ if (this.isTraining()) {
+ List<EventMention> events = JCasUtil.selectCovered(jCas, EventMention.class, sentence);
+ outcomes = this.eventChunking.createOutcomes(jCas, tokens, events);
+ }
+ // during prediction, the list of outcomes predicted so far
+ else {
+ outcomes = new ArrayList<String>();
+ }
+
+ // get BIO entity tags for each entity type
+ int[] entityTypeIDs = new int[] {
+ CONST.NE_TYPE_ID_ANATOMICAL_SITE,
+ CONST.NE_TYPE_ID_DISORDER,
+ CONST.NE_TYPE_ID_DRUG,
+ CONST.NE_TYPE_ID_FINDING,
+ CONST.NE_TYPE_ID_PROCEDURE,
+ CONST.NE_TYPE_ID_UNKNOWN };
+ List<EntityMention> entities = JCasUtil.selectCovered(jCas, EntityMention.class, sentence);
+ Map<Integer, List<String>> entityTagsByType = new HashMap<Integer, List<String>>();
+ for (int typeID : entityTypeIDs) {
+ Predicate<EntityMention> hasTypeID = hasEntityType(typeID);
+ List<EntityMention> subEntities = Lists.newArrayList(Iterables.filter(entities, hasTypeID));
+ entityTagsByType.put(typeID, this.entityChunking.createOutcomes(jCas, tokens, subEntities));
+ }
+
+ // extract features for all tokens
+ int tokenIndex = -1;
+ int window = 2;
+ int nPreviousClassifications = 2;
+
+ for (BaseToken token : tokens) {
+ ++tokenIndex;
+
+ List<Feature> features = new ArrayList<Feature>();
+
+ if (featureTrim >0 ){//if feature selection
+ features.addAll(this.chi2NeighborFsExtractor.extract(jCas, token)); //base features
+ features.addAll(this.chi2NeighborFsExtractor.extractWithin(jCas, token, sentence)); //neighbor features
+ features.addAll(this.chi2NeighborFsExtractor.extract(entityTypeIDs, entityTagsByType,tokenIndex, window)); // features from surrounding entities
+ features.addAll(this.chi2NeighborFsExtractor.extract(nPreviousClassifications, tokenIndex, outcomes)); //features from previous classifications
+ }else{ //if no feature selection
+ // features from token attributes
+ for (SimpleFeatureExtractor extractor : this.tokenFeatureExtractors) {
+ features.addAll(extractor.extract(jCas, token));
+ }
+ // features from surrounding tokens
+ for (CleartkExtractor extractor : this.contextFeatureExtractors) {
+ features.addAll(extractor.extractWithin(jCas, token, sentence));
+ }
+ // features from surrounding entities
+ for (int typeID : entityTypeIDs) {
+ List<String> tokenEntityTags = entityTagsByType.get(typeID);
+ int begin = Math.max(tokenIndex - window, 0);
+ int end = Math.min(tokenIndex + window, tokenEntityTags.size());
+ for (int i = begin; i < end; ++i) {
+ String name = String.format("EntityTag_%d_%d", typeID, i - begin);
+ features.add(new Feature(name, tokenEntityTags.get(i)));
+ }
+ }
+ // features from previous classifications
+ for (int i = nPreviousClassifications; i > 0; --i) {
+ int index = tokenIndex - i;
+ String previousOutcome = index < 0 ? "O" : outcomes.get(index);
+ features.add(new Feature("PreviousOutcome_" + i, previousOutcome));
+ }
+ }
+
+ // if training, write to data file
+ if (this.isTraining()) {
+ String outcome = outcomes.get(tokenIndex);
+ if(outcome.equals("O")){ //if it is an "O". downsample it
+ if (rand.nextDouble()<=probabilityOfKeepingANegativeExample)
+ this.dataWriter.write(new Instance<String>(outcome, features));
+ }else {
+ this.dataWriter.write(new Instance<String>(outcome, features));
+ }
+ }
+
+ // if predicting, add prediction to outcomes
+ else {
+ outcomes.add(this.classifier.classify(features));
+ }
+ }
+
+ // during prediction, convert chunk labels to events and add them to the CAS
+ if (!this.isTraining()) {
+ this.eventChunking.createChunks(jCas, tokens, outcomes);
+ }
+ }
+ }
+
+ private static Predicate<EntityMention> hasEntityType(final int typeID) {
+ return new Predicate<EntityMention>() {
+ public boolean apply(EntityMention mention) {
+ return mention.getTypeID() == typeID;
+ }
+ };
+ }
+
+// private static class StringToDoublesProcessor implements LineProcessor<Map<String, double[]>> {
+// private Logger logger = Logger.getLogger(this.getClass().getName());
+//
+// private Map<String, double[]> result = new HashMap<String, double[]>();
+//
+// private int length = -1;
+//
+// @Override
+// public Map<String, double[]> getResult() {
+// return this.result;
+// }
+//
+// @Override
+// public boolean processLine(String line) throws IOException {
+// String[] parts = line.trim().split(",");
+// String key = parts[0];
+// int partsOffset = 0;
+// if (this.length == -1) {
+// this.length = parts.length;
+// } else if (parts.length != this.length) {
+// String message = "expected %d parts, found %d, skipping line '%s'";
+// this.logger.warning(String.format(message, this.length, parts.length, line));
+// return true;
+// }
+// double[] values = new double[parts.length - 1];
+// for (int i = 0; i < values.length; ++i) {
+// values[i] = Double.parseDouble(parts[i + 1 + partsOffset]);
+// }
+// this.result.put(key, values);
+// return true;
+// }
+// }
+
+
+public Chi2NeighborFSExtractor<String> getChi2NbSubExtractor() {
+ return this.chi2NeighborFsExtractor;
+}
+}
Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java?rev=1424157&r1=1424156&r2=1424157&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java Wed Dec 19 21:49:46 2012
@@ -1,73 +1,73 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.ctakes.temporal.ae.feature;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.tcas.Annotation;
-import org.cleartk.classifier.Feature;
-import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
-import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
-
-public class CoveredTextToValuesExtractor implements SimpleFeatureExtractor {
-
- private String name;
-
- private Map<String, double[]> textDoublesMap;
-
- private double[] meanValues;
-
- public CoveredTextToValuesExtractor(String name, Map<String, double[]> textDoublesMap) {
- super();
- this.name = name;
- this.textDoublesMap = textDoublesMap;
- int nMapEntries = this.textDoublesMap.size();
- if (nMapEntries == 0) {
- throw new IllegalArgumentException("textDoublesMap cannot be empty");
- }
- int nValues = textDoublesMap.entrySet().iterator().next().getValue().length;
- this.meanValues = new double[nValues];
- for (double[] values : textDoublesMap.values()) {
- for (int i = 0; i < values.length; ++i) {
- this.meanValues[i] += values[i];
- }
- }
- for (int i = 0; i < this.meanValues.length; ++i) {
- this.meanValues[i] /= nMapEntries;
- }
- }
-
- @Override
- public List<Feature> extract(JCas view, Annotation annotation) throws CleartkExtractorException {
- double[] values = this.textDoublesMap.get(annotation.getCoveredText());
- if (values == null) {
- values = this.meanValues;
- }
- ArrayList<Feature> features = new ArrayList<Feature>();
- for (int i = 0; i < values.length; ++i) {
- String featureName = Feature.createName(this.name, String.valueOf(i));
- features.add(new Feature(featureName, values[i]));
- }
- return features;
- }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.temporal.ae.feature;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+
+public class CoveredTextToValuesExtractor implements SimpleFeatureExtractor {
+
+ private String name;
+
+ private Map<String, double[]> textDoublesMap;
+
+ private double[] meanValues;
+
+ public CoveredTextToValuesExtractor(String name, Map<String, double[]> textDoublesMap) {
+ super();
+ this.name = name;
+ this.textDoublesMap = textDoublesMap;
+ int nMapEntries = this.textDoublesMap.size();
+ if (nMapEntries == 0) {
+ throw new IllegalArgumentException("textDoublesMap cannot be empty");
+ }
+ int nValues = textDoublesMap.entrySet().iterator().next().getValue().length;
+ this.meanValues = new double[nValues];
+ for (double[] values : textDoublesMap.values()) {
+ for (int i = 0; i < values.length; ++i) {
+ this.meanValues[i] += values[i];
+ }
+ }
+ for (int i = 0; i < this.meanValues.length; ++i) {
+ this.meanValues[i] /= nMapEntries;
+ }
+ }
+
+ @Override
+ public List<Feature> extract(JCas view, Annotation annotation) throws CleartkExtractorException {
+ double[] values = this.textDoublesMap.get(annotation.getCoveredText());
+ if (values == null) {
+ values = this.meanValues;
+ }
+ ArrayList<Feature> features = new ArrayList<Feature>();
+ for (int i = 0; i < values.length; ++i) {
+ String featureName = Feature.createName(this.name, String.valueOf(i));
+ features.add(new Feature(featureName, values[i]));
+ }
+ return features;
+ }
+
+}
Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/CoveredTextToValuesExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java?rev=1424157&r1=1424156&r2=1424157&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java Wed Dec 19 21:49:46 2012
@@ -1,50 +1,50 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.ctakes.temporal.ae.feature;
-
-import java.util.Collections;
-import java.util.List;
-
-import org.apache.ctakes.typesystem.type.syntax.Chunk;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.tcas.Annotation;
-import org.cleartk.classifier.Feature;
-import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
-import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
-import org.uimafit.util.JCasUtil;
-
-public class PhraseExtractor implements SimpleFeatureExtractor {
-
- @Override
- public List<Feature> extract(JCas jCas, Annotation token) throws CleartkExtractorException {
- String featureValue = "NotNPVP";
- for (Chunk chunk : JCasUtil.selectCovered(jCas, Chunk.class, token)) {
- String chunkType = chunk.getChunkType();
- if (chunkType.equals("NP")) {
- featureValue = "NP";
- break;
- } else if (chunkType.equals("VP")) {
- featureValue = "VP";
- break;
- }
- }
- return Collections.singletonList(new Feature("PhraseType", featureValue));
- }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.temporal.ae.feature;
+
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import org.uimafit.util.JCasUtil;
+
+public class PhraseExtractor implements SimpleFeatureExtractor {
+
+ @Override
+ public List<Feature> extract(JCas jCas, Annotation token) throws CleartkExtractorException {
+ String featureValue = "NotNPVP";
+ for (Chunk chunk : JCasUtil.selectCovered(jCas, Chunk.class, token)) {
+ String chunkType = chunk.getChunkType();
+ if (chunkType.equals("NP")) {
+ featureValue = "NP";
+ break;
+ } else if (chunkType.equals("VP")) {
+ featureValue = "VP";
+ break;
+ }
+ }
+ return Collections.singletonList(new Feature("PhraseType", featureValue));
+ }
+
+}
Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/PhraseExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java?rev=1424157&r1=1424156&r2=1424157&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java Wed Dec 19 21:49:46 2012
@@ -1,88 +1,88 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.ctakes.temporal.ae.feature;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.ctakes.typesystem.type.syntax.BaseToken;
-import org.apache.ctakes.typesystem.type.textsem.Predicate;
-import org.apache.ctakes.typesystem.type.textsem.SemanticArgument;
-import org.apache.ctakes.typesystem.type.textsem.SemanticRoleRelation;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.tcas.Annotation;
-import org.cleartk.classifier.Feature;
-import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
-import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
-import org.uimafit.util.JCasUtil;
-
-public class SRLExtractor implements SimpleFeatureExtractor {
-
- @Override
- public List<Feature> extract(JCas jCas, Annotation focusAnnotation)
- throws CleartkExtractorException {
- // and cache the results so that we only do this once per CAS
- String jCasText = jCas.getDocumentText();
- String roleFeat = "SemanticRole";
- String roleVerbFeat = "RoleAndVerb";
- String verb = "noVerb";
- Feature role = new Feature(roleFeat, "NoRole");
- Feature roleVerb = new Feature(roleVerbFeat, "NoRole"+verb);
- ArrayList<Feature> features = new ArrayList<Feature>();
- for (Predicate predicate : JCasUtil.select(jCas, Predicate.class)) {
-
- for (BaseToken token : JCasUtil.selectCovered(jCas, BaseToken.class, predicate)) {
- if (token.equals(focusAnnotation)) {// token.getBegin()==focusAnnotation.getBegin()){
- role = new Feature(roleFeat,"Predicate");
- verb = jCasText.substring(predicate.getBegin(), predicate.getEnd());
- roleVerb = new Feature(roleVerbFeat, "Predicate::"+verb);
-
- features.add(role);
- features.add(roleVerb);
- return features;
- }
- }
-
- for (SemanticRoleRelation relation : JCasUtil.select(
- predicate.getRelations(),
- SemanticRoleRelation.class)) {
- SemanticArgument arg = relation.getArgument();
- // System.out.format("\tArg: %s=%s \n", arg.getLabel(), arg.getCoveredText());
- for (BaseToken token : JCasUtil.selectCovered(jCas, BaseToken.class, arg)) {
- if (token.equals(focusAnnotation)) {// token.getBegin()==focusAnnotation.getBegin()){
- String label = arg.getLabel();
- Predicate currentPred = relation.getPredicate();
- verb = jCasText.substring(currentPred.getBegin(), currentPred.getEnd());
- role = new Feature(roleFeat, label);
- roleVerb = new Feature(roleVerbFeat, label+"::"+verb);
-
- features.add(role);
- features.add(roleVerb);
- return features;
- }
- }
- }
- }
-
- features.add(role);
- features.add(roleVerb);
- return features;
- }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.temporal.ae.feature;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textsem.Predicate;
+import org.apache.ctakes.typesystem.type.textsem.SemanticArgument;
+import org.apache.ctakes.typesystem.type.textsem.SemanticRoleRelation;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import org.uimafit.util.JCasUtil;
+
+public class SRLExtractor implements SimpleFeatureExtractor {
+
+ @Override
+ public List<Feature> extract(JCas jCas, Annotation focusAnnotation)
+ throws CleartkExtractorException {
+ // and cache the results so that we only do this once per CAS
+ String jCasText = jCas.getDocumentText();
+ String roleFeat = "SemanticRole";
+ String roleVerbFeat = "RoleAndVerb";
+ String verb = "noVerb";
+ Feature role = new Feature(roleFeat, "NoRole");
+ Feature roleVerb = new Feature(roleVerbFeat, "NoRole"+verb);
+ ArrayList<Feature> features = new ArrayList<Feature>();
+ for (Predicate predicate : JCasUtil.select(jCas, Predicate.class)) {
+
+ for (BaseToken token : JCasUtil.selectCovered(jCas, BaseToken.class, predicate)) {
+ if (token.equals(focusAnnotation)) {// token.getBegin()==focusAnnotation.getBegin()){
+ role = new Feature(roleFeat,"Predicate");
+ verb = jCasText.substring(predicate.getBegin(), predicate.getEnd());
+ roleVerb = new Feature(roleVerbFeat, "Predicate::"+verb);
+
+ features.add(role);
+ features.add(roleVerb);
+ return features;
+ }
+ }
+
+ for (SemanticRoleRelation relation : JCasUtil.select(
+ predicate.getRelations(),
+ SemanticRoleRelation.class)) {
+ SemanticArgument arg = relation.getArgument();
+ // System.out.format("\tArg: %s=%s \n", arg.getLabel(), arg.getCoveredText());
+ for (BaseToken token : JCasUtil.selectCovered(jCas, BaseToken.class, arg)) {
+ if (token.equals(focusAnnotation)) {// token.getBegin()==focusAnnotation.getBegin()){
+ String label = arg.getLabel();
+ Predicate currentPred = relation.getPredicate();
+ verb = jCasText.substring(currentPred.getBegin(), currentPred.getEnd());
+ role = new Feature(roleFeat, label);
+ roleVerb = new Feature(roleVerbFeat, label+"::"+verb);
+
+ features.add(role);
+ features.add(roleVerb);
+ return features;
+ }
+ }
+ }
+ }
+
+ features.add(role);
+ features.add(roleVerb);
+ return features;
+ }
+
+}
Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SurfaceFormFeatureExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SurfaceFormFeatureExtractor.java?rev=1424157&r1=1424156&r2=1424157&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SurfaceFormFeatureExtractor.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SurfaceFormFeatureExtractor.java Wed Dec 19 21:49:46 2012
@@ -1,68 +1,68 @@
-package org.apache.ctakes.temporal.ae.feature;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.tcas.Annotation;
-import org.cleartk.classifier.Feature;
-import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
-import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
-
-public class SurfaceFormFeatureExtractor implements SimpleFeatureExtractor {
-
- private final String SYMBOL = "Symbol";
- private final String SYMBOL_REG = "\\W+";
- private final String ALL_CAPITAL = "AllCapital";
- private final String ALL_CAPITAL_REG = "[A-Z][A-Z]+";
- private final String FIRST_CAPITAL = "FirstCapital";
- private final String FIRST_CAPITAL_REG = "^[A-Z][a-z]+";
- private final String SINGLE_CAPITAL = "SingelCapital";
- private final String SINGLE_CAPITAL_REG = "^[A-Z]{1}$";
- private final String SINGLE_LETTER ="SingleLetter";
- private final String SINGLE_LETTER_REG = "^[a-z]{1}$";
- private final String ALL_LOWER = "AllLower";
- private final String ALL_LOWER_REG = "[a-z][a-z]+";
- private final String NUMBER = "Number";
- private final String NUMBER_REG ="[\\d]*\\.?[\\d]+";
- private final String WORDNUMMIX ="WordNumberMix";
- private final String WORDNUMMIX_REG ="[\\w][\\w]+";
- private final String FEATURE_SURF = "Surface";
- private final String FEATURE_LENGTH = "Length";
-
- @Override
- public List<Feature> extract(JCas view, Annotation focusAnnotation)
- throws CleartkExtractorException {
- ArrayList<Feature> features = new ArrayList<Feature>();
- String jCasText = view.getDocumentText();
- int begin = focusAnnotation.getBegin();
- int end = focusAnnotation.getEnd();
- String text = jCasText == null ? null : jCasText.substring(begin, end);
- features.add(new Feature(this.FEATURE_SURF, getStrType(text)));
- int length = text == null ? 0 : text.length();
- if (length <=1) features.add(new Feature(this.FEATURE_LENGTH, "single"));
- else features.add(new Feature(this.FEATURE_LENGTH, "multiple"));
-
- // create a single feature from the text
- return features;
- }
-
- public static void main(String[] args) throws Exception {
- SurfaceFormFeatureExtractor se = new SurfaceFormFeatureExtractor();
- String test = "a";
- System.out.println("String type is :" + se.getStrType(test));
- }
-
- private String getStrType(String test) {
- if ( test.matches(this.ALL_CAPITAL_REG)) return this.ALL_CAPITAL;
- else if ( test.matches(ALL_LOWER_REG)) return this.ALL_LOWER;
- else if ( test.matches(FIRST_CAPITAL_REG)) return this.FIRST_CAPITAL;
- else if ( test.matches(NUMBER_REG)) return this.NUMBER;
- else if ( test.matches(SINGLE_CAPITAL_REG)) return this.SINGLE_CAPITAL;
- else if ( test.matches(SINGLE_LETTER_REG)) return this.SINGLE_LETTER;
- else if ( test.matches(SYMBOL_REG)) return this.SYMBOL;
- else if ( test.matches(WORDNUMMIX_REG)) return this.WORDNUMMIX;
- else return "Nomatch";
- }
-
-}
+package org.apache.ctakes.temporal.ae.feature;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+
+public class SurfaceFormFeatureExtractor implements SimpleFeatureExtractor {
+
+ private final String SYMBOL = "Symbol";
+ private final String SYMBOL_REG = "\\W+";
+ private final String ALL_CAPITAL = "AllCapital";
+ private final String ALL_CAPITAL_REG = "[A-Z][A-Z]+";
+ private final String FIRST_CAPITAL = "FirstCapital";
+ private final String FIRST_CAPITAL_REG = "^[A-Z][a-z]+";
+ private final String SINGLE_CAPITAL = "SingelCapital";
+ private final String SINGLE_CAPITAL_REG = "^[A-Z]{1}$";
+ private final String SINGLE_LETTER ="SingleLetter";
+ private final String SINGLE_LETTER_REG = "^[a-z]{1}$";
+ private final String ALL_LOWER = "AllLower";
+ private final String ALL_LOWER_REG = "[a-z][a-z]+";
+ private final String NUMBER = "Number";
+ private final String NUMBER_REG ="[\\d]*\\.?[\\d]+";
+ private final String WORDNUMMIX ="WordNumberMix";
+ private final String WORDNUMMIX_REG ="[\\w][\\w]+";
+ private final String FEATURE_SURF = "Surface";
+ private final String FEATURE_LENGTH = "Length";
+
+ @Override
+ public List<Feature> extract(JCas view, Annotation focusAnnotation)
+ throws CleartkExtractorException {
+ ArrayList<Feature> features = new ArrayList<Feature>();
+ String jCasText = view.getDocumentText();
+ int begin = focusAnnotation.getBegin();
+ int end = focusAnnotation.getEnd();
+ String text = jCasText == null ? null : jCasText.substring(begin, end);
+ features.add(new Feature(this.FEATURE_SURF, getStrType(text)));
+ int length = text == null ? 0 : text.length();
+ if (length <=1) features.add(new Feature(this.FEATURE_LENGTH, "single"));
+ else features.add(new Feature(this.FEATURE_LENGTH, "multiple"));
+
+ // create a single feature from the text
+ return features;
+ }
+
+ public static void main(String[] args) throws Exception {
+ SurfaceFormFeatureExtractor se = new SurfaceFormFeatureExtractor();
+ String test = "a";
+ System.out.println("String type is :" + se.getStrType(test));
+ }
+
+ private String getStrType(String test) {
+ if ( test.matches(this.ALL_CAPITAL_REG)) return this.ALL_CAPITAL;
+ else if ( test.matches(ALL_LOWER_REG)) return this.ALL_LOWER;
+ else if ( test.matches(FIRST_CAPITAL_REG)) return this.FIRST_CAPITAL;
+ else if ( test.matches(NUMBER_REG)) return this.NUMBER;
+ else if ( test.matches(SINGLE_CAPITAL_REG)) return this.SINGLE_CAPITAL;
+ else if ( test.matches(SINGLE_LETTER_REG)) return this.SINGLE_LETTER;
+ else if ( test.matches(SYMBOL_REG)) return this.SYMBOL;
+ else if ( test.matches(WORDNUMMIX_REG)) return this.WORDNUMMIX;
+ else return "Nomatch";
+ }
+
+}
Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SurfaceFormFeatureExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native