You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by ja...@apache.org on 2013/08/14 23:03:05 UTC

svn commit: r1514049 - in /ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion: cr/NegExAnnotation.java cr/NegExCorpusReader.java util/AssertionConst.java

Author: james-masanz
Date: Wed Aug 14 21:03:05 2013
New Revision: 1514049

URL: http://svn.apache.org/r1514049
Log:
1st pass at a reader for negex gold standard.

Added:
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExAnnotation.java   (with props)
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java   (with props)
Modified:
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExAnnotation.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExAnnotation.java?rev=1514049&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExAnnotation.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExAnnotation.java Wed Aug 14 21:03:05 2013
@@ -0,0 +1,135 @@
+package org.apache.ctakes.assertion.cr;
+
+import java.util.Date;
+
+import org.apache.log4j.Logger;
+
+/**
+ * parses a line of data from the negex gold standard
+ * For the few instances where there is somethign wrong with the gold standard, corrects/rejects 
+ * some mistakes in gold standard (e.g. where entity is longer than the sentence!)
+ */
+public class NegExAnnotation {
+	static Logger LOGGER = Logger.getLogger(NegExAnnotation.class);
+
+	/**
+	 * Each line consist of following fields
+	 * line number
+	 * Condition (from within sentence)
+	 * sentence	
+	 * negation_status (Negated, Affirmed) also handles if possible)
+	 * 
+	 * 
+	 * @param args
+	 */
+	
+	String lineNumber;
+	String entityCoveredText;
+	String sentenceText;
+	String polarity; // -1 means negated. 1 means not negated. Note, shares field with possible
+	String possible; // 1 means possible. 0 = either negated or affirmed. shares field with negated/polarity
+	String temporality;
+	String experiencer;
+	String begin;
+	String end;
+	
+	public NegExAnnotation(String lineWithAnnotation) {
+		
+		String s = lineWithAnnotation.trim();
+		if (s.length()==0) throw new RuntimeException("no annotation or sentence data found");
+		
+		String [] fields = lineWithAnnotation.split("\t");
+		int numRequiredFields = 4;
+		if (fields.length < numRequiredFields) { 
+			throw new RuntimeException("Not enough fields on line '" + lineWithAnnotation + "', need at least " + numRequiredFields + "fields.");
+		}
+		
+		lineNumber = fields[0].trim();
+		
+		String INCORRECT_LINE1 = "OSTEOCHONDRAL IRREGULARITY WITHIN THE 45 DEGREE FLEXION ZONE  OF THE LATERAL FEMORAL CONDYLE COMPATIBLE WITH OSTEOCHONDRAL  LESION. INCREASED SCLEROSIS WITHIN THIS REGION";
+		String CORRECTED_LINE1 = "OSTEOCHONDRAL IRREGULARITY WITHIN THE 45 DEGREE FLEXION ZONE  OF THE LATERAL FEMORAL CONDYLE COMPATIBLE WITH OSTEOCHONDRAL  LESION.";
+
+
+		entityCoveredText = fields[1].trim();
+		if (entityCoveredText.toLowerCase().equals(INCORRECT_LINE1.toLowerCase())) { // correct an error in the gold standard
+			entityCoveredText = CORRECTED_LINE1;
+		}
+		if (entityCoveredText.length()<1) throw new RuntimeException("Error parsing entityCoveredText from line '" + lineWithAnnotation + "'");
+
+		if (entityCoveredText.startsWith("Pharynx good.")) entityCoveredText = "Pharynx good.";
+		if (entityCoveredText.toLowerCase().startsWith("neck:  supple.")) entityCoveredText = entityCoveredText.substring(0,"NECK:  Supple.".length());
+		
+		String INCORRECT_LINE3 = "RIGHT THYROID:  SATISFACTORY FOR INTERPRETATION.  NEGATIVE FOR MALIGNANT CELLS.  COLLOID NODULE";
+		String CORRECTED_LINE3 = "RIGHT THYROID:  SATISFACTORY FOR INTERPRETATION.";
+		if (entityCoveredText.toLowerCase().equals(INCORRECT_LINE3.toLowerCase())) { // correct an error in the gold standard
+			entityCoveredText = CORRECTED_LINE3;
+		}
+		
+		if (entityCoveredText.toLowerCase().equals("tolerating p.o. intake")) {
+			//1290	tolerating p.o. intake	intake and voiding without difficulty and ambulating   independently.	Affirmed
+			LOGGER.warn("Unable to handle at this time because gold standard is incorrect");
+			throw new RuntimeException("Skip this one as gold standard has a problem");
+		}
+		sentenceText = fields[2].trim();
+		String INCORRECT_LINE2 = "The patient states   that she was able to tolerate some p.o.";
+		String CORRECTED_LINE2 = "The patient states   that she was able to tolerate some p.o. fluids";
+		if (sentenceText.equals(INCORRECT_LINE2)) sentenceText = CORRECTED_LINE2; // correct an error in the gold standard
+
+		int position = sentenceText.toLowerCase().indexOf(entityCoveredText.toLowerCase());
+		char DQUOTE = '"';
+		if (position<0) {
+			if (entityCoveredText.charAt(0)==DQUOTE) entityCoveredText = entityCoveredText.substring(1);
+			int last = entityCoveredText.length()-1;
+			if (entityCoveredText.charAt(last)==DQUOTE) entityCoveredText = entityCoveredText.substring(0, last);
+			position = sentenceText.toLowerCase().indexOf(entityCoveredText.toLowerCase());
+			if (position<0) {
+				throw new RuntimeException("Did not find entity text '" + entityCoveredText + "' within sentence '" + sentenceText + "'");
+			}
+		}
+		String rest = sentenceText.substring(position+1);
+		if (rest.contains(entityCoveredText)) {
+			LOGGER.error("Assuming 2nd occurrence is correct occurenence of '" + entityCoveredText + "'.");
+			position = sentenceText.toLowerCase().indexOf(entityCoveredText.toLowerCase(), position+1);
+			//throw new RuntimeException("Unable to handle two occurences of entity within sentence");
+		}
+		begin = position + "";
+		end = (position +  entityCoveredText.length()) + "";
+		
+		String field3LowerCase = fields[3].trim().toLowerCase();
+		
+		if (field3LowerCase.equals("possible")) {
+			polarity = "1";
+			possible = "1";
+		} else if (field3LowerCase.equals("affirmed")) {
+			polarity = "1";
+			possible = "0";
+		} else if (field3LowerCase.equals("negated")) {
+			polarity = "-1";
+			possible = "0";
+		}
+		
+		if (fields.length > 4 && fields[4]!=null && fields[4].length()>0) throw new RuntimeException("Does not support temporality yet");
+		if (fields.length > 5 && fields[5]!=null && fields[5].length()>0) throw new RuntimeException("Does not support experiencer yet");
+
+	}
+	
+	public String toString() {
+		 
+		String s =  entityCoveredText + " (" + begin + ", " + end + ")  polarity=" + polarity + " possible=" + possible;
+		s = s + "\n" + "in '" + sentenceText + "'";
+		return s;
+		
+	}
+	/**
+	 * test a single line
+	 * @param args
+	 */
+	public static void main(String[] args) {
+		String line = "2	pulmonic regurgitation	There is trace PULMONIC REGURGITATION.	Affirmed";
+		NegExAnnotation anno = new NegExAnnotation(line);
+		System.out.println("Was able to create NegExAnnotation successfully at " + new Date());
+		System.out.println(anno.toString());
+		
+	}
+
+}

Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExAnnotation.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java?rev=1514049&view=auto
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java (added)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java Wed Aug 14 21:03:05 2013
@@ -0,0 +1,339 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.assertion.cr;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.ctakes.assertion.util.AssertionConst;
+import org.apache.ctakes.core.knowtator.KnowtatorAnnotation;
+import org.apache.ctakes.core.knowtator.KnowtatorXMLParser;
+import org.apache.ctakes.core.util.CtakesFileNamer;
+import org.apache.ctakes.core.util.SHARPKnowtatorXMLDefaults;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.refsem.BodyLaterality;
+import org.apache.ctakes.typesystem.type.refsem.BodySide;
+import org.apache.ctakes.typesystem.type.refsem.Course;
+import org.apache.ctakes.typesystem.type.refsem.Date;
+import org.apache.ctakes.typesystem.type.refsem.Event;
+import org.apache.ctakes.typesystem.type.refsem.EventProperties;
+import org.apache.ctakes.typesystem.type.refsem.LabReferenceRange;
+import org.apache.ctakes.typesystem.type.refsem.LabValue;
+import org.apache.ctakes.typesystem.type.refsem.MedicationDosage;
+import org.apache.ctakes.typesystem.type.refsem.MedicationDuration;
+import org.apache.ctakes.typesystem.type.refsem.MedicationForm;
+import org.apache.ctakes.typesystem.type.refsem.MedicationFrequency;
+import org.apache.ctakes.typesystem.type.refsem.MedicationRoute;
+import org.apache.ctakes.typesystem.type.refsem.MedicationStatusChange;
+import org.apache.ctakes.typesystem.type.refsem.MedicationStrength;
+import org.apache.ctakes.typesystem.type.refsem.OntologyConcept;
+import org.apache.ctakes.typesystem.type.refsem.ProcedureDevice;
+import org.apache.ctakes.typesystem.type.refsem.ProcedureMethod;
+import org.apache.ctakes.typesystem.type.refsem.Severity;
+import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.relation.AffectsTextRelation;
+import org.apache.ctakes.typesystem.type.relation.AspectualTextRelation;
+import org.apache.ctakes.typesystem.type.relation.BinaryTextRelation;
+import org.apache.ctakes.typesystem.type.relation.ComplicatesDisruptsTextRelation;
+import org.apache.ctakes.typesystem.type.relation.DegreeOfTextRelation;
+import org.apache.ctakes.typesystem.type.relation.LocationOfTextRelation;
+import org.apache.ctakes.typesystem.type.relation.ManagesTreatsTextRelation;
+import org.apache.ctakes.typesystem.type.relation.ManifestationOfTextRelation;
+import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.relation.ResultOfTextRelation;
+import org.apache.ctakes.typesystem.type.relation.TemporalTextRelation;
+import org.apache.ctakes.typesystem.type.structured.DocumentID;
+import org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention;
+import org.apache.ctakes.typesystem.type.textsem.BodyLateralityModifier;
+import org.apache.ctakes.typesystem.type.textsem.BodySideModifier;
+import org.apache.ctakes.typesystem.type.textsem.ConditionalModifier;
+import org.apache.ctakes.typesystem.type.textsem.CourseModifier;
+import org.apache.ctakes.typesystem.type.textsem.DiseaseDisorderMention;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.GenericModifier;
+import org.apache.ctakes.typesystem.type.textsem.HistoryOfModifier;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.LabEstimatedModifier;
+import org.apache.ctakes.typesystem.type.textsem.LabInterpretationModifier;
+import org.apache.ctakes.typesystem.type.textsem.LabMention;
+import org.apache.ctakes.typesystem.type.textsem.LabReferenceRangeModifier;
+import org.apache.ctakes.typesystem.type.textsem.LabValueModifier;
+import org.apache.ctakes.typesystem.type.textsem.MedicationAllergyModifier;
+import org.apache.ctakes.typesystem.type.textsem.MedicationDosageModifier;
+import org.apache.ctakes.typesystem.type.textsem.MedicationDurationModifier;
+import org.apache.ctakes.typesystem.type.textsem.MedicationFormModifier;
+import org.apache.ctakes.typesystem.type.textsem.MedicationFrequencyModifier;
+import org.apache.ctakes.typesystem.type.textsem.MedicationMention;
+import org.apache.ctakes.typesystem.type.textsem.MedicationRouteModifier;
+import org.apache.ctakes.typesystem.type.textsem.MedicationStatusChangeModifier;
+import org.apache.ctakes.typesystem.type.textsem.MedicationStrengthModifier;
+import org.apache.ctakes.typesystem.type.textsem.Modifier;
+import org.apache.ctakes.typesystem.type.textsem.PolarityModifier;
+import org.apache.ctakes.typesystem.type.textsem.ProcedureDeviceModifier;
+import org.apache.ctakes.typesystem.type.textsem.ProcedureMention;
+import org.apache.ctakes.typesystem.type.textsem.ProcedureMethodModifier;
+import org.apache.ctakes.typesystem.type.textsem.SeverityModifier;
+import org.apache.ctakes.typesystem.type.textsem.SignSymptomMention;
+import org.apache.ctakes.typesystem.type.textsem.SubjectModifier;
+import org.apache.ctakes.typesystem.type.textsem.TimeMention;
+import org.apache.ctakes.typesystem.type.textsem.UncertaintyModifier;
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.jcas.cas.TOP;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.jdom2.JDOMException;
+import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.component.xwriter.XWriter;
+import org.uimafit.descriptor.ConfigurationParameter;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.util.JCasUtil;
+
+import com.google.common.base.Charsets;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+import com.google.common.io.Files;
+
+/**
+ * assumes knowtator xml files are in "exported-xml" subdirectory
+ * and the original plaintext files are in "text" subdirectory
+ *
+ */
+public class NegExCorpusReader extends JCasAnnotator_ImplBase {
+  static Logger LOGGER = Logger.getLogger(NegExCorpusReader.class);
+  
+  public static final String PARAM_TEXT_DIRECTORY = "TextDirectory";
+  @ConfigurationParameter(
+      name = PARAM_TEXT_DIRECTORY,
+      description = "directory containing the text files (if DocumentIDs are just filenames); "
+          + "defaults to assuming that DocumentIDs are full file paths")
+  private File textDirectory;
+  
+  public static final String PARAM_SET_DEFAULTS = "SetDefaults";
+  @ConfigurationParameter(
+      name = PARAM_SET_DEFAULTS,
+      description = "whether or not to set default attribute values if no annotation is present")
+  private boolean setDefaults;
+
+  private static final Map<String, String> SUBJECT_KNOWTATOR_TO_UIMA_MAP;
+  static {
+    SUBJECT_KNOWTATOR_TO_UIMA_MAP = Maps.newHashMap();
+    SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("C0030705", CONST.ATTR_SUBJECT_PATIENT);
+    SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("patient", CONST.ATTR_SUBJECT_PATIENT);
+    SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("family_member", CONST.ATTR_SUBJECT_FAMILY_MEMBER);
+    SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("donor_family_member", CONST.ATTR_SUBJECT_DONOR_FAMILY_MEMBER);
+    SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("donor_other", CONST.ATTR_SUBJECT_DONOR_OTHER);
+    SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("other", CONST.ATTR_SUBJECT_OTHER);
+  }
+  
+  /**
+   * Get the URI that the text in this class was loaded from
+   */
+  protected URI getTextURI(JCas jCas) throws AnalysisEngineProcessException {
+
+	  String textPath = JCasUtil.selectSingle(jCas, DocumentID.class).getDocumentID();
+	  if (this.textDirectory != null) {
+		  textPath = this.textDirectory + File.separator +  textPath;
+	  }
+
+	  URI uri;
+	  try {
+		  uri = new URI(textPath);
+	  } catch (URISyntaxException e) {
+		  throw new AnalysisEngineProcessException(e);
+	  }
+
+	  //LOGGER.info("textPath = " + textPath);
+	  //LOGGER.info("uri = " + uri);
+	  
+	  
+	  
+	  
+	  //File tmpFile = new File(textPath); // Note this does not work with something like "file:/C:/usr/data/MiPACQ/1/xml/0054074073-0.xml"
+	  //LOGGER.info("tmpFile = " + tmpFile);
+	  //URI answer = tmpFile.toURI();
+	  //LOGGER.info("answer = " + answer);
+
+	  return uri;
+
+  }
+  
+
+
+/**
+   * Returns the names of the annotators in the Knowtator files that represent the gold standard
+   */
+  protected static String[] getAnnotatorNames() {
+    return new String[] { "cTAKES , Mayo Clinic", "CU annotator ,", "consensus set annotator team" , "cons annotator team", "cons team", "team" }; // these three are what are used by MiPACQ gold standard
+  }
+  
+
+  private static List<String> getDiseaseDisorderKnowtatorClasses() {
+	  return Arrays.asList(new String [] {"Disorders"}); 
+  }
+  
+  
+  private static List<String> getSignSymptomKnowtatorClasses() {
+	  return Arrays.asList(new String [] {"Sign_Symptom", "Finding"}); 
+  }
+  
+  private static List<String> getProcedureKnowtatorClasses() {
+	  return Arrays.asList(new String [] {
+			  "Diagnostic_procedure",
+			  "Laboratory_procedure",
+			  "Procedures",
+			  "Therapeutic_or_preventive_procedure",
+			  "Intervention",
+			  "Health_care_activity",
+			  "Research_activity"}); 
+  }
+  
+  private static List<String> getMedicationKnowtatorClasses() {
+	  return Arrays.asList(new String [] {"Chemicals_and_drugs", "Pharmacologic_substance"}); 
+  }
+  
+  private static List<String> getAnatomyKnowtatorClasses() {
+	  return Arrays.asList(new String [] {"Anatomy"}); 
+  }
+
+
+  @Override
+  public void process(JCas jCas) throws AnalysisEngineProcessException {
+	  //
+  }
+  
+  static String format(Annotation ann) {
+    String result;
+    if (ann.getEnd() == Integer.MIN_VALUE || ann.getBegin() == Integer.MAX_VALUE) {
+      result = "<no-spanned-text>";
+    } else {
+      result = String.format("\"%s\"[%d,%d]", ann.getCoveredText(), ann.getBegin(), ann.getEnd());
+    }
+    return String.format("%s(%s)", ann.getClass().getSimpleName(), result);
+  }
+  
+
+  
+  
+  
+  /**
+   * This main method is only for testing purposes. It runs the reader on Knowtator directories.
+   * 	args[0] = "/usr/data/MiPACQ/copies-of-just-clinical-knowtator-xml-and-text/";
+   * should have a child directory called "text"
+   * should have a child directory called "exported-xml"
+   * files in knowtator xml directory should have files that end with .xml
+   */
+  public static void main(String[] args) throws Exception {
+
+	  String filename;
+	  if (args.length != 0) {
+		  filename = args[0];
+	  } else {
+		  try {
+			  LOGGER.warn(String.format(
+					  "usage: java %s path/to/negex/file ",
+					  NegExCorpusReader.class.getName()));
+		  } catch (IllegalArgumentException e) {
+			  e.printStackTrace();
+		  }
+		  Exception e = new RuntimeException("Going to continue with default values");
+		  LOGGER.warn(e.getLocalizedMessage());
+		  filename = AssertionConst.NEGEX_CORPUS;
+	  }
+
+	  AnalysisEngine negexReader = AnalysisEngineFactory.createPrimitive(NegExCorpusReader.class);
+
+	  AnalysisEngine xWriter = AnalysisEngineFactory.createPrimitive(
+			  XWriter.class,
+			  XWriter.PARAM_OUTPUT_DIRECTORY_NAME,
+			  AssertionConst.NEGEX_CORPUS_PREPROCESSED,
+			  XWriter.PARAM_FILE_NAMER_CLASS_NAME,
+			  CtakesFileNamer.class.getName()
+			  );
+
+	  // For each line of data in the file that contains the negex corpus, parse the line and process the data.
+	  String [] lines = readNonWhiteSpaceLines(filename);
+	  int n = lines.length;
+	  LOGGER.info("Processing " + n + " lines from the negex file, treating each line as a document.");
+	  
+	  for (String data : lines) {
+		  LOGGER.info("Processing line '" + data + "'.");
+		  try {
+			  NegExAnnotation a = new NegExAnnotation(data);
+			  JCas jCas = negexReader.newJCas();
+			  jCas.setDocumentText(a.sentenceText);
+			  DocumentID documentID = new DocumentID(jCas);
+			  documentID.setDocumentID("doc" + a.lineNumber);
+			  documentID.addToIndexes();
+			  IdentifiedAnnotation ia = new IdentifiedAnnotation(jCas);
+			  ia.setBegin(Integer.parseInt(a.begin));
+			  ia.setEnd(Integer.parseInt(a.end));
+			  ia.setPolarity(Integer.parseInt(a.polarity));
+			  ia.addToIndexes();
+			  xWriter.process(jCas);
+		  } catch (RuntimeException e) {
+			  LOGGER.warn("Skipping this one because of RuntimeException");
+		  } 
+	  }
+
+  }
+
+
+
+  private static String[] readNonWhiteSpaceLines(String filename) {
+	  List<String> lines = new ArrayList<String>();
+	  BufferedReader br  = null;
+	  try {
+		  br = new BufferedReader(new FileReader(filename));
+		  String line;
+		  while ((line=br.readLine())!=null) {
+			  if (line.trim().length()>0) {
+				  lines.add(line);
+			  }
+		  }
+	  } catch (Exception e) {
+		  //
+	  } finally {
+		  if (br!=null)
+			try {
+				br.close();
+			} catch (IOException e1) {
+				e1.printStackTrace();
+			}
+	  }
+	  return lines.toArray(new String[0]);
+	  
+  }
+}

Propchange: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java?rev=1514049&r1=1514048&r2=1514049&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/util/AssertionConst.java Wed Aug 14 21:03:05 2013
@@ -34,10 +34,14 @@ public class AssertionConst {
 	// expects subdirectories called exported-xml and text
 	public static final String MiPACQ_CORPUS = DATA_DIR + "gold_standard/copies-of-just-clinical-knowtator-xml-and-text/";
 
+	public static final String NEGEX_CORPUS = DATA_DIR + "gold_standard/negex/Annotations-1-120-random.txt"; 
+	public static final String NEGEX_CORPUS_PREPROCESSED = DATA_DIR + "preprocessed_data/negex/"; 
+	
 	// Just plaintext files, which will be run through cTAKES, to generate XMI - attributes will then be judged
 	// This in input for cTAKES; the output (evalOutputDir) can then be the input of the judge step.
-	public static final String CORPUS_WO_GOLD_STD_TO_RUN_THROUGH_CTAKES = DATA_DIR + "ActiveLearning/plaintext"; 
-	
+	public static final String CORPUS_WO_GOLD_STD_TO_RUN_THROUGH_CTAKES = DATA_DIR + "ActiveLearning/plaintext";
+
+
 	// specify the model to write (train/crossvalidate) or read (test/crossvalidate).
 	//  please rename for different configurations of training data 
 	public static String modelDirectory = "../ctakes-assertion-res/resources/model/sharp-sprint-train";