You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by ja...@apache.org on 2013/08/16 12:51:44 UTC

svn commit: r1514649 - /ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java

Author: james-masanz
Date: Fri Aug 16 10:51:43 2013
New Revision: 1514649

URL: http://svn.apache.org/r1514649
Log:
make a true UIMA collection reader

Modified:
    ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java

Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java?rev=1514649&r1=1514648&r2=1514649&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java Fri Aug 16 10:51:43 2013
@@ -23,52 +23,79 @@ import java.io.FileReader;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.Map;
 
+import org.apache.ctakes.assertion.medfacts.cleartk.AssertionComponents;
 import org.apache.ctakes.assertion.util.AssertionConst;
 import org.apache.ctakes.core.util.CtakesFileNamer;
-import org.apache.ctakes.typesystem.type.constants.CONST;
 import org.apache.ctakes.typesystem.type.structured.DocumentID;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.log4j.Logger;
-import org.apache.uima.analysis_engine.AnalysisEngine;
-import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.collection.CollectionException;
+import org.apache.uima.collection.CollectionReaderDescription;
+import org.apache.uima.collection.CollectionReader_ImplBase;
 import org.apache.uima.jcas.JCas;
-import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
+import org.apache.uima.util.Progress;
+import org.apache.uima.util.ProgressImpl;
 import org.uimafit.component.xwriter.XWriter;
+import org.uimafit.factory.AggregateBuilder;
 import org.uimafit.factory.AnalysisEngineFactory;
-
-import com.google.common.collect.Maps;
+import org.uimafit.factory.CollectionReaderFactory;
+import org.uimafit.pipeline.SimplePipeline;
 
 /**
- * assumes knowtator xml files are in "exported-xml" subdirectory
- * and the original plaintext files are in "text" subdirectory
- *
+ * Reads lines from file named by AssertionConst.NEGEX_CORPUS
+ * If use the main() method, uses this collection reader to
+ * read the corpus and then uses the XMI writer to write
+ * the XMI to directory given by AssertionConst.NEGEX_CORPUS_PREPROCESSED
  */
-public class NegExCorpusReader extends JCasAnnotator_ImplBase {
+public class NegExCorpusReader extends CollectionReader_ImplBase {
   static Logger LOGGER = Logger.getLogger(NegExCorpusReader.class);
   
-
-  private static final Map<String, String> SUBJECT_KNOWTATOR_TO_UIMA_MAP;
-  static {
-    SUBJECT_KNOWTATOR_TO_UIMA_MAP = Maps.newHashMap();
-    SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("C0030705", CONST.ATTR_SUBJECT_PATIENT);
-    SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("patient", CONST.ATTR_SUBJECT_PATIENT);
-    SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("family_member", CONST.ATTR_SUBJECT_FAMILY_MEMBER);
-    SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("donor_family_member", CONST.ATTR_SUBJECT_DONOR_FAMILY_MEMBER);
-    SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("donor_other", CONST.ATTR_SUBJECT_DONOR_OTHER);
-    SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("other", CONST.ATTR_SUBJECT_OTHER);
-  }
+  private boolean skipReadingValuesJustReadText;
   
+  private static TypeSystemDescription typeSystemDescription = AssertionComponents.CTAKES_CTS_TYPE_SYSTEM_DESCRIPTION; // TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath();//.createTypeSystemDescription();
+
 
-  @Override
-  public void process(JCas jCas) throws AnalysisEngineProcessException {
-	  //
+  public NegExCorpusReader() {
+	  this(true);
   }
   
+  public NegExCorpusReader(boolean skipReadingValuesJustReadText) {
+	  this.skipReadingValuesJustReadText = skipReadingValuesJustReadText;
+	  readAndParseAllLines(null);
+  }
   
-  
-  
+  private static List<NegExAnnotation> list;
+  private static List<NegExAnnotation> readAndParseAllLines(String filename) {
+	  if (filename == null || filename.length()==0) {
+		  Exception e = new RuntimeException("Going to continue with default values");
+		  LOGGER.warn(e.getLocalizedMessage());
+		  filename = AssertionConst.NEGEX_CORPUS;
+	  }
+
+	  // For each line of data in the file that contains the negex corpus, parse the line and add parsed data to list.
+	  String [] lines = readNonWhiteSpaceLines(filename);
+	  int n = lines.length;
+	  if (n==0) LOGGER.error(n + " lines found in " + filename);
+	  LOGGER.info("Processing " + n + " lines from the negex file, treating each line as a document.");
+
+	  list = new ArrayList<NegExAnnotation>();
+	  for (String data : lines) {
+		  LOGGER.info("Processing line '" + data + "'.");
+		  try {
+			  list.add(new NegExAnnotation(data));
+		  } catch (RuntimeException e) {
+			  LOGGER.warn("Skipping this one because of RuntimeException");
+		  } 
+	  }
+	  
+	  return list;
+  }
+
   /**
    * This main method is only for testing purposes. It runs the reader on Knowtator directories.
    * 	args[0] = "/usr/data/MiPACQ/copies-of-just-clinical-knowtator-xml-and-text/";
@@ -92,48 +119,38 @@ public class NegExCorpusReader extends J
 		  Exception e = new RuntimeException("Going to continue with default values");
 		  LOGGER.warn(e.getLocalizedMessage());
 		  filename = AssertionConst.NEGEX_CORPUS;
+		  LOGGER.warn("filename: " +  filename);
 	  }
 
-	  AnalysisEngine negexReader = AnalysisEngineFactory.createPrimitive(NegExCorpusReader.class);
+	  
+	  //CollectionReader negexReader = new NegExCorpusReader(false);
+	  //List<NegExAnnotation> list = readAndParseAllLines(filename);
+	  CollectionReaderDescription collectionReader = CollectionReaderFactory.createDescription(
+			  NegExCorpusReader.class,
+				typeSystemDescription
+		);
 
-	  AnalysisEngine xWriter = AnalysisEngineFactory.createPrimitive(
+	  //TypeSystemDescription typeSystemDescription = AssertionComponents.TYPE_SYSTEM_DESCRIPTION; // TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath();//.createTypeSystemDescription();
+	  AnalysisEngineDescription xWriter = AnalysisEngineFactory.createPrimitiveDescription(
 			  XWriter.class,
+			  typeSystemDescription,
 			  XWriter.PARAM_OUTPUT_DIRECTORY_NAME,
 			  AssertionConst.NEGEX_CORPUS_PREPROCESSED,
 			  XWriter.PARAM_FILE_NAMER_CLASS_NAME,
 			  CtakesFileNamer.class.getName()
 			  );
 
-	  // For each line of data in the file that contains the negex corpus, parse the line and process the data.
-	  String [] lines = readNonWhiteSpaceLines(filename);
-	  int n = lines.length;
-	  LOGGER.info("Processing " + n + " lines from the negex file, treating each line as a document.");
+	  AggregateBuilder aggregate = new AggregateBuilder();
+	  aggregate.add(xWriter);
+
+	  SimplePipeline.runPipeline(collectionReader, aggregate.createAggregateDescription());
 	  
-	  for (String data : lines) {
-		  LOGGER.info("Processing line '" + data + "'.");
-		  try {
-			  NegExAnnotation a = new NegExAnnotation(data);
-			  JCas jCas = negexReader.newJCas();
-			  jCas.setDocumentText(a.sentenceText);
-			  DocumentID documentID = new DocumentID(jCas);
-			  documentID.setDocumentID("doc" + a.lineNumber);
-			  documentID.addToIndexes();
-			  IdentifiedAnnotation ia = new IdentifiedAnnotation(jCas);
-			  ia.setBegin(Integer.parseInt(a.begin));
-			  ia.setEnd(Integer.parseInt(a.end));
-			  ia.setPolarity(Integer.parseInt(a.polarity));
-			  ia.addToIndexes();
-			  xWriter.process(jCas);
-		  } catch (RuntimeException e) {
-			  LOGGER.warn("Skipping this one because of RuntimeException");
-		  } 
-	  }
 
   }
 
 
 
-  private static String[] readNonWhiteSpaceLines(String filename) {
+private static String[] readNonWhiteSpaceLines(String filename) {
 	  List<String> lines = new ArrayList<String>();
 	  BufferedReader br  = null;
 	  try {
@@ -157,4 +174,63 @@ public class NegExCorpusReader extends J
 	  return lines.toArray(new String[0]);
 	  
   }
+
+
+
+
+private static int i = 0;
+
+/**
+ * Does more than a typical reader - also creates an IdentifiedAnnotation
+ * and a DocumentID annotation
+ */
+@Override
+public void getNext(CAS aCAS) throws IOException, CollectionException {
+	JCas jCas;
+	try {
+		jCas = aCAS.getJCas();
+	} catch(CASException e){
+		throw new CollectionException(e);
+	}
+	
+	NegExAnnotation a = list.get(i);
+	i++;
+
+	jCas.setDocumentText(a.sentenceText);
+	DocumentID documentID = new DocumentID(jCas);
+	documentID.setDocumentID("doc" + a.lineNumber);
+	documentID.addToIndexes();
+	IdentifiedAnnotation ia = new IdentifiedAnnotation(jCas);
+	ia.setBegin(Integer.parseInt(a.begin));
+	ia.setEnd(Integer.parseInt(a.end));
+	if (!skipReadingValuesJustReadText) ia.setPolarity(Integer.parseInt(a.polarity));
+	ia.addToIndexes();
+
+}
+
+@Override
+public boolean hasNext() throws IOException, CollectionException {
+
+	try {
+		return i < list.size();
+	} catch (Exception e) { // list == null for example
+		throw new CollectionException(e);
+	}
+			
+}
+
+
+
+@Override
+public void close() throws IOException {
+	// TODO Auto-generated method stub
+	
+}
+
+@Override
+public Progress[] getProgress() {
+	Progress p = new ProgressImpl(i, list.size(), Progress.ENTITIES);
+	return new Progress[]{ p};
+}
+
 }