You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by ja...@apache.org on 2013/08/16 12:51:44 UTC
svn commit: r1514649 -
/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java
Author: james-masanz
Date: Fri Aug 16 10:51:43 2013
New Revision: 1514649
URL: http://svn.apache.org/r1514649
Log:
make a true UIMA collection reader
Modified:
ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java
Modified: ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java?rev=1514649&r1=1514648&r2=1514649&view=diff
==============================================================================
--- ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java (original)
+++ ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/cr/NegExCorpusReader.java Fri Aug 16 10:51:43 2013
@@ -23,52 +23,79 @@ import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
-import java.util.Map;
+import org.apache.ctakes.assertion.medfacts.cleartk.AssertionComponents;
import org.apache.ctakes.assertion.util.AssertionConst;
import org.apache.ctakes.core.util.CtakesFileNamer;
-import org.apache.ctakes.typesystem.type.constants.CONST;
import org.apache.ctakes.typesystem.type.structured.DocumentID;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
import org.apache.log4j.Logger;
-import org.apache.uima.analysis_engine.AnalysisEngine;
-import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.collection.CollectionException;
+import org.apache.uima.collection.CollectionReaderDescription;
+import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.jcas.JCas;
-import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
+import org.apache.uima.util.Progress;
+import org.apache.uima.util.ProgressImpl;
import org.uimafit.component.xwriter.XWriter;
+import org.uimafit.factory.AggregateBuilder;
import org.uimafit.factory.AnalysisEngineFactory;
-
-import com.google.common.collect.Maps;
+import org.uimafit.factory.CollectionReaderFactory;
+import org.uimafit.pipeline.SimplePipeline;
/**
- * assumes knowtator xml files are in "exported-xml" subdirectory
- * and the original plaintext files are in "text" subdirectory
- *
+ * Reads lines from file named by AssertionConst.NEGEX_CORPUS
+ * If use the main() method, uses this collection reader to
+ * read the corpus and then uses the XMI writer to write
+ * the XMI to directory given by AssertionConst.NEGEX_CORPUS_PREPROCESSED
*/
-public class NegExCorpusReader extends JCasAnnotator_ImplBase {
+public class NegExCorpusReader extends CollectionReader_ImplBase {
static Logger LOGGER = Logger.getLogger(NegExCorpusReader.class);
-
- private static final Map<String, String> SUBJECT_KNOWTATOR_TO_UIMA_MAP;
- static {
- SUBJECT_KNOWTATOR_TO_UIMA_MAP = Maps.newHashMap();
- SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("C0030705", CONST.ATTR_SUBJECT_PATIENT);
- SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("patient", CONST.ATTR_SUBJECT_PATIENT);
- SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("family_member", CONST.ATTR_SUBJECT_FAMILY_MEMBER);
- SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("donor_family_member", CONST.ATTR_SUBJECT_DONOR_FAMILY_MEMBER);
- SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("donor_other", CONST.ATTR_SUBJECT_DONOR_OTHER);
- SUBJECT_KNOWTATOR_TO_UIMA_MAP.put("other", CONST.ATTR_SUBJECT_OTHER);
- }
+ private boolean skipReadingValuesJustReadText;
+ private static TypeSystemDescription typeSystemDescription = AssertionComponents.CTAKES_CTS_TYPE_SYSTEM_DESCRIPTION; // TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath();//.createTypeSystemDescription();
+
- @Override
- public void process(JCas jCas) throws AnalysisEngineProcessException {
- //
+ public NegExCorpusReader() {
+ this(true);
}
+ public NegExCorpusReader(boolean skipReadingValuesJustReadText) {
+ this.skipReadingValuesJustReadText = skipReadingValuesJustReadText;
+ readAndParseAllLines(null);
+ }
-
-
+ private static List<NegExAnnotation> list;
+ private static List<NegExAnnotation> readAndParseAllLines(String filename) {
+ if (filename == null || filename.length()==0) {
+ Exception e = new RuntimeException("Going to continue with default values");
+ LOGGER.warn(e.getLocalizedMessage());
+ filename = AssertionConst.NEGEX_CORPUS;
+ }
+
+ // For each line of data in the file that contains the negex corpus, parse the line and add parsed data to list.
+ String [] lines = readNonWhiteSpaceLines(filename);
+ int n = lines.length;
+ if (n==0) LOGGER.error(n + " lines found in " + filename);
+ LOGGER.info("Processing " + n + " lines from the negex file, treating each line as a document.");
+
+ list = new ArrayList<NegExAnnotation>();
+ for (String data : lines) {
+ LOGGER.info("Processing line '" + data + "'.");
+ try {
+ list.add(new NegExAnnotation(data));
+ } catch (RuntimeException e) {
+ LOGGER.warn("Skipping this one because of RuntimeException");
+ }
+ }
+
+ return list;
+ }
+
/**
* This main method is only for testing purposes. It runs the reader on Knowtator directories.
* args[0] = "/usr/data/MiPACQ/copies-of-just-clinical-knowtator-xml-and-text/";
@@ -92,48 +119,38 @@ public class NegExCorpusReader extends J
Exception e = new RuntimeException("Going to continue with default values");
LOGGER.warn(e.getLocalizedMessage());
filename = AssertionConst.NEGEX_CORPUS;
+ LOGGER.warn("filename: " + filename);
}
- AnalysisEngine negexReader = AnalysisEngineFactory.createPrimitive(NegExCorpusReader.class);
+
+ //CollectionReader negexReader = new NegExCorpusReader(false);
+ //List<NegExAnnotation> list = readAndParseAllLines(filename);
+ CollectionReaderDescription collectionReader = CollectionReaderFactory.createDescription(
+ NegExCorpusReader.class,
+ typeSystemDescription
+ );
- AnalysisEngine xWriter = AnalysisEngineFactory.createPrimitive(
+ //TypeSystemDescription typeSystemDescription = AssertionComponents.TYPE_SYSTEM_DESCRIPTION; // TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath();//.createTypeSystemDescription();
+ AnalysisEngineDescription xWriter = AnalysisEngineFactory.createPrimitiveDescription(
XWriter.class,
+ typeSystemDescription,
XWriter.PARAM_OUTPUT_DIRECTORY_NAME,
AssertionConst.NEGEX_CORPUS_PREPROCESSED,
XWriter.PARAM_FILE_NAMER_CLASS_NAME,
CtakesFileNamer.class.getName()
);
- // For each line of data in the file that contains the negex corpus, parse the line and process the data.
- String [] lines = readNonWhiteSpaceLines(filename);
- int n = lines.length;
- LOGGER.info("Processing " + n + " lines from the negex file, treating each line as a document.");
+ AggregateBuilder aggregate = new AggregateBuilder();
+ aggregate.add(xWriter);
+
+ SimplePipeline.runPipeline(collectionReader, aggregate.createAggregateDescription());
- for (String data : lines) {
- LOGGER.info("Processing line '" + data + "'.");
- try {
- NegExAnnotation a = new NegExAnnotation(data);
- JCas jCas = negexReader.newJCas();
- jCas.setDocumentText(a.sentenceText);
- DocumentID documentID = new DocumentID(jCas);
- documentID.setDocumentID("doc" + a.lineNumber);
- documentID.addToIndexes();
- IdentifiedAnnotation ia = new IdentifiedAnnotation(jCas);
- ia.setBegin(Integer.parseInt(a.begin));
- ia.setEnd(Integer.parseInt(a.end));
- ia.setPolarity(Integer.parseInt(a.polarity));
- ia.addToIndexes();
- xWriter.process(jCas);
- } catch (RuntimeException e) {
- LOGGER.warn("Skipping this one because of RuntimeException");
- }
- }
}
- private static String[] readNonWhiteSpaceLines(String filename) {
+private static String[] readNonWhiteSpaceLines(String filename) {
List<String> lines = new ArrayList<String>();
BufferedReader br = null;
try {
@@ -157,4 +174,63 @@ public class NegExCorpusReader extends J
return lines.toArray(new String[0]);
}
+
+
+
+
+private static int i = 0;
+
+/**
+ * Does more than a typical reader - also creates an IdentifiedAnnotation
+ * and a DocumentID annotation
+ */
+@Override
+public void getNext(CAS aCAS) throws IOException, CollectionException {
+ JCas jCas;
+ try {
+ jCas = aCAS.getJCas();
+ } catch(CASException e){
+ throw new CollectionException(e);
+ }
+
+ NegExAnnotation a = list.get(i);
+ i++;
+
+ jCas.setDocumentText(a.sentenceText);
+ DocumentID documentID = new DocumentID(jCas);
+ documentID.setDocumentID("doc" + a.lineNumber);
+ documentID.addToIndexes();
+ IdentifiedAnnotation ia = new IdentifiedAnnotation(jCas);
+ ia.setBegin(Integer.parseInt(a.begin));
+ ia.setEnd(Integer.parseInt(a.end));
+ if (!skipReadingValuesJustReadText) ia.setPolarity(Integer.parseInt(a.polarity));
+ ia.addToIndexes();
+
+}
+
+@Override
+public boolean hasNext() throws IOException, CollectionException {
+
+ try {
+ return i < list.size();
+ } catch (Exception e) { // list == null for example
+ throw new CollectionException(e);
+ }
+
+}
+
+
+
+@Override
+public void close() throws IOException {
+ // TODO Auto-generated method stub
+
+}
+
+@Override
+public Progress[] getProgress() {
+ Progress p = new ProgressImpl(i, list.size(), Progress.ENTITIES);
+ return new Progress[]{ p};
+}
+
}