You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by br...@apache.org on 2013/07/07 21:23:07 UTC
svn commit: r1500511 [4/6] - in /ctakes/sandbox/ctakes-scrubber-deid/src: ./ main/ main/java/ main/java/org/ main/java/org/apache/ main/java/org/apache/uima/ main/java/org/apache/uima/examples/ main/java/org/spin/ main/java/org/spin/scrubber/ main/java...

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/AnnotationPrinter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/AnnotationPrinter.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/AnnotationPrinter.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/AnnotationPrinter.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,574 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.spin.scrubber.uima.consumer;
+
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.collection.CasConsumer_ImplBase;
+import org.apache.uima.collection.base_cpm.CasObjectProcessor;
+import org.apache.uima.examples.SourceDocumentInformation;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceConfigurationException;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.ResourceProcessException;
+import org.apache.uima.util.ProcessTrace;
+import org.spin.scrubber.uima.type.Calculation;
+import org.spin.scrubber.uima.type.OntologyMatch;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Date;
+import java.util.Iterator;
+
+/**
+ * An example of CAS Consumer. <br>
+ * AnnotationPrinter prints to an output file all annotations in the CAS. <br>
+ * Parameters needed by the AnnotationPrinter are
+ * <ol>
+ * <li> "outputFile" : file to which the output files should be written.</li>
+ * </ol>
+ * <br>
+ * These parameters are set in the initialize method to the values specified in the descriptor file.
+ * <br>
+ * These may also be set by the application by using the setConfigParameterValue methods.
+ * 
+ * 
+ */
+
+@Deprecated
+public class AnnotationPrinter extends CasConsumer_ImplBase implements CasObjectProcessor
+{
+  File outFile;
+
+  FileWriter fileWriter;
+
+  public AnnotationPrinter() {
+  }
+
+  /**
+   * Initializes this CAS Consumer with the parameters specified in the descriptor.
+   * 
+   * @throws ResourceInitializationException
+   *           if there is error in initializing the resources
+   */
+  public void initialize() throws ResourceInitializationException {
+
+    // extract configuration parameter settings
+    String oPath = (String) getUimaContext().getConfigParameterValue("outputFile");
+
+    // Output file should be specified in the descriptor
+    if (oPath == null) {
+      throw new ResourceInitializationException(
+              ResourceInitializationException.CONFIG_SETTING_ABSENT, new Object[] { "outputFile" });
+    }
+    // If specified output directory does not exist, try to create it
+    outFile = new File(oPath.trim());
+    if (outFile.getParentFile() != null && !outFile.getParentFile().exists()) {
+      if (!outFile.getParentFile().mkdirs())
+        throw new ResourceInitializationException(
+                ResourceInitializationException.RESOURCE_DATA_NOT_VALID, new Object[] { oPath,
+                    "outputFile" });
+    }
+    try {
+      fileWriter = new FileWriter(outFile);
+    } catch (IOException e) {
+      throw new ResourceInitializationException(e);
+    }
+  }
+
+  /**
+   * Processes the CasContainer which was populated by the TextAnalysisEngines. <br>
+   * In this case, the CAS index is iterated over selected annotations and printed out into an
+   * output file
+   * 
+   * @param aCAS
+   *          CasContainer which has been populated by the TAEs
+   * 
+   * @throws ResourceProcessException
+   *           if there is an error in processing the Resource
+   * 
+   * @see org.apache.uima.collection.base_cpm.CasObjectProcessor#processCas(CAS)
+   */
+  public synchronized void processCas(CAS aCAS) throws ResourceProcessException {
+    JCas jcas;
+    try {
+      jcas = aCAS.getJCas();
+    } catch (CASException e) {
+      throw new ResourceProcessException(e);
+    }
+
+        
+    boolean titleP = false;
+    String docUri = null;
+    Iterator it = jcas.getAnnotationIndex(SourceDocumentInformation.type).iterator();
+    if (it.hasNext()) {
+      SourceDocumentInformation srcDocInfo = (SourceDocumentInformation) it.next();
+      docUri = srcDocInfo.getUri();
+    }
+
+    try
+	{
+		fileWriter.write("\n\n<++++NEW DOCUMENT++++>\t"+new Date(System.currentTimeMillis())+"\n");
+		if (docUri != null)
+            fileWriter.write("DOCUMENT URI:" + docUri + "\n");
+		fileWriter.write("\n");
+	} catch (IOException e1)
+	{
+		// TODO Auto-generated catch block
+		e1.printStackTrace();
+	}
+    
+	this.printNounPhraseAnnotations(fileWriter, jcas);
+	this.printWordTokenAnnotations(fileWriter, jcas);
+	this.printOntologyMatch(fileWriter, jcas);
+	this.printCalculationAnnotations(fileWriter, jcas);
+	
+
+//TODO: remove ???
+//    // iterate and print annotations
+//    Iterator annotationIter = jcas.getAnnotationIndex().iterator();
+//    while (annotationIter.hasNext()) {
+//      Annotation annot = (Annotation) annotationIter.next();
+//      if (titleP == false) {
+//        try {
+//          fileWriter.write("\n\n<++++NEW DOCUMENT++++>\t"+new Date(System.currentTimeMillis())+"\n");
+//          if (docUri != null)
+//            fileWriter.write("DOCUMENT URI:" + docUri + "\n");
+//          fileWriter.write("\n");
+//        } catch (IOException e) {
+//          throw new ResourceProcessException(e);
+//        }
+//        titleP = true;
+//      }
+//      // get the text that is enclosed within the annotation in the CAS
+//      String aText = annot.getCoveredText();
+//      aText = aText.replace('\n', ' ');
+//      aText = aText.replace('\r', ' ');
+//      
+//      //if aText is empty, continue;
+//      if (aText.trim().length()<1)
+//      {
+//    	  continue;
+//      }
+//      
+//      //System.out.println("TYPE: "+annot.getType().getShortName());
+//      
+//    //get Features... //TODO: there must be a better way to do this...
+//      Feature posFeat=null;
+//      String posTag=null;
+//      try
+//      {
+//    	  String featName = (String) getUimaContext().getConfigParameterValue("pos");
+//    	  posFeat = annot.getCAS().getTypeSystem().getFeatureByFullName(featName);
+//    	  if (posFeat!=null)
+//    		  posTag = annot.getFeatureValueAsString(posFeat);
+//      }
+//      catch(Exception e)
+//      {
+//    	  System.out.println("CONTINUABLE ERROR: unable to get pos feature value. may be null. continuing.");
+//    	  //e.printStackTrace();
+//      }
+////      //only print if pos = {noun, null}
+////      if( posTag!=null && !posTag.startsWith("nn") && !posTag.startsWith("np") )
+////      {
+////    	  continue;
+////      }
+//      
+//      //only print WordToken annots
+//      //TODO: fix this, there are probably other Types we want, numbers for example for an address. 
+//      //this if block is here to prevent the same token from being printed multiple times for each Type, ex. lookupwindow, wordtoken, segment, etc...
+//      if (!annot.getType().getShortName().equalsIgnoreCase("WordToken") && !annot.getType().getShortName().equalsIgnoreCase("OntologyMatch"))
+//      {
+//    	  continue;
+//      }
+//      
+//
+//      String ontArrTag=null;
+////      try
+////      {
+//    	  //1    	  
+////    	  NamedEntity ne = new NamedEntity(annot.getCAS().getJCas());
+////    	  FSArray fsArr = ne.getOntologyConceptArr();
+////    	  for (int i=0; i<fsArr.size(); i++)
+////    	  {
+////    		  FeatureStructure fsStruct = fsArr.get(i);
+////    		  ontArrTag = fsStruct.toString();
+////    	  }
+//    	  
+//    	  //2
+////    	  String ontArrName = (String) getUimaContext().getConfigParameterValue("ontologyConceptArr");
+////    	  Feature ontArrFeat=null;
+////    	  FeatureStructure ontArrFS=null;
+////    	  ontArrFeat = annot.getCAS().getTypeSystem().getFeatureByFullName(ontArrName);
+////    	  if (ontArrFeat!=null)
+////    	  {    		  
+//////    		  ontArrTag = annot.getFeatureValueAsString(ontArrFeat);
+////    		  ontArrFS = annot.getFeatureValue(ontArrFeat);
+////    		  ontArrTag += ontArrFS.toString();
+////    	  }
+//    	  
+////    	  //3 - this gets all the annotations of type NamedEntity from this jcas. 
+////    	  //which means it gives all the cui's for the entire doc on every word.
+////    	  //what we want is only the cuis for the current annot.
+////    	  Iterator neIt = annot.getCAS().getJCas().getAnnotationIndex(NamedEntity.type).iterator();
+////    	  while(neIt.hasNext())
+////    	  {
+////    		  NamedEntity ne = (NamedEntity)neIt.next();
+////    		  if (ne.getBegin()==annot.getBegin()) //only print NE annotations if they match the current annot start position.
+////    		  {
+////	    		  FSArray fsArr = ne.getOntologyConceptArr();
+////	        	  for (int i=0; i<fsArr.size(); i++)
+////	        	  {
+////	        		  FeatureStructure fsStruct = fsArr.get(i);
+////	        		  ontArrTag += "\n"+fsStruct.toString();
+////	//        		  System.out.println(fsStruct.getCAS().getJCas().get)
+////	        		  System.out.println("ontArrTag: " + ontArrTag);
+////	        	  }
+////    		  }
+////    	  }
+////      }
+////      catch(Exception e)
+////      {
+////    	  System.out.println("CONTINUABLE ERROR: unable to get ontologyConceptArr feature value. may be null. continuing.");
+////    	  e.printStackTrace();
+////      }
+//      
+//      try 
+//      {
+//        fileWriter.write("begin: " + annot.getBegin() 
+//        		+ "\t end: " + annot.getEnd() 
+//        		+ "\t num: " + annot.getType().getNumberOfFeatures()
+//        		+ "\t POS: " + posTag
+//        		+ "\t\t name: " + annot.getType().getShortName()
+//        		+ "\t token: " + aText
+//        		+ "\t ontArr: " + ontArrTag
+////        		+ "\t name: " + annot.getType().getName()
+//        		+ "\n");
+////        for (Feature f : annot.getType().getFeatures())
+////        {
+////        	fileWriter.write("feat: " + f.getName()+ "\n");
+////        }
+////        fileWriter.write("\n\n\n");
+//        
+//        fileWriter.flush();
+//      } 
+//      catch (IOException e) 
+//      {
+//        throw new ResourceProcessException(e);
+//      }
+//    }
+  }
+
+  private void printWordTokenAnnotations(FileWriter fileWriter, JCas jcas) throws ResourceProcessException
+  {
+	  	// iterate and print annotations
+	    Iterator annotationIter = jcas.getAnnotationIndex(WordToken.type).iterator();
+	    while (annotationIter.hasNext()) 
+	    {
+	    	WordToken annot = (WordToken) annotationIter.next();
+	      
+	      // get the text that is enclosed within the annotation in the CAS
+	      String aText = annot.getCoveredText();
+	      aText = aText.replace('\n', ' ');
+	      aText = aText.replace('\r', ' ');
+	      
+	      //if aText is empty, continue;
+	      if (aText.trim().length()<1)
+	      {
+	    	  continue;
+	      }
+	      
+	      Feature posFeat=null;
+	      String posTag=null;
+	      try
+	      {
+	    	  String featName = (String) getUimaContext().getConfigParameterValue("pos");
+	    	  posFeat = annot.getCAS().getTypeSystem().getFeatureByFullName(featName);
+	    	  if (posFeat!=null)
+	    		  posTag = annot.getFeatureValueAsString(posFeat);
+	      }
+	      catch(Exception e)
+	      {
+	    	  System.out.println("CONTINUABLE ERROR: unable to get pos feature value. may be null. continuing.");
+	      }
+	      	      
+	      try 
+	      {
+	        fileWriter.write("begin: " + annot.getBegin() 
+	        		+ " end: " + annot.getEnd() 
+	        		+ "\t code: " + annot.getPartOfSpeech() 
+	        		+ "\t cap: " + annot.getCapitalization()
+	        		+ "\t ont: " + "POS"
+	        		+ "\t\t name: " + annot.getType().getShortName()
+	        		+ "\t token: " + aText
+	        		+ "\n");
+
+//	        for (Feature f : annot.getType().getFeatures())
+//	        {
+//	        	fileWriter.write("feat: " + f.getName()+ "\n");
+//	        }
+//	        fileWriter.write("\n\n\n");
+	        
+	        fileWriter.flush();
+	      } 
+	      catch (IOException e) 
+	      {
+	        throw new ResourceProcessException(e);
+	      }
+	  }
+  }
+  
+  private void printNounPhraseAnnotations(FileWriter fileWriter, JCas jcas) throws ResourceProcessException
+  {
+	  	// iterate and print annotations
+	    Iterator annotationIter = jcas.getAnnotationIndex(Chunk.type).iterator();
+	    while (annotationIter.hasNext()) 
+	    {
+	    	Chunk annot = (Chunk) annotationIter.next();
+	      
+	      // get the text that is enclosed within the annotation in the CAS
+	      String aText = annot.getCoveredText();
+	      aText = aText.replace('\n', ' ');
+	      aText = aText.replace('\r', ' ');
+	      
+	      //if aText is empty, continue;
+	      if (aText.trim().length()<1)
+	      {
+	    	  continue;
+	      }
+	      
+//	      //only want NP chunks
+//	      if (!annot.getChunkType().equalsIgnoreCase("NP"))
+//	      {
+//	    	  continue;
+//	      }
+	            
+	      try 
+	      {
+	        fileWriter.write("begin: " + annot.getBegin() 
+	        		+ " end: " + annot.getEnd() 
+	        		+ "\t code: " + annot.getChunkType() 
+	        		+ "\t ont: " + "Chunk"
+	        		+ "\t\t name: " + annot.getType().getShortName()
+	        		+ "\t token: " + aText
+	        		+ "\n");
+	        
+	        fileWriter.flush();
+	      } 
+	      catch (IOException e) 
+	      {
+	        throw new ResourceProcessException(e);
+	      }
+	  }
+  }
+  
+  private void printOntologyMatch(FileWriter fileWriter, JCas jcas) throws ResourceProcessException
+  {
+	    // iterate and print annotations
+	    Iterator<Annotation> annotationIter = jcas.getAnnotationIndex(OntologyMatch.type).iterator();
+	    while (annotationIter.hasNext()) 
+	    {
+	    	OntologyMatch annot = (OntologyMatch) annotationIter.next();
+	      
+	      // get the text that is enclosed within the annotation in the CAS
+	      String aText = annot.getCoveredText();
+	      aText = aText.replace('\n', ' ');
+	      aText = aText.replace('\r', ' ');
+	      
+	      //if aText is empty, continue;
+	      if (aText.trim().length()<1)
+	      {
+	    	  continue;
+	      }
+	      	      
+	      try 
+	      {
+	        fileWriter.write("begin: " + annot.getBegin() 
+	        		+ " end: " + annot.getEnd() 
+//	        		+ "\t num: " + annot.getType().getNumberOfFeatures()
+	        		+ "\t code: " + annot.getCode()
+	        		+ "\t ont: " + annot.getOntology()
+	        		+ "\t\t name: " + annot.getType().getShortName()
+	        		+ "\t token: " + aText
+	        		+ "\n");
+//	        for (Feature f : annot.getType().getFeatures())
+//	        {
+//	        	fileWriter.write("feat: " + f.getName()+ "\n");
+//	        }
+//	        fileWriter.write("\n\n\n");
+	        
+	        fileWriter.flush();
+	      } 
+	      catch (IOException e) 
+	      {
+	        throw new ResourceProcessException(e);
+	      }
+	  }
+  }
+	    
+   private void printCalculationAnnotations(FileWriter fileWriter, JCas jcas) throws ResourceProcessException
+   {
+	   // iterate and print annotations
+  	   Iterator<Annotation> annotationIter = jcas.getAnnotationIndex(Calculation.type).iterator();
+  	   while (annotationIter.hasNext()) 
+  	   {
+  	    	Calculation annot = (Calculation) annotationIter.next();
+  	      
+  	      // get the text that is enclosed within the annotation in the CAS
+  	      String aText = annot.getCoveredText();
+  	      aText = aText.replace('\n', ' ');
+  	      aText = aText.replace('\r', ' ');
+  	      
+  	      //if aText is empty, continue;
+  	      if (aText.trim().length()<1)
+  	      {
+  	    	  continue;
+  	      }
+  	      	      
+  	      try 
+  	      {
+  	        fileWriter.write("begin: " + annot.getBegin() 
+  	        		+ " end: " + annot.getEnd() 
+  	        		+ "\t value: " + annot.getCalculationValue()
+  	        		+ "\t name: " + annot.getCalculationName()
+  	        		+ "\t name: " + annot.getType().getShortName()
+  	        		+ "\t token: " + aText
+  	        		+ "\n");
+  	        
+  	        fileWriter.flush();
+  	      } 
+  	      catch (IOException e) 
+  	      {
+  	        throw new ResourceProcessException(e);
+  	      }
+  	   }
+   }
+  
+    
+  /**
+   * Called when a batch of processing is completed.
+   * 
+   * @param aTrace
+   *          ProcessTrace object that will log events in this method.
+   * @throws ResourceProcessException
+   *           if there is an error in processing the Resource
+   * @throws IOException
+   *           if there is an IO Error
+   * 
+   * @see org.apache.uima.collection.CasConsumer#batchProcessComplete(ProcessTrace)
+   */
+  public void batchProcessComplete(ProcessTrace aTrace) throws ResourceProcessException,
+          IOException {
+    // nothing to do in this case as AnnotationPrinter doesnot do
+    // anything cumulatively
+  }
+
+  /**
+   * Called when the entire collection is completed.
+   * 
+   * @param aTrace
+   *          ProcessTrace object that will log events in this method.
+   * @throws ResourceProcessException
+   *           if there is an error in processing the Resource
+   * @throws IOException
+   *           if there is an IO Error
+   * @see org.apache.uima.collection.CasConsumer#collectionProcessComplete(ProcessTrace)
+   */
+  public void collectionProcessComplete(ProcessTrace aTrace) throws ResourceProcessException,
+          IOException {
+    if (fileWriter != null) {
+      fileWriter.close();
+    }
+  }
+
+  /**
+   * Reconfigures the parameters of this Consumer. <br>
+   * This is used in conjunction with the setConfigurationParameterValue to set the configuration
+   * parameter values to values other than the ones specified in the descriptor.
+   * 
+   * @throws ResourceConfigurationException
+   *           if the configuration parameter settings are invalid
+   * 
+   * @see org.apache.uima.resource.ConfigurableResource#reconfigure()
+   */
+  public void reconfigure() throws ResourceConfigurationException {
+    super.reconfigure();
+    // extract configuration parameter settings
+    String oPath = (String) getUimaContext().getConfigParameterValue("outputFile");
+    File oFile = new File(oPath.trim());
+    // if output file has changed, close exiting file and open new
+    if (!oFile.equals(this.outFile)) {
+      this.outFile = oFile;
+      try {
+        fileWriter.close();
+
+        // If specified output directory does not exist, try to create it
+        if (oFile.getParentFile() != null && !oFile.getParentFile().exists()) {
+          if (!oFile.getParentFile().mkdirs())
+            throw new ResourceConfigurationException(
+                    ResourceInitializationException.RESOURCE_DATA_NOT_VALID, new Object[] { oPath,
+                        "outputFile" });
+        }
+        fileWriter = new FileWriter(oFile);
+      } catch (IOException e) {
+        throw new ResourceConfigurationException();
+      }
+    }
+  }
+
+  /**
+   * Called if clean up is needed in case of exit under error conditions.
+   * 
+   * @see org.apache.uima.resource.Resource#destroy()
+   */
+  public void destroy() {
+    if (fileWriter != null) {
+      try {
+        fileWriter.close();
+      } catch (IOException e) {
+        // ignore IOException on destroy
+      }
+    }
+  }
+
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/AnnotationPrinter.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/CSVAnnotation.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/CSVAnnotation.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/CSVAnnotation.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/CSVAnnotation.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,346 @@
+package org.spin.scrubber.uima.consumer;
+
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.ContractionToken;
+import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
+import org.apache.ctakes.typesystem.type.syntax.PunctuationToken;
+import org.apache.ctakes.typesystem.type.syntax.SymbolToken;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.log4j.Logger;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.spin.scrubber.uima.type.Calculation;
+import org.spin.scrubber.uima.type.OntologyMatch;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.Writer;
+
+/**
+ * @author Andrew McMurry, MS
+ *         Created: 4/4/13
+ */
+public enum CSVAnnotation
+{
+    //UIMA Annotation
+    _Annotation(Annotation.type,
+                Header.annotation_type,
+                Header.filename_short,
+                Header.start_idx,
+                Header.end_idx),
+
+    _Sentence(Sentence.type,
+              Header.annotation_type,
+              Header.filename_short,
+              Header.start_idx,
+              Header.end_idx,
+              Header.sentence_seq),
+
+    _SentenceText(Sentence.type, Header.token),
+
+     //cTakes BaseToken ...*
+    _BaseToken(BaseToken.type),
+    _NewLineToken(NewlineToken.type),
+    _ContractionToken(ContractionToken.type),
+    _PunctuationToken(PunctuationToken.type),
+    _SymbolToken(SymbolToken.type),
+
+    //WordToken
+    _WordToken(WordToken.type,
+               Header.annotation_type,
+               Header.filename_short,
+               Header.start_idx,
+               Header.end_idx,
+               Header.token_seq,
+               Header.pos,
+               Header.cap,
+               Header.token),
+
+    //Ontology Match
+    _OntologyMatch(OntologyMatch.type,
+                   Header.annotation_type,
+                   Header.filename_short,
+                   Header.start_idx,
+                   Header.end_idx,
+                   Header.match_source,
+                   Header.match_value,
+                   Header.token),
+
+    //Term Frequency
+    _Calculation(Calculation.type,
+                 _OntologyMatch.headers);
+
+
+    //CSV Column Headers
+    public static enum Header
+    {
+        annotation_type,
+        filename_short,
+        start_idx,
+        end_idx,
+
+        sentence_seq,
+        token_seq,
+
+        pos,
+        cap,
+
+        match_source,
+        match_value,
+
+        token
+    }
+
+    // Member Variables
+    public  final int       type;
+    private final Header[]  headers;
+
+    private File       file;
+    private Writer     writer;
+    private boolean    isReady = false;
+
+    CSVAnnotation(int type, Header... headers)
+    {
+        this.type    = type;
+        this.headers = headers;
+    }
+
+    //cTakes BaseToken is default
+    CSVAnnotation(int type)
+    {
+        this.type    = type;
+        this.headers = new  Header[] {
+                            Header.annotation_type,
+                            Header.filename_short,
+                            Header.start_idx,
+                            Header.end_idx,
+                            Header.token_seq,
+                            Header.pos
+        };
+    }
+
+    //File Writer Functions
+
+    private void open() throws IOException
+    {
+        if(!OUTPUT_DIR.exists())
+        {
+            log.debug("Creating  OUTPUT_DIR: "+OUTPUT_DIR.getAbsolutePath());
+            OUTPUT_DIR.mkdir();
+        }
+
+        String csv  = this.name().replace("_","") + ".csv";
+
+        this.file    = new File(OUTPUT_DIR+ File.separator + csv);
+        this.writer  = new FileWriter(file); //Default IMPL
+        this.isReady = true;
+
+        log.info("Opened FileWriter "+this.file.getAbsolutePath());
+    }
+
+    public void close() throws IOException
+    {
+        this.writer.close();
+        this.isReady = false;
+
+        log.debug("Closed FileWriter "+file.getAbsolutePath());
+    }
+
+    public static void closeAll() throws IOException
+    {
+        for(CSVAnnotation entry : CSVAnnotation.values())
+        {
+            entry.close();
+        }
+    }
+
+    public void delete()
+    {
+        if(file.exists())
+            file.delete();
+    }
+
+    private void flush() throws IOException
+    {
+        if(isReady) writer.flush();
+    }
+
+    public static void flushAll() throws IOException
+    {
+        for(CSVAnnotation entry : CSVAnnotation.values())
+        {
+            entry.flush();
+        }
+    }
+
+    public void write(String contents) throws IOException
+    {
+        if(!isReady) {open(); writeHeaders();}
+
+        writer.write(contents);
+    }
+
+    private void writeHeaders() throws IOException
+    {
+        write(getHeaders());
+    }
+
+    public String getHeaders()
+    {
+        String[] strings = new String[headers.length];
+
+        for(int h=0; h < headers.length; h++)
+        {
+            strings[h] = headers[h].name();
+        }
+        return getLine(strings);
+    }
+
+    public void writeLine(StringBuilder sb) throws IOException
+    {
+        writeLine(sb.toString());
+    }
+
+    public void writeLine(String ... strings) throws IOException
+    {
+        write(getLine(strings));
+    }
+
+    public void writeLine(String filename_short, int start_idx, int end_idx, int seq, String ... strings) throws IOException
+    {
+        writeLine(filename_short, start_idx, end_idx, seq, true, strings);
+    }
+
+    public void writeLine(String filename_short, int start_idx, int end_idx, String ... strings) throws IOException
+    {
+        writeLine(filename_short, start_idx, end_idx, 0, false, strings);
+    }
+
+    private void writeLine(String filename_short, int start_idx, int end_idx, int seq, boolean hasSeq, String ... strings) throws IOException
+    {
+        String[] indices = hasSeq?
+                asStrings(start_idx, end_idx, seq) :
+                asStrings(start_idx, end_idx);
+
+        write(getCSV(filename_short).append(DELIM).append(
+                getCSV(indices)).
+                toString());
+
+        if(strings.length>0)
+        {
+            write(DELIM);
+            write(getCSV(strings).toString());
+        }
+
+        write(CRLF);
+    }
+
+
+    public static String getLine(String... strings)
+    {
+        return getCSV(strings).append(CRLF).toString();
+    }
+
+    public static String[] asStrings(StringBuilder ... builders)
+    {
+        String[] str = new String[builders.length];
+
+        for(int i=0; i<builders.length; i++)
+            str[i] = builders[i].toString();
+
+        return str;
+    }
+
+    public static String[] asStrings(int ... ints)
+    {
+        String[] str = new String[ints.length];
+
+        for(int i=0; i<ints.length; i++)
+            str[i] = String.valueOf(ints[i]);
+
+        return str;
+    }
+
+    public static StringBuilder getCSV(int ... ints)
+    {
+        return getCSV(asStrings(ints));
+    }
+
+    public static StringBuilder getCSV(StringBuilder ... builders)
+    {
+        return getCSV(asStrings(builders));
+    }
+
+    public static StringBuilder getCSV(String... strings)
+    {
+        StringBuilder sb = new StringBuilder();
+
+        for(int i=0; i < strings.length; i++)
+        {
+            String str = strings[i]==null? "":strings[i];
+
+            sb.append(str);
+
+            if(i!=strings.length-1)
+            {
+                sb.append(DELIM);
+            }
+        }
+        return sb;
+    }
+
+    //Part of Speech
+    public static String getPOS(String pos)
+    {
+        if(pos==null || pos.length()==0) return "0";
+
+        if(pos.contains(",")) return "pos_comma";
+        if(pos.contains(".")) return "pos_period";
+        if(pos.contains(":")) return "pos_colon";
+        if(pos.contains("'")) return "pos_tic";
+        if(pos.contains("(")) return "pos_paren";
+        if(pos.contains(")")) return "pos_paren";
+        if(pos.contains("[")) return "pos_paren";
+        if(pos.contains("]")) return "pos_paren";
+
+        return pos;
+    }
+
+    public static String getCoveredText(String text)
+    {
+        return text.replaceAll(DELIM,       enclose("csv")).
+                replaceAll(_escape_tic,     enclose("tic")).
+                replaceAll(_escape_slash,   enclose("slash")).
+                replaceAll(_escape_crlf,    enclose("crlf")).
+                replaceAll(CRLF,            enclose("crlf"));
+    }
+
+    private static String enclose(String text) {return "("+text+")";}
+
+    public static StringBuilder getCSVBaseToken(String filename_short, BaseToken baseToken) throws IOException
+    {
+        return getCSV(filename_short, baseToken).append(DELIM).
+                append(getCSV(asStrings(baseToken.getTokenNumber()))).append(DELIM).
+                append(getCSV(getPOS(baseToken.getPartOfSpeech())));
+    }
+
+    public static StringBuilder getCSV(String filename_short, Annotation annot) throws IOException
+    {
+        return  getCSV(annot.getType().getShortName(), filename_short).append(DELIM).append(
+                getCSV(asStrings(annot.getBegin(), annot.getEnd())));
+    }
+
+    public static File   OUTPUT_DIR = new File("csv");
+
+    public static String DELIM      = ",";
+    public static String CRLF       = System.getProperty("line.separator");
+
+    public static String _escape_tic   = "'";
+    public static String _escape_slash = "\\\\";
+    public static String _escape_crlf  = "(\\\\r)?\\\\n";
+
+    public static Logger log    =  Logger.getLogger(CSVAnnotation.class);
+    public static boolean DEBUG =  log.isDebugEnabled();
+
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/CSVAnnotation.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/CSVAnnotationConsumer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/CSVAnnotationConsumer.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/CSVAnnotationConsumer.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/CSVAnnotationConsumer.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,387 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.spin.scrubber.uima.consumer;
+
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.log4j.Logger;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.collection.CasConsumer_ImplBase;
+import org.apache.uima.collection.base_cpm.CasObjectProcessor;
+import org.apache.uima.examples.SourceDocumentInformation;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceConfigurationException;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.ResourceProcessException;
+import org.apache.uima.util.ProcessTrace;
+import org.spin.scrubber.uima.type.Calculation;
+import org.spin.scrubber.uima.type.OntologyMatch;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Iterator;
+
+import static org.spin.scrubber.uima.consumer.CSVAnnotation.*;
+import static org.spin.scrubber.uima.consumer.CSVAnnotation.getCSV;
+
+public class CSVAnnotationConsumer extends CasConsumer_ImplBase implements CasObjectProcessor
+{
+    public static final Logger log    =  Logger.getLogger(CSVAnnotationConsumer.class);
+    public static final boolean DEBUG =  log.isDebugEnabled();
+
+    public static final String OUTPUT_DIR = "outputFile";
+    public static final String DELIM      = ",";
+    public static final String CRLF       = System.getProperty("line.separator");
+
+    private File outDir;
+
+    public CSVAnnotationConsumer(){}
+
+    /**
+     * Initializes this CAS Consumer with the parameters specified in the descriptor.
+     *
+     * @throws org.apache.uima.resource.ResourceInitializationException
+     *          if there is error in initializing the resources
+     */
+    public void initialize() throws ResourceInitializationException  //TODO: refactor
+    {
+        log.debug("initialize()...");
+
+        String oPath = (String) getUimaContext().getConfigParameterValue(OUTPUT_DIR);
+
+        // Output file should be specified in the descriptor
+        if (oPath == null) {
+            throw new ResourceInitializationException(
+                    ResourceInitializationException.CONFIG_SETTING_ABSENT, new Object[]{OUTPUT_DIR});
+        }
+
+        // If specified output directory does not exist, try to create it
+        outDir = new File(oPath.trim());
+        if (outDir.getParentFile() != null && !outDir.getParentFile().exists()) {
+            if (!outDir.getParentFile().mkdirs())
+                throw new ResourceInitializationException(
+                        ResourceInitializationException.RESOURCE_DATA_NOT_VALID, new Object[]{oPath,OUTPUT_DIR}
+                );
+        }
+
+        log.debug("initialize() is done.");
+    }
+
+
+    /**
+     * Processes the CasContainer which was populated by the TextAnalysisEngines. <br>
+     * In this case, the CAS index is iterated over selected annotations and printed out into an
+     * output file
+     *
+     * @param aCAS CasContainer which has been populated by the TAEs
+     * @throws org.apache.uima.resource.ResourceProcessException
+     *          if there is an error in processing the Resource
+     * @see org.apache.uima.collection.base_cpm.CasObjectProcessor#processCas(org.apache.uima.cas.CAS)
+     */
+    public synchronized void processCas(CAS aCAS) throws ResourceProcessException
+    {
+        JCas jcas;
+
+        try {
+            jcas = aCAS.getJCas();
+        } catch (CASException e) {
+            throw new ResourceProcessException(e);
+        }
+
+        String inFileURI  = null;
+        String inFilename = null;
+
+        Iterator iter = jcas.getAnnotationIndex(SourceDocumentInformation.type).iterator();
+
+        if (iter.hasNext())
+        {
+            SourceDocumentInformation srcDocInfo = (SourceDocumentInformation) iter.next();
+            inFileURI = srcDocInfo.getUri();
+
+            String split[] = inFileURI.split("/");
+            inFilename = split[split.length - 1];
+        }
+
+        if(DEBUG)
+            System.out.println(CSVAnnotation.getLine("processCas", inFilename, inFileURI));
+
+        try
+        {
+            processCasAnnotation(jcas, inFilename);
+            {
+                processCasOntologyMatch(jcas, inFilename);
+            }
+
+            processCasBaseToken(jcas, inFilename);
+            {
+                processCasSentence(jcas, inFilename);
+                processCasWordToken(jcas, inFilename);
+                processCasNewlineToken(jcas, inFilename);
+                processCasCalculation(jcas, inFilename);
+            }
+
+            CSVAnnotation.flushAll();
+        }
+        catch(IOException e)
+        {
+            throw new ResourceProcessException(e);
+        }
+    }
+
+    private void processCasAnnotation(JCas jcas, String filename_short) throws IOException
+    {
+        processCasAnnotation(jcas, filename_short, _Annotation.type);
+    }
+
+    private void processCasAnnotation(JCas jcas, String filename_short, int annotationIndex) throws IOException
+    {
+        Iterator annotationIter = jcas.getAnnotationIndex(annotationIndex).iterator();
+
+        while(annotationIter.hasNext())
+        {
+            Annotation annot = (Annotation) annotationIter.next();
+
+            if(isEmpty(annot.getCoveredText()))
+                continue;
+
+            if(DEBUG)  System.out.println(annot.toString());
+
+            _Annotation.writeLine(getCSV(
+                    getCSV(filename_short, annot)));
+        }
+    }
+
+    private void processCasBaseToken(JCas jcas, String filename_short) throws IOException
+    {
+        processCasBaseToken(jcas, filename_short, _BaseToken.type);
+    }
+
+    private void processCasBaseToken(JCas jcas, String filename_short, int annotationIndex) throws IOException
+    {
+        Iterator annotationIter = jcas.getAnnotationIndex(annotationIndex).iterator();
+
+        while(annotationIter.hasNext())
+        {
+            BaseToken annot = (BaseToken) annotationIter.next();
+
+            if(isEmpty(annot.getCoveredText()))
+                continue;
+
+            if(DEBUG)  System.out.println(annot.toString());
+
+            _BaseToken.writeLine(getCSV(
+                    getCSVBaseToken(filename_short, annot),
+                    getCSV(getCoveredText(annot))));
+        }
+    }
+
+    private void processCasSentence(JCas jcas, String filename_short) throws IOException
+    {
+        Iterator iter = jcas.getAnnotationIndex(_Sentence.type).iterator();
+
+        while (iter.hasNext())
+        {
+            Sentence annot = (Sentence) iter.next();
+
+            _Sentence.writeLine(
+                    getCSV( getCSV(filename_short, annot),
+                            getCSV(annot.getSentenceNumber())));
+
+            _SentenceText.writeLine(annot.getCoveredText());
+        }
+    }
+
+    private void processCasWordToken(JCas jcas, String filename_short) throws IOException
+    {
+        Iterator annotationIter = jcas.getAnnotationIndex(_WordToken.type).iterator();
+
+        while(annotationIter.hasNext())
+        {
+            WordToken annot = (WordToken) annotationIter.next();
+
+            if(isEmpty(annot.getCoveredText()))
+                continue;
+
+            _WordToken.writeLine(
+                    getCSV( getCSVBaseToken(filename_short, annot),
+                            getCSV(annot.getCapitalization()),
+                            getCSV(getCoveredText(annot))));
+        }
+    }
+
+    private void processCasNewlineToken(JCas jcas, String filename_short) throws IOException
+    {
+        Iterator iter = jcas.getAnnotationIndex(_NewLineToken.type).iterator();
+
+        while (iter.hasNext())
+        {
+            NewlineToken annot = (NewlineToken) iter.next();
+
+            _NewLineToken.writeLine(
+                    getCSVBaseToken(filename_short, annot)
+            );
+        }
+    }
+
+    private void processCasOntologyMatch(JCas jcas, String filename_short) throws IOException
+    {
+        Iterator iter = jcas.getAnnotationIndex(_OntologyMatch.type).iterator();
+
+        while (iter.hasNext())
+        {
+            OntologyMatch annot = (OntologyMatch) iter.next();
+
+            _OntologyMatch.writeLine(getCSV(
+                            getCSV(filename_short, annot),
+                            getCSV(annot.getOntology(), annot.getCode(), getCoveredText(annot))));
+        }
+    }
+
+    private void processCasCalculation(JCas jcas, String filename_short) throws IOException
+    {
+        Iterator iter = jcas.getAnnotationIndex(_Calculation.type).iterator();
+
+        while (iter.hasNext())
+        {
+            Calculation annot = (Calculation) iter.next();
+
+            _Calculation.writeLine(getCSV(
+                    getCSV(filename_short, annot),
+                    getCSV(annot.getCalculationName(), annot.getCalculationValue())));
+        }
+    }
+
+    private String getCoveredText(Annotation annot)
+    {
+        return CSVAnnotation.getCoveredText(annot.getCoveredText());
+    }
+
+    private boolean isEmpty(String coveredText)
+    {
+        coveredText = coveredText.replace('\n', ' ');
+        coveredText = coveredText.replace('\r', ' ');
+
+        return coveredText.trim().length()< 1;
+    }
+
+
+    /**
+     * Called when a batch of processing is completed.
+     *
+     * @param aTrace ProcessTrace object that will log events in this method.
+     * @throws org.apache.uima.resource.ResourceProcessException
+     *                             if there is an error in processing the Resource
+     * @throws java.io.IOException if there is an IO Error
+     * @see org.apache.uima.collection.CasConsumer#batchProcessComplete(org.apache.uima.util.ProcessTrace)
+     */
+    public void batchProcessComplete(ProcessTrace aTrace) throws ResourceProcessException,
+            IOException {
+        // nothing to do in this case as AnnotationPrinter doesnot do
+        // anything cumulatively
+    }
+
+    /**
+     * Called when the entire collection is completed.
+     *
+     * @param aTrace ProcessTrace object that will log events in this method.
+     * @throws org.apache.uima.resource.ResourceProcessException
+     *                             if there is an error in processing the Resource
+     * @throws java.io.IOException if there is an IO Error
+     * @see org.apache.uima.collection.CasConsumer#collectionProcessComplete(org.apache.uima.util.ProcessTrace)
+     */
+    public void collectionProcessComplete(ProcessTrace aTrace) throws ResourceProcessException, IOException
+    {
+        CSVAnnotation.closeAll();
+    }
+
+
+    /**
+     * Reconfigures the parameters of this Consumer. <br>
+     * This is used in conjunction with the setConfigurationParameterValue to set the configuration
+     * parameter values to values other than the ones specified in the descriptor.
+     *
+     * @throws org.apache.uima.resource.ResourceConfigurationException
+     *          if the configuration parameter settings are invalid
+     * @see org.apache.uima.resource.ConfigurableResource#reconfigure()
+     */
+    public void reconfigure() throws ResourceConfigurationException {
+        super.reconfigure();
+        // extract configuration parameter settings
+        String oPath = (String) getUimaContext().getConfigParameterValue("outputFile");
+        File oFile = new File(oPath.trim());
+        // if output file has changed, close exiting file and open new
+        if (!oFile.equals(this.outDir)) {
+            this.outDir = oFile;
+            try {
+                CSVAnnotation.closeAll();
+
+                // If specified output directory does not exist, try to create it
+                if (oFile.getParentFile() != null && !oFile.getParentFile().exists()) {
+                    if (!oFile.getParentFile().mkdirs())
+                        throw new ResourceConfigurationException(
+                                ResourceInitializationException.RESOURCE_DATA_NOT_VALID, new Object[]{oPath,
+                                "outputFile"});
+                }
+
+            } catch (IOException e) {
+                throw new ResourceConfigurationException();
+            }
+        }
+    }
+
+    /**
+     * Called if clean up is needed in case of exit under error conditions.
+     *
+     * @see org.apache.uima.resource.Resource#destroy()
+     */
+    public void destroy()
+    {
+        try
+        {
+            CSVAnnotation.closeAll();
+        }
+        catch(Exception e)
+        {
+            log.error("Could not close writers during destroy()");
+        }
+    }
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/CSVAnnotationConsumer.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/JDBCCasConsumer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/JDBCCasConsumer.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/JDBCCasConsumer.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/JDBCCasConsumer.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,474 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+/**
+ * 
+ */
+package org.spin.scrubber.uima.consumer;
+
+import com.mysql.jdbc.PreparedStatement;
+
+import org.apache.ctakes.typesystem.type.syntax.NumToken;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.log4j.Logger;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.collection.CasConsumer_ImplBase;
+import org.apache.uima.examples.SourceDocumentInformation;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.ResourceProcessException;
+import org.apache.uima.util.ProcessTrace;
+import org.spin.scrubber.uima.dao.BaseDAO;
+import org.spin.scrubber.uima.type.Calculation;
+import org.spin.scrubber.uima.type.KnownPHI;
+import org.spin.scrubber.uima.type.OntologyMatch;
+
+import java.io.IOException;
+import java.sql.Connection;
+import java.sql.SQLException;
+import java.util.Iterator;
+
+/**
+ * @author bf19
+ *
+ */
+public class JDBCCasConsumer extends CasConsumer_ImplBase
+{
+    private static Logger log    =  Logger.getLogger(JDBCCasConsumer.class);
+
+	private static Connection conn;
+	private static String tableName;
+	
+	/**
+	   * Initializes CAS Consumer with DB params from xml descriptor.
+	   * 
+	   * @throws ResourceInitializationException
+	   *           if there is error in initializing the resources
+	   */
+	  public void initialize() throws ResourceInitializationException 
+	  {	    
+		  // extract configuration parameter settings
+		  tableName = (String) getUimaContext().getConfigParameterValue("tableName");
+
+		  //if any params are null or empty throw exception
+		  if (tableName==null || tableName.equals(""))
+		  {
+			  throw new ResourceInitializationException(ResourceInitializationException.CONFIG_SETTING_ABSENT, new Object[] { "requires db table name." });
+		  }
+		  
+		  try 
+		  {
+              //TODO: remove?
+//			  	//get db connection info
+//				Properties props= new Properties();
+//				ClassLoader loader = this.getClass().getClassLoader();
+//		        InputStream inputStream = loader.getResourceAsStream(PROPERTIES_FILE_NAME);
+//		        props.load(inputStream);
+//
+//				String driver = props.getProperty("JDBC_DRIVER");
+//				String connString = props.getProperty("JDBC_CONNECTION_STRING");
+//				String user = props.getProperty("DB_USER");
+//				String pw = props.getProperty("DB_PW");
+//
+//				Class.forName(driver);
+//				conn = DriverManager.getConnection(connString, user, pw);
+
+              conn = BaseDAO.getConnectionToScrubber();
+		  }
+		  catch (Exception e)
+		  {
+			e.printStackTrace();
+			log.error("unable to initialize JDBCCasConsumer: " + e.getMessage());
+			throw new ResourceInitializationException(ResourceInitializationException.CONFIG_SETTING_ABSENT, new Object[] { "requires driver, connectionString, user, and pw to connect to db." });
+		  }		  
+	  }
+	  
+	  /**
+	   * much of this method was cannibalized from Apache's AnnotationPrinter.java
+	   */
+	  public synchronized void processCas(CAS aCAS) throws ResourceProcessException 
+	  {		
+		JCas jcas;
+	    try 
+	    {
+	      jcas = aCAS.getJCas();
+	    } 
+	    catch (CASException e) 
+	    {
+	      throw new ResourceProcessException(e);
+	    }
+		
+	    String filenameLong = null;
+	    String filenameShort = null;
+	    Iterator it = jcas.getAnnotationIndex(SourceDocumentInformation.type).iterator();
+	    if (it.hasNext()) 
+	    {
+	      SourceDocumentInformation srcDocInfo = (SourceDocumentInformation) it.next();
+	      filenameLong = srcDocInfo.getUri();	      
+	      filenameShort = filenameLong.substring(filenameLong.lastIndexOf("/")+1);
+	    }
+
+	    //iterate and records annotations
+	    processKnownPHI(filenameLong, filenameShort, jcas);
+	    processWordToken(filenameLong, filenameShort, jcas);
+	    processOntologyMatch(filenameLong, filenameShort, jcas);
+	    processNumToken(filenameLong, filenameShort, jcas);
+	    processCalculation(filenameLong, filenameShort, jcas);
+	    
+	  }
+
+	  private void processNumToken(String filenameLong, String filenameShort, JCas jcas) throws ResourceProcessException
+	  {
+		  //we only want to record the 'NumToken' annotations...
+		  Iterator annotationIter = jcas.getAnnotationIndex(NumToken.type).iterator();
+		    while (annotationIter.hasNext()) 
+		    {
+		      Annotation annot = (Annotation) annotationIter.next();
+		     
+		      // get the text that is enclosed within the annotation in the CAS
+		      String token = annot.getCoveredText();
+		      token = token.replace('\n', ' ');
+		      token = token.replace('\r', ' ');
+
+		      //if aText is empty, continue;
+		      if (token.trim().length()<1)
+		      {
+		    	  continue;
+		      }
+		      //if token length > 100, trim it down
+		      if (token.length()>100)
+		      {
+		    	  log.info("INFO: trimming token: " + token + "(" + token.length() + ")");
+		    	  token = token.substring(0, 100);
+		      }
+		     	      
+		      //get Features... //TODO: there must be a better way to do this...
+		      Feature posFeat=null;
+		      String posTag=null;
+		      try
+		      {
+		    	  String featName = (String) getUimaContext().getConfigParameterValue("pos");
+		    	  posFeat = annot.getCAS().getTypeSystem().getFeatureByFullName(featName);
+		    	  if (posFeat!=null)
+		    		  posTag = annot.getFeatureValueAsString(posFeat);
+		      }
+		      catch(Exception e)
+		      {
+		    	  log.warn("unable to get pos feature value. may be null. continuing.", e);
+		      }
+		      
+		      try
+		      {
+		    	  insert(filenameLong, filenameShort, annot.getType().getShortName(), annot.getType().getName(), token, annot.getBegin(), annot.getEnd(), posTag, "pos");
+		      }
+		      catch(SQLException e)
+		      {
+		    	  throw new ResourceProcessException(e);
+		      }
+		    }
+	  }
+	  private void processOntologyMatch(String filenameLong, String filenameShort, JCas jcas) throws ResourceProcessException
+	  {
+		  //we only want to record the 'OntologyMatch' annotations...
+		  Iterator annotationIter = jcas.getAnnotationIndex(OntologyMatch.type).iterator();		
+		    while (annotationIter.hasNext()) 
+		    {
+		      OntologyMatch annot = (OntologyMatch) annotationIter.next();
+		     
+		      // get the text that is enclosed within the annotation in the CAS
+		      String token = "";
+		      
+		      try
+		      {
+		    	  token = annot.getCoveredText();
+		      }
+		      catch(StringIndexOutOfBoundsException e)
+		      {
+		    	  e.printStackTrace();
+		    	  System.out.println("ERROR: index error on file: " + filenameShort + " position: " + annot.getBegin() +"-"+ annot.getEnd());
+		    	  continue;
+		      }
+		      token = token.replace('\n', ' ');
+		      token = token.replace('\r', ' ');
+
+		      //if aText is empty, continue;
+		      if (token.trim().length()<1)
+		      {
+		    	  continue;
+		      }
+		      //if token length > 100, trim it down
+		      if (token.length()>100)
+		      {
+		    	  System.out.println("INFO: trimming token: " + token + "(" + token.length() + ")");
+		    	  token = token.substring(0, 100);
+		      }
+		     
+		      String code = annot.getCode();
+		      String ontology = annot.getOntology();
+		      
+		    //trim code if its too long
+		      if (code.length()>50)
+		      {
+		    	  code = code.substring(0,50);
+		      }
+		      
+		    //trim ontology if its too long
+		      if (ontology.length()>50)
+		      {
+		    	  ontology = ontology.substring(0,50);
+		      }
+		      		      
+		      try
+		      {
+		    	  insert(filenameLong, filenameShort, annot.getType().getShortName(), annot.getType().getName(), token, annot.getBegin(), annot.getEnd(), code, ontology);
+		      }
+		      catch(SQLException e)
+		      {
+		    	  throw new ResourceProcessException(e);
+		      }
+		    }
+	  }
+
+	  private void processKnownPHI(String filenameLong, String filenameShort, JCas jcas) throws ResourceProcessException
+	  {
+		  //we only want to record the 'KnownPHI' annotations...
+		  Iterator annotationIter = jcas.getAnnotationIndex(KnownPHI.type).iterator();		
+		  while (annotationIter.hasNext()) 
+		  {
+		      KnownPHI annot = (KnownPHI) annotationIter.next();
+		     
+		      // get the text that is enclosed within the annotation in the CAS
+		      // NOTE: for knownPHI type the covered text is in getContent(). 
+		      // this is because knownPHI annots are made by the reader, not an annotator, this should be fixed in the future.
+		      String token = annot.getContent(); 
+		      token = token.replace('\n', ' ');
+		      token = token.replace('\r', ' ');
+
+		      //if aText is empty, continue;
+		      if (token.trim().length()<1)
+		      {
+		    	  continue;
+		      }
+		      //if token length > 100, trim it down
+		      if (token.length()>100)
+		      {
+		    	  System.out.println("INFO: trimming token: " + token + "(" + token.length() + ")");
+		    	  token = token.substring(0, 100);
+		      }
+		     
+		      String code = annot.getCode();
+		      String ontology = annot.getOntology();
+		      
+		    //trim code if its too long
+		      if (code.length()>50)
+		      {
+		    	  code = code.substring(0,50);
+		      }
+		      
+		    //trim ontology if its too long
+		      if (ontology.length()>50)
+		      {
+		    	  ontology = ontology.substring(0,50);
+		      }
+		      		      
+		      try
+		      {
+		    	  insert(filenameLong, filenameShort, annot.getType().getShortName(), annot.getType().getName(), token, annot.getBegin(), annot.getEnd(), code, ontology);
+		      }
+		      catch(SQLException e)
+		      {
+		    	  throw new ResourceProcessException(e);
+		      }
+		  }
+	  }
+
+	  private void processCalculation(String filenameLong, String filenameShort, JCas jcas) throws ResourceProcessException
+	  {
+		  //we only want to record the 'Calculation' annotations...
+		  Iterator annotationIter = jcas.getAnnotationIndex(Calculation.type).iterator();		
+		    while (annotationIter.hasNext()) 
+		    {
+		    	Calculation annot = (Calculation) annotationIter.next();
+		     
+		      // get the text that is enclosed within the annotation in the CAS
+		      String token = annot.getCoveredText();
+		      token = token.replace('\n', ' ');
+		      token = token.replace('\r', ' ');
+
+		      //if aText is empty, continue;
+		      if (token.trim().length()<1)
+		      {
+		    	  continue;
+		      }
+		      //if token length > 100, trim it down
+		      if (token.length()>100)
+		      {
+		    	  log.info("trimming token: " + token + "(" + token.length() + ")");
+		    	  token = token.substring(0, 100);
+		      }
+		     
+		      String calcName = annot.getCalculationName();
+		      String calcValue = annot.getCalculationValue();
+		      
+		      //trim calcName if its too long
+		      if (calcName.length()>50)
+		      {
+		    	  calcName = calcName.substring(0,50);
+		      }
+		      		      
+		      try
+		      {
+		    	  insert(filenameLong, filenameShort, annot.getType().getShortName(), annot.getType().getName(), token, annot.getBegin(), annot.getEnd(), calcValue, calcName);
+		      }
+		      catch(SQLException e)
+		      {
+		    	  throw new ResourceProcessException(e);
+		      }
+		    }
+	  }
+	  
+	  private void processWordToken(String filenameLong, String filenameShort, JCas jcas) throws ResourceProcessException
+	  {
+		  //we only want to record the 'WordToken' annotations...
+		  Iterator annotationIter = jcas.getAnnotationIndex(WordToken.type).iterator();
+		    while (annotationIter.hasNext()) 
+		    {
+		      Annotation annot = (Annotation) annotationIter.next();
+		     
+		      // get the text that is enclosed within the annotation in the CAS
+		      String token = annot.getCoveredText();
+		      token = token.replace('\n', ' ');
+		      token = token.replace('\r', ' ');
+
+		      //if aText is empty, continue;
+		      if (token.trim().length()<1)
+		      {
+		    	  continue;
+		      }
+		      //if token length > 100, trim it down
+		      if (token.length()>100)
+		      {
+		    	  log.info("INFO: trimming token: " + token + "(" + token.length() + ")");
+		    	  token = token.substring(0, 100);
+		      }
+		     	      
+		      //get Features... //TODO: there must be a better way to do this...
+		      Feature posFeat=null;
+		      String posTag=null;
+		      try
+		      {
+		    	  String featName = (String) getUimaContext().getConfigParameterValue("pos");
+		    	  posFeat = annot.getCAS().getTypeSystem().getFeatureByFullName(featName);
+		    	  if (posFeat!=null)
+		    		  posTag = annot.getFeatureValueAsString(posFeat);		    	  
+		      }
+		      catch(Exception e)
+		      {
+		    	  log.warn("unable to get POS feature declared in consumer.xml. may be null. continuing.");
+		    	  e.printStackTrace();
+		      }
+		      
+		    //get Features... //TODO: there must be a better way to do this...
+		      Feature capFeat=null;
+		      String capTag=null;
+		      try
+		      {
+		    	  String capName = (String) getUimaContext().getConfigParameterValue("capitalization");
+		    	  capFeat = annot.getCAS().getTypeSystem().getFeatureByFullName(capName);
+		    	  if (capFeat!=null)
+		    		  capTag = annot.getFeatureValueAsString(capFeat);
+		      }
+		      catch(Exception e)
+		      {
+		    	  log.warn("unable to get CAP feature declared in consumer.xml. may be null. continuing.");
+		    	  e.printStackTrace();
+		      }
+		      
+		      try
+		      {
+		    	  //insert pos & capitalization features.
+		    	  insert(filenameLong, filenameShort, annot.getType().getShortName(), annot.getType().getName(), token, annot.getBegin(), annot.getEnd(), posTag, "pos");
+		    	  insert(filenameLong, filenameShort, annot.getType().getShortName(), annot.getType().getName(), token, annot.getBegin(), annot.getEnd(), capTag, "cap");
+		      }
+		      catch(SQLException e)
+		      {
+		    	  throw new ResourceProcessException(e);
+		      }
+		    }
+	  }
+	  
+	  private void insert(String filenameLong, String filenameShort, String annot_type_short, String annot_type_long, String token, int start_idx, int end_idx, String match_value, String match_source) throws SQLException
+		{
+			PreparedStatement ps = null;
+			int updated = 0;
+			StringBuilder sql = new StringBuilder("insert into "+tableName+" (filename_long, filename_short, annot_type_short, annot_type_long, token, start_idx, end_idx, match_value, match_source)  values (?,?,?,?,?,?,?,?,?);");
+				
+			try 
+			{
+				ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+				int i=1;
+				ps.setString(i++, filenameLong);
+				ps.setString(i++, filenameShort);
+				ps.setString(i++, annot_type_short);
+				ps.setString(i++, annot_type_long);
+				ps.setString(i++, token);
+				ps.setInt(i++, start_idx);
+				ps.setInt(i++, end_idx);
+				ps.setString(i++, match_value);
+				ps.setString(i++, match_source);
+				
+				updated = ps.executeUpdate();
+			    
+			    if(updated!=1) { throw new SQLException("ERROR: db insert count="+updated + ". expected=1"); } 
+			}
+			catch (SQLException e)
+			{
+				log.error("processing CAS in JDBCCasConsumer: " + e.getMessage());
+				throw e;
+			}
+			finally
+			{
+				if (ps!=null)
+				{
+					ps.close();
+				}
+			}
+		}	 
+	
+	/**
+	 * close db connection.
+	 */
+	public void collectionProcessComplete(ProcessTrace arg0) throws ResourceProcessException,IOException 
+    {
+		try
+		{
+			if (conn!=null)
+			{
+				conn.close();
+			}
+		}
+		catch (Exception e)
+		{
+			e.printStackTrace();
+			log.error("closing DB connection for JDBCCasConsumer - SWALLOWING: " + e.getMessage());
+		}
+    }
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/consumer/JDBCCasConsumer.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/core/ReferenceTextStripper.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/core/ReferenceTextStripper.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/core/ReferenceTextStripper.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/core/ReferenceTextStripper.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,232 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+/**
+ * 
+ */
+package org.spin.scrubber.uima.core;
+
+import org.apache.log4j.Logger;
+import org.spin.scrubber.ScrubberProperties;
+
+import java.io.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * @author bf19
+ */
+public class ReferenceTextStripper implements Runnable
+{
+    private static Logger log    =  Logger.getLogger(ReferenceTextStripper.class);
+
+	public static final String REGEX_ET_AL="(\\b[A-Z][A-Za-z-]*(\\.)?(\\s+))+([E|e][T|t]\\s+[A|a][L|l])";
+
+	private String dirInputPublicationsTXT;
+	private String dirInputPublicationsProcessed;
+
+	public ReferenceTextStripper()
+    {
+        this(   ScrubberProperties.getDirInputPublicationsTXT(),
+                ScrubberProperties.getDirInputPublicationsProcessed()
+        );
+    }
+
+	public ReferenceTextStripper(String dirInputPublicationsTXT, String dirInputPublicationsProcessed)
+	{
+		this.dirInputPublicationsTXT       = dirInputPublicationsTXT;
+		this.dirInputPublicationsProcessed = dirInputPublicationsProcessed;
+	}
+
+    	/**
+	 * @param args
+	 * @throws Exception
+	 */
+	public static void main(String[] args) throws Exception
+	{
+		ReferenceTextStripper stripper = new ReferenceTextStripper();
+		stripper.run();
+	}
+	
+	public void run()
+	{		
+		try 
+		{
+			File inDir = new File(dirInputPublicationsTXT);
+			
+			if (!inDir.exists())
+			{
+				inDir.createNewFile();
+			}
+			
+			File[] files = inDir.listFiles();
+			int i=1;
+			for (File f : files)
+			{
+				if (f.isDirectory())
+				{
+					log.debug("SKIPPING File - isDirectory: " + i++ + " - " + f.getName() + "\t");
+					continue;
+				}
+								
+				String fname = f.getName();
+				
+				//make new outfile 
+				File newFile = new File(dirInputPublicationsProcessed + File.separatorChar + fname);
+				if (newFile.exists())
+				{
+					log.debug("SKIPPING File - already exists: " + i + " - " + newFile.getName() + "\t") ;
+					continue;
+				}
+								
+				log.debug("File: " + i++ + " - " + fname + "\t") ;
+								
+				//read infile
+				String strLine;
+				BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(f)));
+				StringBuffer buff = new StringBuffer();
+				String content;
+				
+				while ((strLine = br.readLine()) != null)
+				{
+					buff.append(strLine);
+					buff.append("\n");
+				}
+							
+				content = buff.toString();
+								
+			    FileWriter writer = new FileWriter(newFile);
+			    
+			    //strip references - substr from start to token 'References'
+//			    content = stripReferencesSection(content);
+			    
+			    //strip "<Token(s)> et al." - gets a series of init case tokens followed by "et al"
+			    //TODO: maybe also strip (*<Token(s)> et al.*) to get cases like: 
+			    	//"(Grindlay, 1997; Ono et al., 2001; Hirose, 2005; Terashima et al., 2005; Niinemets, 2007)"
+			    	//"(Walters et al., 1999; Weaver and Amasino, 2001)"
+			    content = stripInLineReferences(content);
+			    
+			    //TODO: strip pub authors & institutions - there has got to be a better way than this...
+//			    content = stripPubAuthors(content);
+			    		    
+			    //write new contents.
+			    writer.write(content);
+			    writer.flush();
+			    
+			    //wrap up
+			    writer.close();
+			    br.close();
+			    buff=null;
+			}		
+		}
+		catch(Exception e)
+		{
+			log.error("Unknown error during reference text stripping", e);
+		}
+	}
+
+	private String stripInLineReferences(String content)
+	{
+		try
+		{
+			Pattern p = Pattern.compile(REGEX_ET_AL);
+			Matcher m = p.matcher(content);
+			
+			while (m.find()) 
+			{
+	        	String matched = m.group();
+	        	
+	        	System.out.println("Stripping inline reference: "+matched);
+	        	
+	        	// remove any reg ex special chars
+	        	String[] specialChars = { "\\", "[", "{", "}", "[", "]", "$", "^", "|", "(", ")", "*", "?", "+", "]", ".", "/"};
+	        	
+	        	String escaped = matched;
+	        	
+	        	for (String specialChar : specialChars) {
+	        		escaped = escaped.replaceAll("([^\\" + specialChar + "]*)(\\" + specialChar + ")([^\\" + specialChar + "]*)", "$1\\\\$2$3");
+	        	}
+	        	
+	        	escaped = "(\\W|\\b)" + escaped + "(\\W|\\b)";
+	        	
+	        	content = content.replaceAll(escaped, " AUTHOR et al.");        	
+	        }
+			
+			return content;
+		}
+		catch(Exception e)
+		{
+			e.printStackTrace();
+		}
+		
+		//if method fails to match regex, just return empty string.
+		return "";
+		
+//		try
+//		{
+//			return content.replaceAll(REGEX_ET_AL, " AUTHOR et al.");
+//		}
+//		catch(Exception e)
+//		{
+//			e.printStackTrace();
+//		}
+//		
+//		//if method fails to match regex, just return empty string.
+//		return "";
+	}
+
+    	/**
+	 * over cautious stripping. if for some reason the token 'references' only existed as a word in the body
+	 * and not a section heading, this would make the pub basically useless as it would strip most of the content.
+	 * @param content
+	 * @return
+	 */
+    @Deprecated
+	private String stripReferencesSection(String content)
+	{
+		if (content!=null && content.toLowerCase().contains("references"))
+		{
+			//System.out.println("Found REFERENCES section. Stripping it...");
+			return content.substring(0, content.toLowerCase().lastIndexOf("references"));
+		}
+		else
+		{
+			return content;
+		}
+	}
+
+	/**
+	 * over cautious stripping. if pub did not have the standard IMRD sections, this may erroneously cut more than needed.
+	 * @param content
+	 * @return
+	 */
+    @Deprecated
+	private String stripPubAuthors(String content)
+	{
+		if (content!=null && content.toLowerCase().contains("introduction"))
+		{
+			//System.out.println("Found REFERENCES section. Stripping it...");
+			return content.substring(content.toLowerCase().indexOf("introduction"));
+		}
+		else
+		{
+			return content;
+		}
+	}
+
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/core/ReferenceTextStripper.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/core/UIMARunner.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/core/UIMARunner.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/core/UIMARunner.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/core/UIMARunner.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,97 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+/**
+ * 
+ */
+package org.spin.scrubber.uima.core;
+
+import org.apache.log4j.Logger;
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.collection.CollectionProcessingEngine;
+import org.apache.uima.collection.metadata.CpeDescription;
+import org.apache.uima.util.XMLInputSource;
+
+import java.io.File;
+
+/**
+ *  @author britt fitch
+ *  @author Andrew McMurry
+ *
+ */
+public class UIMARunner implements Runnable
+{
+    private static Logger log =  Logger.getLogger(UIMARunner.class);
+    private String cpe;
+    
+    public UIMARunner(String cpe)
+    {
+        log.debug("CPE file: "+ cpe);
+
+    	this.cpe = cpe;
+    }
+
+	/**
+	 * @param args
+	 * @throws Exception 
+	 */
+	public static void main(String[] args) throws Exception
+	{
+		log.info("Running UIMA CPE Pipeline (Collection processing engine)...");
+
+        if(args.length < 1)
+        {
+            System.out.println("You must provide a CPE file. See UIMA documentation for help.");
+        }
+        else
+        {
+            UIMARunner runner = new UIMARunner(args[0]);
+            runner.run();
+        }
+	}
+	
+	public void run()
+	{
+		try
+		{
+			File cpeFile = new File(cpe);
+	
+	        if(!cpeFile.exists())
+	        {
+	            System.out.println("CPE file does not exist: "+ cpeFile.getAbsolutePath());
+	        }
+	        else
+	        {
+	            log.debug("Parsing CPE descriptor from "+ cpeFile.getAbsolutePath());
+	            CpeDescription cpeDesc = UIMAFramework.getXMLParser().parseCpeDescription(new XMLInputSource(cpeFile));
+	
+	            log.debug("Instantiating CPE...");
+	            CollectionProcessingEngine mCpe = UIMAFramework.produceCollectionProcessingEngine(cpeDesc);
+	
+	            log.debug("Starting process...");
+	            mCpe.process();
+	            
+	            log.debug("UIMA step complete...");
+	        }
+		}
+		catch(Exception e)
+		{
+			log.error("Encountered error while executing UIMARunner.", e);
+		}
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/core/UIMARunner.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain