You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by br...@apache.org on 2013/07/07 21:23:07 UTC
svn commit: r1500511 [2/6] - in /ctakes/sandbox/ctakes-scrubber-deid/src: ./ main/ main/java/ main/java/org/ main/java/org/apache/ main/java/org/apache/uima/ main/java/org/apache/uima/examples/ main/java/org/spin/ main/java/org/spin/scrubber/ main/java...

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorI2B2.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorI2B2.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorI2B2.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorI2B2.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,211 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.classification;
+
+import java.io.File;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathFactory;
+
+import org.apache.log4j.Logger;
+import org.spin.scrubber.uima.dao.HumanAnnotationsDAO;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * Class for extracting different types of PHI tags out of annotated i2b2 smoking data.
+ * Writes to the database
+ * tables: human_annotations
+ * 
+ * @author britt fitch bf19
+ *
+ */
+public class HumanAnnotationsExtractorI2B2 implements HumanAnnotationsExtractor
+{
+    private static Logger log =  Logger.getLogger(HumanAnnotationsExtractorI2B2.class);
+	
+	protected String tableSuffix; 
+    private File dirInputHumanAnnotations;
+
+	public HumanAnnotationsExtractorI2B2(String dirInputHumanAnnotations, String tableSuffix)
+	{
+	    this(new File(dirInputHumanAnnotations), tableSuffix);
+	}
+	
+	public HumanAnnotationsExtractorI2B2(File dirInputHumanAnnotations, String tableSuffix)
+	{
+	    this.dirInputHumanAnnotations = dirInputHumanAnnotations;
+	    this.tableSuffix = tableSuffix;
+	
+	    log.info("Starting Human Annotations Extractor (I2B2) @ "+ dirInputHumanAnnotations.getAbsolutePath());
+	}
+
+    /**
+     * @param args
+     * @throws Exception
+     */
+    public static void main(String[] args) throws Exception
+    {
+    	if(args.length!=2)
+        {
+    		System.out.println("USAGE:\t\t HumanAnnotationsExtractorI2B2 input_directory table_suffix");
+    		System.out.println("EXAMPLE:\t HumanAnnotationsExtractorI2B2 ../data/ _test");
+        }
+        else
+        {
+	        HumanAnnotationsExtractorI2B2 runner= new HumanAnnotationsExtractorI2B2(args[0], args[1]);
+	        runner.parseHumanAnnotations();
+        }
+    }
+
+    /**
+     * Parse XML such that the "real" absolute character positions can be obtained from the input XML
+     */
+	public void parseHumanAnnotations()
+	{
+		HumanAnnotationsDAO dao = null;
+		
+		try
+		{
+			dao = new HumanAnnotationsDAO(tableSuffix);
+			
+			log.debug("Input path "+dirInputHumanAnnotations.getAbsolutePath());
+			
+			if (!dirInputHumanAnnotations.exists())
+			{
+				dirInputHumanAnnotations.createNewFile();
+			}
+			
+			File[] files = dirInputHumanAnnotations.listFiles();
+			
+			for (File f : files)
+			{
+				if (f.isDirectory())
+				{
+					continue;
+				}
+				
+				log.debug("Reading: " + f.getName());
+				
+				//read infile
+				DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+				DocumentBuilder builder = factory.newDocumentBuilder();
+				Document doc = builder.parse(f);
+				
+				//START iterate over dom, update phi tags with start&end attribs.
+				Element root = doc.getDocumentElement(); 
+				XPathFactory xPathfactory1 = XPathFactory.newInstance();
+				XPath xpath1 = xPathfactory1.newXPath();
+				XPathExpression expr1 = xpath1.compile("//TEXT");
+				
+				NodeList nodes1 = (NodeList) expr1.evaluate(doc, XPathConstants.NODESET);
+				for (int n1int=0; n1int<nodes1.getLength(); n1int++)
+				{
+					int start=0; 
+					NodeList nodes2 = nodes1.item(n1int).getChildNodes();
+									
+					for (int n2int=0; n2int<nodes2.getLength(); n2int++)
+					{
+						Node n2 = nodes2.item(n2int);
+						
+						if (n2.getNodeType()==Node.ELEMENT_NODE)
+						{
+							//count line breaks
+							Pattern p = Pattern.compile("\\r\\n|\\r|\\n");
+							Matcher m = p.matcher(n2.getTextContent());
+							int k = 0;
+							while (m.find()) 
+							{
+								k++;
+							}
+							start+=k;
+							
+							String n2Val = n2.getTextContent();
+
+							((Element)n2).setAttribute("start", Integer.toString(start));
+							
+							start+=n2Val.length();
+							((Element)n2).setAttribute("end", Integer.toString(start));							
+						}
+						else if (n2.getNodeType()==Node.TEXT_NODE)
+						{
+							//count line breaks
+							Pattern p = Pattern.compile("\\r\\n|\\r|\\n");
+							Matcher m = p.matcher(n2.getTextContent());
+							int k = 0;
+							while (m.find()) 
+							{
+								k++;
+							}
+//							start+=k; //uncommenting this makes case 1.txt correct and all others wrong. comment out makes 1.txt incorrect and all others right.
+
+							start+=n2.getTextContent().length();
+						}
+					}					
+				}
+				//END iterate over dom, update phi tags with start&end attribs.
+
+				//continue on and parse PHI tags with the start & end dates.
+				XPathFactory xPathfactory = XPathFactory.newInstance();
+				XPath xpath = xPathfactory.newXPath();
+				XPathExpression expr = xpath.compile("//PHI");
+	
+				//read all matching nodes
+			    NodeList nodes = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
+	
+			    //for each RECORD node in a file
+			    for (int i=0; i<nodes.getLength(); i++)
+			    {
+			    	Node n = nodes.item(i);
+					String id = n.getParentNode().getParentNode().getAttributes().getNamedItem("ID").getNodeValue()+".txt";
+					String type = n.getAttributes().getNamedItem("TYPE").getNodeValue();
+					int startIdx = Integer.parseInt(n.getAttributes().getNamedItem("start").getNodeValue());
+					int endIdx = Integer.parseInt(n.getAttributes().getNamedItem("end").getNodeValue());
+					String phi = n.getTextContent().toLowerCase();
+					
+					for (String token : phi.split(" "))
+					{
+						token = token.trim();
+						if (token.length()>0 && !token.equals(","))
+						{
+							dao.insert(id, type, token, startIdx, endIdx);
+						}
+					}
+			    }
+			}			
+		}
+		catch (Exception e)
+		{
+			log.error("Failed to parse human annotations from i2b2 input", e) ;
+		}
+		finally
+		{
+			dao.close();
+		}
+	}
+
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorI2B2.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorProtege.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorProtege.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorProtege.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorProtege.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,279 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.classification;
+
+import org.apache.log4j.Logger;
+import org.spin.scrubber.uima.dao.HumanAnnotationsDAO;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathFactory;
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * class for extracting different types of PHI tags out of protege/knowtator annotated data.
+ *
+ * Writes to the database
+ * tables: 
+ * 		human_annotations_test
+ * 		human_annotations_train
+ *
+ * @author britt fitch bf19
+ *
+ */
+public class HumanAnnotationsExtractorProtege implements HumanAnnotationsExtractor
+{
+    private static Logger log =  Logger.getLogger(HumanAnnotationsExtractorProtege.class);
+    protected String tableSuffix; 
+    private File dirInputHumanAnnotations;
+
+	public HumanAnnotationsExtractorProtege(String dirInputHumanAnnotations, String tableSuffix)
+	{
+        this(new File(dirInputHumanAnnotations), tableSuffix);
+	}
+
+    public HumanAnnotationsExtractorProtege(File dirInputHumanAnnotations, String tableSuffix)
+    {
+        this.dirInputHumanAnnotations = dirInputHumanAnnotations;
+        this.tableSuffix = tableSuffix;
+
+        log.info("Starting Human Annotations Extractor (Protege) @ "+ dirInputHumanAnnotations.getAbsolutePath());
+    }
+
+    /**
+     * @param args
+     * @throws Exception
+     */
+    public static void main(String[] args) throws Exception
+    {
+        if(args.length!=2)
+        {
+        	System.out.println("USAGE:\t\t HumanAnnotationsExtractorProtege input_directory {_test|_train}");
+        }
+        else
+        {
+            HumanAnnotationsExtractorProtege runner = new HumanAnnotationsExtractorProtege(args[0], args[1]);
+            runner.parseHumanAnnotations();
+        }
+    }
+
+	public void parseHumanAnnotations()
+	{
+        log.info("BEGIN Parsing human annotations.");
+
+		HumanAnnotationsDAO dao = null;
+		try
+		{
+			dao = new HumanAnnotationsDAO(tableSuffix);
+
+            log.debug("Input path "+dirInputHumanAnnotations.getAbsolutePath());
+			
+			if (!dirInputHumanAnnotations.exists())
+			{
+				dirInputHumanAnnotations.createNewFile();
+			}
+			
+			File[] files = dirInputHumanAnnotations.listFiles();
+
+            if(files==null || files.length==0)
+            {
+                log.warn("There were no human annotations in dir: "+ dirInputHumanAnnotations.getAbsolutePath());
+            }
+			
+			for (File f : files)
+			{
+				if (f.isDirectory())
+				{
+					continue;
+				}
+				
+				log.debug("Reading: " + f.getName());
+				
+				//read infile
+				DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+				DocumentBuilder builder = factory.newDocumentBuilder();
+				Document doc = builder.parse(f);
+				
+				//continue on and parse PHI tags with the start & end dates.
+				XPathFactory xPathfactory = XPathFactory.newInstance();
+				XPath xpath = xPathfactory.newXPath();
+							
+				List<KnowtatorAnnot> annotationList = new ArrayList<KnowtatorAnnot>();
+				Map<String,String> classMentionMap = new HashMap<String,String>();
+	
+				//read all matching nodes
+				XPathExpression annotExpr = xpath.compile("//annotation");
+				NodeList nodes = (NodeList) annotExpr.evaluate(doc, XPathConstants.NODESET);
+	
+			    //for each ANNOTATION node in a file
+			    for (int i=0; i<nodes.getLength(); i++)
+			    {
+			    	Node n = nodes.item(i);
+			    	NodeList kids = n.getChildNodes();
+			    	
+			    	String filename = n.getParentNode().getAttributes().getNamedItem("textSource").getNodeValue();
+			    	
+			    	KnowtatorAnnot a = new KnowtatorAnnot();
+			    	a.setFilenameShort(filename);
+			    	
+			    	for (int k=0; k<kids.getLength(); k++)
+			    	{
+			    		Node kid = kids.item(k);
+			    		if (kid.getNodeName().equalsIgnoreCase("mention"))
+			    		{
+			    			String mentionId = kid.getAttributes().getNamedItem("id").getNodeValue();
+			    			a.setMentionId(mentionId);
+			    		}
+			    		else if (kid.getNodeName().equalsIgnoreCase("span"))
+			    		{
+			    			int startIdx = Integer.parseInt(kid.getAttributes().getNamedItem("start").getNodeValue());
+			    			int endIdx = Integer.parseInt(kid.getAttributes().getNamedItem("end").getNodeValue());
+			    			a.setStartIdx(startIdx);
+			    			a.setEndIdx(endIdx);
+			    		}
+			    		else if (kid.getNodeName().equalsIgnoreCase("spannedText"))
+			    		{
+			    			String token = kid.getTextContent();
+			    			a.setToken(token);
+			    		}
+			    	}
+			    	annotationList.add(a);
+			    }
+			    
+			    //for each CLASSMENTION node in a file
+			    XPathExpression classMentionExpr = xpath.compile("//classMention");
+			    NodeList classMentionNodes = (NodeList) classMentionExpr.evaluate(doc, XPathConstants.NODESET);
+		    	
+			    for(int i=0; i<classMentionNodes.getLength(); i++)
+			    {
+			    	Node n = classMentionNodes.item(i);
+			    	String key = n.getAttributes().getNamedItem("id").getNodeValue();
+			    	String val = n.getFirstChild().getNextSibling().getTextContent();
+			    	
+			    	classMentionMap.put(key,val);
+			    }
+			    
+		    	
+			    //INSERT phi 
+				for (int i=0; i<annotationList.size(); i++)
+				{
+					KnowtatorAnnot a = annotationList.get(i);
+					
+					//check for empty annotations
+					if (a.getStartIdx()==0 && a.getEndIdx()==0 && a.getToken()==null)
+					{
+						log.warn("Encountered empty annotation for " + a.getMentionId());
+						continue;
+					}
+					
+					for (String token : a.getToken().split(" "))
+					{
+						token = token.trim();
+						if (token.length()>0 && !token.equals(","))
+						{
+							dao.insert(a.getFilenameShort(), classMentionMap.get(a.getMentionId()).toUpperCase(), token, a.getStartIdx(), a.getEndIdx());
+						}
+					}
+				}
+			}
+
+            log.info("DONE Parsing human annotations.");
+		}
+		catch (Exception e)
+		{
+			log.error("Could not parse human annotations", e);
+		}
+		finally
+		{
+			dao.close();
+		}
+	}
+	
+
+	private class KnowtatorAnnot
+	{
+		private String  token;
+		private String  filenameShort;
+		private int     startIdx;
+		private int     endIdx;
+		private String  mentionClass;
+		private String  mentionId;
+
+		public String getToken()
+		{
+			return token;
+		}
+
+		public void setToken(String token)
+		{
+			this.token = token;
+		}
+
+		public String getFilenameShort()
+		{
+			return filenameShort;
+		}
+		public void setFilenameShort(String filenameShort)
+		{
+			this.filenameShort = filenameShort;
+		}
+		public int getStartIdx()
+		{
+			return startIdx;
+		}
+		public void setStartIdx(int startIdx)
+		{
+			this.startIdx = startIdx;
+		}
+		public int getEndIdx()
+		{
+			return endIdx;
+		}
+		public void setEndIdx(int endIdx)
+		{
+			this.endIdx = endIdx;
+		}
+		public String getMentionClass()
+		{
+			return mentionClass;
+		}
+		public void setMentionClass(String mentionClass)
+		{
+			this.mentionClass = mentionClass;
+		}
+		public String getMentionId()
+		{
+			return mentionId;
+		}
+		public void setMentionId(String mentionId)
+		{
+			this.mentionId = mentionId;
+		}
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorProtege.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaClassifier.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaClassifier.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaClassifier.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaClassifier.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,276 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.classification;
+
+import org.spin.scrubber.ScrubberProperties;
+import org.spin.scrubber.uima.dao.FeatureMatrixDAO;
+import weka.classifiers.Classifier;
+import weka.classifiers.Evaluation;
+import weka.classifiers.meta.CostSensitiveClassifier;
+import weka.core.Instances;
+import weka.core.Utils;
+import weka.core.converters.ConverterUtils.DataSource;
+import weka.filters.Filter;
+import weka.filters.unsupervised.attribute.Remove;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * TODO: Serious refactoring needed, only use what we need for the implementation we published. (britt)
+ */
+
+public class WekaClassifier
+{
+	private String testModelFilepath = null;
+	private String trainModelFilepath =null;
+	
+	private Remove removeFilter = null; 
+	private Classifier classifier = null;
+	
+	private String tableSuffix = "_test"; //WekaClassifier only updates "test" tables. 
+	
+	public WekaClassifier()
+	{
+		this(ScrubberProperties.getFileModelTrainAbsolutePath(), ScrubberProperties.getFileModelTestAbsolutePath());
+	}
+	
+	public WekaClassifier(String trainModelFilepath, String testModelFilepath)
+	{
+		this.trainModelFilepath = trainModelFilepath;
+		this.testModelFilepath  = testModelFilepath;
+	}
+
+	public static void main(String[] args) throws Exception
+	{
+		WekaClassifier wc = new WekaClassifier();
+		wc.test();
+	}
+
+	public void test() throws Exception
+	{
+		//get data
+		DataSource trainSource = new DataSource(getTrainModelFilepath());
+		DataSource testSource = new DataSource(getTestModelFilepath());
+		Instances trainData = trainSource.getDataSet();
+		Instances testData = testSource.getDataSet();
+		Instances orig = new Instances(testData);
+		 
+		//remove filter 
+		trainData = Filter.useFilter(trainData, getRemoveFilter(trainData));
+		testData = Filter.useFilter(testData, getRemoveFilter(trainData));
+		
+		//set class index
+		trainData.setClassIndex(trainData.numAttributes()-1);
+		System.out.println("class index: " + trainData.classIndex() +"\t"+ trainData.attribute(trainData.classIndex()));
+		
+		testData.setClassIndex(testData.numAttributes()-1);
+		System.out.println("class index: " + testData.classIndex() +"\t"+ testData.attribute(testData.classIndex()));
+		
+		//check headers
+		if (!trainData.equalHeaders(testData))
+		{
+			System.out.println();
+			throw new IllegalStateException("Incompatible train and test set!");
+		}
+		else
+		{
+			System.out.println("headers match...");
+		}
+		
+		//build classifier
+		System.out.println("building classifier...");
+		Classifier base = getClassifier();
+		base.buildClassifier(trainData);
+		System.out.println(base);
+		
+		//evaluate
+		System.out.println("evaluating...");
+		Evaluation eval = new Evaluation(trainData);
+
+		eval.evaluateModel(base, testData);
+		System.out.println(eval.toSummaryString());
+    	System.out.println(eval.toClassDetailsString());
+    	System.out.println(eval.toMatrixString());
+						
+		//output txt results
+		List<String> classifiedAsPHIList = printSummary(base, eval, testData, orig);
+		
+		//update db w/ classification
+		recordClassification(classifiedAsPHIList);
+	}
+		
+	private void recordClassification(List<String> classifiedAsPHIList) throws Exception
+	{
+		String[] keys;
+		int id;
+		String classifiedAs;
+		FeatureMatrixDAO dao = new FeatureMatrixDAO(tableSuffix);
+		
+		for (String s : classifiedAsPHIList)
+		{
+			keys = s.split("\\|");
+			if (keys.length!=2)
+			{
+				System.out.println("ERROR: unable to record classification, insufficient number of keys for '"+s+"'.");
+			}
+			else
+			{
+				id = Integer.parseInt(keys[0]);
+				classifiedAs = keys[1];
+				
+				dao.updateClassification(classifiedAs, id);
+			}
+		}
+	}
+
+	private List<String> printSummary(Classifier base, Evaluation eval, Instances data, Instances orig) throws Exception
+	{
+		//return list of cases classified as PHI
+		List<String> classifiedAsPHIList = new ArrayList<String>();
+		
+		// output evaluation
+	    System.out.println();
+	    System.out.println("=== Setup ===");
+	    System.out.println("Classifier: " + getClassifier().getClass().getName() + " " + Utils.joinOptions(base.getOptions()));
+	    System.out.println("Dataset: " + data.relationName());
+	    System.out.println();
+    
+	    // output predictions
+	    int totalMisclass = 0;
+	    int totalPHIClass = 0;
+	    int totalNonPHIClass = 0;
+	    System.out.println("# -\t actual -\t predicted -\t token");
+	    for (int i = 0; i < data.numInstances(); i++) 
+	    {
+	    	double pred = base.classifyInstance(data.instance(i));	    	
+	    	double actual = data.instance(i).classValue();
+	    	String predString = data.classAttribute().value((int) pred);
+//			double[] dist = base.distributionForInstance(data.instance(i));
+			
+			//save data for cases classified as PHI
+			//if (pred>0)
+	    	if(!predString.equalsIgnoreCase("NA"))
+			{
+				totalPHIClass++;
+				classifiedAsPHIList.add(orig.instance(i).stringValue(0)+"|"+predString);
+			}
+			else 
+			{
+				totalNonPHIClass++;
+			}
+			
+			//output misclassifications
+			if (pred != actual && predString.equalsIgnoreCase("NA"))
+//				if (pred != actual && actual>1)
+	    	{
+	    		totalMisclass++;
+	    		System.out.print((i+1));
+				System.out.print(" -\t ");
+				System.out.print(data.instance(i).toString(data.classIndex()));
+				System.out.print(" -\t ");
+				System.out.print(predString);				
+				System.out.print(" -\t ");
+				//System.out.print(data.instance(i)); //comment out classified instance. 
+				System.out.println(orig.instance(i).stringValue(0)); //show identifying part of the instance base on original instance data.
+				//System.out.print("\t\t\t");
+				System.out.println();
+	    	}
+	    }
+	    
+	    System.out.println("total misclassifications: " + totalMisclass);
+    	System.out.println(eval.toSummaryString());
+    	System.out.println(eval.toClassDetailsString());
+    	System.out.println(eval.toMatrixString());
+	
+    	System.out.println("total PHI class: " + totalPHIClass);
+    	System.out.println("total non-PHI class: " + totalNonPHIClass);
+    	
+    	return classifiedAsPHIList;
+	}
+		
+	/*
+	 * filters - only initialize once or it causes problems running on test/train sets
+	 */
+	private Filter getRemoveFilter(Instances data) throws Exception
+	{
+		if (removeFilter == null)
+		{
+			removeFilter = new Remove();
+			System.out.println("\tExecuting Remove Filter...");
+			String[] options = new String[2];
+			options[0] = "-R";
+			options[1] = "1"; 
+			removeFilter.setOptions(options);
+			removeFilter.setInputFormat(data);
+		}
+		return removeFilter;
+	}
+
+    public Classifier getClassifier() throws Exception
+    {
+        return getClassifier(ScrubberProperties.getClassificationCostMatrix());
+    }
+	
+	public Classifier getClassifier(String classificationCostMatrix) throws Exception
+	{
+		if (classifier==null)
+		{
+			classifier = new CostSensitiveClassifier();
+			String[] options = new String[11];
+			int i=0;
+			options[i++] = "-cost-matrix";
+			options[i++] = classificationCostMatrix;
+			options[i++] = "-S";
+			options[i++] = "1";
+			options[i++] = "-W";
+			options[i++] = "weka.classifiers.trees.J48";
+			options[i++] = "--";
+			options[i++] = "-C";
+			options[i++] = "0.25 ";
+			options[i++] = "-M";
+			options[i++] = "2";
+			
+			classifier.setOptions(options);
+		}
+		
+		return classifier;
+	}
+	
+	public String getTrainModelFilepath()
+	{
+		return trainModelFilepath;
+	}
+
+	public void setTrainModelFilepath(String trainModelFilepath)
+	{
+		this.trainModelFilepath = trainModelFilepath;
+	}
+
+	public String getTestModelFilepath()
+	{
+		return testModelFilepath;
+	}
+
+	public void setTestModelFilepath(String testModelFilepath)
+	{
+		this.testModelFilepath = testModelFilepath;
+	}
+
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaClassifier.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractor.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractor.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractor.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,131 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.classification;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.Scanner;
+
+import org.apache.log4j.Logger;
+
+/**
+ * 
+ * @author britt fitch
+ *
+ */
+public abstract class WekaDataExtractor
+{
+	private static Logger log =  Logger.getLogger(WekaDataExtractor.class);
+	
+	private String dirModels = null;
+	private String modelName = null;
+	private String tableSuffix = null;
+		
+	public WekaDataExtractor(String dirModels, String modelName)
+	{
+		this.dirModels = dirModels; 
+		this.modelName = modelName;
+	}
+
+	public void writeFile(String pathToFile, String content) throws IOException  
+	{
+	    Writer out = null;
+	    try 
+	    {
+	    	out = new OutputStreamWriter(new FileOutputStream(pathToFile));
+	    	out.write(content);
+	    }
+	    catch (IOException e)
+		{
+			log.error("Unable to write to file: " + pathToFile, e);
+			throw e;
+		}
+	    finally 
+	    {
+	    	out.close();
+	    }
+	}
+	
+	public String readFile(String pathToFile) throws FileNotFoundException
+	{
+		StringBuilder text = new StringBuilder();
+	    String NL = System.getProperty("line.separator");
+	    Scanner scanner = null;
+	    try 
+	    {
+	    	scanner = new Scanner(new FileInputStream(pathToFile));
+	    	while (scanner.hasNextLine())
+	    	{
+	    		text.append(scanner.nextLine() + NL);
+	    	}
+	    } 
+	    catch (FileNotFoundException e)
+		{
+			log.error("Unable to read file: " + pathToFile, e);
+			throw e;
+		}
+	    finally
+	    {
+	    	scanner.close();
+	    }
+	    
+	    return text.toString();
+	}
+	
+	public abstract void generateModel() throws Exception;
+		
+	/**
+	 * Delete old model.
+	 * @param pathToFile - file path of the file to be deleted.
+	 */
+	protected void deleteModel(String pathToFile)
+	{
+		File model = new File(pathToFile);
+		if(model.exists())
+		{
+			log.info("deleting model: " + pathToFile);
+			model.delete();
+		}
+	}
+
+	public String getDirModels()
+	{
+		return dirModels;
+	}
+
+	public void setDirModels(String dirModels)
+	{
+		this.dirModels = dirModels;
+	}
+
+	public String getModelName()
+	{
+		return modelName;
+	}
+
+	public void setModelName(String modelName)
+	{
+		this.modelName = modelName;
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractor.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTest.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTest.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTest.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTest.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,78 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.classification;
+
+import org.apache.log4j.Logger;
+import org.spin.scrubber.ScrubberProperties;
+import org.spin.scrubber.uima.dao.FeatureMatrixDAO;
+
+import java.io.File;
+import java.util.List;
+
+/**
+ * 
+ * @author britt fitch
+ *
+ */
+public class WekaDataExtractorTest extends WekaDataExtractor
+{
+	private static Logger log =  Logger.getLogger(WekaDataExtractorTest.class);
+	private String tableSuffix = "_test";
+	
+	public WekaDataExtractorTest() 
+	{
+		super(ScrubberProperties.getDirModels(), ScrubberProperties.getFileModelTest());
+	} 
+		
+	public static void main(String[] args) throws Exception
+	{
+		WekaDataExtractor extractor = new WekaDataExtractorTest();
+		extractor.generateModel();
+	}
+	
+	public void generateModel() throws Exception
+	{
+		String pathToArff = getDirModels() + File.separator + getModelName();
+		
+		//delete old arff
+		deleteModel(pathToArff);
+		
+		//get weka header
+		StringBuilder sb = new StringBuilder(readFile(getDirModels() + File.separator + "weka_header.txt")); //TODO: refactor
+		
+		//select records for output model
+		List<String> rows = new FeatureMatrixDAO(tableSuffix).selectDataSetTest();
+		
+		//clean file according to .sed
+		for (String row : rows) 
+		{
+			row = row.replaceAll(",',", ",apos,");
+			row = row.replaceAll(",,,", ",comma,");
+			row = row.replaceAll(",\\.,", ",period,");
+			row = row.replaceAll(",:,", ",colon,");
+			row = row.replaceAll(",\\(,", ",none,");
+			row = row.replaceAll(",\\),", ",none,");
+			row = row.replaceAll(",\\$,", ",none,");
+			sb.append(row);
+			sb.append("\n");
+		}
+		
+		writeFile(pathToArff, sb.toString());
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTrain.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTrain.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTrain.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTrain.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,78 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.classification;
+
+import java.io.File;
+import java.util.List;
+
+import org.apache.log4j.Logger;
+import org.spin.scrubber.ScrubberProperties;
+import org.spin.scrubber.uima.dao.FeatureMatrixDAO;
+
+/**
+ * 
+ * @author britt fitch
+ *
+ */
+public class WekaDataExtractorTrain extends WekaDataExtractor
+{
+	private static Logger log =  Logger.getLogger(WekaDataExtractorTrain.class);
+	private String tableSuffix = "_train";
+	
+	public WekaDataExtractorTrain() 
+	{
+		super(ScrubberProperties.getDirModels(), ScrubberProperties.getFileModelTrain());
+	} 
+		
+	public static void main(String[] args) throws Exception
+	{
+		WekaDataExtractor extractor = new WekaDataExtractorTrain();
+		extractor.generateModel();
+	}
+	
+	public void generateModel() throws Exception
+	{
+		String pathToArff = getDirModels() + File.separator + getModelName();
+		
+		//delete old arff
+		deleteModel(pathToArff);
+		
+		//get weka header
+		StringBuilder sb = new StringBuilder(readFile(getDirModels() + File.separator + "weka_header.txt")); //TODO: refactor
+		
+		//select records for output model
+		List<String> rows = new FeatureMatrixDAO(tableSuffix).selectDataSetTrain();
+		
+		//clean file according to .sed
+		for (String row : rows) 
+		{
+			row = row.replaceAll(",',", ",apos,");
+			row = row.replaceAll(",,,", ",comma,");
+			row = row.replaceAll(",\\.,", ",period,");
+			row = row.replaceAll(",:,", ",colon,");
+			row = row.replaceAll(",\\(,", ",none,");
+			row = row.replaceAll(",\\),", ",none,");
+			row = row.replaceAll(",\\$,", ",none,");
+			sb.append(row);
+			sb.append("\n");
+		}
+		
+		writeFile(pathToArff, sb.toString());
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTrain.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/AnnotationsPubsPosCounter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/AnnotationsPubsPosCounter.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/AnnotationsPubsPosCounter.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/AnnotationsPubsPosCounter.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,60 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.oneoff;
+
+import org.spin.scrubber.uima.dao.AnnotationsPubsDAO;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+@Deprecated
+public class AnnotationsPubsPosCounter
+{
+
+	/**
+	 * @param args
+	 * @throws Exception 
+	 * 
+	 * this class was used to generate a distribution of parts of speech across the set of pubs
+	 * for comparison with the distribution of part of speech across the cases 
+	 * and the distribution of PoS for known phi (based on gold standard)
+	 */
+	public static void main(String[] args) throws Exception
+	{
+		//AnnotationsPubsPosCounter runner = new AnnotationsPubsPosCounter();
+		Map<String,Integer> pubPosMap = new HashMap<String,Integer>();
+		AnnotationsPubsDAO dao = new AnnotationsPubsDAO();
+		List<String> fileList = dao.selectDistinctFilenameShort();
+		
+		//sum pos for pubs
+		for (String filename : fileList)
+		{
+			pubPosMap = dao.selectDistinctPOS(pubPosMap, filename);
+		}
+		
+		//insert pos for pubs
+		for (String pos : pubPosMap.keySet())
+		{
+			int posCnt = pubPosMap.get(pos);
+			dao.insertPubsPOS(pos, posCnt);
+		}
+	}
+
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/AnnotationsPubsPosCounter.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeaturePHITypeUpdater.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeaturePHITypeUpdater.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeaturePHITypeUpdater.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeaturePHITypeUpdater.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.oneoff;
+
+import org.spin.scrubber.beans.CaseFeature;
+import org.spin.scrubber.uima.dao.FeatureMatrixDAO;
+import org.spin.scrubber.uima.dao.HumanAnnotationsDAO;
+
+import java.util.List;
+
+@Deprecated
+public class CaseFeaturePHITypeUpdater //implements Runnable
+{	
+	
+//	public static void main(String[] args) throws Exception
+//	{		
+//		CaseFeaturePHITypeUpdater runner = new CaseFeaturePHITypeUpdater();
+//		runner.run();
+//	}
+//
+//	public void run()
+//	{
+//		FeatureMatrixDAO cfDAO;
+//		HumanAnnotationsDAO phiDao;
+//		try
+//		{
+//			cfDAO = new FeatureMatrixDAO();
+//			phiDao = new HumanAnnotationsDAO();
+//			
+////			//update TRAIN set
+////			List<CaseFeature> caseFeatureList = cfDAO.selectAllCaseFeatures();			
+////			System.out.println("INFO: " + caseFeatureList.size() + " train instances to be updated...");
+////			for (CaseFeature cf : caseFeatureList)
+////			{
+////				try
+////				{
+////					String phiLabel = phiDao.selectPHIType(cf.getFilename_short(), cf.getStartIdx());
+////					if (phiLabel!=null)
+////					{
+////						cfDAO.updateCaseFeaturePHITypeTrain(cf.getId(), phiLabel);
+////					}
+////				}
+////				catch(Exception e)
+////				{
+////					System.out.println("ERROR: (train) token|id: " +cf.getToken()+"|"+cf.getId() );
+////					e.printStackTrace();
+////				}
+////			}
+//			
+//			//upate TEST set
+//			List<CaseFeature> caseFeatureTESTList = cfDAO.selectAllTestCaseFeatures();			
+//			System.out.println("INFO: " + caseFeatureTESTList.size() + " test instances to be updated...");
+//			for (CaseFeature cf : caseFeatureTESTList)
+//			{
+//				try
+//				{
+//					String phiLabel = phiDao.selectPHIType(cf.getFilename_short(), cf.getStartIdx());
+//					if (phiLabel!=null)
+//					{
+//						cfDAO.updateCaseFeaturePHITypeTest(cf.getId(), phiLabel);
+//					}
+//				}
+//				catch(Exception e)
+//				{
+//					System.out.println("ERROR: (test) token|id: " +cf.getToken()+"|"+cf.getId() );
+//					e.printStackTrace();
+//				}
+//			}
+//		} catch (Exception e1)
+//		{
+//			e1.printStackTrace();
+//		}
+//	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeaturePHITypeUpdater.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeatureTFUpdater.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeatureTFUpdater.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeatureTFUpdater.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeatureTFUpdater.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,129 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+/**
+ * 
+ */
+package org.spin.scrubber.oneoff;
+
+import org.spin.scrubber.beans.CaseFeature;
+import org.spin.scrubber.uima.dao.FeatureMatrixDAO;
+import org.spin.scrubber.uima.dao.TfDAO;
+
+import java.util.List;
+import java.util.Map;
+
+/**
+ * This class is intended to be run as a one-off process (thus the package *.oneoff) to update the TF features. 
+ * The normal process of calculating the TF occurs in TFAnnotator.
+ *  
+ * @author BF19
+ *
+ */
+@Deprecated
+public class CaseFeatureTFUpdater //implements Runnable
+{
+//	public static void main(String[] args) throws Exception
+//	{		
+//		CaseFeatureTFUpdater runner = new CaseFeatureTFUpdater();
+//		runner.run();
+//	}
+//
+//	public void run()
+//	{
+//		//select all pub token/cnt/pos
+//		Map<String, Integer> pubsTFMap;
+//		try
+//		{
+//			pubsTFMap = new TfDAO().selectPubTFMap();
+//			
+//			updateTrain(pubsTFMap);			
+//			updateTest(pubsTFMap);
+//		} 
+//		catch (Exception e)
+//		{
+//			e.printStackTrace();
+//		}
+//	}
+//	
+//	private void updateTrain(Map<String,Integer> pubsTFMap)
+//	{
+//		FeatureMatrixDAO cfDAO;
+//		try
+//		{
+//			cfDAO = new FeatureMatrixDAO();
+//						
+//			//select all feature records (to be updated)
+//			List<CaseFeature> caseFeatureList = cfDAO.selectAllCaseFeatures();
+//			
+//			for (CaseFeature cf : caseFeatureList)
+//			{
+//				try
+//				{
+//					//update all_pubs features
+//					int pubTermPosCnt = (pubsTFMap.get(cf.getToken()+"|"+cf.getPos())==null) ? 0 : pubsTFMap.get(cf.getToken()+"|"+cf.getPos());
+//					int pubTermCnt = (pubsTFMap.get(cf.getToken())==null) ? 0 : pubsTFMap.get(cf.getToken());
+//					float pubTotalCnt = Float.valueOf(Integer.toString(pubsTFMap.get("totalPubCount")));
+//					cfDAO.updateCaseFeatureTFAllPubs(cf.getId(), pubTermPosCnt/pubTotalCnt, pubTermCnt/pubTotalCnt);
+//				}
+//				catch(Exception e)
+//				{
+//					System.out.println("ERROR: token|id: " +cf.getToken()+"|"+cf.getId() );
+//					e.printStackTrace();
+//				}
+//			}
+//		} 
+//		catch (Exception e1)
+//		{
+//			e1.printStackTrace();
+//		}
+//	}
+//	
+//	private void updateTest(Map<String,Integer> pubsTFMap)
+//	{
+//		FeatureMatrixDAO cfDAO;
+//		try
+//		{
+//			cfDAO = new FeatureMatrixDAO();
+//						
+//			//select all feature records (to be updated)
+//			List<CaseFeature> caseFeatureList = cfDAO.selectAllTestCaseFeatures();
+//			
+//			for (CaseFeature cf : caseFeatureList)
+//			{
+//				try
+//				{
+//					//update all_pubs features
+//					int pubTermPosCnt = (pubsTFMap.get(cf.getToken()+"|"+cf.getPos())==null) ? 0 : pubsTFMap.get(cf.getToken()+"|"+cf.getPos());
+//					int pubTermCnt = (pubsTFMap.get(cf.getToken())==null) ? 0 : pubsTFMap.get(cf.getToken());
+//					float pubTotalCnt = Float.valueOf(Integer.toString(pubsTFMap.get("totalPubCount")));
+//					cfDAO.updateTestCaseFeatureTFAllPubs(cf.getId(), pubTermPosCnt/pubTotalCnt, pubTermCnt/pubTotalCnt);
+//				}
+//				catch(Exception e)
+//				{
+//					System.out.println("ERROR: token|id: " +cf.getToken()+"|"+cf.getId() );
+//					e.printStackTrace();
+//				}
+//			}
+//		} 
+//		catch (Exception e1)
+//		{
+//			e1.printStackTrace();
+//		}
+//	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeatureTFUpdater.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/XmlToTextI2B2.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/XmlToTextI2B2.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/XmlToTextI2B2.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/XmlToTextI2B2.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,847 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+/**
+ * 
+ */
+package org.spin.scrubber.oneoff;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathFactory;
+import java.io.File;
+import java.io.FileWriter;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author britt fitch bf19
+ *
+ * takes 2 command line params:
+ * inDirectory : containing xml files
+ * outDirectory : where to place txt files (assumes this dir has "train" and "test" subdirs)
+ * 
+ * parses i2b2 xml file into individual text files for use by scrubber.
+ * 
+ * THIS IS REQUIRED TO REPRODUCE FINDINGS REPORTED IN THE PAPER. 
+ */
+public class XmlToTextI2B2 implements Runnable
+{
+	private String inDirectory;
+	private String outDirectory;
+	private List<String> allTrainCaseList;
+	
+	public XmlToTextI2B2(String in, String out)
+	{
+		this.inDirectory = in;
+		this.outDirectory = out;
+	}
+	
+	public void run()
+	{
+		try
+		{
+			File inDir = new File(inDirectory);
+			
+			if (!inDir.exists())
+			{
+				inDir.createNewFile();
+			}
+			
+			File[] files = inDir.listFiles();
+			
+			for (File f : files)
+			{
+				if (f.isDirectory())
+				{
+					continue;
+				}
+				
+				System.out.println("XmlToText for: " + f.getName());
+				
+				//read infile
+				DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+				DocumentBuilder builder = factory.newDocumentBuilder();
+				Document doc = builder.parse(f);
+				XPathFactory xPathfactory = XPathFactory.newInstance();
+				XPath xpath = xPathfactory.newXPath();
+				XPathExpression expr = xpath.compile("//TEXT");
+	
+				//read all matching nodes
+			    NodeList nodes = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
+	
+			    FileWriter writer = null; 
+			    
+			    //for each node in a file, write out to a flat txt file of the same name.
+			    for (int i=0; i<nodes.getLength(); i++)
+			    {
+			    	Node n = nodes.item(i);
+					String id = n.getParentNode().getAttributes().getNamedItem("ID").getNodeValue();//attributes.getNamedItem("ID").getNodeValue();
+					
+					String subdir = (isTrainCase(id)) ? "train" : "test";
+					
+					//make outfile
+					String fname = id+".txt";
+					writer = new FileWriter(new File(outDirectory + File.separatorChar + subdir + File.separatorChar + fname));
+			    	String txt = n.getTextContent();
+					
+					writer.write( txt + "\n");
+			    	writer.flush();
+				    writer.close();
+			    }
+			}
+		}
+		catch (Exception e)
+		{
+			System.out.println(e.getMessage());
+			e.printStackTrace();
+		}
+	}
+
+	/**
+	 * i2b2 smoking deid data set is broken up into 4 files.
+	 * 
+	 * TRAIN set - annotated (A) & unannotated (UA):
+	 * 	unannotated_records_deid_smoking = 889 records. (UA)
+	 * 	deid_surrogate_train_all_version2 = 669 records. (A)
+	 * 
+	 * TEST set - annotated (A) & unannotated (UA):
+	 * 	deid_surrogate_test_all_version2 = 220 records. (UA) 
+	 * 	deid_surrogate_test_all_groundtruth_v2 = 220 records. (A)
+	 * 
+	 * the train file has all of the 889 records in it, 
+	 * but we only want to use the 669 records for training. 
+	 * leaving the 220 records for testing.
+	 * 
+	 * this method will just ignore the 220 in the process of generating the text files.
+	 * 
+	 * 
+	 */
+	private List<String> getAllTrainCaseList()
+	{
+		if (allTrainCaseList ==null)
+		{
+			allTrainCaseList = new ArrayList<String>();
+			allTrainCaseList.add("1");
+			allTrainCaseList.add("10");
+			allTrainCaseList.add("100");
+			allTrainCaseList.add("101");
+			allTrainCaseList.add("102");
+			allTrainCaseList.add("103");
+			allTrainCaseList.add("104");
+			allTrainCaseList.add("105");
+			allTrainCaseList.add("106");
+			allTrainCaseList.add("107");
+			allTrainCaseList.add("108");
+			allTrainCaseList.add("11");
+			allTrainCaseList.add("110");
+			allTrainCaseList.add("112");
+			allTrainCaseList.add("113");
+			allTrainCaseList.add("114");
+			allTrainCaseList.add("115");
+			allTrainCaseList.add("116");
+			allTrainCaseList.add("117");
+			allTrainCaseList.add("118");
+			allTrainCaseList.add("119");
+			allTrainCaseList.add("12");
+			allTrainCaseList.add("120");
+			allTrainCaseList.add("122");
+			allTrainCaseList.add("123");
+			allTrainCaseList.add("124");
+			allTrainCaseList.add("125");
+			allTrainCaseList.add("126");
+			allTrainCaseList.add("127");
+			allTrainCaseList.add("128");
+			allTrainCaseList.add("129");
+			allTrainCaseList.add("13");
+			allTrainCaseList.add("130");
+			allTrainCaseList.add("131");
+			allTrainCaseList.add("132");
+			allTrainCaseList.add("134");
+			allTrainCaseList.add("137");
+			allTrainCaseList.add("138");
+			allTrainCaseList.add("139");
+			allTrainCaseList.add("140");
+			allTrainCaseList.add("141");
+			allTrainCaseList.add("143");
+			allTrainCaseList.add("144");
+			allTrainCaseList.add("145");
+			allTrainCaseList.add("146");
+			allTrainCaseList.add("147");
+			allTrainCaseList.add("148");
+			allTrainCaseList.add("149");
+			allTrainCaseList.add("15");
+			allTrainCaseList.add("150");
+			allTrainCaseList.add("152");
+			allTrainCaseList.add("153");
+			allTrainCaseList.add("154");
+			allTrainCaseList.add("155");
+			allTrainCaseList.add("156");
+			allTrainCaseList.add("157");
+			allTrainCaseList.add("158");
+			allTrainCaseList.add("159");
+			allTrainCaseList.add("16");
+			allTrainCaseList.add("160");
+			allTrainCaseList.add("161");
+			allTrainCaseList.add("162");
+			allTrainCaseList.add("163");
+			allTrainCaseList.add("164");
+			allTrainCaseList.add("165");
+			allTrainCaseList.add("166");
+			allTrainCaseList.add("169");
+			allTrainCaseList.add("17");
+			allTrainCaseList.add("170");
+			allTrainCaseList.add("171");
+			allTrainCaseList.add("172");
+			allTrainCaseList.add("173");
+			allTrainCaseList.add("174");
+			allTrainCaseList.add("175");
+			allTrainCaseList.add("178");
+			allTrainCaseList.add("179");
+			allTrainCaseList.add("18");
+			allTrainCaseList.add("180");
+			allTrainCaseList.add("181");
+			allTrainCaseList.add("182");
+			allTrainCaseList.add("183");
+			allTrainCaseList.add("184");
+			allTrainCaseList.add("186");
+			allTrainCaseList.add("187");
+			allTrainCaseList.add("188");
+			allTrainCaseList.add("189");
+			allTrainCaseList.add("19");
+			allTrainCaseList.add("190");
+			allTrainCaseList.add("191");
+			allTrainCaseList.add("192");
+			allTrainCaseList.add("193");
+			allTrainCaseList.add("195");
+			allTrainCaseList.add("196");
+			allTrainCaseList.add("197");
+			allTrainCaseList.add("198");
+			allTrainCaseList.add("199");
+			allTrainCaseList.add("2");
+			allTrainCaseList.add("20");
+			allTrainCaseList.add("200");
+			allTrainCaseList.add("201");
+			allTrainCaseList.add("203");
+			allTrainCaseList.add("204");
+			allTrainCaseList.add("205");
+			allTrainCaseList.add("207");
+			allTrainCaseList.add("208");
+			allTrainCaseList.add("209");
+			allTrainCaseList.add("21");
+			allTrainCaseList.add("210");
+			allTrainCaseList.add("211");
+			allTrainCaseList.add("212");
+			allTrainCaseList.add("213");
+			allTrainCaseList.add("215");
+			allTrainCaseList.add("216");
+			allTrainCaseList.add("217");
+			allTrainCaseList.add("218");
+			allTrainCaseList.add("219");
+			allTrainCaseList.add("22");
+			allTrainCaseList.add("221");
+			allTrainCaseList.add("222");
+			allTrainCaseList.add("223");
+			allTrainCaseList.add("224");
+			allTrainCaseList.add("225");
+			allTrainCaseList.add("226");
+			allTrainCaseList.add("227");
+			allTrainCaseList.add("228");
+			allTrainCaseList.add("229");
+			allTrainCaseList.add("23");
+			allTrainCaseList.add("230");
+			allTrainCaseList.add("231");
+			allTrainCaseList.add("232");
+			allTrainCaseList.add("234");
+			allTrainCaseList.add("235");
+			allTrainCaseList.add("236");
+			allTrainCaseList.add("237");
+			allTrainCaseList.add("238");
+			allTrainCaseList.add("239");
+			allTrainCaseList.add("24");
+			allTrainCaseList.add("240");
+			allTrainCaseList.add("241");
+			allTrainCaseList.add("242");
+			allTrainCaseList.add("243");
+			allTrainCaseList.add("244");
+			allTrainCaseList.add("245");
+			allTrainCaseList.add("246");
+			allTrainCaseList.add("247");
+			allTrainCaseList.add("248");
+			allTrainCaseList.add("249");
+			allTrainCaseList.add("250");
+			allTrainCaseList.add("251");
+			allTrainCaseList.add("252");
+			allTrainCaseList.add("253");
+			allTrainCaseList.add("254");
+			allTrainCaseList.add("255");
+			allTrainCaseList.add("256");
+			allTrainCaseList.add("257");
+			allTrainCaseList.add("258");
+			allTrainCaseList.add("259");
+			allTrainCaseList.add("26");
+			allTrainCaseList.add("260");
+			allTrainCaseList.add("261");
+			allTrainCaseList.add("262");
+			allTrainCaseList.add("264");
+			allTrainCaseList.add("265");
+			allTrainCaseList.add("266");
+			allTrainCaseList.add("267");
+			allTrainCaseList.add("269");
+			allTrainCaseList.add("27");
+			allTrainCaseList.add("270");
+			allTrainCaseList.add("271");
+			allTrainCaseList.add("272");
+			allTrainCaseList.add("273");
+			allTrainCaseList.add("274");
+			allTrainCaseList.add("275");
+			allTrainCaseList.add("276");
+			allTrainCaseList.add("277");
+			allTrainCaseList.add("278");
+			allTrainCaseList.add("279");
+			allTrainCaseList.add("28");
+			allTrainCaseList.add("280");
+			allTrainCaseList.add("281");
+			allTrainCaseList.add("282");
+			allTrainCaseList.add("283");
+			allTrainCaseList.add("284");
+			allTrainCaseList.add("285");
+			allTrainCaseList.add("286");
+			allTrainCaseList.add("287");
+			allTrainCaseList.add("288");
+			allTrainCaseList.add("289");
+			allTrainCaseList.add("29");
+			allTrainCaseList.add("290");
+			allTrainCaseList.add("291");
+			allTrainCaseList.add("292");
+			allTrainCaseList.add("293");
+			allTrainCaseList.add("294");
+			allTrainCaseList.add("295");
+			allTrainCaseList.add("296");
+			allTrainCaseList.add("297");
+			allTrainCaseList.add("299");
+			allTrainCaseList.add("3");
+			allTrainCaseList.add("30");
+			allTrainCaseList.add("300");
+			allTrainCaseList.add("301");
+			allTrainCaseList.add("302");
+			allTrainCaseList.add("303");
+			allTrainCaseList.add("304");
+			allTrainCaseList.add("305");
+			allTrainCaseList.add("306");
+			allTrainCaseList.add("307");
+			allTrainCaseList.add("308");
+			allTrainCaseList.add("309");
+			allTrainCaseList.add("31");
+			allTrainCaseList.add("310");
+			allTrainCaseList.add("311");
+			allTrainCaseList.add("312");
+			allTrainCaseList.add("313");
+			allTrainCaseList.add("314");
+			allTrainCaseList.add("315");
+			allTrainCaseList.add("316");
+			allTrainCaseList.add("317");
+			allTrainCaseList.add("318");
+			allTrainCaseList.add("32");
+			allTrainCaseList.add("320");
+			allTrainCaseList.add("321");
+			allTrainCaseList.add("322");
+			allTrainCaseList.add("323");
+			allTrainCaseList.add("324");
+			allTrainCaseList.add("325");
+			allTrainCaseList.add("326");
+			allTrainCaseList.add("327");
+			allTrainCaseList.add("329");
+			allTrainCaseList.add("33");
+			allTrainCaseList.add("330");
+			allTrainCaseList.add("331");
+			allTrainCaseList.add("332");
+			allTrainCaseList.add("333");
+			allTrainCaseList.add("334");
+			allTrainCaseList.add("335");
+			allTrainCaseList.add("336");
+			allTrainCaseList.add("337");
+			allTrainCaseList.add("338");
+			allTrainCaseList.add("339");
+			allTrainCaseList.add("34");
+			allTrainCaseList.add("340");
+			allTrainCaseList.add("341");
+			allTrainCaseList.add("342");
+			allTrainCaseList.add("343");
+			allTrainCaseList.add("344");
+			allTrainCaseList.add("345");
+			allTrainCaseList.add("346");
+			allTrainCaseList.add("347");
+			allTrainCaseList.add("348");
+			allTrainCaseList.add("349");
+			allTrainCaseList.add("350");
+			allTrainCaseList.add("351");
+			allTrainCaseList.add("352");
+			allTrainCaseList.add("354");
+			allTrainCaseList.add("355");
+			allTrainCaseList.add("356");
+			allTrainCaseList.add("357");
+			allTrainCaseList.add("358");
+			allTrainCaseList.add("359");
+			allTrainCaseList.add("36");
+			allTrainCaseList.add("360");
+			allTrainCaseList.add("361");
+			allTrainCaseList.add("362");
+			allTrainCaseList.add("363");
+			allTrainCaseList.add("364");
+			allTrainCaseList.add("366");
+			allTrainCaseList.add("367");
+			allTrainCaseList.add("368");
+			allTrainCaseList.add("369");
+			allTrainCaseList.add("37");
+			allTrainCaseList.add("370");
+			allTrainCaseList.add("372");
+			allTrainCaseList.add("373");
+			allTrainCaseList.add("374");
+			allTrainCaseList.add("375");
+			allTrainCaseList.add("376");
+			allTrainCaseList.add("378");
+			allTrainCaseList.add("379");
+			allTrainCaseList.add("38");
+			allTrainCaseList.add("380");
+			allTrainCaseList.add("381");
+			allTrainCaseList.add("382");
+			allTrainCaseList.add("383");
+			allTrainCaseList.add("384");
+			allTrainCaseList.add("385");
+			allTrainCaseList.add("386");
+			allTrainCaseList.add("387");
+			allTrainCaseList.add("388");
+			allTrainCaseList.add("389");
+			allTrainCaseList.add("39");
+			allTrainCaseList.add("390");
+			allTrainCaseList.add("391");
+			allTrainCaseList.add("392");
+			allTrainCaseList.add("393");
+			allTrainCaseList.add("394");
+			allTrainCaseList.add("395");
+			allTrainCaseList.add("396");
+			allTrainCaseList.add("397");
+			allTrainCaseList.add("398");
+			allTrainCaseList.add("399");
+			allTrainCaseList.add("4");
+			allTrainCaseList.add("40");
+			allTrainCaseList.add("400");
+			allTrainCaseList.add("401");
+			allTrainCaseList.add("402");
+			allTrainCaseList.add("403");
+			allTrainCaseList.add("404");
+			allTrainCaseList.add("405");
+			allTrainCaseList.add("407");
+			allTrainCaseList.add("408");
+			allTrainCaseList.add("409");
+			allTrainCaseList.add("411");
+			allTrainCaseList.add("412");
+			allTrainCaseList.add("414");
+			allTrainCaseList.add("415");
+			allTrainCaseList.add("416");
+			allTrainCaseList.add("417");
+			allTrainCaseList.add("418");
+			allTrainCaseList.add("419");
+			allTrainCaseList.add("42");
+			allTrainCaseList.add("421");
+			allTrainCaseList.add("43");
+			allTrainCaseList.add("434");
+			allTrainCaseList.add("44");
+			allTrainCaseList.add("45");
+			allTrainCaseList.add("452");
+			allTrainCaseList.add("46");
+			allTrainCaseList.add("464");
+			allTrainCaseList.add("468");
+			allTrainCaseList.add("47");
+			allTrainCaseList.add("48");
+			allTrainCaseList.add("485");
+			allTrainCaseList.add("49");
+			allTrainCaseList.add("497");
+			allTrainCaseList.add("5");
+			allTrainCaseList.add("50");
+			allTrainCaseList.add("51");
+			allTrainCaseList.add("52");
+			allTrainCaseList.add("53");
+			allTrainCaseList.add("54");
+			allTrainCaseList.add("55");
+			allTrainCaseList.add("57");
+			allTrainCaseList.add("58");
+			allTrainCaseList.add("59");
+			allTrainCaseList.add("6");
+			allTrainCaseList.add("60");
+			allTrainCaseList.add("61");
+			allTrainCaseList.add("62");
+			allTrainCaseList.add("63");
+			allTrainCaseList.add("64");
+			allTrainCaseList.add("640");
+			allTrainCaseList.add("641");
+			allTrainCaseList.add("642");
+			allTrainCaseList.add("643");
+			allTrainCaseList.add("644");
+			allTrainCaseList.add("645");
+			allTrainCaseList.add("646");
+			allTrainCaseList.add("647");
+			allTrainCaseList.add("648");
+			allTrainCaseList.add("649");
+			allTrainCaseList.add("65");
+			allTrainCaseList.add("650");
+			allTrainCaseList.add("651");
+			allTrainCaseList.add("652");
+			allTrainCaseList.add("653");
+			allTrainCaseList.add("654");
+			allTrainCaseList.add("655");
+			allTrainCaseList.add("656");
+			allTrainCaseList.add("657");
+			allTrainCaseList.add("658");
+			allTrainCaseList.add("659");
+			allTrainCaseList.add("66");
+			allTrainCaseList.add("660");
+			allTrainCaseList.add("661");
+			allTrainCaseList.add("662");
+			allTrainCaseList.add("663");
+			allTrainCaseList.add("664");
+			allTrainCaseList.add("665");
+			allTrainCaseList.add("666");
+			allTrainCaseList.add("667");
+			allTrainCaseList.add("668");
+			allTrainCaseList.add("669");
+			allTrainCaseList.add("67");
+			allTrainCaseList.add("670");
+			allTrainCaseList.add("671");
+			allTrainCaseList.add("672");
+			allTrainCaseList.add("673");
+			allTrainCaseList.add("674");
+			allTrainCaseList.add("675");
+			allTrainCaseList.add("676");
+			allTrainCaseList.add("677");
+			allTrainCaseList.add("678");
+			allTrainCaseList.add("679");
+			allTrainCaseList.add("68");
+			allTrainCaseList.add("680");
+			allTrainCaseList.add("681");
+			allTrainCaseList.add("682");
+			allTrainCaseList.add("683");
+			allTrainCaseList.add("684");
+			allTrainCaseList.add("685");
+			allTrainCaseList.add("686");
+			allTrainCaseList.add("687");
+			allTrainCaseList.add("688");
+			allTrainCaseList.add("689");
+			allTrainCaseList.add("69");
+			allTrainCaseList.add("690");
+			allTrainCaseList.add("691");
+			allTrainCaseList.add("692");
+			allTrainCaseList.add("693");
+			allTrainCaseList.add("694");
+			allTrainCaseList.add("695");
+			allTrainCaseList.add("696");
+			allTrainCaseList.add("697");
+			allTrainCaseList.add("698");
+			allTrainCaseList.add("699");
+			allTrainCaseList.add("7");
+			allTrainCaseList.add("70");
+			allTrainCaseList.add("700");
+			allTrainCaseList.add("701");
+			allTrainCaseList.add("702");
+			allTrainCaseList.add("703");
+			allTrainCaseList.add("704");
+			allTrainCaseList.add("705");
+			allTrainCaseList.add("707");
+			allTrainCaseList.add("708");
+			allTrainCaseList.add("709");
+			allTrainCaseList.add("71");
+			allTrainCaseList.add("710");
+			allTrainCaseList.add("711");
+			allTrainCaseList.add("712");
+			allTrainCaseList.add("713");
+			allTrainCaseList.add("714");
+			allTrainCaseList.add("715");
+			allTrainCaseList.add("716");
+			allTrainCaseList.add("717");
+			allTrainCaseList.add("718");
+			allTrainCaseList.add("719");
+			allTrainCaseList.add("72");
+			allTrainCaseList.add("720");
+			allTrainCaseList.add("721");
+			allTrainCaseList.add("722");
+			allTrainCaseList.add("723");
+			allTrainCaseList.add("724");
+			allTrainCaseList.add("725");
+			allTrainCaseList.add("726");
+			allTrainCaseList.add("727");
+			allTrainCaseList.add("728");
+			allTrainCaseList.add("729");
+			allTrainCaseList.add("73");
+			allTrainCaseList.add("730");
+			allTrainCaseList.add("731");
+			allTrainCaseList.add("732");
+			allTrainCaseList.add("733");
+			allTrainCaseList.add("734");
+			allTrainCaseList.add("735");
+			allTrainCaseList.add("736");
+			allTrainCaseList.add("737");
+			allTrainCaseList.add("738");
+			allTrainCaseList.add("739");
+			allTrainCaseList.add("74");
+			allTrainCaseList.add("740");
+			allTrainCaseList.add("741");
+			allTrainCaseList.add("742");
+			allTrainCaseList.add("743");
+			allTrainCaseList.add("744");
+			allTrainCaseList.add("745");
+			allTrainCaseList.add("746");
+			allTrainCaseList.add("747");
+			allTrainCaseList.add("748");
+			allTrainCaseList.add("749");
+			allTrainCaseList.add("75");
+			allTrainCaseList.add("750");
+			allTrainCaseList.add("751");
+			allTrainCaseList.add("752");
+			allTrainCaseList.add("753");
+			allTrainCaseList.add("754");
+			allTrainCaseList.add("755");
+			allTrainCaseList.add("756");
+			allTrainCaseList.add("757");
+			allTrainCaseList.add("758");
+			allTrainCaseList.add("759");
+			allTrainCaseList.add("76");
+			allTrainCaseList.add("760");
+			allTrainCaseList.add("761");
+			allTrainCaseList.add("762");
+			allTrainCaseList.add("763");
+			allTrainCaseList.add("764");
+			allTrainCaseList.add("765");
+			allTrainCaseList.add("766");
+			allTrainCaseList.add("767");
+			allTrainCaseList.add("768");
+			allTrainCaseList.add("769");
+			allTrainCaseList.add("770");
+			allTrainCaseList.add("771");
+			allTrainCaseList.add("772");
+			allTrainCaseList.add("773");
+			allTrainCaseList.add("774");
+			allTrainCaseList.add("775");
+			allTrainCaseList.add("776");
+			allTrainCaseList.add("777");
+			allTrainCaseList.add("778");
+			allTrainCaseList.add("779");
+			allTrainCaseList.add("78");
+			allTrainCaseList.add("780");
+			allTrainCaseList.add("781");
+			allTrainCaseList.add("782");
+			allTrainCaseList.add("783");
+			allTrainCaseList.add("784");
+			allTrainCaseList.add("785");
+			allTrainCaseList.add("786");
+			allTrainCaseList.add("787");
+			allTrainCaseList.add("788");
+			allTrainCaseList.add("789");
+			allTrainCaseList.add("79");
+			allTrainCaseList.add("790");
+			allTrainCaseList.add("791");
+			allTrainCaseList.add("792");
+			allTrainCaseList.add("793");
+			allTrainCaseList.add("794");
+			allTrainCaseList.add("795");
+			allTrainCaseList.add("796");
+			allTrainCaseList.add("797");
+			allTrainCaseList.add("798");
+			allTrainCaseList.add("799");
+			allTrainCaseList.add("8");
+			allTrainCaseList.add("80");
+			allTrainCaseList.add("800");
+			allTrainCaseList.add("801");
+			allTrainCaseList.add("802");
+			allTrainCaseList.add("803");
+			allTrainCaseList.add("804");
+			allTrainCaseList.add("805");
+			allTrainCaseList.add("806");
+			allTrainCaseList.add("807");
+			allTrainCaseList.add("808");
+			allTrainCaseList.add("809");
+			allTrainCaseList.add("81");
+			allTrainCaseList.add("810");
+			allTrainCaseList.add("811");
+			allTrainCaseList.add("812");
+			allTrainCaseList.add("813");
+			allTrainCaseList.add("814");
+			allTrainCaseList.add("815");
+			allTrainCaseList.add("816");
+			allTrainCaseList.add("817");
+			allTrainCaseList.add("818");
+			allTrainCaseList.add("819");
+			allTrainCaseList.add("82");
+			allTrainCaseList.add("820");
+			allTrainCaseList.add("821");
+			allTrainCaseList.add("822");
+			allTrainCaseList.add("823");
+			allTrainCaseList.add("824");
+			allTrainCaseList.add("825");
+			allTrainCaseList.add("826");
+			allTrainCaseList.add("827");
+			allTrainCaseList.add("828");
+			allTrainCaseList.add("829");
+			allTrainCaseList.add("83");
+			allTrainCaseList.add("830");
+			allTrainCaseList.add("831");
+			allTrainCaseList.add("832");
+			allTrainCaseList.add("833");
+			allTrainCaseList.add("834");
+			allTrainCaseList.add("835");
+			allTrainCaseList.add("836");
+			allTrainCaseList.add("837");
+			allTrainCaseList.add("838");
+			allTrainCaseList.add("839");
+			allTrainCaseList.add("84");
+			allTrainCaseList.add("840");
+			allTrainCaseList.add("841");
+			allTrainCaseList.add("842");
+			allTrainCaseList.add("843");
+			allTrainCaseList.add("844");
+			allTrainCaseList.add("845");
+			allTrainCaseList.add("846");
+			allTrainCaseList.add("847");
+			allTrainCaseList.add("848");
+			allTrainCaseList.add("849");
+			allTrainCaseList.add("85");
+			allTrainCaseList.add("850");
+			allTrainCaseList.add("851");
+			allTrainCaseList.add("852");
+			allTrainCaseList.add("853");
+			allTrainCaseList.add("854");
+			allTrainCaseList.add("855");
+			allTrainCaseList.add("856");
+			allTrainCaseList.add("857");
+			allTrainCaseList.add("858");
+			allTrainCaseList.add("859");
+			allTrainCaseList.add("86");
+			allTrainCaseList.add("860");
+			allTrainCaseList.add("861");
+			allTrainCaseList.add("862");
+			allTrainCaseList.add("863");
+			allTrainCaseList.add("864");
+			allTrainCaseList.add("865");
+			allTrainCaseList.add("866");
+			allTrainCaseList.add("867");
+			allTrainCaseList.add("868");
+			allTrainCaseList.add("869");
+			allTrainCaseList.add("87");
+			allTrainCaseList.add("870");
+			allTrainCaseList.add("871");
+			allTrainCaseList.add("872");
+			allTrainCaseList.add("873");
+			allTrainCaseList.add("874");
+			allTrainCaseList.add("875");
+			allTrainCaseList.add("876");
+			allTrainCaseList.add("877");
+			allTrainCaseList.add("878");
+			allTrainCaseList.add("879");
+			allTrainCaseList.add("88");
+			allTrainCaseList.add("880");
+			allTrainCaseList.add("881");
+			allTrainCaseList.add("882");
+			allTrainCaseList.add("883");
+			allTrainCaseList.add("884");
+			allTrainCaseList.add("885");
+			allTrainCaseList.add("886");
+			allTrainCaseList.add("887");
+			allTrainCaseList.add("888");
+			allTrainCaseList.add("889");
+			allTrainCaseList.add("89");
+			allTrainCaseList.add("890");
+			allTrainCaseList.add("891");
+			allTrainCaseList.add("892");
+			allTrainCaseList.add("893");
+			allTrainCaseList.add("894");
+			allTrainCaseList.add("895");
+			allTrainCaseList.add("896");
+			allTrainCaseList.add("897");
+			allTrainCaseList.add("898");
+			allTrainCaseList.add("899");
+			allTrainCaseList.add("9");
+			allTrainCaseList.add("90");
+			allTrainCaseList.add("900");
+			allTrainCaseList.add("901");
+			allTrainCaseList.add("902");
+			allTrainCaseList.add("903");
+			allTrainCaseList.add("905");
+			allTrainCaseList.add("906");
+			allTrainCaseList.add("907");
+			allTrainCaseList.add("908");
+			allTrainCaseList.add("909");
+			allTrainCaseList.add("91");
+			allTrainCaseList.add("910");
+			allTrainCaseList.add("911");
+			allTrainCaseList.add("912");
+			allTrainCaseList.add("913");
+			allTrainCaseList.add("914");
+			allTrainCaseList.add("915");
+			allTrainCaseList.add("916");
+			allTrainCaseList.add("917");
+			allTrainCaseList.add("918");
+			allTrainCaseList.add("919");
+			allTrainCaseList.add("92");
+			allTrainCaseList.add("920");
+			allTrainCaseList.add("921");
+			allTrainCaseList.add("922");
+			allTrainCaseList.add("93");
+			allTrainCaseList.add("94");
+			allTrainCaseList.add("95");
+			allTrainCaseList.add("96");
+			allTrainCaseList.add("98");
+			allTrainCaseList.add("99");
+		}
+		
+		return allTrainCaseList;
+	}
+	
+	private boolean isTrainCase(String id)
+	{
+		if (this.getAllTrainCaseList().contains(id))
+		{
+			return true;
+		}
+		else
+		{
+			return false;
+		}
+	}
+	/**
+	 * @param args
+	 * @throws Exception 
+	 */
+	public static void main(String[] args) throws Exception
+	{
+		if (args.length!=2)
+		{
+			System.out.println("USAGE:\t\t XmlToText inDir outDir");
+		}
+
+		XmlToTextI2B2 runner = new XmlToTextI2B2(args[0], args[1]);
+		runner.run();		
+	}
+
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/XmlToTextI2B2.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotation.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotation.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotation.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotation.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,72 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.protege.beans;
+
+import com.thoughtworks.xstream.annotations.XStreamAlias;
+
+@XStreamAlias("annotation")
+public class Annotation
+{
+	private Mention    mention;
+	private Annotator  annotator;
+	private Span       span;
+	private String     spannedText;
+	
+	public String getSpannedText()
+	{
+		return spannedText;
+	}
+
+	public void setSpannedText(String spannedText)
+	{
+		this.spannedText = spannedText;
+	}
+
+	public Mention getMention()
+	{
+		return mention;
+	}
+
+	public void setMention(Mention mention)
+	{
+		this.mention = mention;
+	}
+
+	public void setAnnotator(Annotator annotator)
+	{
+		this.annotator = annotator;
+	}
+
+	public Annotator getAnnotator()
+	{
+		return annotator;
+	}
+
+	public Span getSpan()
+	{
+		return span;
+	}
+
+	public void setSpan(Span span)
+	{
+		this.span = span;
+	}
+	
+	
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotation.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotations.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotations.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotations.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotations.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,65 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.protege.beans;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import com.thoughtworks.xstream.annotations.XStreamAlias;
+import com.thoughtworks.xstream.annotations.XStreamAsAttribute;
+import com.thoughtworks.xstream.annotations.XStreamImplicit;
+
+@XStreamAlias("annotations")
+public class Annotations
+{
+	@XStreamAlias("textSource")
+	@XStreamAsAttribute
+	private String textSource;
+	
+	@XStreamImplicit(itemFieldName="annotation")
+	private List<Annotation> annotList = new ArrayList<Annotation>();
+	
+	@XStreamImplicit(itemFieldName="classMention")
+	private List<ClassMention> cmList = new ArrayList<ClassMention>();
+	
+	public List<Annotation> getAnnotList()
+	{
+		return annotList;
+	}
+	public void setAnnotList(List<Annotation> annotList)
+	{
+		this.annotList = annotList;
+	}
+	public List<ClassMention> getCmList()
+	{
+		return cmList;
+	}
+	public void setCmList(List<ClassMention> cmList)
+	{
+		this.cmList = cmList;
+	}
+	public String getTextSource()
+	{
+		return textSource;
+	}
+	public void setTextSource(String textSource)
+	{
+		this.textSource = textSource;
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotations.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotator.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotator.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotator.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,60 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.protege.beans;
+
+import com.thoughtworks.xstream.annotations.XStreamAlias;
+import com.thoughtworks.xstream.annotations.XStreamAsAttribute;
+import com.thoughtworks.xstream.annotations.XStreamConverter;
+import com.thoughtworks.xstream.converters.extended.ToAttributedValueConverter;
+
+@XStreamAlias("annotator")
+@XStreamConverter(value=ToAttributedValueConverter.class, strings={"name"})
+public class Annotator
+{
+	@XStreamAlias("id")
+	@XStreamAsAttribute
+	private String id;
+	
+	private String name;
+
+	public Annotator()
+	{
+	}
+	public Annotator(String id, String name)
+	{
+		this.setId(id);
+		this.setName(name);
+	}
+	public String getId()
+	{
+		return id;
+	}
+	public void setId(String id)
+	{
+		this.id = id;
+	}
+	public String getName()
+	{
+		return name;
+	}
+	public void setName(String name)
+	{
+		this.name = name;
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotator.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain