You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by br...@apache.org on 2013/07/07 21:23:07 UTC
svn commit: r1500511 [2/6] - in /ctakes/sandbox/ctakes-scrubber-deid/src: ./
main/ main/java/ main/java/org/ main/java/org/apache/
main/java/org/apache/uima/ main/java/org/apache/uima/examples/
main/java/org/spin/ main/java/org/spin/scrubber/ main/java...
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorI2B2.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorI2B2.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorI2B2.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorI2B2.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,211 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+package org.spin.scrubber.classification;
+
+import java.io.File;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathFactory;
+
+import org.apache.log4j.Logger;
+import org.spin.scrubber.uima.dao.HumanAnnotationsDAO;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+/**
+ * Class for extracting different types of PHI tags out of annotated i2b2 smoking data.
+ * Writes to the database
+ * tables: human_annotations
+ *
+ * @author britt fitch bf19
+ *
+ */
+public class HumanAnnotationsExtractorI2B2 implements HumanAnnotationsExtractor
+{
+ private static Logger log = Logger.getLogger(HumanAnnotationsExtractorI2B2.class);
+
+ protected String tableSuffix;
+ private File dirInputHumanAnnotations;
+
+ public HumanAnnotationsExtractorI2B2(String dirInputHumanAnnotations, String tableSuffix)
+ {
+ this(new File(dirInputHumanAnnotations), tableSuffix);
+ }
+
+ public HumanAnnotationsExtractorI2B2(File dirInputHumanAnnotations, String tableSuffix)
+ {
+ this.dirInputHumanAnnotations = dirInputHumanAnnotations;
+ this.tableSuffix = tableSuffix;
+
+ log.info("Starting Human Annotations Extractor (I2B2) @ "+ dirInputHumanAnnotations.getAbsolutePath());
+ }
+
+ /**
+ * @param args
+ * @throws Exception
+ */
+ public static void main(String[] args) throws Exception
+ {
+ if(args.length!=2)
+ {
+ System.out.println("USAGE:\t\t HumanAnnotationsExtractorI2B2 input_directory table_suffix");
+ System.out.println("EXAMPLE:\t HumanAnnotationsExtractorI2B2 ../data/ _test");
+ }
+ else
+ {
+ HumanAnnotationsExtractorI2B2 runner= new HumanAnnotationsExtractorI2B2(args[0], args[1]);
+ runner.parseHumanAnnotations();
+ }
+ }
+
+ /**
+ * Parse XML such that the "real" absolute character positions can be obtained from the input XML
+ */
+ public void parseHumanAnnotations()
+ {
+ HumanAnnotationsDAO dao = null;
+
+ try
+ {
+ dao = new HumanAnnotationsDAO(tableSuffix);
+
+ log.debug("Input path "+dirInputHumanAnnotations.getAbsolutePath());
+
+ if (!dirInputHumanAnnotations.exists())
+ {
+ dirInputHumanAnnotations.createNewFile();
+ }
+
+ File[] files = dirInputHumanAnnotations.listFiles();
+
+ for (File f : files)
+ {
+ if (f.isDirectory())
+ {
+ continue;
+ }
+
+ log.debug("Reading: " + f.getName());
+
+ //read infile
+ DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder builder = factory.newDocumentBuilder();
+ Document doc = builder.parse(f);
+
+ //START iterate over dom, update phi tags with start&end attribs.
+ Element root = doc.getDocumentElement();
+ XPathFactory xPathfactory1 = XPathFactory.newInstance();
+ XPath xpath1 = xPathfactory1.newXPath();
+ XPathExpression expr1 = xpath1.compile("//TEXT");
+
+ NodeList nodes1 = (NodeList) expr1.evaluate(doc, XPathConstants.NODESET);
+ for (int n1int=0; n1int<nodes1.getLength(); n1int++)
+ {
+ int start=0;
+ NodeList nodes2 = nodes1.item(n1int).getChildNodes();
+
+ for (int n2int=0; n2int<nodes2.getLength(); n2int++)
+ {
+ Node n2 = nodes2.item(n2int);
+
+ if (n2.getNodeType()==Node.ELEMENT_NODE)
+ {
+ //count line breaks
+ Pattern p = Pattern.compile("\\r\\n|\\r|\\n");
+ Matcher m = p.matcher(n2.getTextContent());
+ int k = 0;
+ while (m.find())
+ {
+ k++;
+ }
+ start+=k;
+
+ String n2Val = n2.getTextContent();
+
+ ((Element)n2).setAttribute("start", Integer.toString(start));
+
+ start+=n2Val.length();
+ ((Element)n2).setAttribute("end", Integer.toString(start));
+ }
+ else if (n2.getNodeType()==Node.TEXT_NODE)
+ {
+ //count line breaks
+ Pattern p = Pattern.compile("\\r\\n|\\r|\\n");
+ Matcher m = p.matcher(n2.getTextContent());
+ int k = 0;
+ while (m.find())
+ {
+ k++;
+ }
+// start+=k; //uncommenting this makes case 1.txt correct and all others wrong. comment out makes 1.txt incorrect and all others right.
+
+ start+=n2.getTextContent().length();
+ }
+ }
+ }
+ //END iterate over dom, update phi tags with start&end attribs.
+
+ //continue on and parse PHI tags with the start & end dates.
+ XPathFactory xPathfactory = XPathFactory.newInstance();
+ XPath xpath = xPathfactory.newXPath();
+ XPathExpression expr = xpath.compile("//PHI");
+
+ //read all matching nodes
+ NodeList nodes = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
+
+ //for each RECORD node in a file
+ for (int i=0; i<nodes.getLength(); i++)
+ {
+ Node n = nodes.item(i);
+ String id = n.getParentNode().getParentNode().getAttributes().getNamedItem("ID").getNodeValue()+".txt";
+ String type = n.getAttributes().getNamedItem("TYPE").getNodeValue();
+ int startIdx = Integer.parseInt(n.getAttributes().getNamedItem("start").getNodeValue());
+ int endIdx = Integer.parseInt(n.getAttributes().getNamedItem("end").getNodeValue());
+ String phi = n.getTextContent().toLowerCase();
+
+ for (String token : phi.split(" "))
+ {
+ token = token.trim();
+ if (token.length()>0 && !token.equals(","))
+ {
+ dao.insert(id, type, token, startIdx, endIdx);
+ }
+ }
+ }
+ }
+ }
+ catch (Exception e)
+ {
+ log.error("Failed to parse human annotations from i2b2 input", e) ;
+ }
+ finally
+ {
+ dao.close();
+ }
+ }
+
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorI2B2.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorProtege.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorProtege.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorProtege.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorProtege.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,279 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+package org.spin.scrubber.classification;
+
+import org.apache.log4j.Logger;
+import org.spin.scrubber.uima.dao.HumanAnnotationsDAO;
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathFactory;
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * class for extracting different types of PHI tags out of protege/knowtator annotated data.
+ *
+ * Writes to the database
+ * tables:
+ * human_annotations_test
+ * human_annotations_train
+ *
+ * @author britt fitch bf19
+ *
+ */
+public class HumanAnnotationsExtractorProtege implements HumanAnnotationsExtractor
+{
+ private static Logger log = Logger.getLogger(HumanAnnotationsExtractorProtege.class);
+ protected String tableSuffix;
+ private File dirInputHumanAnnotations;
+
+ public HumanAnnotationsExtractorProtege(String dirInputHumanAnnotations, String tableSuffix)
+ {
+ this(new File(dirInputHumanAnnotations), tableSuffix);
+ }
+
+ public HumanAnnotationsExtractorProtege(File dirInputHumanAnnotations, String tableSuffix)
+ {
+ this.dirInputHumanAnnotations = dirInputHumanAnnotations;
+ this.tableSuffix = tableSuffix;
+
+ log.info("Starting Human Annotations Extractor (Protege) @ "+ dirInputHumanAnnotations.getAbsolutePath());
+ }
+
+ /**
+ * @param args
+ * @throws Exception
+ */
+ public static void main(String[] args) throws Exception
+ {
+ if(args.length!=2)
+ {
+ System.out.println("USAGE:\t\t HumanAnnotationsExtractorProtege input_directory {_test|_train}");
+ }
+ else
+ {
+ HumanAnnotationsExtractorProtege runner = new HumanAnnotationsExtractorProtege(args[0], args[1]);
+ runner.parseHumanAnnotations();
+ }
+ }
+
+ public void parseHumanAnnotations()
+ {
+ log.info("BEGIN Parsing human annotations.");
+
+ HumanAnnotationsDAO dao = null;
+ try
+ {
+ dao = new HumanAnnotationsDAO(tableSuffix);
+
+ log.debug("Input path "+dirInputHumanAnnotations.getAbsolutePath());
+
+ if (!dirInputHumanAnnotations.exists())
+ {
+ dirInputHumanAnnotations.createNewFile();
+ }
+
+ File[] files = dirInputHumanAnnotations.listFiles();
+
+ if(files==null || files.length==0)
+ {
+ log.warn("There were no human annotations in dir: "+ dirInputHumanAnnotations.getAbsolutePath());
+ }
+
+ for (File f : files)
+ {
+ if (f.isDirectory())
+ {
+ continue;
+ }
+
+ log.debug("Reading: " + f.getName());
+
+ //read infile
+ DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder builder = factory.newDocumentBuilder();
+ Document doc = builder.parse(f);
+
+ //continue on and parse PHI tags with the start & end dates.
+ XPathFactory xPathfactory = XPathFactory.newInstance();
+ XPath xpath = xPathfactory.newXPath();
+
+ List<KnowtatorAnnot> annotationList = new ArrayList<KnowtatorAnnot>();
+ Map<String,String> classMentionMap = new HashMap<String,String>();
+
+ //read all matching nodes
+ XPathExpression annotExpr = xpath.compile("//annotation");
+ NodeList nodes = (NodeList) annotExpr.evaluate(doc, XPathConstants.NODESET);
+
+ //for each ANNOTATION node in a file
+ for (int i=0; i<nodes.getLength(); i++)
+ {
+ Node n = nodes.item(i);
+ NodeList kids = n.getChildNodes();
+
+ String filename = n.getParentNode().getAttributes().getNamedItem("textSource").getNodeValue();
+
+ KnowtatorAnnot a = new KnowtatorAnnot();
+ a.setFilenameShort(filename);
+
+ for (int k=0; k<kids.getLength(); k++)
+ {
+ Node kid = kids.item(k);
+ if (kid.getNodeName().equalsIgnoreCase("mention"))
+ {
+ String mentionId = kid.getAttributes().getNamedItem("id").getNodeValue();
+ a.setMentionId(mentionId);
+ }
+ else if (kid.getNodeName().equalsIgnoreCase("span"))
+ {
+ int startIdx = Integer.parseInt(kid.getAttributes().getNamedItem("start").getNodeValue());
+ int endIdx = Integer.parseInt(kid.getAttributes().getNamedItem("end").getNodeValue());
+ a.setStartIdx(startIdx);
+ a.setEndIdx(endIdx);
+ }
+ else if (kid.getNodeName().equalsIgnoreCase("spannedText"))
+ {
+ String token = kid.getTextContent();
+ a.setToken(token);
+ }
+ }
+ annotationList.add(a);
+ }
+
+ //for each CLASSMENTION node in a file
+ XPathExpression classMentionExpr = xpath.compile("//classMention");
+ NodeList classMentionNodes = (NodeList) classMentionExpr.evaluate(doc, XPathConstants.NODESET);
+
+ for(int i=0; i<classMentionNodes.getLength(); i++)
+ {
+ Node n = classMentionNodes.item(i);
+ String key = n.getAttributes().getNamedItem("id").getNodeValue();
+ String val = n.getFirstChild().getNextSibling().getTextContent();
+
+ classMentionMap.put(key,val);
+ }
+
+
+ //INSERT phi
+ for (int i=0; i<annotationList.size(); i++)
+ {
+ KnowtatorAnnot a = annotationList.get(i);
+
+ //check for empty annotations
+ if (a.getStartIdx()==0 && a.getEndIdx()==0 && a.getToken()==null)
+ {
+ log.warn("Encountered empty annotation for " + a.getMentionId());
+ continue;
+ }
+
+ for (String token : a.getToken().split(" "))
+ {
+ token = token.trim();
+ if (token.length()>0 && !token.equals(","))
+ {
+ dao.insert(a.getFilenameShort(), classMentionMap.get(a.getMentionId()).toUpperCase(), token, a.getStartIdx(), a.getEndIdx());
+ }
+ }
+ }
+ }
+
+ log.info("DONE Parsing human annotations.");
+ }
+ catch (Exception e)
+ {
+ log.error("Could not parse human annotations", e);
+ }
+ finally
+ {
+ dao.close();
+ }
+ }
+
+
+ private class KnowtatorAnnot
+ {
+ private String token;
+ private String filenameShort;
+ private int startIdx;
+ private int endIdx;
+ private String mentionClass;
+ private String mentionId;
+
+ public String getToken()
+ {
+ return token;
+ }
+
+ public void setToken(String token)
+ {
+ this.token = token;
+ }
+
+ public String getFilenameShort()
+ {
+ return filenameShort;
+ }
+ public void setFilenameShort(String filenameShort)
+ {
+ this.filenameShort = filenameShort;
+ }
+ public int getStartIdx()
+ {
+ return startIdx;
+ }
+ public void setStartIdx(int startIdx)
+ {
+ this.startIdx = startIdx;
+ }
+ public int getEndIdx()
+ {
+ return endIdx;
+ }
+ public void setEndIdx(int endIdx)
+ {
+ this.endIdx = endIdx;
+ }
+ public String getMentionClass()
+ {
+ return mentionClass;
+ }
+ public void setMentionClass(String mentionClass)
+ {
+ this.mentionClass = mentionClass;
+ }
+ public String getMentionId()
+ {
+ return mentionId;
+ }
+ public void setMentionId(String mentionId)
+ {
+ this.mentionId = mentionId;
+ }
+ }
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/HumanAnnotationsExtractorProtege.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaClassifier.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaClassifier.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaClassifier.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaClassifier.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,276 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+package org.spin.scrubber.classification;
+
+import org.spin.scrubber.ScrubberProperties;
+import org.spin.scrubber.uima.dao.FeatureMatrixDAO;
+import weka.classifiers.Classifier;
+import weka.classifiers.Evaluation;
+import weka.classifiers.meta.CostSensitiveClassifier;
+import weka.core.Instances;
+import weka.core.Utils;
+import weka.core.converters.ConverterUtils.DataSource;
+import weka.filters.Filter;
+import weka.filters.unsupervised.attribute.Remove;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * TODO: Serious refactoring needed, only use what we need for the implementation we published. (britt)
+ */
+
+public class WekaClassifier
+{
+ private String testModelFilepath = null;
+ private String trainModelFilepath =null;
+
+ private Remove removeFilter = null;
+ private Classifier classifier = null;
+
+ private String tableSuffix = "_test"; //WekaClassifier only updates "test" tables.
+
+ public WekaClassifier()
+ {
+ this(ScrubberProperties.getFileModelTrainAbsolutePath(), ScrubberProperties.getFileModelTestAbsolutePath());
+ }
+
+ public WekaClassifier(String trainModelFilepath, String testModelFilepath)
+ {
+ this.trainModelFilepath = trainModelFilepath;
+ this.testModelFilepath = testModelFilepath;
+ }
+
+ public static void main(String[] args) throws Exception
+ {
+ WekaClassifier wc = new WekaClassifier();
+ wc.test();
+ }
+
+ public void test() throws Exception
+ {
+ //get data
+ DataSource trainSource = new DataSource(getTrainModelFilepath());
+ DataSource testSource = new DataSource(getTestModelFilepath());
+ Instances trainData = trainSource.getDataSet();
+ Instances testData = testSource.getDataSet();
+ Instances orig = new Instances(testData);
+
+ //remove filter
+ trainData = Filter.useFilter(trainData, getRemoveFilter(trainData));
+ testData = Filter.useFilter(testData, getRemoveFilter(trainData));
+
+ //set class index
+ trainData.setClassIndex(trainData.numAttributes()-1);
+ System.out.println("class index: " + trainData.classIndex() +"\t"+ trainData.attribute(trainData.classIndex()));
+
+ testData.setClassIndex(testData.numAttributes()-1);
+ System.out.println("class index: " + testData.classIndex() +"\t"+ testData.attribute(testData.classIndex()));
+
+ //check headers
+ if (!trainData.equalHeaders(testData))
+ {
+ System.out.println();
+ throw new IllegalStateException("Incompatible train and test set!");
+ }
+ else
+ {
+ System.out.println("headers match...");
+ }
+
+ //build classifier
+ System.out.println("building classifier...");
+ Classifier base = getClassifier();
+ base.buildClassifier(trainData);
+ System.out.println(base);
+
+ //evaluate
+ System.out.println("evaluating...");
+ Evaluation eval = new Evaluation(trainData);
+
+ eval.evaluateModel(base, testData);
+ System.out.println(eval.toSummaryString());
+ System.out.println(eval.toClassDetailsString());
+ System.out.println(eval.toMatrixString());
+
+ //output txt results
+ List<String> classifiedAsPHIList = printSummary(base, eval, testData, orig);
+
+ //update db w/ classification
+ recordClassification(classifiedAsPHIList);
+ }
+
+ private void recordClassification(List<String> classifiedAsPHIList) throws Exception
+ {
+ String[] keys;
+ int id;
+ String classifiedAs;
+ FeatureMatrixDAO dao = new FeatureMatrixDAO(tableSuffix);
+
+ for (String s : classifiedAsPHIList)
+ {
+ keys = s.split("\\|");
+ if (keys.length!=2)
+ {
+ System.out.println("ERROR: unable to record classification, insufficient number of keys for '"+s+"'.");
+ }
+ else
+ {
+ id = Integer.parseInt(keys[0]);
+ classifiedAs = keys[1];
+
+ dao.updateClassification(classifiedAs, id);
+ }
+ }
+ }
+
+ private List<String> printSummary(Classifier base, Evaluation eval, Instances data, Instances orig) throws Exception
+ {
+ //return list of cases classified as PHI
+ List<String> classifiedAsPHIList = new ArrayList<String>();
+
+ // output evaluation
+ System.out.println();
+ System.out.println("=== Setup ===");
+ System.out.println("Classifier: " + getClassifier().getClass().getName() + " " + Utils.joinOptions(base.getOptions()));
+ System.out.println("Dataset: " + data.relationName());
+ System.out.println();
+
+ // output predictions
+ int totalMisclass = 0;
+ int totalPHIClass = 0;
+ int totalNonPHIClass = 0;
+ System.out.println("# -\t actual -\t predicted -\t token");
+ for (int i = 0; i < data.numInstances(); i++)
+ {
+ double pred = base.classifyInstance(data.instance(i));
+ double actual = data.instance(i).classValue();
+ String predString = data.classAttribute().value((int) pred);
+// double[] dist = base.distributionForInstance(data.instance(i));
+
+ //save data for cases classified as PHI
+ //if (pred>0)
+ if(!predString.equalsIgnoreCase("NA"))
+ {
+ totalPHIClass++;
+ classifiedAsPHIList.add(orig.instance(i).stringValue(0)+"|"+predString);
+ }
+ else
+ {
+ totalNonPHIClass++;
+ }
+
+ //output misclassifications
+ if (pred != actual && predString.equalsIgnoreCase("NA"))
+// if (pred != actual && actual>1)
+ {
+ totalMisclass++;
+ System.out.print((i+1));
+ System.out.print(" -\t ");
+ System.out.print(data.instance(i).toString(data.classIndex()));
+ System.out.print(" -\t ");
+ System.out.print(predString);
+ System.out.print(" -\t ");
+ //System.out.print(data.instance(i)); //comment out classified instance.
+ System.out.println(orig.instance(i).stringValue(0)); //show identifying part of the instance base on original instance data.
+ //System.out.print("\t\t\t");
+ System.out.println();
+ }
+ }
+
+ System.out.println("total misclassifications: " + totalMisclass);
+ System.out.println(eval.toSummaryString());
+ System.out.println(eval.toClassDetailsString());
+ System.out.println(eval.toMatrixString());
+
+ System.out.println("total PHI class: " + totalPHIClass);
+ System.out.println("total non-PHI class: " + totalNonPHIClass);
+
+ return classifiedAsPHIList;
+ }
+
+ /*
+ * filters - only initialize once or it causes problems running on test/train sets
+ */
+ private Filter getRemoveFilter(Instances data) throws Exception
+ {
+ if (removeFilter == null)
+ {
+ removeFilter = new Remove();
+ System.out.println("\tExecuting Remove Filter...");
+ String[] options = new String[2];
+ options[0] = "-R";
+ options[1] = "1";
+ removeFilter.setOptions(options);
+ removeFilter.setInputFormat(data);
+ }
+ return removeFilter;
+ }
+
+ public Classifier getClassifier() throws Exception
+ {
+ return getClassifier(ScrubberProperties.getClassificationCostMatrix());
+ }
+
+ public Classifier getClassifier(String classificationCostMatrix) throws Exception
+ {
+ if (classifier==null)
+ {
+ classifier = new CostSensitiveClassifier();
+ String[] options = new String[11];
+ int i=0;
+ options[i++] = "-cost-matrix";
+ options[i++] = classificationCostMatrix;
+ options[i++] = "-S";
+ options[i++] = "1";
+ options[i++] = "-W";
+ options[i++] = "weka.classifiers.trees.J48";
+ options[i++] = "--";
+ options[i++] = "-C";
+ options[i++] = "0.25 ";
+ options[i++] = "-M";
+ options[i++] = "2";
+
+ classifier.setOptions(options);
+ }
+
+ return classifier;
+ }
+
+ public String getTrainModelFilepath()
+ {
+ return trainModelFilepath;
+ }
+
+ public void setTrainModelFilepath(String trainModelFilepath)
+ {
+ this.trainModelFilepath = trainModelFilepath;
+ }
+
+ public String getTestModelFilepath()
+ {
+ return testModelFilepath;
+ }
+
+ public void setTestModelFilepath(String testModelFilepath)
+ {
+ this.testModelFilepath = testModelFilepath;
+ }
+
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaClassifier.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractor.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractor.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractor.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,131 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+package org.spin.scrubber.classification;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.Scanner;
+
+import org.apache.log4j.Logger;
+
+/**
+ *
+ * @author britt fitch
+ *
+ */
+public abstract class WekaDataExtractor
+{
+ private static Logger log = Logger.getLogger(WekaDataExtractor.class);
+
+ private String dirModels = null;
+ private String modelName = null;
+ private String tableSuffix = null;
+
+ public WekaDataExtractor(String dirModels, String modelName)
+ {
+ this.dirModels = dirModels;
+ this.modelName = modelName;
+ }
+
+ public void writeFile(String pathToFile, String content) throws IOException
+ {
+ Writer out = null;
+ try
+ {
+ out = new OutputStreamWriter(new FileOutputStream(pathToFile));
+ out.write(content);
+ }
+ catch (IOException e)
+ {
+ log.error("Unable to write to file: " + pathToFile, e);
+ throw e;
+ }
+ finally
+ {
+ out.close();
+ }
+ }
+
+ public String readFile(String pathToFile) throws FileNotFoundException
+ {
+ StringBuilder text = new StringBuilder();
+ String NL = System.getProperty("line.separator");
+ Scanner scanner = null;
+ try
+ {
+ scanner = new Scanner(new FileInputStream(pathToFile));
+ while (scanner.hasNextLine())
+ {
+ text.append(scanner.nextLine() + NL);
+ }
+ }
+ catch (FileNotFoundException e)
+ {
+ log.error("Unable to read file: " + pathToFile, e);
+ throw e;
+ }
+ finally
+ {
+ scanner.close();
+ }
+
+ return text.toString();
+ }
+
+ public abstract void generateModel() throws Exception;
+
+ /**
+ * Delete old model.
+ * @param pathToFile - file path of the file to be deleted.
+ */
+ protected void deleteModel(String pathToFile)
+ {
+ File model = new File(pathToFile);
+ if(model.exists())
+ {
+ log.info("deleting model: " + pathToFile);
+ model.delete();
+ }
+ }
+
+ public String getDirModels()
+ {
+ return dirModels;
+ }
+
+ public void setDirModels(String dirModels)
+ {
+ this.dirModels = dirModels;
+ }
+
+ public String getModelName()
+ {
+ return modelName;
+ }
+
+ public void setModelName(String modelName)
+ {
+ this.modelName = modelName;
+ }
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractor.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTest.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTest.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTest.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTest.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,78 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+package org.spin.scrubber.classification;
+
+import org.apache.log4j.Logger;
+import org.spin.scrubber.ScrubberProperties;
+import org.spin.scrubber.uima.dao.FeatureMatrixDAO;
+
+import java.io.File;
+import java.util.List;
+
+/**
+ *
+ * @author britt fitch
+ *
+ */
+public class WekaDataExtractorTest extends WekaDataExtractor
+{
+ private static Logger log = Logger.getLogger(WekaDataExtractorTest.class);
+ private String tableSuffix = "_test";
+
+ public WekaDataExtractorTest()
+ {
+ super(ScrubberProperties.getDirModels(), ScrubberProperties.getFileModelTest());
+ }
+
+ public static void main(String[] args) throws Exception
+ {
+ WekaDataExtractor extractor = new WekaDataExtractorTest();
+ extractor.generateModel();
+ }
+
+ public void generateModel() throws Exception
+ {
+ String pathToArff = getDirModels() + File.separator + getModelName();
+
+ //delete old arff
+ deleteModel(pathToArff);
+
+ //get weka header
+ StringBuilder sb = new StringBuilder(readFile(getDirModels() + File.separator + "weka_header.txt")); //TODO: refactor
+
+ //select records for output model
+ List<String> rows = new FeatureMatrixDAO(tableSuffix).selectDataSetTest();
+
+ //clean file according to .sed
+ for (String row : rows)
+ {
+ row = row.replaceAll(",',", ",apos,");
+ row = row.replaceAll(",,,", ",comma,");
+ row = row.replaceAll(",\\.,", ",period,");
+ row = row.replaceAll(",:,", ",colon,");
+ row = row.replaceAll(",\\(,", ",none,");
+ row = row.replaceAll(",\\),", ",none,");
+ row = row.replaceAll(",\\$,", ",none,");
+ sb.append(row);
+ sb.append("\n");
+ }
+
+ writeFile(pathToArff, sb.toString());
+ }
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTest.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTrain.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTrain.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTrain.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTrain.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,78 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+package org.spin.scrubber.classification;
+
+import java.io.File;
+import java.util.List;
+
+import org.apache.log4j.Logger;
+import org.spin.scrubber.ScrubberProperties;
+import org.spin.scrubber.uima.dao.FeatureMatrixDAO;
+
+/**
+ *
+ * @author britt fitch
+ *
+ */
+public class WekaDataExtractorTrain extends WekaDataExtractor
+{
+ private static Logger log = Logger.getLogger(WekaDataExtractorTrain.class);
+ private String tableSuffix = "_train";
+
+ public WekaDataExtractorTrain()
+ {
+ super(ScrubberProperties.getDirModels(), ScrubberProperties.getFileModelTrain());
+ }
+
+ public static void main(String[] args) throws Exception
+ {
+ WekaDataExtractor extractor = new WekaDataExtractorTrain();
+ extractor.generateModel();
+ }
+
+ public void generateModel() throws Exception
+ {
+ String pathToArff = getDirModels() + File.separator + getModelName();
+
+ //delete old arff
+ deleteModel(pathToArff);
+
+ //get weka header
+ StringBuilder sb = new StringBuilder(readFile(getDirModels() + File.separator + "weka_header.txt")); //TODO: refactor
+
+ //select records for output model
+ List<String> rows = new FeatureMatrixDAO(tableSuffix).selectDataSetTrain();
+
+ //clean file according to .sed
+ for (String row : rows)
+ {
+ row = row.replaceAll(",',", ",apos,");
+ row = row.replaceAll(",,,", ",comma,");
+ row = row.replaceAll(",\\.,", ",period,");
+ row = row.replaceAll(",:,", ",colon,");
+ row = row.replaceAll(",\\(,", ",none,");
+ row = row.replaceAll(",\\),", ",none,");
+ row = row.replaceAll(",\\$,", ",none,");
+ sb.append(row);
+ sb.append("\n");
+ }
+
+ writeFile(pathToArff, sb.toString());
+ }
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/classification/WekaDataExtractorTrain.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/AnnotationsPubsPosCounter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/AnnotationsPubsPosCounter.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/AnnotationsPubsPosCounter.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/AnnotationsPubsPosCounter.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,60 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+package org.spin.scrubber.oneoff;
+
+import org.spin.scrubber.uima.dao.AnnotationsPubsDAO;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+@Deprecated
+public class AnnotationsPubsPosCounter
+{
+
+ /**
+ * @param args
+ * @throws Exception
+ *
+ * this class was used to generate a distribution of parts of speech across the set of pubs
+ * for comparison with the distribution of part of speech across the cases
+ * and the distribution of PoS for known phi (based on gold standard)
+ */
+ public static void main(String[] args) throws Exception
+ {
+ //AnnotationsPubsPosCounter runner = new AnnotationsPubsPosCounter();
+ Map<String,Integer> pubPosMap = new HashMap<String,Integer>();
+ AnnotationsPubsDAO dao = new AnnotationsPubsDAO();
+ List<String> fileList = dao.selectDistinctFilenameShort();
+
+ //sum pos for pubs
+ for (String filename : fileList)
+ {
+ pubPosMap = dao.selectDistinctPOS(pubPosMap, filename);
+ }
+
+ //insert pos for pubs
+ for (String pos : pubPosMap.keySet())
+ {
+ int posCnt = pubPosMap.get(pos);
+ dao.insertPubsPOS(pos, posCnt);
+ }
+ }
+
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/AnnotationsPubsPosCounter.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeaturePHITypeUpdater.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeaturePHITypeUpdater.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeaturePHITypeUpdater.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeaturePHITypeUpdater.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+package org.spin.scrubber.oneoff;
+
+import org.spin.scrubber.beans.CaseFeature;
+import org.spin.scrubber.uima.dao.FeatureMatrixDAO;
+import org.spin.scrubber.uima.dao.HumanAnnotationsDAO;
+
+import java.util.List;
+
+@Deprecated
+public class CaseFeaturePHITypeUpdater //implements Runnable
+{
+
+// public static void main(String[] args) throws Exception
+// {
+// CaseFeaturePHITypeUpdater runner = new CaseFeaturePHITypeUpdater();
+// runner.run();
+// }
+//
+// public void run()
+// {
+// FeatureMatrixDAO cfDAO;
+// HumanAnnotationsDAO phiDao;
+// try
+// {
+// cfDAO = new FeatureMatrixDAO();
+// phiDao = new HumanAnnotationsDAO();
+//
+//// //update TRAIN set
+//// List<CaseFeature> caseFeatureList = cfDAO.selectAllCaseFeatures();
+//// System.out.println("INFO: " + caseFeatureList.size() + " train instances to be updated...");
+//// for (CaseFeature cf : caseFeatureList)
+//// {
+//// try
+//// {
+//// String phiLabel = phiDao.selectPHIType(cf.getFilename_short(), cf.getStartIdx());
+//// if (phiLabel!=null)
+//// {
+//// cfDAO.updateCaseFeaturePHITypeTrain(cf.getId(), phiLabel);
+//// }
+//// }
+//// catch(Exception e)
+//// {
+//// System.out.println("ERROR: (train) token|id: " +cf.getToken()+"|"+cf.getId() );
+//// e.printStackTrace();
+//// }
+//// }
+//
+// //upate TEST set
+// List<CaseFeature> caseFeatureTESTList = cfDAO.selectAllTestCaseFeatures();
+// System.out.println("INFO: " + caseFeatureTESTList.size() + " test instances to be updated...");
+// for (CaseFeature cf : caseFeatureTESTList)
+// {
+// try
+// {
+// String phiLabel = phiDao.selectPHIType(cf.getFilename_short(), cf.getStartIdx());
+// if (phiLabel!=null)
+// {
+// cfDAO.updateCaseFeaturePHITypeTest(cf.getId(), phiLabel);
+// }
+// }
+// catch(Exception e)
+// {
+// System.out.println("ERROR: (test) token|id: " +cf.getToken()+"|"+cf.getId() );
+// e.printStackTrace();
+// }
+// }
+// } catch (Exception e1)
+// {
+// e1.printStackTrace();
+// }
+// }
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeaturePHITypeUpdater.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeatureTFUpdater.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeatureTFUpdater.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeatureTFUpdater.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeatureTFUpdater.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,129 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+/**
+ *
+ */
+package org.spin.scrubber.oneoff;
+
+import org.spin.scrubber.beans.CaseFeature;
+import org.spin.scrubber.uima.dao.FeatureMatrixDAO;
+import org.spin.scrubber.uima.dao.TfDAO;
+
+import java.util.List;
+import java.util.Map;
+
+/**
+ * This class is intended to be run as a one-off process (thus the package *.oneoff) to update the TF features.
+ * The normal process of calculating the TF occurs in TFAnnotator.
+ *
+ * @author BF19
+ *
+ */
+@Deprecated
+public class CaseFeatureTFUpdater //implements Runnable
+{
+// public static void main(String[] args) throws Exception
+// {
+// CaseFeatureTFUpdater runner = new CaseFeatureTFUpdater();
+// runner.run();
+// }
+//
+// public void run()
+// {
+// //select all pub token/cnt/pos
+// Map<String, Integer> pubsTFMap;
+// try
+// {
+// pubsTFMap = new TfDAO().selectPubTFMap();
+//
+// updateTrain(pubsTFMap);
+// updateTest(pubsTFMap);
+// }
+// catch (Exception e)
+// {
+// e.printStackTrace();
+// }
+// }
+//
+// private void updateTrain(Map<String,Integer> pubsTFMap)
+// {
+// FeatureMatrixDAO cfDAO;
+// try
+// {
+// cfDAO = new FeatureMatrixDAO();
+//
+// //select all feature records (to be updated)
+// List<CaseFeature> caseFeatureList = cfDAO.selectAllCaseFeatures();
+//
+// for (CaseFeature cf : caseFeatureList)
+// {
+// try
+// {
+// //update all_pubs features
+// int pubTermPosCnt = (pubsTFMap.get(cf.getToken()+"|"+cf.getPos())==null) ? 0 : pubsTFMap.get(cf.getToken()+"|"+cf.getPos());
+// int pubTermCnt = (pubsTFMap.get(cf.getToken())==null) ? 0 : pubsTFMap.get(cf.getToken());
+// float pubTotalCnt = Float.valueOf(Integer.toString(pubsTFMap.get("totalPubCount")));
+// cfDAO.updateCaseFeatureTFAllPubs(cf.getId(), pubTermPosCnt/pubTotalCnt, pubTermCnt/pubTotalCnt);
+// }
+// catch(Exception e)
+// {
+// System.out.println("ERROR: token|id: " +cf.getToken()+"|"+cf.getId() );
+// e.printStackTrace();
+// }
+// }
+// }
+// catch (Exception e1)
+// {
+// e1.printStackTrace();
+// }
+// }
+//
+// private void updateTest(Map<String,Integer> pubsTFMap)
+// {
+// FeatureMatrixDAO cfDAO;
+// try
+// {
+// cfDAO = new FeatureMatrixDAO();
+//
+// //select all feature records (to be updated)
+// List<CaseFeature> caseFeatureList = cfDAO.selectAllTestCaseFeatures();
+//
+// for (CaseFeature cf : caseFeatureList)
+// {
+// try
+// {
+// //update all_pubs features
+// int pubTermPosCnt = (pubsTFMap.get(cf.getToken()+"|"+cf.getPos())==null) ? 0 : pubsTFMap.get(cf.getToken()+"|"+cf.getPos());
+// int pubTermCnt = (pubsTFMap.get(cf.getToken())==null) ? 0 : pubsTFMap.get(cf.getToken());
+// float pubTotalCnt = Float.valueOf(Integer.toString(pubsTFMap.get("totalPubCount")));
+// cfDAO.updateTestCaseFeatureTFAllPubs(cf.getId(), pubTermPosCnt/pubTotalCnt, pubTermCnt/pubTotalCnt);
+// }
+// catch(Exception e)
+// {
+// System.out.println("ERROR: token|id: " +cf.getToken()+"|"+cf.getId() );
+// e.printStackTrace();
+// }
+// }
+// }
+// catch (Exception e1)
+// {
+// e1.printStackTrace();
+// }
+// }
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/CaseFeatureTFUpdater.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/XmlToTextI2B2.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/XmlToTextI2B2.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/XmlToTextI2B2.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/XmlToTextI2B2.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,847 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+/**
+ *
+ */
+package org.spin.scrubber.oneoff;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathConstants;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathFactory;
+import java.io.File;
+import java.io.FileWriter;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author britt fitch bf19
+ *
+ * takes 2 command line params:
+ * inDirectory : containing xml files
+ * outDirectory : where to place txt files (assumes this dir has "train" and "test" subdirs)
+ *
+ * parses i2b2 xml file into individual text files for use by scrubber.
+ *
+ * THIS IS REQUIRED TO REPRODUCE FINDINGS REPORTED IN THE PAPER.
+ */
+public class XmlToTextI2B2 implements Runnable
+{
+ private String inDirectory;
+ private String outDirectory;
+ private List<String> allTrainCaseList;
+
+ public XmlToTextI2B2(String in, String out)
+ {
+ this.inDirectory = in;
+ this.outDirectory = out;
+ }
+
+ public void run()
+ {
+ try
+ {
+ File inDir = new File(inDirectory);
+
+ if (!inDir.exists())
+ {
+ inDir.createNewFile();
+ }
+
+ File[] files = inDir.listFiles();
+
+ for (File f : files)
+ {
+ if (f.isDirectory())
+ {
+ continue;
+ }
+
+ System.out.println("XmlToText for: " + f.getName());
+
+ //read infile
+ DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+ DocumentBuilder builder = factory.newDocumentBuilder();
+ Document doc = builder.parse(f);
+ XPathFactory xPathfactory = XPathFactory.newInstance();
+ XPath xpath = xPathfactory.newXPath();
+ XPathExpression expr = xpath.compile("//TEXT");
+
+ //read all matching nodes
+ NodeList nodes = (NodeList) expr.evaluate(doc, XPathConstants.NODESET);
+
+ FileWriter writer = null;
+
+ //for each node in a file, write out to a flat txt file of the same name.
+ for (int i=0; i<nodes.getLength(); i++)
+ {
+ Node n = nodes.item(i);
+ String id = n.getParentNode().getAttributes().getNamedItem("ID").getNodeValue();//attributes.getNamedItem("ID").getNodeValue();
+
+ String subdir = (isTrainCase(id)) ? "train" : "test";
+
+ //make outfile
+ String fname = id+".txt";
+ writer = new FileWriter(new File(outDirectory + File.separatorChar + subdir + File.separatorChar + fname));
+ String txt = n.getTextContent();
+
+ writer.write( txt + "\n");
+ writer.flush();
+ writer.close();
+ }
+ }
+ }
+ catch (Exception e)
+ {
+ System.out.println(e.getMessage());
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * i2b2 smoking deid data set is broken up into 4 files.
+ *
+ * TRAIN set - annotated (A) & unannotated (UA):
+ * unannotated_records_deid_smoking = 889 records. (UA)
+ * deid_surrogate_train_all_version2 = 669 records. (A)
+ *
+ * TEST set - annotated (A) & unannotated (UA):
+ * deid_surrogate_test_all_version2 = 220 records. (UA)
+ * deid_surrogate_test_all_groundtruth_v2 = 220 records. (A)
+ *
+ * the train file has all of the 889 records in it,
+ * but we only want to use the 669 records for training.
+ * leaving the 220 records for testing.
+ *
+ * this method will just ignore the 220 in the process of generating the text files.
+ *
+ *
+ */
+ private List<String> getAllTrainCaseList()
+ {
+ if (allTrainCaseList ==null)
+ {
+ allTrainCaseList = new ArrayList<String>();
+ allTrainCaseList.add("1");
+ allTrainCaseList.add("10");
+ allTrainCaseList.add("100");
+ allTrainCaseList.add("101");
+ allTrainCaseList.add("102");
+ allTrainCaseList.add("103");
+ allTrainCaseList.add("104");
+ allTrainCaseList.add("105");
+ allTrainCaseList.add("106");
+ allTrainCaseList.add("107");
+ allTrainCaseList.add("108");
+ allTrainCaseList.add("11");
+ allTrainCaseList.add("110");
+ allTrainCaseList.add("112");
+ allTrainCaseList.add("113");
+ allTrainCaseList.add("114");
+ allTrainCaseList.add("115");
+ allTrainCaseList.add("116");
+ allTrainCaseList.add("117");
+ allTrainCaseList.add("118");
+ allTrainCaseList.add("119");
+ allTrainCaseList.add("12");
+ allTrainCaseList.add("120");
+ allTrainCaseList.add("122");
+ allTrainCaseList.add("123");
+ allTrainCaseList.add("124");
+ allTrainCaseList.add("125");
+ allTrainCaseList.add("126");
+ allTrainCaseList.add("127");
+ allTrainCaseList.add("128");
+ allTrainCaseList.add("129");
+ allTrainCaseList.add("13");
+ allTrainCaseList.add("130");
+ allTrainCaseList.add("131");
+ allTrainCaseList.add("132");
+ allTrainCaseList.add("134");
+ allTrainCaseList.add("137");
+ allTrainCaseList.add("138");
+ allTrainCaseList.add("139");
+ allTrainCaseList.add("140");
+ allTrainCaseList.add("141");
+ allTrainCaseList.add("143");
+ allTrainCaseList.add("144");
+ allTrainCaseList.add("145");
+ allTrainCaseList.add("146");
+ allTrainCaseList.add("147");
+ allTrainCaseList.add("148");
+ allTrainCaseList.add("149");
+ allTrainCaseList.add("15");
+ allTrainCaseList.add("150");
+ allTrainCaseList.add("152");
+ allTrainCaseList.add("153");
+ allTrainCaseList.add("154");
+ allTrainCaseList.add("155");
+ allTrainCaseList.add("156");
+ allTrainCaseList.add("157");
+ allTrainCaseList.add("158");
+ allTrainCaseList.add("159");
+ allTrainCaseList.add("16");
+ allTrainCaseList.add("160");
+ allTrainCaseList.add("161");
+ allTrainCaseList.add("162");
+ allTrainCaseList.add("163");
+ allTrainCaseList.add("164");
+ allTrainCaseList.add("165");
+ allTrainCaseList.add("166");
+ allTrainCaseList.add("169");
+ allTrainCaseList.add("17");
+ allTrainCaseList.add("170");
+ allTrainCaseList.add("171");
+ allTrainCaseList.add("172");
+ allTrainCaseList.add("173");
+ allTrainCaseList.add("174");
+ allTrainCaseList.add("175");
+ allTrainCaseList.add("178");
+ allTrainCaseList.add("179");
+ allTrainCaseList.add("18");
+ allTrainCaseList.add("180");
+ allTrainCaseList.add("181");
+ allTrainCaseList.add("182");
+ allTrainCaseList.add("183");
+ allTrainCaseList.add("184");
+ allTrainCaseList.add("186");
+ allTrainCaseList.add("187");
+ allTrainCaseList.add("188");
+ allTrainCaseList.add("189");
+ allTrainCaseList.add("19");
+ allTrainCaseList.add("190");
+ allTrainCaseList.add("191");
+ allTrainCaseList.add("192");
+ allTrainCaseList.add("193");
+ allTrainCaseList.add("195");
+ allTrainCaseList.add("196");
+ allTrainCaseList.add("197");
+ allTrainCaseList.add("198");
+ allTrainCaseList.add("199");
+ allTrainCaseList.add("2");
+ allTrainCaseList.add("20");
+ allTrainCaseList.add("200");
+ allTrainCaseList.add("201");
+ allTrainCaseList.add("203");
+ allTrainCaseList.add("204");
+ allTrainCaseList.add("205");
+ allTrainCaseList.add("207");
+ allTrainCaseList.add("208");
+ allTrainCaseList.add("209");
+ allTrainCaseList.add("21");
+ allTrainCaseList.add("210");
+ allTrainCaseList.add("211");
+ allTrainCaseList.add("212");
+ allTrainCaseList.add("213");
+ allTrainCaseList.add("215");
+ allTrainCaseList.add("216");
+ allTrainCaseList.add("217");
+ allTrainCaseList.add("218");
+ allTrainCaseList.add("219");
+ allTrainCaseList.add("22");
+ allTrainCaseList.add("221");
+ allTrainCaseList.add("222");
+ allTrainCaseList.add("223");
+ allTrainCaseList.add("224");
+ allTrainCaseList.add("225");
+ allTrainCaseList.add("226");
+ allTrainCaseList.add("227");
+ allTrainCaseList.add("228");
+ allTrainCaseList.add("229");
+ allTrainCaseList.add("23");
+ allTrainCaseList.add("230");
+ allTrainCaseList.add("231");
+ allTrainCaseList.add("232");
+ allTrainCaseList.add("234");
+ allTrainCaseList.add("235");
+ allTrainCaseList.add("236");
+ allTrainCaseList.add("237");
+ allTrainCaseList.add("238");
+ allTrainCaseList.add("239");
+ allTrainCaseList.add("24");
+ allTrainCaseList.add("240");
+ allTrainCaseList.add("241");
+ allTrainCaseList.add("242");
+ allTrainCaseList.add("243");
+ allTrainCaseList.add("244");
+ allTrainCaseList.add("245");
+ allTrainCaseList.add("246");
+ allTrainCaseList.add("247");
+ allTrainCaseList.add("248");
+ allTrainCaseList.add("249");
+ allTrainCaseList.add("250");
+ allTrainCaseList.add("251");
+ allTrainCaseList.add("252");
+ allTrainCaseList.add("253");
+ allTrainCaseList.add("254");
+ allTrainCaseList.add("255");
+ allTrainCaseList.add("256");
+ allTrainCaseList.add("257");
+ allTrainCaseList.add("258");
+ allTrainCaseList.add("259");
+ allTrainCaseList.add("26");
+ allTrainCaseList.add("260");
+ allTrainCaseList.add("261");
+ allTrainCaseList.add("262");
+ allTrainCaseList.add("264");
+ allTrainCaseList.add("265");
+ allTrainCaseList.add("266");
+ allTrainCaseList.add("267");
+ allTrainCaseList.add("269");
+ allTrainCaseList.add("27");
+ allTrainCaseList.add("270");
+ allTrainCaseList.add("271");
+ allTrainCaseList.add("272");
+ allTrainCaseList.add("273");
+ allTrainCaseList.add("274");
+ allTrainCaseList.add("275");
+ allTrainCaseList.add("276");
+ allTrainCaseList.add("277");
+ allTrainCaseList.add("278");
+ allTrainCaseList.add("279");
+ allTrainCaseList.add("28");
+ allTrainCaseList.add("280");
+ allTrainCaseList.add("281");
+ allTrainCaseList.add("282");
+ allTrainCaseList.add("283");
+ allTrainCaseList.add("284");
+ allTrainCaseList.add("285");
+ allTrainCaseList.add("286");
+ allTrainCaseList.add("287");
+ allTrainCaseList.add("288");
+ allTrainCaseList.add("289");
+ allTrainCaseList.add("29");
+ allTrainCaseList.add("290");
+ allTrainCaseList.add("291");
+ allTrainCaseList.add("292");
+ allTrainCaseList.add("293");
+ allTrainCaseList.add("294");
+ allTrainCaseList.add("295");
+ allTrainCaseList.add("296");
+ allTrainCaseList.add("297");
+ allTrainCaseList.add("299");
+ allTrainCaseList.add("3");
+ allTrainCaseList.add("30");
+ allTrainCaseList.add("300");
+ allTrainCaseList.add("301");
+ allTrainCaseList.add("302");
+ allTrainCaseList.add("303");
+ allTrainCaseList.add("304");
+ allTrainCaseList.add("305");
+ allTrainCaseList.add("306");
+ allTrainCaseList.add("307");
+ allTrainCaseList.add("308");
+ allTrainCaseList.add("309");
+ allTrainCaseList.add("31");
+ allTrainCaseList.add("310");
+ allTrainCaseList.add("311");
+ allTrainCaseList.add("312");
+ allTrainCaseList.add("313");
+ allTrainCaseList.add("314");
+ allTrainCaseList.add("315");
+ allTrainCaseList.add("316");
+ allTrainCaseList.add("317");
+ allTrainCaseList.add("318");
+ allTrainCaseList.add("32");
+ allTrainCaseList.add("320");
+ allTrainCaseList.add("321");
+ allTrainCaseList.add("322");
+ allTrainCaseList.add("323");
+ allTrainCaseList.add("324");
+ allTrainCaseList.add("325");
+ allTrainCaseList.add("326");
+ allTrainCaseList.add("327");
+ allTrainCaseList.add("329");
+ allTrainCaseList.add("33");
+ allTrainCaseList.add("330");
+ allTrainCaseList.add("331");
+ allTrainCaseList.add("332");
+ allTrainCaseList.add("333");
+ allTrainCaseList.add("334");
+ allTrainCaseList.add("335");
+ allTrainCaseList.add("336");
+ allTrainCaseList.add("337");
+ allTrainCaseList.add("338");
+ allTrainCaseList.add("339");
+ allTrainCaseList.add("34");
+ allTrainCaseList.add("340");
+ allTrainCaseList.add("341");
+ allTrainCaseList.add("342");
+ allTrainCaseList.add("343");
+ allTrainCaseList.add("344");
+ allTrainCaseList.add("345");
+ allTrainCaseList.add("346");
+ allTrainCaseList.add("347");
+ allTrainCaseList.add("348");
+ allTrainCaseList.add("349");
+ allTrainCaseList.add("350");
+ allTrainCaseList.add("351");
+ allTrainCaseList.add("352");
+ allTrainCaseList.add("354");
+ allTrainCaseList.add("355");
+ allTrainCaseList.add("356");
+ allTrainCaseList.add("357");
+ allTrainCaseList.add("358");
+ allTrainCaseList.add("359");
+ allTrainCaseList.add("36");
+ allTrainCaseList.add("360");
+ allTrainCaseList.add("361");
+ allTrainCaseList.add("362");
+ allTrainCaseList.add("363");
+ allTrainCaseList.add("364");
+ allTrainCaseList.add("366");
+ allTrainCaseList.add("367");
+ allTrainCaseList.add("368");
+ allTrainCaseList.add("369");
+ allTrainCaseList.add("37");
+ allTrainCaseList.add("370");
+ allTrainCaseList.add("372");
+ allTrainCaseList.add("373");
+ allTrainCaseList.add("374");
+ allTrainCaseList.add("375");
+ allTrainCaseList.add("376");
+ allTrainCaseList.add("378");
+ allTrainCaseList.add("379");
+ allTrainCaseList.add("38");
+ allTrainCaseList.add("380");
+ allTrainCaseList.add("381");
+ allTrainCaseList.add("382");
+ allTrainCaseList.add("383");
+ allTrainCaseList.add("384");
+ allTrainCaseList.add("385");
+ allTrainCaseList.add("386");
+ allTrainCaseList.add("387");
+ allTrainCaseList.add("388");
+ allTrainCaseList.add("389");
+ allTrainCaseList.add("39");
+ allTrainCaseList.add("390");
+ allTrainCaseList.add("391");
+ allTrainCaseList.add("392");
+ allTrainCaseList.add("393");
+ allTrainCaseList.add("394");
+ allTrainCaseList.add("395");
+ allTrainCaseList.add("396");
+ allTrainCaseList.add("397");
+ allTrainCaseList.add("398");
+ allTrainCaseList.add("399");
+ allTrainCaseList.add("4");
+ allTrainCaseList.add("40");
+ allTrainCaseList.add("400");
+ allTrainCaseList.add("401");
+ allTrainCaseList.add("402");
+ allTrainCaseList.add("403");
+ allTrainCaseList.add("404");
+ allTrainCaseList.add("405");
+ allTrainCaseList.add("407");
+ allTrainCaseList.add("408");
+ allTrainCaseList.add("409");
+ allTrainCaseList.add("411");
+ allTrainCaseList.add("412");
+ allTrainCaseList.add("414");
+ allTrainCaseList.add("415");
+ allTrainCaseList.add("416");
+ allTrainCaseList.add("417");
+ allTrainCaseList.add("418");
+ allTrainCaseList.add("419");
+ allTrainCaseList.add("42");
+ allTrainCaseList.add("421");
+ allTrainCaseList.add("43");
+ allTrainCaseList.add("434");
+ allTrainCaseList.add("44");
+ allTrainCaseList.add("45");
+ allTrainCaseList.add("452");
+ allTrainCaseList.add("46");
+ allTrainCaseList.add("464");
+ allTrainCaseList.add("468");
+ allTrainCaseList.add("47");
+ allTrainCaseList.add("48");
+ allTrainCaseList.add("485");
+ allTrainCaseList.add("49");
+ allTrainCaseList.add("497");
+ allTrainCaseList.add("5");
+ allTrainCaseList.add("50");
+ allTrainCaseList.add("51");
+ allTrainCaseList.add("52");
+ allTrainCaseList.add("53");
+ allTrainCaseList.add("54");
+ allTrainCaseList.add("55");
+ allTrainCaseList.add("57");
+ allTrainCaseList.add("58");
+ allTrainCaseList.add("59");
+ allTrainCaseList.add("6");
+ allTrainCaseList.add("60");
+ allTrainCaseList.add("61");
+ allTrainCaseList.add("62");
+ allTrainCaseList.add("63");
+ allTrainCaseList.add("64");
+ allTrainCaseList.add("640");
+ allTrainCaseList.add("641");
+ allTrainCaseList.add("642");
+ allTrainCaseList.add("643");
+ allTrainCaseList.add("644");
+ allTrainCaseList.add("645");
+ allTrainCaseList.add("646");
+ allTrainCaseList.add("647");
+ allTrainCaseList.add("648");
+ allTrainCaseList.add("649");
+ allTrainCaseList.add("65");
+ allTrainCaseList.add("650");
+ allTrainCaseList.add("651");
+ allTrainCaseList.add("652");
+ allTrainCaseList.add("653");
+ allTrainCaseList.add("654");
+ allTrainCaseList.add("655");
+ allTrainCaseList.add("656");
+ allTrainCaseList.add("657");
+ allTrainCaseList.add("658");
+ allTrainCaseList.add("659");
+ allTrainCaseList.add("66");
+ allTrainCaseList.add("660");
+ allTrainCaseList.add("661");
+ allTrainCaseList.add("662");
+ allTrainCaseList.add("663");
+ allTrainCaseList.add("664");
+ allTrainCaseList.add("665");
+ allTrainCaseList.add("666");
+ allTrainCaseList.add("667");
+ allTrainCaseList.add("668");
+ allTrainCaseList.add("669");
+ allTrainCaseList.add("67");
+ allTrainCaseList.add("670");
+ allTrainCaseList.add("671");
+ allTrainCaseList.add("672");
+ allTrainCaseList.add("673");
+ allTrainCaseList.add("674");
+ allTrainCaseList.add("675");
+ allTrainCaseList.add("676");
+ allTrainCaseList.add("677");
+ allTrainCaseList.add("678");
+ allTrainCaseList.add("679");
+ allTrainCaseList.add("68");
+ allTrainCaseList.add("680");
+ allTrainCaseList.add("681");
+ allTrainCaseList.add("682");
+ allTrainCaseList.add("683");
+ allTrainCaseList.add("684");
+ allTrainCaseList.add("685");
+ allTrainCaseList.add("686");
+ allTrainCaseList.add("687");
+ allTrainCaseList.add("688");
+ allTrainCaseList.add("689");
+ allTrainCaseList.add("69");
+ allTrainCaseList.add("690");
+ allTrainCaseList.add("691");
+ allTrainCaseList.add("692");
+ allTrainCaseList.add("693");
+ allTrainCaseList.add("694");
+ allTrainCaseList.add("695");
+ allTrainCaseList.add("696");
+ allTrainCaseList.add("697");
+ allTrainCaseList.add("698");
+ allTrainCaseList.add("699");
+ allTrainCaseList.add("7");
+ allTrainCaseList.add("70");
+ allTrainCaseList.add("700");
+ allTrainCaseList.add("701");
+ allTrainCaseList.add("702");
+ allTrainCaseList.add("703");
+ allTrainCaseList.add("704");
+ allTrainCaseList.add("705");
+ allTrainCaseList.add("707");
+ allTrainCaseList.add("708");
+ allTrainCaseList.add("709");
+ allTrainCaseList.add("71");
+ allTrainCaseList.add("710");
+ allTrainCaseList.add("711");
+ allTrainCaseList.add("712");
+ allTrainCaseList.add("713");
+ allTrainCaseList.add("714");
+ allTrainCaseList.add("715");
+ allTrainCaseList.add("716");
+ allTrainCaseList.add("717");
+ allTrainCaseList.add("718");
+ allTrainCaseList.add("719");
+ allTrainCaseList.add("72");
+ allTrainCaseList.add("720");
+ allTrainCaseList.add("721");
+ allTrainCaseList.add("722");
+ allTrainCaseList.add("723");
+ allTrainCaseList.add("724");
+ allTrainCaseList.add("725");
+ allTrainCaseList.add("726");
+ allTrainCaseList.add("727");
+ allTrainCaseList.add("728");
+ allTrainCaseList.add("729");
+ allTrainCaseList.add("73");
+ allTrainCaseList.add("730");
+ allTrainCaseList.add("731");
+ allTrainCaseList.add("732");
+ allTrainCaseList.add("733");
+ allTrainCaseList.add("734");
+ allTrainCaseList.add("735");
+ allTrainCaseList.add("736");
+ allTrainCaseList.add("737");
+ allTrainCaseList.add("738");
+ allTrainCaseList.add("739");
+ allTrainCaseList.add("74");
+ allTrainCaseList.add("740");
+ allTrainCaseList.add("741");
+ allTrainCaseList.add("742");
+ allTrainCaseList.add("743");
+ allTrainCaseList.add("744");
+ allTrainCaseList.add("745");
+ allTrainCaseList.add("746");
+ allTrainCaseList.add("747");
+ allTrainCaseList.add("748");
+ allTrainCaseList.add("749");
+ allTrainCaseList.add("75");
+ allTrainCaseList.add("750");
+ allTrainCaseList.add("751");
+ allTrainCaseList.add("752");
+ allTrainCaseList.add("753");
+ allTrainCaseList.add("754");
+ allTrainCaseList.add("755");
+ allTrainCaseList.add("756");
+ allTrainCaseList.add("757");
+ allTrainCaseList.add("758");
+ allTrainCaseList.add("759");
+ allTrainCaseList.add("76");
+ allTrainCaseList.add("760");
+ allTrainCaseList.add("761");
+ allTrainCaseList.add("762");
+ allTrainCaseList.add("763");
+ allTrainCaseList.add("764");
+ allTrainCaseList.add("765");
+ allTrainCaseList.add("766");
+ allTrainCaseList.add("767");
+ allTrainCaseList.add("768");
+ allTrainCaseList.add("769");
+ allTrainCaseList.add("770");
+ allTrainCaseList.add("771");
+ allTrainCaseList.add("772");
+ allTrainCaseList.add("773");
+ allTrainCaseList.add("774");
+ allTrainCaseList.add("775");
+ allTrainCaseList.add("776");
+ allTrainCaseList.add("777");
+ allTrainCaseList.add("778");
+ allTrainCaseList.add("779");
+ allTrainCaseList.add("78");
+ allTrainCaseList.add("780");
+ allTrainCaseList.add("781");
+ allTrainCaseList.add("782");
+ allTrainCaseList.add("783");
+ allTrainCaseList.add("784");
+ allTrainCaseList.add("785");
+ allTrainCaseList.add("786");
+ allTrainCaseList.add("787");
+ allTrainCaseList.add("788");
+ allTrainCaseList.add("789");
+ allTrainCaseList.add("79");
+ allTrainCaseList.add("790");
+ allTrainCaseList.add("791");
+ allTrainCaseList.add("792");
+ allTrainCaseList.add("793");
+ allTrainCaseList.add("794");
+ allTrainCaseList.add("795");
+ allTrainCaseList.add("796");
+ allTrainCaseList.add("797");
+ allTrainCaseList.add("798");
+ allTrainCaseList.add("799");
+ allTrainCaseList.add("8");
+ allTrainCaseList.add("80");
+ allTrainCaseList.add("800");
+ allTrainCaseList.add("801");
+ allTrainCaseList.add("802");
+ allTrainCaseList.add("803");
+ allTrainCaseList.add("804");
+ allTrainCaseList.add("805");
+ allTrainCaseList.add("806");
+ allTrainCaseList.add("807");
+ allTrainCaseList.add("808");
+ allTrainCaseList.add("809");
+ allTrainCaseList.add("81");
+ allTrainCaseList.add("810");
+ allTrainCaseList.add("811");
+ allTrainCaseList.add("812");
+ allTrainCaseList.add("813");
+ allTrainCaseList.add("814");
+ allTrainCaseList.add("815");
+ allTrainCaseList.add("816");
+ allTrainCaseList.add("817");
+ allTrainCaseList.add("818");
+ allTrainCaseList.add("819");
+ allTrainCaseList.add("82");
+ allTrainCaseList.add("820");
+ allTrainCaseList.add("821");
+ allTrainCaseList.add("822");
+ allTrainCaseList.add("823");
+ allTrainCaseList.add("824");
+ allTrainCaseList.add("825");
+ allTrainCaseList.add("826");
+ allTrainCaseList.add("827");
+ allTrainCaseList.add("828");
+ allTrainCaseList.add("829");
+ allTrainCaseList.add("83");
+ allTrainCaseList.add("830");
+ allTrainCaseList.add("831");
+ allTrainCaseList.add("832");
+ allTrainCaseList.add("833");
+ allTrainCaseList.add("834");
+ allTrainCaseList.add("835");
+ allTrainCaseList.add("836");
+ allTrainCaseList.add("837");
+ allTrainCaseList.add("838");
+ allTrainCaseList.add("839");
+ allTrainCaseList.add("84");
+ allTrainCaseList.add("840");
+ allTrainCaseList.add("841");
+ allTrainCaseList.add("842");
+ allTrainCaseList.add("843");
+ allTrainCaseList.add("844");
+ allTrainCaseList.add("845");
+ allTrainCaseList.add("846");
+ allTrainCaseList.add("847");
+ allTrainCaseList.add("848");
+ allTrainCaseList.add("849");
+ allTrainCaseList.add("85");
+ allTrainCaseList.add("850");
+ allTrainCaseList.add("851");
+ allTrainCaseList.add("852");
+ allTrainCaseList.add("853");
+ allTrainCaseList.add("854");
+ allTrainCaseList.add("855");
+ allTrainCaseList.add("856");
+ allTrainCaseList.add("857");
+ allTrainCaseList.add("858");
+ allTrainCaseList.add("859");
+ allTrainCaseList.add("86");
+ allTrainCaseList.add("860");
+ allTrainCaseList.add("861");
+ allTrainCaseList.add("862");
+ allTrainCaseList.add("863");
+ allTrainCaseList.add("864");
+ allTrainCaseList.add("865");
+ allTrainCaseList.add("866");
+ allTrainCaseList.add("867");
+ allTrainCaseList.add("868");
+ allTrainCaseList.add("869");
+ allTrainCaseList.add("87");
+ allTrainCaseList.add("870");
+ allTrainCaseList.add("871");
+ allTrainCaseList.add("872");
+ allTrainCaseList.add("873");
+ allTrainCaseList.add("874");
+ allTrainCaseList.add("875");
+ allTrainCaseList.add("876");
+ allTrainCaseList.add("877");
+ allTrainCaseList.add("878");
+ allTrainCaseList.add("879");
+ allTrainCaseList.add("88");
+ allTrainCaseList.add("880");
+ allTrainCaseList.add("881");
+ allTrainCaseList.add("882");
+ allTrainCaseList.add("883");
+ allTrainCaseList.add("884");
+ allTrainCaseList.add("885");
+ allTrainCaseList.add("886");
+ allTrainCaseList.add("887");
+ allTrainCaseList.add("888");
+ allTrainCaseList.add("889");
+ allTrainCaseList.add("89");
+ allTrainCaseList.add("890");
+ allTrainCaseList.add("891");
+ allTrainCaseList.add("892");
+ allTrainCaseList.add("893");
+ allTrainCaseList.add("894");
+ allTrainCaseList.add("895");
+ allTrainCaseList.add("896");
+ allTrainCaseList.add("897");
+ allTrainCaseList.add("898");
+ allTrainCaseList.add("899");
+ allTrainCaseList.add("9");
+ allTrainCaseList.add("90");
+ allTrainCaseList.add("900");
+ allTrainCaseList.add("901");
+ allTrainCaseList.add("902");
+ allTrainCaseList.add("903");
+ allTrainCaseList.add("905");
+ allTrainCaseList.add("906");
+ allTrainCaseList.add("907");
+ allTrainCaseList.add("908");
+ allTrainCaseList.add("909");
+ allTrainCaseList.add("91");
+ allTrainCaseList.add("910");
+ allTrainCaseList.add("911");
+ allTrainCaseList.add("912");
+ allTrainCaseList.add("913");
+ allTrainCaseList.add("914");
+ allTrainCaseList.add("915");
+ allTrainCaseList.add("916");
+ allTrainCaseList.add("917");
+ allTrainCaseList.add("918");
+ allTrainCaseList.add("919");
+ allTrainCaseList.add("92");
+ allTrainCaseList.add("920");
+ allTrainCaseList.add("921");
+ allTrainCaseList.add("922");
+ allTrainCaseList.add("93");
+ allTrainCaseList.add("94");
+ allTrainCaseList.add("95");
+ allTrainCaseList.add("96");
+ allTrainCaseList.add("98");
+ allTrainCaseList.add("99");
+ }
+
+ return allTrainCaseList;
+ }
+
+ private boolean isTrainCase(String id)
+ {
+ if (this.getAllTrainCaseList().contains(id))
+ {
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ /**
+ * @param args
+ * @throws Exception
+ */
+ public static void main(String[] args) throws Exception
+ {
+ if (args.length!=2)
+ {
+ System.out.println("USAGE:\t\t XmlToText inDir outDir");
+ }
+
+ XmlToTextI2B2 runner = new XmlToTextI2B2(args[0], args[1]);
+ runner.run();
+ }
+
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/oneoff/XmlToTextI2B2.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotation.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotation.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotation.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotation.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,72 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+package org.spin.scrubber.protege.beans;
+
+import com.thoughtworks.xstream.annotations.XStreamAlias;
+
+@XStreamAlias("annotation")
+public class Annotation
+{
+ private Mention mention;
+ private Annotator annotator;
+ private Span span;
+ private String spannedText;
+
+ public String getSpannedText()
+ {
+ return spannedText;
+ }
+
+ public void setSpannedText(String spannedText)
+ {
+ this.spannedText = spannedText;
+ }
+
+ public Mention getMention()
+ {
+ return mention;
+ }
+
+ public void setMention(Mention mention)
+ {
+ this.mention = mention;
+ }
+
+ public void setAnnotator(Annotator annotator)
+ {
+ this.annotator = annotator;
+ }
+
+ public Annotator getAnnotator()
+ {
+ return annotator;
+ }
+
+ public Span getSpan()
+ {
+ return span;
+ }
+
+ public void setSpan(Span span)
+ {
+ this.span = span;
+ }
+
+
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotation.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotations.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotations.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotations.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotations.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,65 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+package org.spin.scrubber.protege.beans;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import com.thoughtworks.xstream.annotations.XStreamAlias;
+import com.thoughtworks.xstream.annotations.XStreamAsAttribute;
+import com.thoughtworks.xstream.annotations.XStreamImplicit;
+
+@XStreamAlias("annotations")
+public class Annotations
+{
+ @XStreamAlias("textSource")
+ @XStreamAsAttribute
+ private String textSource;
+
+ @XStreamImplicit(itemFieldName="annotation")
+ private List<Annotation> annotList = new ArrayList<Annotation>();
+
+ @XStreamImplicit(itemFieldName="classMention")
+ private List<ClassMention> cmList = new ArrayList<ClassMention>();
+
+ public List<Annotation> getAnnotList()
+ {
+ return annotList;
+ }
+ public void setAnnotList(List<Annotation> annotList)
+ {
+ this.annotList = annotList;
+ }
+ public List<ClassMention> getCmList()
+ {
+ return cmList;
+ }
+ public void setCmList(List<ClassMention> cmList)
+ {
+ this.cmList = cmList;
+ }
+ public String getTextSource()
+ {
+ return textSource;
+ }
+ public void setTextSource(String textSource)
+ {
+ this.textSource = textSource;
+ }
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotations.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotator.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotator.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotator.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,60 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+package org.spin.scrubber.protege.beans;
+
+import com.thoughtworks.xstream.annotations.XStreamAlias;
+import com.thoughtworks.xstream.annotations.XStreamAsAttribute;
+import com.thoughtworks.xstream.annotations.XStreamConverter;
+import com.thoughtworks.xstream.converters.extended.ToAttributedValueConverter;
+
+@XStreamAlias("annotator")
+@XStreamConverter(value=ToAttributedValueConverter.class, strings={"name"})
+public class Annotator
+{
+ @XStreamAlias("id")
+ @XStreamAsAttribute
+ private String id;
+
+ private String name;
+
+ public Annotator()
+ {
+ }
+ public Annotator(String id, String name)
+ {
+ this.setId(id);
+ this.setName(name);
+ }
+ public String getId()
+ {
+ return id;
+ }
+ public void setId(String id)
+ {
+ this.id = id;
+ }
+ public String getName()
+ {
+ return name;
+ }
+ public void setName(String name)
+ {
+ this.name = name;
+ }
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/protege/beans/Annotator.java
------------------------------------------------------------------------------
svn:mime-type = text/plain