You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by br...@apache.org on 2013/07/07 21:01:48 UTC

svn commit: r1500504 - in /ctakes/sandbox/ctakes-scrubber-deid/etc: ./ scripts/

Author: brittfitch
Date: Sun Jul  7 19:01:47 2013
New Revision: 1500504

URL: http://svn.apache.org/r1500504
Log:
CTAKES-64
adding runtime scripts. 

Added:
    ctakes/sandbox/ctakes-scrubber-deid/etc/
    ctakes/sandbox/ctakes-scrubber-deid/etc/demo.sh
    ctakes/sandbox/ctakes-scrubber-deid/etc/install.sh
    ctakes/sandbox/ctakes-scrubber-deid/etc/prerequisites.sh
    ctakes/sandbox/ctakes-scrubber-deid/etc/processPublications.sh
    ctakes/sandbox/ctakes-scrubber-deid/etc/runXmlToTextI2B2.sh
    ctakes/sandbox/ctakes-scrubber-deid/etc/scripts/
    ctakes/sandbox/ctakes-scrubber-deid/etc/scripts/runPubExtractor.sh
    ctakes/sandbox/ctakes-scrubber-deid/etc/scripts/runReferenceTextStripper.sh
    ctakes/sandbox/ctakes-scrubber-deid/etc/setClassPath.sh
    ctakes/sandbox/ctakes-scrubber-deid/etc/test.sh
    ctakes/sandbox/ctakes-scrubber-deid/etc/train.sh
    ctakes/sandbox/ctakes-scrubber-deid/etc/uninstall.sh

Added: ctakes/sandbox/ctakes-scrubber-deid/etc/demo.sh
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/etc/demo.sh?rev=1500504&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/etc/demo.sh (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/etc/demo.sh Sun Jul  7 19:01:47 2013
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+echo " By default, we assume that you want to use the existing model that was trained in the paper."
+echo " Citation:"
+echo
+echo " The data for train was obtained using this data use agreement"
+echo " https://www.i2b2.org/NLP/DataSets/Agreement.php"
+echo
+echo " You can train your own model by running train.sh "
+echo " test.sh"
+
+# TODO andy: more documentation here
+chmod +x *.sh
+
+./test.sh
\ No newline at end of file

Added: ctakes/sandbox/ctakes-scrubber-deid/etc/install.sh
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/etc/install.sh?rev=1500504&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/etc/install.sh (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/etc/install.sh Sun Jul  7 19:01:47 2013
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+echo "######################################################################################"
+echo "NOTE: admin to your mysql instance is required to create the scrubber database	    "
+echo "######################################################################################"
+
+source ./setClassPath.sh
+source ./scrubber.properties.sh
+
+echo "Creating database and user"
+echo "Prompting for root password  "
+mysql -u root --password < sql/create_database_and_user.sql
+
+echo "Fetching prerequisites cTakes and Weka in the background while we build your scrubber database ...."
+source ./prerequisites.sh &
+
+echo "Creating tables"
+mysql -D $SCRUBBER_DB_NAME -u $SCRUBBER_DB_USER -p$SCRUBBER_DB_PWD < sql/create_tables.sql
+
+echo "Inserting 1990 USA census names"
+mysql -D $SCRUBBER_DB_NAME -u $SCRUBBER_DB_USER -p$SCRUBBER_DB_PWD < sql/insert_lookup_dictionary.sql
+
+echo "Inserting concepts from UMLS"
+echo "note that CUIDs are not supplied due to license restrictions"
+mysql -D $SCRUBBER_DB_NAME -u $SCRUBBER_DB_USER -p$SCRUBBER_DB_PWD < sql/insert_lookup_umls.sql
+
+echo "Inserting Term Frequencies from Open Access Publications"
+mysql -D $SCRUBBER_DB_NAME -u $SCRUBBER_DB_USER -p$SCRUBBER_DB_PWD < sql/insert_lookup_term_frequency.sql
+
+echo " ######################################################################### "
+
+echo " Done with installer. Ready to scrub! "
\ No newline at end of file

Added: ctakes/sandbox/ctakes-scrubber-deid/etc/prerequisites.sh
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/etc/prerequisites.sh?rev=1500504&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/etc/prerequisites.sh (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/etc/prerequisites.sh Sun Jul  7 19:01:47 2013
@@ -0,0 +1,79 @@
+#!/bin/sh
+
+#######################################################################
+# britt fitch 
+# 2/22/12
+# wget scrubber dependencies:
+# 	ctakes 	(v.1.2.2 at this time)
+#	weka 	(v.3.6.3)
+#######################################################################
+
+CTAKES_ZIP=icTAKES.zip
+CTAKES_WGET=http://iweb.dl.sourceforge.net/project/ohnlp/icTAKES/$CTAKES_ZIP
+
+WEKA_ZIP=weka-3-6-3.zip
+WEKA_DIR=weka-3-6-3
+WEKA_WGET=http://iweb.dl.sourceforge.net/project/weka/weka-3-6/3.6.3/$WEKA_ZIP
+
+echo "STARTING..."
+echo "getting dependencies..."
+echo ""
+
+#######################################################################
+# getting weka
+#######################################################################
+echo ""
+echo "getting $WEKA_ZIP from sourceforge..."
+wget -O $WEKA_ZIP $WEKA_WGET
+
+echo ""
+echo "moving $WEKA_ZIP to parent directory..."
+mv $WEKA_ZIP lib/
+
+echo ""
+echo "change working directory to lib directory..."
+cd lib 
+
+echo ""
+echo "unzipping $WEKA_ZIP ..."
+unzip -q $WEKA_ZIP
+
+echo ""
+echo "cp weka libs..."
+cp $WEKA_DIR/*.jar .
+
+echo ""
+echo "DONE with setup of WEKA."
+
+#######################################################################
+# reset working dir
+#######################################################################
+echo ""
+echo "change working directory to scrubber directory..."
+cd ../ 
+
+#######################################################################
+# getting ctakes
+#######################################################################
+echo ""
+echo "getting $CTAKES_ZIP from sourceforge..."
+wget -O $CTAKES_ZIP $CTAKES_WGET
+
+echo ""
+echo "moving $CTAKES_ZIP to parent directory..."
+mv $CTAKES_ZIP ../
+
+echo ""
+echo "change working directory to parent directory..."
+cd ../
+
+echo ""
+echo "unzipping $CTAKES_ZIP..."
+unzip -q $CTAKES_ZIP
+
+echo ""
+echo "DONE with setup of cTakes."
+echo ""
+echo "DONE with setup of prerequisites."
+echo "Installing....."
+

Added: ctakes/sandbox/ctakes-scrubber-deid/etc/processPublications.sh
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/etc/processPublications.sh?rev=1500504&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/etc/processPublications.sh (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/etc/processPublications.sh Sun Jul  7 19:01:47 2013
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+source ./setClasspath.sh
+source ./scrubber.properties.sh
+
+echo "Processing Open Access publications "
+
+echo "Parsing files from $SCRUBBER_DIR_INPUT_PUBS_XML"
+java $JAVA_OPTS -cp $CP org.spin.scrubber.oa.core.PublicationsParserOpenAccessXML
+
+echo "Reading parsed files from database."
+java $JAVA_OPTS -cp $CP org.spin.scrubber.oa.core.PublicationsExtractorJDBC
+
+echo "Stripping references such as author names from publications ."
+java $JAVA_OPTS -cp $CP org.spin.scrubber.oa.core.ReferenceTextStripper
+
+echo "Running UIMA to annotate the publications."
+java $JAVA_OPTS -cp $CP org.spin.scrubber.uima.core.UIMARunner desc/cpe/cpe_PUBS.xml
+
+echo "Creating indexes on publication annotations to speed up performance"
+mysql -D $SCRUBBER_DB_NAME -u $SCRUBBER_DB_USER -p$SCRUBBER_DB_PWD < sql/create_indexes_pubs.sql
+
+echo "Calculating term frequencies..."
+mysql -D $SCRUBBER_DB_NAME -u $SCRUBBER_DB_USER -p$SCRUBBER_DB_PWD < sql/populateTFTables.sql
+
+echo "Done."
+
+

Added: ctakes/sandbox/ctakes-scrubber-deid/etc/runXmlToTextI2B2.sh
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/etc/runXmlToTextI2B2.sh?rev=1500504&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/etc/runXmlToTextI2B2.sh (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/etc/runXmlToTextI2B2.sh Sun Jul  7 19:01:47 2013
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+echo ""
+echo "Extracting individual cases from I2B2 XML file to seperate text files." 
+echo ""
+
+source ./setClassPath.sh
+source ./scrubber.properties.sh
+
+#########################################################
+#The 2nd param is data/input/cases because the code already assumes there is a 'train' and a 'test' dir in there.
+#This could be refactored but its only used as a preprocessing step for a very specific data source. 
+#This script is provided only as a helper to reproduce findings reported in the paper.
+#########################################################
+java $JAVA_OPTS -cp $CP org.spin.scrubber.oneoff.XmlToTextI2B2 data/input/cases/train data/input/cases/

Added: ctakes/sandbox/ctakes-scrubber-deid/etc/scripts/runPubExtractor.sh
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/etc/scripts/runPubExtractor.sh?rev=1500504&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/etc/scripts/runPubExtractor.sh (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/etc/scripts/runPubExtractor.sh Sun Jul  7 19:01:47 2013
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+echo "Starting..."
+
+source ./runSetClasspath.sh
+
+java -Xmx3000m -cp $CP org.spin.scrubber.publications.PubExtractor ../../open_pubs_raw/pub_txt/

Added: ctakes/sandbox/ctakes-scrubber-deid/etc/scripts/runReferenceTextStripper.sh
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/etc/scripts/runReferenceTextStripper.sh?rev=1500504&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/etc/scripts/runReferenceTextStripper.sh (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/etc/scripts/runReferenceTextStripper.sh Sun Jul  7 19:01:47 2013
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+echo "Starting..."
+
+source ./runSetClasspath.sh
+
+java -Xmx3000m -cp $CP org.spin.scrubber.uima.core.ReferenceTextStripper $DIR_PUBS_RAW $DIR_PUBS_CLEAN 
+
+#java -Xmx3000m -cp $CP org.spin.scrubber.uima.core.ReferenceTextStripper ../../open_pubs_raw/pub_txt/all/ ../../open_pubs_raw/pub_txt_scrubbed/all/

Added: ctakes/sandbox/ctakes-scrubber-deid/etc/setClassPath.sh
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/etc/setClassPath.sh?rev=1500504&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/etc/setClassPath.sh (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/etc/setClassPath.sh Sun Jul  7 19:01:47 2013
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+JAVA_OPTS="-Xmx1000m"
+
+SCRUBBER_HOME=`dirname $0`
+
+# Set up our classpath with our library dependencies...
+for jar in `ls $SCRUBBER_HOME/lib/*.jar`;
+do
+    SCRUBBER_CP=$jar:$SCRUBBER_CP
+done
+
+# add ctakes project to classpath
+CTAKES_CP="../icTAKES/:../icTAKES/resources:../icTAKES/cTAKESdesc:../icTAKES/cTAKES.jar"
+
+#export combined classpath
+CP=".:log4j.properties:scrubber.properties:$SCRUBBER_CP:$CTAKES_CP"
+
+# comment out this line if you dont want to update templates
+java $JAVA_OPTS -cp $CP org.spin.scrubber.templates.TemplateFileProcessor all
+
+ # exports SCRUBBER_* properties to shell
+java $JAVA_OPTS -cp $CP org.spin.scrubber.ScrubberProperties export
\ No newline at end of file

Added: ctakes/sandbox/ctakes-scrubber-deid/etc/test.sh
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/etc/test.sh?rev=1500504&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/etc/test.sh (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/etc/test.sh Sun Jul  7 19:01:47 2013
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+source ./setClassPath.sh
+source ./scrubber.properties.sh
+
+echo "Truncating database tables containing TEST data"
+mysql -D $SCRUBBER_DB_NAME -u $SCRUBBER_DB_USER -p$SCRUBBER_DB_PWD < sql/truncate_tables_test.sql
+
+echo "Removing output files from previous TEST execution"
+rm -f $SCRUBBER_DIR_OUTPUT_TEST/*
+
+echo "Reading human annotations"
+java $JAVA_OPTS -cp $CP $SCRUBBER_HUMAN_ANNOTATIONS_IMPLEMENTATION $SCRUBBER_DIR_INPUT_HUMAN_ANNOTATIONS_TEST _test
+
+echo "Running UIMA to annotate TEST cases"
+java $JAVA_OPTS -cp $CP org.spin.scrubber.uima.core.UIMARunner desc/cpe/cpe_cases_test.xml
+
+echo "Generating feature set (matrix) in the database"
+java $JAVA_OPTS -cp $CP org.spin.scrubber.classification.FeatureSetGenerator _test
+
+echo "Selecting the TEST set"
+java $JAVA_OPTS -cp $CP org.spin.scrubber.classification.WekaDataExtractorTest
+
+echo "Classification step: identifying PHI"
+java $JAVA_OPTS -cp $CP org.spin.scrubber.classification.WekaClassifier 
+
+echo "Redacting...."
+java $JAVA_OPTS -cp $CP org.spin.scrubber.redactor.Redactor
+
+echo "Done."
+echo "Scrubbed output dir : $SCRUBBER_DIR_OUTPUT_TEST"
+
+
+
+
+
+
+
+

Added: ctakes/sandbox/ctakes-scrubber-deid/etc/train.sh
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/etc/train.sh?rev=1500504&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/etc/train.sh (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/etc/train.sh Sun Jul  7 19:01:47 2013
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+source ./setClassPath.sh
+source ./scrubber.properties.sh
+
+echo "Truncating database tables containing TRAIN data"
+mysql -D $SCRUBBER_DB_NAME -u $SCRUBBER_DB_USER -p$SCRUBBER_DB_PWD < sql/truncate_tables_train.sql
+
+echo "Reading human annotations"
+java $JAVA_OPTS -cp $CP $SCRUBBER_HUMAN_ANNOTATIONS_IMPLEMENTATION $SCRUBBER_DIR_INPUT_HUMAN_ANNOTATIONS_TRAIN _train
+
+# TODO rename CPE files to use CamelCase (Andy)
+echo "Running UIMA to annotate TRAIN cases"
+java $JAVA_OPTS -cp $CP org.spin.scrubber.uima.core.UIMARunner desc/cpe/cpe_cases_train.xml
+
+echo "Generating feature set (matrix) in the database"
+java $JAVA_OPTS -cp $CP org.spin.scrubber.classification.FeatureSetGenerator _train
+
+echo "Selecting the TRAIN set"
+java $JAVA_OPTS -cp $CP org.spin.scrubber.classification.WekaDataExtractorTrain
+
+echo "Done."
+echo "Train model output dir : $SCRUBBER_DIR_MODELS"
+
+
+
+
+
+
+
+
+
+

Added: ctakes/sandbox/ctakes-scrubber-deid/etc/uninstall.sh
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/etc/uninstall.sh?rev=1500504&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/etc/uninstall.sh (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/etc/uninstall.sh Sun Jul  7 19:01:47 2013
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+echo " ######################################################################### "
+echo " NOTE: this script requires admin rights to your MySQL scrubber database # "
+echo " ######################################################################### "
+
+echo "Dropping existing database and user if they exist"
+mysql -u root < sql/drop_database_and_user.sql
+
+echo " ######################################################################### "
+echo " DONE. "
+
+
+
+