You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by br...@apache.org on 2013/07/07 21:39:01 UTC
svn commit: r1500518 [1/2] - in /ctakes/sandbox/ctakes-scrubber-deid/sql: ./ CSVAnnotation/ CSVAnnotation/human/ CSVAnnotation/machine/ CSVAnnotation/schema/ tmp/

Author: brittfitch
Date: Sun Jul  7 19:39:00 2013
New Revision: 1500518

URL: http://svn.apache.org/r1500518
Log:
add sql scripts used by scrubber 

Added:
    ctakes/sandbox/ctakes-scrubber-deid/sql/
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/@dep_etl.sh
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/HOWTO
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_test.sh   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_train.sh   (with props)
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/export.csv.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/human/
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/human/load.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Feature.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureBin.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrix.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrixOnt.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Sample.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/load.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/drop_tables.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/procedures.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/schema.sh
    ctakes/sandbox/ctakes-scrubber-deid/sql/create_database_and_user.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_cases.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_pubs.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/create_tables.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/create_umls_tables_from_local_install.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/drop_database_and_user.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/drop_tables.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/evaluate_results.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/export_matlab.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/load_csv_annotations.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/populateTFTables.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/tmp/
    ctakes/sandbox/ctakes-scrubber-deid/sql/tmp/FeatureMatrixOnt.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_test.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_train.sql
    ctakes/sandbox/ctakes-scrubber-deid/sql/update_pos_bins.sql

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/@dep_etl.sh
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/%40dep_etl.sh?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/@dep_etl.sh (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/@dep_etl.sh Sun Jul  7 19:39:00 2013
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+echo
+echo "######################################################################################"
+echo INPUT: UIMA+cTAKES annotations;
+echo OUTPUT: WEKA+Matlab CSV files;
+echo "######################################################################################"
+echo
+echo "You can monitor the Process, memory, and log info in the database (ps, mme, tail)"
+echo
+echo " mysql> call ps   ; "
+echo " mysql> call mem  ; "
+echo " mysql> call tail ; "
+echo
+echo
+echo "List of the Data entities handled by this script:"
+echo
+cat README.txt
+
+source ./scrubber.properties.sh
+
+scrubberdb='mysql -D $SCRUBBER_DB_NAME -u $SCRUBBER_DB_USER -p$SCRUBBER_DB_PWD';
+
+echo "######################################################################################"
+echo "step0_load_human_annotations.sql"
+scrubberdb< step0_load_human_annotations.sql
+
+echo "######################################################################################"
+echo "step1_load_machine_annotations.sql"
+scrubberdb< step1_load_machine_annotations.sql
+
+echo "######################################################################################"
+echo "step2_FeatureBin.sql"
+scrubberdb< step2_FeatureBin.sql
+
+echo "######################################################################################"
+echo "step3_Feature.sql"
+scrubberdb< step3_Feature.sql
+
+
+echo "######################################################################################"
+echo "step4_FeatureMatrix.sql"
+scrubberdb< step4_FeatureMatrix.sql
+scrubberdb< step4_FeatureMatrixOnt.sql
+
+echo "######################################################################################"
+echo "step5_export_csv.sql"
+
+pushd . ;
+	cd /tmp/mysql; rm -f Sample.csv FeatureMatrix.csv FeatureMatrixOnt.csv Sentence.csv TermFrequency.csv MyFeatureIdx.csv MyFeatureHeader.csv;
+popd
+
+scrubberdb< step5_export_csv.sql
+
+mkdir -p csv
+mv /tmp/mysql/Sample.csv             ./csv/.
+mv /tmp/mysql/FeatureMatrix.csv      ./csv/.
+mv /tmp/mysql/FeatureMatrixOnt.csv   ./csv/.
+mv /tmp/mysql/TermFrequency.csv      ./csv/.
+mv /tmp/mysql/MyFeatureSelection.csv ./csv/.
+mv /tmp/mysql/MyFeatureIdx.csv       ./csv/.
+mv /tmp/mysql/MyFeatureHeader.csv    ./csv/.
+
+echo "######################################################################################"
+cat README
+
+
+
+
+
+
+

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/HOWTO
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/HOWTO?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/HOWTO (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/HOWTO Sun Jul  7 19:39:00 2013
@@ -0,0 +1,67 @@
+1. human annotations
+java $JAVA_OPTS -cp $CP $SCRUBBER_HUMAN_ANNOTATIONS_IMPLEMENTATION $SCRUBBER_DIR_INPUT_HUMAN_ANNOTATIONS_TRAIN _train
+java $JAVA_OPTS -cp $CP $SCRUBBER_HUMAN_ANNOTATIONS_IMPLEMENTATION $SCRUBBER_DIR_INPUT_HUMAN_ANNOTATIONS_TEST  _test
+
+java $JAVA_OPTS -cp $CP org.spin.scrubber.uima.core.UIMARunner desc/cpe/cpe_cases_train.xml
+java $JAVA_OPTS -cp $CP org.spin.scrubber.uima.core.UIMARunner desc/cpe/cpe_cases_test.xml
+
+
+2. run UIMA : train
+mv csv/*.csv csv/train/*.csv
+
+2. run UIMA : test
+
+mv csv/*.csv csv/train/*.csv
+
+3. copy csv files to temp dir
+
+rm -rf /tmp/mysql/
+mkdir -p /tmp/mysql/
+chmod -R 777 /tmp/mysql/.
+cd csv/train
+cp *.csv /tmp/mysql/.
+
+3. load csv files into mysql: train
+
+source schema.sh
+source load_train.sh
+
+
+
+
+
+last_build/scrubber/csv/train $
+
+rm -rf /tmp/mysql/
+mkdir -p /tmp/mysql/
+chmod -R 777 /tmp/mysql/.
+cp csv/*.csv /tmp/mysql/.
+
+...
+
+3. test
+
+
+java $JAVA_OPTS -cp $CP org.spin.scrubber.uima.core.UIMARunner desc/cpe/cpe_cases_test.xml
+
+
+3. create SCHEMA
+CSVAnnotation$ source ./schema.sh
+
+4. train
+CSVAnnotation$ source ./load_train.sh
+
+6. analysis
+
+data/csv/test $ mv /tmp/mysql/*.csv .
+data/csv/test $ matlab etl.m
+
+
+
+
+
+
+
+
+
+

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_test.sh
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_test.sh?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_test.sh (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_test.sh Sun Jul  7 19:39:00 2013
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+echo
+echo "######################################################################################"
+echo INPUT: UIMA+cTAKES annotations;
+echo OUTPUT: WEKA+Matlab CSV files;
+echo "######################################################################################"
+echo
+echo "You can monitor the Process, memory, and log info in the database (ps, mme, tail)"
+echo
+echo " mysql> call ps   ; "
+echo " mysql> call mem  ; "
+echo " mysql> call tail ; "
+echo
+echo
+echo "List of the Data entities handled by this script:"
+echo
+cat README.txt
+
+echo "######################################################################################"
+source schema/schema.sh
+
+echo "######################################################################################"
+scrubberdb -e 'call USE_TEST'
+scrubberdb < machine/load.sql
+
+echo "######################################################################################"
+scrubberdb< machine/Sample.sql
+
+echo "######################################################################################"
+scrubberdb< machine/FeatureMatrix.sql
+scrubberdb< machine/FeatureMatrixOnt.sql
+
+echo "######################################################################################"
+scrubberdb< machine/export.sql
\ No newline at end of file

Propchange: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_test.sh
------------------------------------------------------------------------------
    svn:executable = *

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_train.sh
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_train.sh?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_train.sh (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_train.sh Sun Jul  7 19:39:00 2013
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+echo
+echo "######################################################################################"
+echo INPUT: UIMA+cTAKES annotations;
+echo OUTPUT: WEKA+Matlab CSV files;
+echo "######################################################################################"
+echo
+echo "You can monitor the Process, memory, and log info in the database (ps, mme, tail)"
+echo
+echo " mysql> call ps   ; "
+echo " mysql> call mem  ; "
+echo " mysql> call tail ; "
+echo
+echo
+echo "List of the Data entities handled by this script:"
+echo
+cat README.txt
+
+echo "######################################################################################"
+source schema/schema.sh
+
+echo "######################################################################################"
+scrubberdb -e 'call USE_TRAIN'
+scrubberdb < human/load.sql
+scrubberdb < machine/load.sql
+
+echo "######################################################################################"
+scrubberdb< machine/Sample.sql
+
+echo "######################################################################################"
+scrubberdb< machine/FeatureBin.sql
+scrubberdb< machine/Feature.sql
+
+echo "######################################################################################"
+scrubberdb< machine/FeatureMatrix.sql
+scrubberdb< machine/FeatureMatrixOnt.sql
+
+echo "######################################################################################"
+scrubberdb< machine/export.sql
\ No newline at end of file

Propchange: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_train.sh
------------------------------------------------------------------------------
    svn:executable = *

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/export.csv.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/export.csv.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/export.csv.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/export.csv.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,133 @@
+call Session;
+
+call log('export_csv.sql', 'begin');
+-- ####
+
+drop table if exists myFeatureSelection;
+create table MyFeatureSelection like Feature;
+
+insert into MyFeatureSelection
+select * from feature where datatype in (
+  'double',
+  'nominal(int)',
+  'nominal(4)',
+  'binary'
+);
+
+
+call log('Sample.csv', 'begin');
+
+select     file_idx, sample_idx, sentence_idx, start_idx, end_idx, token_seq, class_idx
+from       Sample
+order by   sample_idx
+into
+  outfile '/tmp/mysql/Sample.csv'
+  FIELDS TERMINATED BY ','
+  LINES TERMINATED BY '\n';
+
+call log('Sample.csv',       'done');
+call log('SentenceIdx.csv', 'begin');
+
+
+select     s.file_idx, s.sample_idx, s.sentence_idx, sample_idx_start, sample_idx_end, s.sentence_seq
+from       Sentence sentence, Sample s
+where      sentence.sentence_idx = s.sentence_idx
+order by   s.sample_idx asc
+into
+  outfile '/tmp/mysql/SentenceIdx.csv'
+  FIELDS TERMINATED BY ','
+  LINES TERMINATED BY '\n';
+
+call log('SentenceIdx.csv', 'done');
+call log('TermFrequency.csv', 'begin');
+
+-- TODO HACK ! SYM
+update TermFrequency set pos = 'SYM' where pos like 'pos_%';
+
+select     file_idx, sample_idx,
+           TF.tf_ham_without_pos, TF.tf_ham_with_pos , TF.token_len,
+           POS.pos_idx, POS.pos_bin_idx
+from       TermFrequency TF  left join POS on TF.pos = POS.pos
+order by   sample_idx
+into
+  outfile '/tmp/mysql/TermFrequency.csv'
+  FIELDS TERMINATED BY ','
+  LINES TERMINATED BY '\n';
+
+call log('TermFrequency.csv', 'done');
+call log('MyFeatureIdx.csv', 'begin');
+
+select feature_idx, CASE datatype
+when 'binary'  then 1
+WHEN 'double'  THEN 3
+ELSE 2 end
+from MyFeatureSelection
+order      by feature_idx
+into
+  outfile '/tmp/mysql/MyFeatureIdx.csv'
+  FIELDS TERMINATED BY ','
+  LINES TERMINATED BY '\n';
+
+call log('MyFeatureIdx.csv', 'done');
+-- call log('MyFeatureSelection.csv', 'csv');
+
+-- select     *
+-- from       MyFeatureSelection
+-- order      by feature_idx
+-- into
+--   outfile '/tmp/mysql/MyFeatureSelection.csv'
+--   FIELDS TERMINATED BY ','
+--   LINES TERMINATED BY '\n';
+--
+-- call log('MyFeatureSelection.csv', 'csv');
+
+-- call log('MyFeatureHeader.csv', 'begin');
+
+set group_concat_max_len = 8096;
+
+drop table if exists my_FeatureHeader;
+create temporary table my_FeatureHeader
+select feature_idx, concat(feature_idx, '.',annotation_type, '.', match_source, '.', match_value, '.',datatype) as feature_label
+from myFeatureSelection
+order by feature_idx;
+
+insert into my_FeatureHeader values (0,'Class_PHI_Label');
+
+update my_FeatureHeader set feature_label=replace(feature_label, '..', '.');
+
+select     feature_label
+from       my_FeatureHeader
+order      by feature_idx
+into
+  outfile '/tmp/mysql/MyFeatureHeader.csv'
+  FIELDS TERMINATED BY ','
+  LINES TERMINATED  BY ';';
+
+call log('MyFeatureHeader.csv', 'done');
+
+call log('export_csv.sql',      'done');
+call log('FeatureMatrix.csv',   'begin');
+
+select     s.file_idx, s.sample_idx, s.sentence_idx, m.feature_idx, m.feature_value
+from       FeatureMatrix m, Sample s
+where      m.sample_idx = s.sample_idx
+order by   m.sample_idx, m.feature_idx
+into
+  outfile '/tmp/mysql/FeatureMatrix.csv'
+  FIELDS TERMINATED BY ','
+  LINES TERMINATED BY '\n';
+
+call log('FeatureMatrix.csv',     'done');
+call log('FeatureMatrixOnt.csv',  'begin');
+
+select     s.file_idx, s.sample_idx, s.sentence_idx, m.feature_idx, m.count_matches
+from       FeatureMatrixOnt m, Sample s
+where      m.sample_idx = s.sample_idx
+order by   m.sample_idx, m.feature_idx
+into
+  outfile '/tmp/mysql/FeatureMatrixOnt.csv'
+  FIELDS TERMINATED BY ','
+  LINES TERMINATED BY '\n';
+
+-- ####
+call log('FeatureMatrixOnt.csv', 'done');
\ No newline at end of file

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/human/load.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/human/load.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/human/load.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/human/load.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,56 @@
+call Session;
+
+call log('load_human_annotations.sql', 'begin');
+
+call log('HumanAnnotations',   'begin');
+
+call drop_table('HumanAnnotations');
+
+create table HumanAnnotations like human_annotations_train;
+alter table HumanAnnotations  drop column id;
+alter table HumanAnnotations  add column DATASET varchar(20);
+
+call log('HumanAnnotations', 'begin');
+call log('HumanAnnotations', 'TRAIN');
+
+select count(*) from HumanAnnotations into @num_rows;
+
+insert into HumanAnnotations (DATASET, subject_id, tag_name, tag_value, start_idx,end_idx)
+select 'TRAIN' , subject_id, tag_name, tag_value, start_idx,end_idx
+from human_annotations_train;
+
+call log('HumanAnnotations', @num_rows);
+call log('HumanAnnotations',  'TEST');
+
+insert into HumanAnnotations (DATASET, subject_id, tag_name, tag_value, start_idx,end_idx)
+select 'TEST' , subject_id, tag_name, tag_value, start_idx,end_idx
+from human_annotations_test;
+
+select count(*) from HumanAnnotations into @num_rows;
+
+call log('HumanAnnotations',  @num_rows);
+call log('HumanAnnotations', 'done');
+call log('Class',            'begin');
+
+call drop_table('Class');
+
+create table Class
+(
+  class_idx  smallint UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+  class      varchar(20),
+  cnt        integer,
+  DATASET    varchar(10)
+) engine = myisam;
+
+
+insert into Class (DATASET, class) values (@DATASET, 'NA');
+insert into Class (DATASET, class, cnt)
+select @DATASET, tag_name as class, count(*) as cnt
+from HumanAnnotations
+where DATASET = @DATASET
+group by tag_name;
+
+alter table  Class add index (class);
+
+call log('Class',   'done');
+call log('load_human_annotations.sql', 'done');
\ No newline at end of file

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Feature.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Feature.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Feature.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Feature.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,148 @@
+call Session;
+
+call log('Feature', 'begin');
+
+call drop_table('Feature');
+
+create table Feature
+(
+  annotation_type    varchar(25) not null,
+  match_source       varchar(25) not null,
+  match_value        varchar(25) null default '',
+  cnt                int         null,
+  datatype           varchar(25) null,
+  description        text        null
+) engine=myisam;
+
+alter table Feature add index (match_source);
+alter table Feature add index (match_value);
+alter table Feature add index (match_source, match_value);
+
+call log('Feature', 'TermFrequency');
+
+insert into Feature
+select 'TermFrequency', 'tf_ham_without_pos',null, count(*) as cnt, 'double', 'Publication Term Frequency'
+from TermFrequency;
+
+insert into Feature
+select 'TermFrequency', 'tf_ham_with_pos',null, count(*) as cnt, 'double', 'Publication Term Frequency with Part of Speech'
+from TermFrequency;
+
+insert into Feature
+select 'TermFrequency', 'token_len', null, count(*) as cnt, 'nominal(int)', 'String Length {WordToken or NumToken}'
+from TermFrequency;
+
+call log('Feature', 'bin');
+
+insert into Feature
+select 'bin', 'ontology', match_source, count(*) as cnt, 'nominal(int)', '#matches in Dictionary'
+from OntologyMatch
+group by match_source
+order by cnt desc;
+
+insert into Feature
+select 'bin', match_source, match_bin, count(*) as cnt, 'nominal(int)', '@see FeatureBin'
+from FeatureBin where match_source = 'regex'
+group by match_source, match_bin;
+
+insert into Feature
+select 'bin', match_source, match_bin, count(*) as cnt, 'nominal(int)', '@see FeatureBin'
+from FeatureBin where match_source != 'regex'
+group by match_source, match_bin;
+
+call log('Feature','WordToken:cap');
+
+insert into Feature
+select 'WordToken', 'cap', null, count(*) as cnt, 'nominal(4)', '(ctakes) Word Capitalization'
+from WordToken;
+
+call log('Feature','BaseToken...*');
+
+insert into Feature
+select annotation_type, 'BaseToken', null, count(*), 'binary', '(ctakes) token type'
+from BaseToken group by annotation_type order by count(*) desc;
+
+call log('Feature','Annotation...*');
+
+insert into Feature
+select annotation_type, 'Annotation', null, count(*), 'binary', '(ctakes) context dependent'
+from Annotation
+where annotation_type in
+(
+  'RomanNumeralAnnotation',
+  'FractionAnnotation',
+  'DateAnnotation',
+  'MeasurementAnnotation',
+  'RangeAnnotation',
+  'PersonTitleAnnotation',
+  'TimeAnnotation'
+)
+group by annotation_type
+order by count(*) desc;
+
+call log('Feature','BaseToken:pos');
+
+insert into Feature
+select 'BaseToken', 'pos', pos, count(*) as cnt, 'binary', '(ctakes) Part Of Speech'
+from BaseToken
+where annotation_type = 'WordToken' or annotation_type = 'NumToken'
+group by pos
+order by cnt desc;
+
+call log('Feature','regex');
+
+insert into Feature
+select annotation_type, match_source, match_value, count(*) as cnt, 'binary', '(Beckwith) regular expression'
+from OntologyMatch
+where match_source  = 'regex'
+group by match_source, match_value
+order by count(*) desc;
+
+ALTER TABLE Feature add feature_idx smallint UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+ALTER TABLE Feature add index (annotation_type, match_source, match_value);
+
+update Feature  set match_value = '' where match_value is null;
+
+CALL log('Feature', 'FeatureBin:index');
+
+alter table FeatureBin add index (match_source, match_value);
+alter table FeatureBin add index (match_source, match_value, match_bin);
+
+alter table FeatureBin add column match_value_feature_idx smallint unsigned;
+alter table FeatureBin add column match_bin_feature_idx smallint unsigned;
+
+alter table FeatureBin add index(match_value_feature_idx);
+
+update FeatureBin b, Feature f
+set b.match_bin_feature_idx = f.feature_idx
+where f.annotation_type = 'bin' and f.match_value = b.match_bin;
+
+update FeatureBin b, Feature f
+set b.match_value_feature_idx = f.feature_idx
+where f.match_source = f.match_source and f.match_value = b.match_value;
+
+-- CALL log('Feature', 'removing sparse features');
+--
+-- select * from Feature where annotation_type != 'bin' and cnt < 10;
+-- delete from Feature where annotation_type != 'bin' and cnt < 10;
+
+CALL log('Feature', 'FeatureBin:done');
+CALL log('Feature', 'POS:begin');
+
+drop table if exists POS;
+create table POS
+(
+  pos           varchar(25) null,
+  pos_idx       smallint    unsigned,
+  pos_bin_idx   smallint    unsigned
+)
+engine = myisam;
+
+insert into POS (pos, pos_idx, pos_bin_idx)
+select distinct match_value, match_value_feature_idx, match_bin_feature_idx from FeatureBin where match_source='pos';
+
+alter table POS add index (pos);
+CALL log('Feature', 'POS:done');
+call log('Feature',     'done');
+
+

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureBin.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureBin.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureBin.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureBin.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,192 @@
+call Session;
+call log('Step(2)', 'begin');
+
+CALL log('FeatureBin',      'create');
+
+call drop_table('FeatureBin');
+
+create table FeatureBin
+(
+  match_source  varchar(50) not null,
+  match_value   varchar(50) not null,
+  match_bin     varchar(50) not null
+)
+engine = myisam;
+
+insert into FeatureBin values
+('pos','CC', 'com-dep-wd'),
+('pos','CT', 'com-dep-wd'),
+('pos','DT', 'com-dep-wd'),
+('pos','EX', 'com-dep-wd'),
+('pos','IN', 'com-dep-wd'),
+('pos','MD', 'com-dep-wd'),
+('pos','PDT', 'com-dep-wd'),
+('pos','RP', 'com-dep-wd'),
+('pos','TO', 'com-dep-wd'),
+('pos','UH', 'com-dep-wd'),
+('pos','WDT', 'com-dep-wd'),
+('pos','FW', 'FW-Symb'),
+('pos','SYM', 'FW-Symb'),
+('pos','JJ', 'Adjectives'),
+('pos','JJR', 'Adjectives'),
+('pos','JJS', 'Adjectives'),
+('pos','NN', 'Nouns'),
+('pos','NNS', 'Nouns'),
+('pos','NNP', 'Nouns'),
+('pos','NNPS', 'Nouns'),
+('pos','WRB', 'Adverbs'),
+('pos','RB', 'Adverbs'),
+('pos','RBR', 'Adverbs'),
+('pos','RBS', 'Adverbs'),
+('pos','VB', 'Verbs'),
+('pos','VBD', 'Verbs'),
+('pos','VBG', 'Verbs'),
+('pos','VBN', 'Verbs'),
+('pos','VBP', 'Verbs'),
+('pos','VBZ', 'Verbs'),
+('pos','PRP', 'Pronouns'),
+('pos','PRP$', 'Pronouns'),
+('pos','WP', 'Pronouns'),
+('pos','WP$', 'Pronouns'),
+('pos','CD', 'Numbers'),
+('pos','LS', 'Numbers')
+;
+
+insert into FeatureBin values
+('regex', 'HOSPITAL_1', 'hospital'),
+('regex', 'HOSPITAL_2', 'hospital');
+
+
+drop table if exists temp_regex_list;
+create temporary table temp_regex_list
+(
+match_value varchar(50) not null
+);
+
+insert into temp_regex_list values
+('EMAIL_ADDRESS'),
+('TELEPHONE2'),
+('TELEPHONE3'),
+('TELEPHONE0'),
+('TELEPHONE1'),
+('EXTENSION'),
+('IP');
+
+insert into FeatureBin select 'regex', match_value, 'phone' from temp_regex_list;
+truncate temp_regex_list;
+
+insert into temp_regex_list values
+('WRITTEN_AGE_10_100_DIV10'),
+('WRITTEN_AGE_110_TO_119'),
+('WRITTEN_AGE_20_TO_99'),
+('WRITTEN_AGE_1_TO_19'),
+('AGE'),
+('AGED_OVER'),
+('AGE4'),
+('AGE5'),
+('AGE6'),
+('AGE7'),
+('AGE8');
+
+insert into FeatureBin select 'regex', match_value, 'age' from temp_regex_list;
+truncate temp_regex_list;
+
+insert into temp_regex_list values
+('DATE'),
+('DATE1'),
+('DATE2'),
+('DATE3'),
+('DATE4'),
+('DATE5'),
+('DATE5_U'),
+('DATE6'),
+('DATE7'),
+('DATE8'),
+('DATE9'),
+('DATE11'),
+('DATE12'),
+('DATE13'),
+('DATE14'),
+('DATE15'),
+('DATE16'),
+('DATE17'),
+('DATE18'),
+('DATE19'),
+('DATE20'),
+('DATE21'),
+('DATE22'),
+('DATE_SEPARATORS'),
+('YEAR_CENTURY'),
+('MONTH'),
+('MONTH2'),
+('DISCHARGED'),
+('DISCHARGED2'),
+('DATE_23');
+
+insert into FeatureBin select 'regex', match_value, 'date' from temp_regex_list;
+truncate temp_regex_list;
+
+insert into temp_regex_list values
+('ADDRESS'),
+('ADDRESS2'),
+('ADDRESS4'),
+('ADDRESS5'),
+('ADDRESS6'),
+('ADDRESS7'),
+('ADDRESS8'),
+('POBOX'),
+('ROOM'),
+('ZIP_CODE'),
+('LOCATION_FLOOR_1'),
+('LOCATION_FLOOR_2');
+
+insert into FeatureBin select 'regex', match_value, 'address' from temp_regex_list;
+truncate temp_regex_list;
+
+insert into temp_regex_list values
+('SURGEON'),
+('SURGEON2'),
+('SURGEON3'),
+('SURGEON4'),
+('SURGEON5'),
+('SURGEON6'),
+('SURGEON7'),
+('SURGEON8'),
+('ASST'),
+('ASST2'),
+('ASST3'),
+('PA'),
+('CC'),
+('CC2'),
+('FROM'),
+('CONSULTATION'),
+('TECH'),
+('DOCTOR0_00'),
+('DOCTOR0_0'),
+('DOCTOR0_1'),
+('DOCTOR0_2'),
+('DOCTOR0'),
+('DOCTOR1'),
+('DOCTOR2'),
+('DOCTOR3'),
+('DOCTOR4'),
+('DOCTOR_OLDER'),
+('DOCTOR_GEN'),
+('DOCTOR_SUBHEAD');
+
+insert into FeatureBin select 'regex', match_value, 'doctor' from temp_regex_list;
+truncate temp_regex_list;
+
+insert into temp_regex_list values
+('TITLES'),
+('TITLE_THIRD'),
+('PATIENT_NAME'),
+('MR'),
+('MS');
+
+
+insert into FeatureBin select 'regex', match_value, 'patient' from temp_regex_list;
+truncate temp_regex_list;
+
+CALL log('FeatureBin', 'create is done');
+call log('Step(2)',    'done');
\ No newline at end of file

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrix.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrix.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrix.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrix.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,168 @@
+call Session;
+
+call log('FeatureMatrix.sql', 'begin');
+
+call log('FeatureMatrix', 'begin');
+call log('FeatureMatrix', 'create');
+drop table if exists FeatureMatrix;
+create table FeatureMatrix
+(
+  file_idx       smallint not null,
+  sample_idx     integer  not null,
+  feature_idx    smallint not null,
+  feature_value  float unsigned default 0
+) engine=myisam;
+
+call log('FeatureMatrix', 'TermFrequency');
+
+select feature_idx into @tf_ham_without_pos from Feature where match_source = 'tf_ham_without_pos';
+select feature_idx into @tf_ham_with_pos    from Feature where match_source = 'tf_ham_with_pos';
+select feature_idx into @token_len          from Feature where match_source = 'token_len';
+
+insert into FeatureMatrix
+select file_idx, sample_idx, @tf_ham_without_pos, tf_ham_without_pos from TermFrequency;
+
+insert into FeatureMatrix
+select file_idx, sample_idx, @tf_ham_with_pos, tf_ham_with_pos from TermFrequency;
+
+insert into FeatureMatrix
+select file_idx, sample_idx, @token_len, token_len from TermFrequency;
+
+call log('FeatureMatrix', 'TermFrequency:done');
+call log('FeatureMatrix', 'WordToken:cap');
+
+select feature_idx into @cap from Feature where annotation_type = 'WordToken' and match_source = 'cap';
+
+insert into FeatureMatrix
+select  s.file_idx, s.sample_idx, @cap, w.cap
+from    Sample s, WordToken w
+where   w.file_idx     = s.file_idx  and
+        w.token_seq    = s.token_seq   ;
+
+call log('FeatureMatrix', 'WordToken:cap:done');
+call log('FeatureMatrix', 'BaseToken');
+
+-- | annotation_type        | match_source | match_value | datatype   | feature_idx |
+-- +------------------------+--------------+--------------------------+-------------+
+-- | WordToken              | BaseToken    | NULL        | binary       |           5 |
+-- | PunctuationToken       | BaseToken    | NULL        | binary       |           6 |
+-- | NumToken               | BaseToken    | NULL        | binary       |           7 |
+-- | SymbolToken            | BaseToken    | NULL        | binary       |           8 |
+-- | ContractionToken       | BaseToken    | NULL        | binary       |           9 |
+
+-- NumToken and WordToken only
+insert into FeatureMatrix
+select  s.file_idx, s.sample_idx, f.feature_idx, 1
+from    Feature f, Sample s
+where   f.match_source     = 'BaseToken'       and
+        f.annotation_type  = s.annotation_type;
+
+insert into FeatureMatrix
+select  s.file_idx, s.sample_idx, f.feature_idx, 1
+from    Feature f, Sample s, BaseToken b
+where   f.match_source     = 'BaseToken'       and
+        f.annotation_type  = b.annotation_type and
+        b.file_idx         = s.file_idx        and
+        b.token_seq        = s.token_seq-1     and
+        b.annotation_type in ('PunctuationToken', 'SymbolToken', 'ContractionToken');
+
+call log('FeatureMatrix', 'BaseToken:done');
+call log('FeatureMatrix', 'Annotation');
+
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | annotation_type        | match_source | match_value | datatype   | feature_idx |
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | RomanNumeralAnnotation | Annotation   | NULL        | binary       |          10 |
+-- | FractionAnnotation     | Annotation   | NULL        | binary       |          11 |
+-- | DateAnnotation         | Annotation   | NULL        | binary       |          12 |
+-- | MeasurementAnnotation  | Annotation   | NULL        | binary       |          13 |
+-- | RangeAnnotation        | Annotation   | NULL        | binary       |          14 |
+-- | PersonTitleAnnotation  | Annotation   | NULL        | binary       |          15 |
+-- | TimeAnnotation         | Annotation   | NULL        | binary       |          16 |
+
+-- Annotation is inside the WordNum
+insert into FeatureMatrix
+select  s.file_idx, s.sample_idx, f.feature_idx, 1
+from    Feature f, Sample s, Annotation a
+where   f.match_source     = 'Annotation'      and
+        f.annotation_type  = a.annotation_type and
+        a.file_idx         = s.file_idx        and
+        a.start_idx       >= s.start_idx       and
+        a.end_idx         <= s.end_idx           ;
+
+-- WordNum is inside the Annotation
+insert into FeatureMatrix
+select  s.file_idx, s.sample_idx, f.feature_idx, 1
+from    Feature f, Sample s, Annotation a
+where   f.match_source     = 'Annotation'      and
+        f.annotation_type  = a.annotation_type and
+        a.file_idx         = s.file_idx        and
+        a.start_idx       <= s.start_idx       and
+        a.end_idx         >= s.end_idx           ;
+
+call log('FeatureMatrix', 'Annotation:done');
+call log('FeatureMatrix', 'BaseToken:pos');
+
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | annotation_type        | match_source | match_value | datatype   | feature_idx |
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | BaseToken              | pos          | NN          | binary       |          32 |
+-- | BaseToken              | pos          | JJ          | binary       |          33 |
+-- | BaseToken              | pos          | CD          | binary       |          34 |
+-- | BaseToken              | pos          | IN          | binary       |          35 |
+-- | BaseToken              | pos          | NNP         | binary       |          36 |
+-- | BaseToken              | pos          | DT          | binary       |          37 |
+-- | ...                    | ...          | ...         | ...        |          .. |
+-- | BaseToken              | pos          | pos_paren   | binary       |          71 |
+-- +------------------------+--------------+-------------+------------+-------------+
+
+insert into FeatureMatrix
+select  s.file_idx, s.sample_idx, f.feature_idx, 1
+from    Feature f, Sample s
+where   f.annotation_type  = 'BaseToken'       and
+        f.match_source     = 'pos'             and
+        f.match_value      = s.pos               ;
+
+call log('FeatureMatrix', 'BaseToken:pos:done');
+call log('FeatureMatrix', 'OntologyMatch:regex');
+
+-- +-----------------+--------------+-----------------+----------+-------------+
+-- | annotation_type | match_source | match_value     | datatype | feature_idx |
+-- +-----------------+--------------+-----------------+----------+-------------+
+-- | OntologyMatch   | regex        | DATE22          | binary     |          72 |
+-- | OntologyMatch   | regex        | DATE21          | binary     |          73 |
+-- | OntologyMatch   | regex        | DATE17          | binary     |          74 |
+-- | OntologyMatch   | regex        | DATE18          | binary     |          75 |
+-- | OntologyMatch   | regex        | DATE_SEPARATORS | binary     |          76 |
+-- | OntologyMatch   | regex        | DATE1           | binary     |          77 |
+-- | OntologyMatch   | regex        | DATE4           | binary     |          78 |
+-- | OntologyMatch   | regex        | YEAR_CENTURY    | binary     |          79 |
+-- | OntologyMatch   | regex        | SUSPICIOUS_NUM  | binary     |          80 |
+-- | OntologyMatch   | regex        | MONTH           | binary     |          81 |
+-- | ...             | ...          | ...             | ...      |         ... |
+-- | OntologyMatch   | regex        | ADDRESS         | binary     |         148 |
+-- +-----------------+--------------+-----------------+----------+-------------+
+
+insert into FeatureMatrix
+select  s.file_idx, s.sample_idx, f.feature_idx, 1
+from    Feature f, Sample s, OntologyMatch o
+where   f.annotation_type = 'OntologyMatch' and
+        f.match_source    = 'regex'         and
+        f.match_source    = o.match_source  and
+        f.match_value     = o.match_value   and
+        o.file_idx        = s.file_idx      and
+        o.start_idx       <= s.start_idx    and
+        o.end_idx         >= s.end_idx         ;
+
+call log('FeatureMatrix', 'OntologyMatch:regex:done');
+
+call log('FeatureMatrix', 'bin');
+
+insert into FeatureMatrix
+select  fm.file_idx, fm.sample_idx, b.match_bin_feature_idx, count(*)
+from    FeatureMatrix fm, FeatureBin b
+where   fm.feature_idx = match_value_feature_idx
+group by fm.file_idx, fm.sample_idx, b.match_bin_feature_idx;
+
+call log('FeatureMatrix', 'bin:done');
+call log('FeatureMatrix.sql', 'done');
\ No newline at end of file

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrixOnt.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrixOnt.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrixOnt.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrixOnt.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,33 @@
+call Session;
+
+call log('FeatureMatrixOnt.sql', 'begin');
+
+call drop_table('FeatureMatrixOnt');
+
+create table FeatureMatrixOnt
+(
+  file_idx       smallint unsigned not null,
+  sample_idx     integer  unsigned not null,
+  feature_idx    smallint unsigned not null,
+  count_matches  smallint unsigned not null
+)
+engine=myisam;
+
+call log('FeatureMatrixOnt', 'begin');
+
+call insert_FeatureMatrixOnt('dict') ;
+call insert_FeatureMatrixOnt('ICD10%') ;
+call insert_FeatureMatrixOnt('ICD9CM') ;
+call insert_FeatureMatrixOnt('RXNORM') ;
+call insert_FeatureMatrixOnt('HL7V3.0') ;
+call insert_FeatureMatrixOnt('HL7V2.5') ;
+call insert_FeatureMatrixOnt('COSTAR');
+call insert_FeatureMatrixOnt('MSH');
+call insert_FeatureMatrixOnt('LNC');
+call insert_FeatureMatrixOnt('SNOMEDCT');
+
+insert into FeatureMatrix select * from FeatureMatrixOnt;
+
+call log('FeatureMatrixOnt', 'done');
+
+call log('FeatureMatrixOnt.sql', 'done');

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Sample.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Sample.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Sample.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Sample.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,157 @@
+call Session;
+
+call log('Sample',   'begin');
+call log('Sample',   'create');
+call log('Sample',   'BaseToken');
+
+call drop_table('Sample');
+
+create table Sample
+  select * from BaseToken where
+  annotation_type = 'NumToken' or
+  annotation_type = 'WordToken'
+  order by file_idx, token_seq asc;
+
+call log('Sample',   'BaseToken');
+call log('Sample',   'index');
+
+alter table Sample engine = myisam;
+alter table Sample add class_idx  tinyint UNSIGNED NULL;
+alter table Sample add sample_idx integer UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+
+alter table Sample add index (file_idx);
+alter table Sample add index (file_idx, token_seq);
+alter table Sample add index (file_idx, start_idx, end_idx);
+
+call log('Sample',   'index:done');
+call log('Sample',   'class:begin');
+
+call log('Class', 'table');
+drop table if exists Class;
+create table Class
+(
+  class_idx   tinyint unsigned,
+  class       varchar(10) not null
+)
+engine = myisam;
+
+insert into Class values
+ (1, 'NA'),
+ (2, 'AGE'),
+ (3, 'DATE'),
+ (4, 'DOCTOR'),
+ (5, 'HOSPITAL'),
+ (6, 'ID'),
+ (7, 'LOCATION'),
+ (8, 'PATIENT'),
+ (9, 'PHONE');
+
+call log('Class',   'done');
+
+update Sample s, Class c
+set   s.class_idx = c.class_idx
+where s.class = c.class;
+
+update Sample set class_idx = 0 where class_idx is null;
+
+call log('Sample',   'class:done');
+call log('Sample',   'sentence');
+
+call drop_table('tmp_sentence_bounds');
+
+create temporary table tmp_sentence_bounds select
+file_idx, sentence_seq, min(sample_idx) sample_idx_start, max(sample_idx) sample_idx_end
+from Sample group by file_idx, sentence_seq order by file_idx, sentence_seq;
+
+call drop_table('tmp_Sentence ');
+
+create table tmp_Sentence select
+ annotation_type   ,
+ filename_short    ,
+ start_idx         ,
+ end_idx           ,
+ sentence_seq      ,
+ file_idx
+from Sentence
+order by file_idx, sentence_seq;
+
+alter table tmp_Sentence add column sample_idx_start mediumint unsigned;
+alter table tmp_Sentence add column sample_idx_end   mediumint unsigned;
+
+alter table tmp_Sentence add index (file_idx, sentence_seq);
+alter table tmp_Sentence add index (file_idx, sample_idx_start, sample_idx_end);
+
+update tmp_Sentence s,  tmp_sentence_bounds bounds
+set    s.sample_idx_start = bounds.sample_idx_start,
+       s.sample_idx_end   = bounds.sample_idx_end
+where  s.file_idx         = bounds.file_idx      and
+       s.sentence_seq     = bounds.sentence_seq     ;
+
+call drop_table('EmptySentence');
+
+create table EmptySentence
+select * from tmp_Sentence where sample_idx_end = 0;
+delete from tmp_Sentence where sample_idx_end = 0;
+
+alter table Sentence rename to dropme;
+alter table tmp_Sentence   rename to Sentence;
+
+call log('Sample', 'sentence:index');
+
+alter table Sentence  add column sentence_idx integer UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+alter table Sample    add column sentence_idx smallint unsigned ;
+drop table  dropme;
+
+update Sample sample,    Sentence s
+set    sample.sentence_idx  = s.sentence_idx
+where  sample.file_idx      = s.file_idx      and
+       sample.start_idx    >= s.start_idx     and
+       sample.end_idx      <= s.end_idx         ;
+
+call log('Sample',  'sentence:done');
+call log('Sample',  'done');
+call log('TermFrequency', 'begin');
+
+call drop_table('TermFrequency');
+call        log('TermFrequency', 'create');
+
+create table TermFrequency
+(
+  file_idx      smallint unsigned,
+  sample_idx    int(10)  unsigned,
+
+  start_idx  smallint unsigned,
+  end_idx    smallint unsigned,
+
+  token varchar(100) not null null,
+  token_len tinyint unsigned,
+
+  pos  varchar(25) ,
+  tf_ham_with_pos     double default -1,
+  tf_ham_without_pos  double default -1
+
+) engine = myisam;
+
+insert into TermFrequency (file_idx, sample_idx, start_idx, end_idx, pos, token, token_len)
+select file_idx, sample_idx, start_idx, end_idx, pos, token, length(token)
+from Sample order by sample_idx;
+
+alter table TermFrequency add index (file_idx, sample_idx, start_idx);
+alter table TermFrequency add index (pos);
+
+update  TermFrequency tf, Calculation c
+set     tf_ham_without_pos = match_value
+where   'tf_ham_without_pos' = c.match_source   and
+        tf.file_idx  = c.file_idx               and
+        tf.start_idx = c.start_idx              and
+        tf.end_idx   = c.end_idx                   ;
+
+update  TermFrequency tf, Calculation c
+set     tf_ham_with_pos = match_value
+where   'tf_ham_with_pos' = c.match_source   and
+        tf.file_idx  = c.file_idx            and
+        tf.start_idx = c.start_idx           and
+        tf.end_idx   = c.end_idx               ;
+
+call        log('Sample',   'done');
+

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/load.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/load.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/load.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/load.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,168 @@
+truncate log;
+
+call Session;
+
+call log('load_machine_annotations.sql', 'begin');
+
+CALL log('Annotation', 'begin');
+
+CALL drop_table('Annotation');
+
+create table Annotation
+(
+  annotation_type varchar(50),
+  filename_short  varchar(20),
+  start_idx       smallint,
+  end_idx         smallint
+) engine = myisam;
+
+load data infile '/tmp/mysql/Annotation.csv' into table Annotation
+fields terminated by ',' ignore 1 lines;
+
+CALL log('Annotation', 'done');
+CALL log('Sentence',   'begin');
+
+CALL drop_table('Sentence');
+
+create table Sentence like Annotation;
+alter table Sentence add  column sentence_seq smallint  not null;
+
+load data infile '/tmp/mysql/Sentence.csv' into table Sentence
+fields terminated by ',' ignore 1 lines;
+
+CALL log('Sentence',     'done');
+CALL log('BaseToken',    'begin');
+
+CALL drop_table('BaseToken');
+create table BaseToken like Annotation;
+
+alter table BaseToken add column token_seq  smallint    not null;
+alter table BaseToken add column pos        varchar(25) not null;
+alter table BaseToken add column token      varchar(100) not null;
+
+load data infile '/tmp/mysql/BaseToken.csv' into table BaseToken
+fields terminated by ',' ignore 1 lines;
+
+CALL log('BaseToken',    'done');
+CALL log('NewLineToken',    'skipping...');
+-- CALL log('NewLineToken', 'begin');
+--
+-- create table NewLineToken like BaseToken;
+--
+-- load data infile '/tmp/mysql/NewLineToken.csv' into table NewLineToken
+-- fields terminated by ',' ignore 1 lines;
+--
+-- CALL log('NewLineToken', 'done');
+CALL log('WordToken',  'begin');
+
+CALL drop_table('WordToken');
+create table WordToken like Annotation;
+
+alter table WordToken add column token_seq  smallint     not null;
+alter table WordToken add column pos        varchar(25)  not null;
+alter table WordToken add column cap        tinyint      not null;
+alter table WordToken add column token      varchar(100) not null;
+
+load data infile '/tmp/mysql/WordToken.csv' into table WordToken
+fields terminated by ',' ignore 1 lines;
+
+alter table WordToken add index(pos, cap);
+
+CALL log('WordToken',    'done');
+CALL log('OntologyMatch','begin');
+
+CALL drop_table('OntologyMatch');
+create table OntologyMatch like Annotation;
+
+alter table OntologyMatch add column match_source  varchar(50)  not null;
+alter table OntologyMatch add column match_value   varchar(50)  not null;
+alter table OntologyMatch add column token         varchar(100) not null;
+
+load data infile '/tmp/mysql/OntologyMatch.csv' into table OntologyMatch
+fields terminated by ',' ignore 1 lines;
+
+CALL log('OntologyMatch',  'done');
+CALL log('Calculation',    'begin');
+
+CALL drop_table('Calculation');
+create table Calculation like OntologyMatch;
+
+alter table Calculation drop column token;
+
+load data infile '/tmp/mysql/Calculation.csv' into table Calculation
+fields terminated by ',' ignore 1 lines;
+
+CALL log('Calculation',    'done');
+CALL log('file_idx',      'begin');
+
+alter table Annotation     add column file_idx smallint not null;
+alter table BaseToken      add column file_idx smallint not null;
+alter table Sentence       add column file_idx smallint not null;
+alter table WordToken      add column file_idx smallint not null;
+alter table OntologyMatch  add column file_idx smallint not null;
+alter table Calculation    add column file_idx smallint not null;
+
+update Annotation     set file_idx = trim(replace(filename_short, '.txt', ''));
+update BaseToken      set file_idx = trim(replace(filename_short, '.txt', ''));
+update Sentence       set file_idx = trim(replace(filename_short, '.txt', ''));
+update WordToken      set file_idx = trim(replace(filename_short, '.txt', ''));
+update OntologyMatch  set file_idx = trim(replace(filename_short, '.txt', ''));
+update Calculation    set file_idx = trim(replace(filename_short, '.txt', ''));
+
+CALL log('file_idx',   'done');
+CALL log('index',      'begin');
+
+CALL log('WordToken', 'index');
+alter table WordToken add index (file_idx);
+alter table WordToken add index (file_idx, token_seq);
+alter table WordToken add index (file_idx, start_idx, end_idx);
+
+CALL log('BaseToken', 'index');
+alter table BaseToken add index (filename_short);
+alter table BaseToken add index (file_idx, token_seq);
+alter table BaseToken add index (file_idx, start_idx, end_idx);
+alter table BaseToken add index (annotation_type, file_idx, token_seq);
+
+CALL log('Annotation', 'index');
+alter table Annotation add index (annotation_type);
+alter table Annotation add index (file_idx, start_idx, end_idx);
+
+CALL log('Calculation', 'index');
+alter table Calculation add index (file_idx, start_idx, end_idx);
+
+CALL log('OntologyMatch', 'index');
+alter table OntologyMatch add index(match_source);
+alter table OntologyMatch add index(match_source, match_value);
+alter table OntologyMatch add index(file_idx, start_idx, end_idx);
+alter table OntologyMatch add index(match_source, match_value, file_idx, start_idx, end_idx);
+
+CALL log('Sentence', 'index');
+alter table Sentence add index(file_idx, start_idx, end_idx);
+alter table Sentence add index(file_idx, sentence_seq);
+
+CALL log('index',      'done');
+call log('BaseToken:sentence_seq', 'begin');
+
+alter table BaseToken add sentence_seq integer not null;
+
+update BaseToken b,    Sentence s
+set    b.sentence_seq  = s.sentence_seq
+where  b.file_idx      = s.file_idx      and
+       b.start_idx    >= s.start_idx     and
+       b.end_idx      <= s.end_idx         ;
+
+call log('BaseToken:sentence_seq', 'done');
+
+alter table BaseToken add class varchar(20) not null;
+
+update BaseToken b,    HumanAnnotations phi
+set    b.class         = phi.tag_name
+where  phi.DATASET     = @DATASET        and
+       b.filename_short= phi.subject_id  and
+       b.start_idx    >= phi.start_idx   and
+       b.end_idx      <= phi.end_idx       ;
+
+call log('BaseToken:class',   'done.');
+CALL log('Annotation',        'done');
+
+call log('load_machine_annotations.sql', 'done');
\ No newline at end of file

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README Sun Jul  7 19:39:00 2013
@@ -0,0 +1,50 @@
++----------------+-------------+-----------------------+-------------------------------------+-----------------------------------------+
+| entity_topic   | entity_type | entity_name           | example                             | description                             |
++----------------+-------------+-----------------------+-------------------------------------+-----------------------------------------+
+| README         | version     | SCRUBBER 3.1-SNAPSHOT |                                     |                                         |
+| README         | version     | cTAKES 1.2.2          |                                     |                                         |
+| global         |             |                       |                                     |                                         |
+| global         | variable    | @DATASET              | select TEST into @DATASET           | (TRAIN | TEST) select dataset           |
+| global         | variable    | @CSV_DIR              | select /tmp/mysql/ into @CSV_DIR    | call csvwrite(table)                    |
+| global         | variable    | @LOG_LEVEL            | select 1 into @LOG_LEVEL            | turn on printing log to console         |
+| shell          |             |                       |                                     |                                         |
+| shell          | procedure   | mem                   | call mem                            | get schema +memory usage                |
+| shell          | procedure   | ps                    | call ps                             | show current sql process                |
+| shell          | procedure   | csvread               | call csvread(WordToken)             | csv->table                              |
+| shell          | procedure   | csvwrite              | call csvwrite(WordToken)            | table->csv                              |
+| logging        |             |                       |                                     |                                         |
+| logging        | table       | etl_log               | call log | tail | etime             | scrubber application log                |
+| logging        | procedure   | log                   | call log (tablename, message)       | add new log event                       |
+| logging        | procedure   | tail                  | call tail                           | print recent log events                 |
+| logging        | procedure   | etime                 | call etime                          | print elapsed time from log             |
+| ctakes         |             |                       |                                     |                                         |
+| ctakes         | uima        | Annotation            | desc table | csvread | csvwrite     | UIMA base class                         |
+| ctakes         | ->          | Sentence              | desc table | csvread | csvwrite     | sequence of annotation tokens           |
+| ctakes         | ->          | Calculation           | desc table | csvread | csvwrite     | TermFrequency                           |
+| ctakes         | ->          | OntologyMatch         | desc table | csvread | csvwrite     | UMLS, Regex, and Private dictionaries   |
+| ctakes         | type        | BaseToken             | desc table | csvread | csvwrite     | CTAKES base class                       |
+| ctakes         | ->          | NumToken              | desc table | csvread | csvwrite     | token is a number                       |
+| ctakes         | ->          | WordToken             | desc table | csvread | csvwrite     | token is a word                         |
+| Classification |             |                       |                                     |                                         |
+| Classification | table       | Class                 | desc table | show index             | PHI| NA                                 |
+| Classification | table       | Sample                | desc table | show index             | Class ( NumToken | WordToken )          |
+| Classification | table       | Feature               | desc table | show index             | Lexical | Stat | Pattern | Ontology     |
+| Classification | table       | FeatureBin            | desc table | show index             | POS | UMLS | regex                      |
+| Classification | table       | FeatureMatrix         | desc table | show index             | (row,col) = (Sample,Feature)            |
+| Classification | table       | FeatureMatrixOnt      | desc table | show index             | (row,col) = (Sample,OntologyMatch)      |
+| Features       |             |                       |                                     |                                         |
+| Features       | type        | Feature               | Lexical | Stat | Pattern | Ontology |                                         |
+| Features       | ->          | Lexical               | POS* | Capital | cTAKES*            |                                         |
+| Features       | ->          | Lexical               | POS*                                | Part of Speech (BIN)?                   |
+| Features       | ->          | Lexical               | Capital                             | Token Capitalization                    |
+| Features       | ->          | Lexical               | cTAKES*                             | (context dependent)                     |
+| Features       | ->          | Stat                  | TF* | TokenLength | BagOfWords      |                                         |
+| Features       | ->          | Stat                  | TF*                                 | Term Frequency (POS)?                   |
+| Features       | ->          | Stat                  | TokenLength                         | length(token)                           |
+| Features       | ->          | Stat                  | BagOfWords                          | #OntologyMatches per ontology           |
+| Features       | ->          | Pattern               |                                     |                                         |
+| Features       | ->          | Pattern               | Beckwith Regular Expressions        | Beckwith et al. 2006                    |
+| Features       | ->          | Pattern               | custom                              | custom regex.list                       |
+| Features       | ->          | Ontology              |                                     |                                         |
+| Features       | ->          | Ontology              | UMLS                                | Hl7|ICD|LOINC|MESH|RXNORM|SNOMED|COSTAR |
++----------------+-------------+-----------------------+-------------------------------------+-----------------------------------------+

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,103 @@
+drop table if exists README;
+create table README
+(
+   entity_topic  varchar(50) not null default '',
+   entity_type   varchar(50) not null default '',
+   entity_name   varchar(50) not null default '',
+   example       text        not null default '',
+   description   text        not null default ''
+);
+
+insert into README values
+('README', 'version',  'SCRUBBER 3.1-SNAPSHOT', null, null),
+('README', 'version',  'cTAKES 1.2.2',          null, null);
+
+insert into README values
+('global','','','',''),
+('global', 'variable',  '@DATASET',   'select TEST into @DATASET',         '(TRAIN | TEST) select dataset'),
+('global', 'variable',  '@CSV_DIR',   'select /tmp/mysql/ into @CSV_DIR',   'call csvwrite(table) '),
+('global', 'variable',  '@LOG_LEVEL', 'select 1 into @LOG_LEVEL',          'turn on printing log to console');
+
+insert into README values
+('shell','','','',''),
+('shell',  'procedure',  'mem',      'call mem', 'get schema +memory usage'),
+('shell',  'procedure',  'ps',       'call ps',  'show current sql process'),
+('shell',  'procedure',  'csvread',  'call csvread(WordToken)',  'csv->table'),
+('shell',  'procedure',  'csvwrite', 'call csvwrite(WordToken)', 'table->csv');
+
+insert into README values
+('logging','','','',''),
+('logging', 'table',     'log',   'call log | tail | etime', 'scrubber application log'),
+('logging', 'procedure', 'log',   'call log (tablename, message)', 'add new log event'),
+('logging', 'procedure', 'tail',  'call tail', 'print recent log events'),
+('logging', 'procedure', 'etime', 'call etime', 'print elapsed time from log');
+
+select    'desc table | csvread | csvwrite'   into @csv_readwrite;
+select 'desc table | show index' into @desc_table;
+
+insert into README values
+('ctakes', '',  '', null,null),
+('ctakes', 'uima',  'Annotation',    @csv_readwrite, 'UIMA base class'),
+('ctakes', '->',    'Sentence',      @csv_readwrite, 'sequence of annotation tokens'),
+('ctakes', '->',    'Calculation',   @csv_readwrite, 'TermFrequency'),
+('ctakes', '->',    'OntologyMatch', @csv_readwrite, 'UMLS, Regex, and Private dictionaries'),
+('ctakes', 'type',  'BaseToken',     @csv_readwrite, 'CTAKES base class'),
+('ctakes', '->',    'NumToken',      @csv_readwrite, 'token is a number'),
+('ctakes', '->',    'WordToken',     @csv_readwrite, 'token is a word');
+
+
+
+select 'PHI| NA' into @class;
+select 'Class ( NumToken | WordToken )' into @sample;
+select 'Lexical | Stat | Pattern | Ontology' into @feature;
+
+insert into README values
+('Classification','','','',''),
+('Classification', 'table',  'Class',            @desc_table, @class),
+('Classification', 'table',  'Sample',           @desc_table, @sample),
+('Classification', 'table',  'Feature',          @desc_table, @feature),
+('Classification', 'table',  'FeatureBin',       @desc_table, 'POS | UMLS | regex'),
+('Classification', 'table',  'FeatureMatrix',    @desc_table, '(row,col) = (Sample,Feature)'),
+('Classification', 'table',  'FeatureMatrixOnt', @desc_table, '(row,col) = (Sample,OntologyMatch)');
+
+select 'POS* | Capital | cTAKES*' into @lexical;
+
+insert into README values
+('Features','','','',''),
+('Features', 'type', 'Feature',          @feature, null),
+('Features', '->',   'Lexical',          @lexical, null),
+('Features', '->',   'Lexical',          'POS*', 'Part of Speech (BIN)?'),
+('Features', '->',   'Lexical',          'Capital', 'Token Capitalization'),
+('Features', '->',   'Lexical',          'cTAKES*', '(context dependent)');
+
+select 'TF* | TokenLength | BagOfWords' into @stat;
+
+insert into README values
+('Features', '->',   'Stat',          @stat, null),
+('Features', '->',   'Stat',          'TF*', 'Term Frequency (POS)?'),
+('Features', '->',   'Stat',          'TokenLength', 'length(token)'),
+('Features', '->',   'Stat',          'BagOfWords', '#OntologyMatches per ontology');
+
+select 'Beckwith Regular Expressions' into @pattern;
+
+insert into README values
+('Features', '->',   'Pattern',          null,  null),
+('Features', '->',   'Pattern',          @pattern,  'Beckwith et al. 2006'),
+('Features', '->',   'Pattern',          'custom',  'custom regex.list');
+
+select 'UMLS | custom' into @ont;
+select 'Hl7|ICD|LOINC|MESH|RXNORM|SNOMED|COSTAR' into @umls;
+
+insert into README values
+('Features', '->',   'Ontology',           null, null),
+('Features', '->',   'Ontology',          'UMLS',   @umls);
+
+
+-- TODO REFACTOR
+-- insert into README values
+-- ('Human', 'table',  'HumanAnnotation', 'REFACTOR', 'REFACTOR');
+
+update README set example=''      where example is null;
+update README set description=''  where description is null;
+update README set entity_name=''  where entity_name is null;
+update README set entity_topic='' where entity_topic is null;

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/drop_tables.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/drop_tables.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/drop_tables.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/drop_tables.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,22 @@
+select 'Dropping Annotation Tables and Samples, Features, and FeatureMatrix' as 'note';
+
+drop table if exists README;
+drop table if exists Annotation;
+drop table if exists OntologyMatch; 
+drop table if exists BaseToken; 
+drop table if exists WordToken; 
+drop table if exists Sentence; 
+drop table if exists EmptySentence; 
+drop table if exists NewLineToken; 
+drop table if exists Calculation;
+drop table if exists POS;
+drop table if exists TermFrequency; ;
+drop table if exists Class;
+drop table if exists Sample;
+drop table if exists Feature;
+drop table if exists FeatureBin; 
+drop table if exists FeatureMatrix; 
+drop table if exists FeatureMatrixOnt;
+drop table if exists MyFeatureSelection;
+
+select 'Dropping tables is done' as 'note';

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/procedures.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/procedures.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/procedures.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/procedures.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,324 @@
+select 'Creating procedures, see README for usage' as note;
+
+drop procedure if exists README;
+drop procedure if exists CSV_DIR;
+drop procedure if exists LOG_LEVEL;
+drop procedure if exists drop_table;
+drop procedure if exists Session;
+drop procedure if exists log;
+drop procedure if exists tail;
+drop procedure if exists head;
+drop procedure if exists etime;
+drop procedure if exists mem;
+drop procedure if exists ps;
+drop procedure if exists csvread;
+drop procedure if exists csvwrite;
+drop procedure if exists insert_FeatureMatrixOnt;
+
+drop procedure if exists README;
+delimiter //
+create procedure README()
+begin
+  select * from README;
+end//
+delimiter ;
+
+drop procedure if exists DATASET;
+delimiter //
+create procedure DATASET(user_dataset varchar(10))
+begin
+  if @DATASET is null then
+    call SessionResume;
+  end if;
+
+  select user_dataset into @DATASET;
+
+  call log('DATASET', @DATASET) ;
+  call Session;
+
+end//
+delimiter ;
+
+drop procedure if exists USE_TRAIN;
+delimiter //
+create procedure USE_TRAIN()
+begin
+
+  call log('DATASET', 'SWITCH') ;
+  call DATASET('TRAIN');
+end//
+delimiter ;
+
+drop procedure if exists USE_TEST;
+delimiter //
+create procedure USE_TEST()
+begin
+
+  call log('DATASET', 'SWITCH') ;
+  call DATASET('TEST');
+end//
+delimiter ;
+
+
+drop procedure if exists CSV_DIR;
+delimiter //
+create procedure CSV_DIR(csv_dir tinyint)
+begin
+  select csv_dir into @CSV_DIR;
+  call log('CSV_DIR', log_level) ;
+end//
+delimiter ;
+
+drop procedure if exists LOG_LEVEL;
+delimiter //
+create procedure LOG_LEVEL(log_level tinyint)
+begin
+
+  select log_level into @LOG_LEVEL;
+  call log('LOG_LEVEL', log_level) ;
+end//
+delimiter ;
+
+
+drop table if exists Session;
+CREATE TABLE Session (
+  DATASET varchar(5),
+  CSV_DIR varchar(1000),
+  LOG_LEVEL tinyint
+);
+
+alter  table Session add column session_id    int         UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+alter  table Session add column session_start timestamp   default now();
+
+drop procedure if exists Session;
+delimiter //
+create procedure Session()
+begin
+
+   call SessionResume;
+
+   if @DATASET is null then
+
+     call README;
+     select '@@@@' as warning;
+     select '@DATASET is null? which one to use, train or test?' as warning;
+     select 'Terminating now.' from _WARNING_TRAIN_OR_TEST;
+   end if;
+
+   insert into Session(LOG_LEVEL, DATASET, CSV_DIR) select @LOG_LEVEL, @DATASET, @CSV_DIR;
+
+   select max(session_id) into @session_id from Session;
+   select concat('@DATASET=', @DATASET,' @CSV_DIR=',@CSV_DIR, ' @session_id=', @session_id,' @LOG_LEVEL=', @LOG_LEVEL)
+   as 'your current variable settings = ';
+
+   call log('Session', 'started');
+
+end//
+delimiter ;
+
+-- TODO: @@@NOTICE:
+insert into Session (DATASET, CSV_DIR, LOG_LEVEL) select 'TEST', '/tmp/mysql/', 1;
+
+drop procedure if exists SessionResume;
+delimiter //
+create procedure SessionResume()
+begin
+   if @CSV_DIR is null then
+     select CSV_DIR into @CSV_DIR from Session order by session_id desc limit 1;
+   end if;
+
+   if @DATASET is null then
+     select DATASET   into @DATASET   from Session order by session_id desc limit 1;
+   end if;
+
+   if @LOG_LEVEL is null then
+     select LOG_LEVEL into @LOG_LEVEL from Session order by session_id desc limit 1;
+   end if;
+end//
+delimiter ;
+
+
+drop table if exists log;
+CREATE TABLE log
+(
+  event_time   timestamp    default now(),
+  session_id   integer      NOT NULL,
+  entity_name     varchar(100) NOT NULL,
+  message      varchar(100) NULL,
+  DATASET      varchar(5)   NOT null default '?????'
+);
+ALTER TABLE log add idx smallint UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+
+drop procedure if exists log;
+delimiter //
+create procedure log(log_entity_name varchar(100), log_message varchar(100))
+begin
+
+   insert into log
+   (DATASET, session_id,  entity_name,     message) select
+   @DATASET, @session_id, log_entity_name, log_message;
+
+   if @LOG_LEVEL > 0 then
+    select concat(DATASET, ',', entity_name, ',',message) as '' from log order by idx desc limit 1;
+   end if;
+end//
+delimiter ;
+
+call Session;
+call log('LOG', 'create table');
+
+
+drop procedure if exists tail;
+delimiter //
+create procedure tail()
+begin
+  select event_time, DATASET, entity_name, message from log order by idx DESC limit 50;
+end//
+delimiter ;
+
+drop procedure if exists head;
+delimiter //
+create procedure head()
+begin
+  select event_time, DATASET, entity_name, message from log order by idx ASC limit 50;
+end//
+delimiter ;
+
+drop procedure if exists etime;
+delimiter //
+create procedure etime()
+begin
+  select l1.idx, l1.event_time, l1.entity_name, l1.message, l1.event_time,
+  UNIX_TIMESTAMP(l1.event_time)-UNIX_TIMESTAMP(l2.event_time) as etime
+  from log l1, log l2
+  where l1.idx = l2.idx+1
+  order by l1.idx asc;
+end//
+delimiter ;
+
+drop procedure if exists mem;
+delimiter //
+create procedure mem()
+begin
+  select
+  table_schema,
+  ENGINE,
+  TABLE_NAME,
+  TABLE_ROWS,
+  DATA_LENGTH,
+  concat( round( data_length / ( 1024 *1024 ) , 2 ) , 'M' )  data_length,
+  concat( round( index_length / ( 1024 *1024 ) , 2 ) , 'M' ) index_length
+  from
+  information_schema.TABLES
+  where
+  TABLE_SCHEMA = 'scrubber'
+  order by
+  table_schema, engine, table_name;
+end//
+delimiter ;
+
+drop procedure if exists ps;
+delimiter //
+create procedure ps()
+begin
+  select * from information_schema.processlist
+  where information_schema.processlist.INFO not like '%information_schema.processlist%';
+end//
+delimiter ;
+
+drop procedure if exists csvread;
+delimiter //
+create procedure csvread(mytable varchar(500))
+begin
+
+  select concat( ' load data '
+                    ' local infile ', "'", concat(@CSV_DIR, mytable, '.csv'), "'",
+                    ' into table ', mytable,
+                    ' fields  terminated by ',"'", ",","'",' ignore 1 lines') into @csvout;
+
+  select @csvout;
+
+  prepare stmt from @csvout;
+  execute stmt;
+end//
+delimiter ;
+
+drop procedure if exists csvwrite;
+delimiter //
+create procedure csvwrite(mytable varchar(500))
+begin
+
+  select concat(mytable, '.csv') into @table_csv;
+
+  select concat( ' SELECT * from ', mytable,
+                    ' into outfile ', "'", concat(@CSV_DIR, @table_csv), "'",
+                    ' fields terminated by ', "'", ',', "'"
+                    ' lines  terminated by ', "'", '\\n', "'") into @csvout;
+
+--   select @csvout;
+
+  prepare stmt from @csvout;
+  execute stmt;
+end//
+delimiter ;
+
+drop procedure if exists drop_table;
+delimiter //
+create procedure drop_table(table_to_drop varchar(50))
+begin
+
+  set @s:=concat('drop table if exists ', table_to_drop);
+
+  call log(table_to_drop, 'drop_table()');
+
+  prepare stmt from @s;
+  execute stmt;
+end//
+delimiter ;
+
+
+-- TODO : refactor below
+-- call log('insert_FeatureMatrixOnt()', '');
+
+drop procedure if exists insert_FeatureMatrixOnt;
+delimiter //
+create procedure insert_FeatureMatrixOnt(ontology varchar(100))
+begin
+
+call log(ontology, 'temp');
+
+drop table if exists tmp_OM , tmp_FeatureMatrixOntOnt;
+
+create temporary table tmp_OM select file_idx, start_idx, end_idx, f.feature_idx, token from OntologyMatch om, Feature f
+  where om.match_source like ontology and om.match_source = f.match_value;
+
+alter table tmp_OM add index (file_idx, start_idx, end_idx);
+
+select count(*) into @count_before from FeatureMatrixOnt;
+
+call log('insert_FeatureMatrixOnt(before)', @count_before);
+
+insert into FeatureMatrixOnt
+select  s.file_idx, s.sample_idx,o.feature_idx, count(*) as count_matches
+from    Sample s, tmp_OM o
+where   o.file_idx        = s.file_idx      and
+        o.start_idx       <= s.start_idx    and
+        o.end_idx         >= s.end_idx
+group by  s.file_idx, s.sample_idx,o.feature_idx;
+
+select count(*) into @count_now from FeatureMatrixOnt;
+select @count_now-@count_before into @count_diff;
+
+call log(ontology, @count_diff);
+call log('insert_FeatureMatrixOnt(now)', @count_now);
+
+call log(ontology, 'done');
+call log('insert_FeatureMatrixOnt()', ontology);
+
+end//
+delimiter ;
+
+call log('procedures.sql', 'done');
+
+
+

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/schema.sh
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/schema.sh?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/schema.sh (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/schema.sh Sun Jul  7 19:39:00 2013
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+scrubberdb < drop_tables.sql
+scrubberdb < procedures.sql
+scrubberdb < README.sql
+scrubberdb -e 'call README';

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/create_database_and_user.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/create_database_and_user.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/create_database_and_user.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/create_database_and_user.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,8 @@
+drop database if exists scrubber;
+
+create database scrubber
+default character set=utf8
+default collate = utf8_general_ci;
+
+create user 'scrubber'@'localhost' identified by 'scrubber';
+grant all on scrubber.* to 'scrubber'@'localhost';
\ No newline at end of file

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_cases.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_cases.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_cases.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_cases.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,25 @@
+-- CASES INDEXES-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- critical idx:
+-- create index idxCovering on annotations (filename_short,token);
+create index idxCovering2 on annotations (filename_short, token, start_idx, end_idx);
+create index idxToken on annotations(token);
+create index idxShortFile on annotations (filename_short);
+-- tf 
+create index idxTokPos on lookup_term_frequency (token,pos);
+
+-- useful idx:
+create index idxMatchSource on annotations (match_source);
+create index idxMatchValue on annotations (match_value);
+create index idxMatchKVPair on annotations (match_source, match_value);
+create index idxShortAnnot on annotations (annot_type_short);
+
+
+-- case feature indexes -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+create index idxIsPhi on feature_matrix(is_phi);
+create index idxShortFile on feature_matrix(filename_short);
+create index idxStartIdx on feature_matrix(start_idx);
+
+-- other 
+create index idxStartIdx on human_annotations(start_idx);
+create index idxEndIdx on human_annotations(end_idx);
+

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_pubs.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_pubs.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_pubs.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_pubs.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,17 @@
+-- PUBS INDEXES-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- critical idx:
+create index idxCovering on machine_annotations_pubs (filename_short,token);
+create index idxToken on machine_annotations_pubs(token);
+create index idxShortFile on machine_annotations_pubs (filename_short);
+create index idxMatchSource on machine_annotations_pubs (match_source);
+
+-- useful idx:
+create index idxMatchValue on machine_annotations_pubs (match_value);
+create index idxMatchKVPair on machine_annotations_pubs (match_source, match_value);
+create index idxShortAnnot on machine_annotations_pubs (annot_type_short);
+
+create index kwdIdx on keywords(keyword);
+create index pubIdIdx on authors(pub_id);
+create index pubIdIdx on refs(pub_id);
+create index jtitleIdx on pubs(journal_title);
+create index fileIdx on pubs(filepath);

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/create_tables.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/create_tables.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/create_tables.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/create_tables.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,192 @@
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+--  BEWARE: string comparisons are done against many tables
+--  and it is assumed "COLLATE utf8_ci" for default db collation. 
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+
+CREATE TABLE lookup_umls (
+  id int(11) NOT NULL auto_increment,
+  CUI char(10) NOT NULL,
+  SAB varchar(20) NOT NULL,
+  STR text NOT NULL,
+  PRIMARY KEY  (id),
+  KEY X_CUI (CUI),
+  KEY X_SAB (SAB),
+  KEY X_STR (STR(255))
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- create table for dictionaryAnnotator, this implements the old txt dictionaries from earlier versions of scrubber.
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+
+CREATE TABLE lookup_dictionary (
+  id int(11) NOT NULL auto_increment,
+  code varchar(50) NOT NULL,
+  ontology varchar(50) NOT NULL,
+  str text NOT NULL,
+  update_date timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP,
+  PRIMARY KEY  (id),
+  KEY strIdx (str(255))
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- create table for tf of tokens across a corpus, group by token, pos
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+
+CREATE TABLE lookup_term_frequency (
+  id int(11) NOT NULL auto_increment,
+  cnt int(11) NOT NULL ,
+  token varchar(100) NOT NULL,
+  pos varchar(10),
+  source varchar(20) NOT NULL,
+  PRIMARY KEY  (id)
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- intermediate table used for comparing the distribution of part of speech 
+-- between pubs, cases, and known phi tokens.
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+CREATE TABLE pubs_pos (
+  id int(11) NOT NULL auto_increment,
+  pos varchar(10),
+  pos_bin varchar(10),
+  pos_cnt int(11) NOT NULL ,
+  PRIMARY KEY  (id)
+) ENGINE=innodb DEFAULT CHARSET=utf8;
+
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- create feature table for CASES
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+CREATE TABLE feature_matrix_train (
+  id int(11) NOT NULL auto_increment,
+  token varchar(100) NOT NULL,
+  start_idx int(11) NOT NULL ,
+  end_idx int(11) NOT NULL ,
+  filename_short varchar(20) NOT NULL,
+  pos varchar(10),
+  pos_bin varchar(10) NOT NULL,
+  has_capital int(11) NOT NULL default 0 ,
+  cnt_regex_phon int(11) NOT NULL ,
+  cnt_regex_date int(11) NOT NULL ,
+  cnt_regex_age int(11) NOT NULL ,
+  cnt_regex_id int(11) NOT NULL ,
+  cnt_regex_pat int(11) NOT NULL ,
+  cnt_regex_doc int(11) NOT NULL ,
+  cnt_regex_loc int(11) NOT NULL ,
+  cnt_regex_hosp int(11) NOT NULL ,
+  cnt_priv int(11) NOT NULL ,
+  cnt_hosp int(11) NOT NULL ,
+  cnt_name int(11) NOT NULL ,
+  cnt_dict_costar int(11) NOT NULL ,
+  cnt_dict_hl7v25 int(11) NOT NULL ,
+  cnt_dict_hl7v30 int(11) NOT NULL ,
+  cnt_dict_icd10cm int(11) NOT NULL ,
+  cnt_dict_icd10pcs int(11) NOT NULL ,
+  cnt_dict_icd9cm int(11) NOT NULL ,
+  cnt_dict_lnc int(11) NOT NULL ,
+  cnt_dict_msh int(11) NOT NULL ,
+  cnt_dict_rxnorm int(11) NOT NULL ,
+  cnt_dict_snomedct int(11) NOT NULL ,
+  cnt_ham_w_pos float(23,15)  NOT NULL ,
+  cnt_ham_wo_pos float(23,15)  NOT NULL ,
+  is_phi int(11) NOT NULL ,
+  classified_as varchar(20) NOT NULL default 'NA',
+  phi_type varchar(20),
+  PRIMARY KEY  (id)
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+create table feature_matrix_test like feature_matrix_train;
+
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- create table for KNOWN PHI. determined by annotated gold standard for training data.
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+CREATE TABLE human_annotations_train (
+  id int(11) NOT NULL auto_increment,
+  subject_id varchar(100) NOT NULL,
+  tag_name varchar(1000) NOT NULL,
+  tag_value LONGTEXT NOT NULL,
+  start_idx int(11) NOT NULL, 
+  end_idx int(11) NOT NULL,
+  PRIMARY KEY  (id)
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+create table human_annotations_test like human_annotations_train;
+
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- create table for annotations
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+CREATE TABLE machine_annotations_train (
+  id int(11) NOT NULL auto_increment,
+  source varchar(20),
+  filename_long varchar(1000) NOT NULL,
+  filename_short varchar(20) NOT NULL,
+  annot_type_long varchar(1000) NOT NULL,
+  annot_type_short varchar(50) NOT NULL,
+  token varchar(100) NOT NULL,
+  start_idx int(11) NOT NULL,
+  end_idx int(11) NOT NULL,
+  match_value varchar(50),
+  match_source varchar(50),
+  update_date timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP,
+  PRIMARY KEY  (id)
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+create table machine_annotations_test like machine_annotations_train;
+
+CREATE TABLE machine_annotations_pubs (
+  id int(11) NOT NULL auto_increment,
+  source varchar(20),
+  filename_long varchar(1000) NOT NULL,
+  filename_short varchar(20) NOT NULL,
+  annot_type_long varchar(1000) NOT NULL,
+  annot_type_short varchar(50) NOT NULL,
+  token varchar(100) NOT NULL,
+  start_idx int(11) NOT NULL,
+  end_idx int(11) NOT NULL,
+  match_value varchar(50),
+  match_source varchar(50),
+  update_date timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP,
+  PRIMARY KEY  (id)
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- creates table for OPEN ACCESS pub data
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+CREATE TABLE pubs (
+  id int(11) NOT NULL auto_increment,
+  journal_title varchar(1000),
+  article_title varchar(1000),
+  pmc varchar(100),
+  pmid varchar(100),
+  doi varchar(100),
+  body LONGTEXT,
+  filepath varchar(1000),
+  update_date timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP,
+  PRIMARY KEY  (id)
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+CREATE TABLE pubs_authors (
+  id int(11) NOT NULL auto_increment,
+  pub_id int(11) NOT NULL ,
+  surname varchar(1000),
+  given_name varchar(1000),  
+  update_date timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP,
+  PRIMARY KEY  (id)
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+CREATE TABLE pubs_refs (
+  id int(11) NOT NULL auto_increment,
+  pub_id int(11) NOT NULL ,
+  surname varchar(1000),
+  given_name varchar(1000),  
+  update_date timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP,
+  PRIMARY KEY  (id)
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+CREATE TABLE pubs_keywords (
+  id int(11) NOT NULL auto_increment,
+  pub_id int(11) NOT NULL ,
+  keyword varchar(1000),
+  update_date timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP,
+  PRIMARY KEY  (id)
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/create_umls_tables_from_local_install.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/create_umls_tables_from_local_install.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/create_umls_tables_from_local_install.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/create_umls_tables_from_local_install.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,20 @@
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- used to house the subset of umls that is being used by scrubber dictionary annotator
+-- this is housed in the same schema as the umls data (currently that is expected to be called 'umls')
+-- 
+-- this will also populate the table with that subset.
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- - 
+create table lookup_umls like mrconso;
+
+insert into lookup_umls select * from mrconso where sab in (
+'COSTAR',
+'HL7V2.5',
+'HL7V3.0',
+'ICD10CM',
+'ICD10PCS',
+'ICD9CM',
+'LNC',
+'MSH',
+'RXNORM',
+'SNOMEDCT'
+);

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/drop_database_and_user.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/drop_database_and_user.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/drop_database_and_user.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/drop_database_and_user.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,2 @@
+drop database if exists scrubber;
+drop user 'scrubber'@'localhost';

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/drop_tables.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/drop_tables.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/drop_tables.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/drop_tables.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,31 @@
+use scrubber;
+
+drop table if exists  lookup_umls;
+
+drop table if exists  lookup_dictionary;
+
+drop table if exists  lookup_term_frequency;
+
+drop table if exists  pubs_pos;
+
+drop table if exists  feature_matrix_train;
+
+drop table if exists  feature_matrix_test;
+
+drop table if exists  human_annotations_train;
+
+drop table if exists  human_annotations_test;
+
+drop table if exists  machine_annotations_train;
+
+drop table if exists  machine_annotations_test;
+
+drop table if exists  machine_annotations_pubs;
+
+drop table if exists  pubs;
+
+drop table if exists  pubs_authors;
+
+drop table if exists  pubs_refs;
+
+drop table if exists  pubs_keywords;
\ No newline at end of file

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/evaluate_results.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/evaluate_results.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/evaluate_results.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/evaluate_results.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,9 @@
+select 'ALL' as type , count(*) from feature_matrix_test
+union
+select 'TN' as type , count(*) from feature_matrix_test where classified_as = 'NA' and phi_type = 'NA' 
+union
+select 'TP' as type, count(*) from feature_matrix_test where classified_as <> 'NA' and phi_type <> 'NA' 
+union
+select 'FN' as type, count(*) from feature_matrix_test where classified_as = 'NA' and phi_type <> 'NA' 
+union
+select 'FP' as type, count(*) from feature_matrix_test where classified_as <> 'NA' and phi_type = 'NA' ;