You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by br...@apache.org on 2013/07/07 21:39:01 UTC
svn commit: r1500518 [1/2] - in /ctakes/sandbox/ctakes-scrubber-deid/sql: ./
CSVAnnotation/ CSVAnnotation/human/ CSVAnnotation/machine/
CSVAnnotation/schema/ tmp/
Author: brittfitch
Date: Sun Jul 7 19:39:00 2013
New Revision: 1500518
URL: http://svn.apache.org/r1500518
Log:
add sql scripts used by scrubber
Added:
ctakes/sandbox/ctakes-scrubber-deid/sql/
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/@dep_etl.sh
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/HOWTO
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_test.sh (with props)
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_train.sh (with props)
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/export.csv.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/human/
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/human/load.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Feature.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureBin.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrix.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrixOnt.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Sample.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/load.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/drop_tables.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/procedures.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/schema.sh
ctakes/sandbox/ctakes-scrubber-deid/sql/create_database_and_user.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_cases.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_pubs.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/create_tables.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/create_umls_tables_from_local_install.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/drop_database_and_user.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/drop_tables.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/evaluate_results.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/export_matlab.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/load_csv_annotations.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/populateTFTables.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/tmp/
ctakes/sandbox/ctakes-scrubber-deid/sql/tmp/FeatureMatrixOnt.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_test.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_train.sql
ctakes/sandbox/ctakes-scrubber-deid/sql/update_pos_bins.sql
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/@dep_etl.sh
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/%40dep_etl.sh?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/@dep_etl.sh (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/@dep_etl.sh Sun Jul 7 19:39:00 2013
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+echo
+echo "######################################################################################"
+echo INPUT: UIMA+cTAKES annotations;
+echo OUTPUT: WEKA+Matlab CSV files;
+echo "######################################################################################"
+echo
+echo "You can monitor the Process, memory, and log info in the database (ps, mme, tail)"
+echo
+echo " mysql> call ps ; "
+echo " mysql> call mem ; "
+echo " mysql> call tail ; "
+echo
+echo
+echo "List of the Data entities handled by this script:"
+echo
+cat README.txt
+
+source ./scrubber.properties.sh
+
+scrubberdb='mysql -D $SCRUBBER_DB_NAME -u $SCRUBBER_DB_USER -p$SCRUBBER_DB_PWD';
+
+echo "######################################################################################"
+echo "step0_load_human_annotations.sql"
+scrubberdb< step0_load_human_annotations.sql
+
+echo "######################################################################################"
+echo "step1_load_machine_annotations.sql"
+scrubberdb< step1_load_machine_annotations.sql
+
+echo "######################################################################################"
+echo "step2_FeatureBin.sql"
+scrubberdb< step2_FeatureBin.sql
+
+echo "######################################################################################"
+echo "step3_Feature.sql"
+scrubberdb< step3_Feature.sql
+
+
+echo "######################################################################################"
+echo "step4_FeatureMatrix.sql"
+scrubberdb< step4_FeatureMatrix.sql
+scrubberdb< step4_FeatureMatrixOnt.sql
+
+echo "######################################################################################"
+echo "step5_export_csv.sql"
+
+pushd . ;
+ cd /tmp/mysql; rm -f Sample.csv FeatureMatrix.csv FeatureMatrixOnt.csv Sentence.csv TermFrequency.csv MyFeatureIdx.csv MyFeatureHeader.csv;
+popd
+
+scrubberdb< step5_export_csv.sql
+
+mkdir -p csv
+mv /tmp/mysql/Sample.csv ./csv/.
+mv /tmp/mysql/FeatureMatrix.csv ./csv/.
+mv /tmp/mysql/FeatureMatrixOnt.csv ./csv/.
+mv /tmp/mysql/TermFrequency.csv ./csv/.
+mv /tmp/mysql/MyFeatureSelection.csv ./csv/.
+mv /tmp/mysql/MyFeatureIdx.csv ./csv/.
+mv /tmp/mysql/MyFeatureHeader.csv ./csv/.
+
+echo "######################################################################################"
+cat README
+
+
+
+
+
+
+
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/HOWTO
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/HOWTO?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/HOWTO (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/HOWTO Sun Jul 7 19:39:00 2013
@@ -0,0 +1,67 @@
+1. human annotations
+java $JAVA_OPTS -cp $CP $SCRUBBER_HUMAN_ANNOTATIONS_IMPLEMENTATION $SCRUBBER_DIR_INPUT_HUMAN_ANNOTATIONS_TRAIN _train
+java $JAVA_OPTS -cp $CP $SCRUBBER_HUMAN_ANNOTATIONS_IMPLEMENTATION $SCRUBBER_DIR_INPUT_HUMAN_ANNOTATIONS_TEST _test
+
+java $JAVA_OPTS -cp $CP org.spin.scrubber.uima.core.UIMARunner desc/cpe/cpe_cases_train.xml
+java $JAVA_OPTS -cp $CP org.spin.scrubber.uima.core.UIMARunner desc/cpe/cpe_cases_test.xml
+
+
+2. run UIMA : train
+mv csv/*.csv csv/train/*.csv
+
+2. run UIMA : test
+
+mv csv/*.csv csv/train/*.csv
+
+3. copy csv files to temp dir
+
+rm -rf /tmp/mysql/
+mkdir -p /tmp/mysql/
+chmod -R 777 /tmp/mysql/.
+cd csv/train
+cp *.csv /tmp/mysql/.
+
+3. load csv files into mysql: train
+
+source schema.sh
+source load_train.sh
+
+
+
+
+
+last_build/scrubber/csv/train $
+
+rm -rf /tmp/mysql/
+mkdir -p /tmp/mysql/
+chmod -R 777 /tmp/mysql/.
+cp csv/*.csv /tmp/mysql/.
+
+...
+
+3. test
+
+
+java $JAVA_OPTS -cp $CP org.spin.scrubber.uima.core.UIMARunner desc/cpe/cpe_cases_test.xml
+
+
+3. create SCHEMA
+CSVAnnotation$ source ./schema.sh
+
+4. train
+CSVAnnotation$ source ./load_train.sh
+
+6. analysis
+
+data/csv/test $ mv /tmp/mysql/*.csv .
+data/csv/test $ matlab etl.m
+
+
+
+
+
+
+
+
+
+
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_test.sh
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_test.sh?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_test.sh (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_test.sh Sun Jul 7 19:39:00 2013
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+echo
+echo "######################################################################################"
+echo INPUT: UIMA+cTAKES annotations;
+echo OUTPUT: WEKA+Matlab CSV files;
+echo "######################################################################################"
+echo
+echo "You can monitor the Process, memory, and log info in the database (ps, mme, tail)"
+echo
+echo " mysql> call ps ; "
+echo " mysql> call mem ; "
+echo " mysql> call tail ; "
+echo
+echo
+echo "List of the Data entities handled by this script:"
+echo
+cat README.txt
+
+echo "######################################################################################"
+source schema/schema.sh
+
+echo "######################################################################################"
+scrubberdb -e 'call USE_TEST'
+scrubberdb < machine/load.sql
+
+echo "######################################################################################"
+scrubberdb< machine/Sample.sql
+
+echo "######################################################################################"
+scrubberdb< machine/FeatureMatrix.sql
+scrubberdb< machine/FeatureMatrixOnt.sql
+
+echo "######################################################################################"
+scrubberdb< machine/export.sql
\ No newline at end of file
Propchange: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_test.sh
------------------------------------------------------------------------------
svn:executable = *
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_train.sh
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_train.sh?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_train.sh (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_train.sh Sun Jul 7 19:39:00 2013
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+echo
+echo "######################################################################################"
+echo INPUT: UIMA+cTAKES annotations;
+echo OUTPUT: WEKA+Matlab CSV files;
+echo "######################################################################################"
+echo
+echo "You can monitor the Process, memory, and log info in the database (ps, mme, tail)"
+echo
+echo " mysql> call ps ; "
+echo " mysql> call mem ; "
+echo " mysql> call tail ; "
+echo
+echo
+echo "List of the Data entities handled by this script:"
+echo
+cat README.txt
+
+echo "######################################################################################"
+source schema/schema.sh
+
+echo "######################################################################################"
+scrubberdb -e 'call USE_TRAIN'
+scrubberdb < human/load.sql
+scrubberdb < machine/load.sql
+
+echo "######################################################################################"
+scrubberdb< machine/Sample.sql
+
+echo "######################################################################################"
+scrubberdb< machine/FeatureBin.sql
+scrubberdb< machine/Feature.sql
+
+echo "######################################################################################"
+scrubberdb< machine/FeatureMatrix.sql
+scrubberdb< machine/FeatureMatrixOnt.sql
+
+echo "######################################################################################"
+scrubberdb< machine/export.sql
\ No newline at end of file
Propchange: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/etl_train.sh
------------------------------------------------------------------------------
svn:executable = *
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/export.csv.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/export.csv.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/export.csv.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/export.csv.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,133 @@
+call Session;
+
+call log('export_csv.sql', 'begin');
+-- ####
+
+drop table if exists myFeatureSelection;
+create table MyFeatureSelection like Feature;
+
+insert into MyFeatureSelection
+select * from feature where datatype in (
+ 'double',
+ 'nominal(int)',
+ 'nominal(4)',
+ 'binary'
+);
+
+
+call log('Sample.csv', 'begin');
+
+select file_idx, sample_idx, sentence_idx, start_idx, end_idx, token_seq, class_idx
+from Sample
+order by sample_idx
+into
+ outfile '/tmp/mysql/Sample.csv'
+ FIELDS TERMINATED BY ','
+ LINES TERMINATED BY '\n';
+
+call log('Sample.csv', 'done');
+call log('SentenceIdx.csv', 'begin');
+
+
+select s.file_idx, s.sample_idx, s.sentence_idx, sample_idx_start, sample_idx_end, s.sentence_seq
+from Sentence sentence, Sample s
+where sentence.sentence_idx = s.sentence_idx
+order by s.sample_idx asc
+into
+ outfile '/tmp/mysql/SentenceIdx.csv'
+ FIELDS TERMINATED BY ','
+ LINES TERMINATED BY '\n';
+
+call log('SentenceIdx.csv', 'done');
+call log('TermFrequency.csv', 'begin');
+
+-- TODO HACK ! SYM
+update TermFrequency set pos = 'SYM' where pos like 'pos_%';
+
+select file_idx, sample_idx,
+ TF.tf_ham_without_pos, TF.tf_ham_with_pos , TF.token_len,
+ POS.pos_idx, POS.pos_bin_idx
+from TermFrequency TF left join POS on TF.pos = POS.pos
+order by sample_idx
+into
+ outfile '/tmp/mysql/TermFrequency.csv'
+ FIELDS TERMINATED BY ','
+ LINES TERMINATED BY '\n';
+
+call log('TermFrequency.csv', 'done');
+call log('MyFeatureIdx.csv', 'begin');
+
+select feature_idx, CASE datatype
+when 'binary' then 1
+WHEN 'double' THEN 3
+ELSE 2 end
+from MyFeatureSelection
+order by feature_idx
+into
+ outfile '/tmp/mysql/MyFeatureIdx.csv'
+ FIELDS TERMINATED BY ','
+ LINES TERMINATED BY '\n';
+
+call log('MyFeatureIdx.csv', 'done');
+-- call log('MyFeatureSelection.csv', 'csv');
+
+-- select *
+-- from MyFeatureSelection
+-- order by feature_idx
+-- into
+-- outfile '/tmp/mysql/MyFeatureSelection.csv'
+-- FIELDS TERMINATED BY ','
+-- LINES TERMINATED BY '\n';
+--
+-- call log('MyFeatureSelection.csv', 'csv');
+
+-- call log('MyFeatureHeader.csv', 'begin');
+
+set group_concat_max_len = 8096;
+
+drop table if exists my_FeatureHeader;
+create temporary table my_FeatureHeader
+select feature_idx, concat(feature_idx, '.',annotation_type, '.', match_source, '.', match_value, '.',datatype) as feature_label
+from myFeatureSelection
+order by feature_idx;
+
+insert into my_FeatureHeader values (0,'Class_PHI_Label');
+
+update my_FeatureHeader set feature_label=replace(feature_label, '..', '.');
+
+select feature_label
+from my_FeatureHeader
+order by feature_idx
+into
+ outfile '/tmp/mysql/MyFeatureHeader.csv'
+ FIELDS TERMINATED BY ','
+ LINES TERMINATED BY ';';
+
+call log('MyFeatureHeader.csv', 'done');
+
+call log('export_csv.sql', 'done');
+call log('FeatureMatrix.csv', 'begin');
+
+select s.file_idx, s.sample_idx, s.sentence_idx, m.feature_idx, m.feature_value
+from FeatureMatrix m, Sample s
+where m.sample_idx = s.sample_idx
+order by m.sample_idx, m.feature_idx
+into
+ outfile '/tmp/mysql/FeatureMatrix.csv'
+ FIELDS TERMINATED BY ','
+ LINES TERMINATED BY '\n';
+
+call log('FeatureMatrix.csv', 'done');
+call log('FeatureMatrixOnt.csv', 'begin');
+
+select s.file_idx, s.sample_idx, s.sentence_idx, m.feature_idx, m.count_matches
+from FeatureMatrixOnt m, Sample s
+where m.sample_idx = s.sample_idx
+order by m.sample_idx, m.feature_idx
+into
+ outfile '/tmp/mysql/FeatureMatrixOnt.csv'
+ FIELDS TERMINATED BY ','
+ LINES TERMINATED BY '\n';
+
+-- ####
+call log('FeatureMatrixOnt.csv', 'done');
\ No newline at end of file
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/human/load.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/human/load.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/human/load.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/human/load.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,56 @@
+call Session;
+
+call log('load_human_annotations.sql', 'begin');
+
+call log('HumanAnnotations', 'begin');
+
+call drop_table('HumanAnnotations');
+
+create table HumanAnnotations like human_annotations_train;
+alter table HumanAnnotations drop column id;
+alter table HumanAnnotations add column DATASET varchar(20);
+
+call log('HumanAnnotations', 'begin');
+call log('HumanAnnotations', 'TRAIN');
+
+select count(*) from HumanAnnotations into @num_rows;
+
+insert into HumanAnnotations (DATASET, subject_id, tag_name, tag_value, start_idx,end_idx)
+select 'TRAIN' , subject_id, tag_name, tag_value, start_idx,end_idx
+from human_annotations_train;
+
+call log('HumanAnnotations', @num_rows);
+call log('HumanAnnotations', 'TEST');
+
+insert into HumanAnnotations (DATASET, subject_id, tag_name, tag_value, start_idx,end_idx)
+select 'TEST' , subject_id, tag_name, tag_value, start_idx,end_idx
+from human_annotations_test;
+
+select count(*) from HumanAnnotations into @num_rows;
+
+call log('HumanAnnotations', @num_rows);
+call log('HumanAnnotations', 'done');
+call log('Class', 'begin');
+
+call drop_table('Class');
+
+create table Class
+(
+ class_idx smallint UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ class varchar(20),
+ cnt integer,
+ DATASET varchar(10)
+) engine = myisam;
+
+
+insert into Class (DATASET, class) values (@DATASET, 'NA');
+insert into Class (DATASET, class, cnt)
+select @DATASET, tag_name as class, count(*) as cnt
+from HumanAnnotations
+where DATASET = @DATASET
+group by tag_name;
+
+alter table Class add index (class);
+
+call log('Class', 'done');
+call log('load_human_annotations.sql', 'done');
\ No newline at end of file
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Feature.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Feature.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Feature.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Feature.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,148 @@
+call Session;
+
+call log('Feature', 'begin');
+
+call drop_table('Feature');
+
+create table Feature
+(
+ annotation_type varchar(25) not null,
+ match_source varchar(25) not null,
+ match_value varchar(25) null default '',
+ cnt int null,
+ datatype varchar(25) null,
+ description text null
+) engine=myisam;
+
+alter table Feature add index (match_source);
+alter table Feature add index (match_value);
+alter table Feature add index (match_source, match_value);
+
+call log('Feature', 'TermFrequency');
+
+insert into Feature
+select 'TermFrequency', 'tf_ham_without_pos',null, count(*) as cnt, 'double', 'Publication Term Frequency'
+from TermFrequency;
+
+insert into Feature
+select 'TermFrequency', 'tf_ham_with_pos',null, count(*) as cnt, 'double', 'Publication Term Frequency with Part of Speech'
+from TermFrequency;
+
+insert into Feature
+select 'TermFrequency', 'token_len', null, count(*) as cnt, 'nominal(int)', 'String Length {WordToken or NumToken}'
+from TermFrequency;
+
+call log('Feature', 'bin');
+
+insert into Feature
+select 'bin', 'ontology', match_source, count(*) as cnt, 'nominal(int)', '#matches in Dictionary'
+from OntologyMatch
+group by match_source
+order by cnt desc;
+
+insert into Feature
+select 'bin', match_source, match_bin, count(*) as cnt, 'nominal(int)', '@see FeatureBin'
+from FeatureBin where match_source = 'regex'
+group by match_source, match_bin;
+
+insert into Feature
+select 'bin', match_source, match_bin, count(*) as cnt, 'nominal(int)', '@see FeatureBin'
+from FeatureBin where match_source != 'regex'
+group by match_source, match_bin;
+
+call log('Feature','WordToken:cap');
+
+insert into Feature
+select 'WordToken', 'cap', null, count(*) as cnt, 'nominal(4)', '(ctakes) Word Capitalization'
+from WordToken;
+
+call log('Feature','BaseToken...*');
+
+insert into Feature
+select annotation_type, 'BaseToken', null, count(*), 'binary', '(ctakes) token type'
+from BaseToken group by annotation_type order by count(*) desc;
+
+call log('Feature','Annotation...*');
+
+insert into Feature
+select annotation_type, 'Annotation', null, count(*), 'binary', '(ctakes) context dependent'
+from Annotation
+where annotation_type in
+(
+ 'RomanNumeralAnnotation',
+ 'FractionAnnotation',
+ 'DateAnnotation',
+ 'MeasurementAnnotation',
+ 'RangeAnnotation',
+ 'PersonTitleAnnotation',
+ 'TimeAnnotation'
+)
+group by annotation_type
+order by count(*) desc;
+
+call log('Feature','BaseToken:pos');
+
+insert into Feature
+select 'BaseToken', 'pos', pos, count(*) as cnt, 'binary', '(ctakes) Part Of Speech'
+from BaseToken
+where annotation_type = 'WordToken' or annotation_type = 'NumToken'
+group by pos
+order by cnt desc;
+
+call log('Feature','regex');
+
+insert into Feature
+select annotation_type, match_source, match_value, count(*) as cnt, 'binary', '(Beckwith) regular expression'
+from OntologyMatch
+where match_source = 'regex'
+group by match_source, match_value
+order by count(*) desc;
+
+ALTER TABLE Feature add feature_idx smallint UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+ALTER TABLE Feature add index (annotation_type, match_source, match_value);
+
+update Feature set match_value = '' where match_value is null;
+
+CALL log('Feature', 'FeatureBin:index');
+
+alter table FeatureBin add index (match_source, match_value);
+alter table FeatureBin add index (match_source, match_value, match_bin);
+
+alter table FeatureBin add column match_value_feature_idx smallint unsigned;
+alter table FeatureBin add column match_bin_feature_idx smallint unsigned;
+
+alter table FeatureBin add index(match_value_feature_idx);
+
+update FeatureBin b, Feature f
+set b.match_bin_feature_idx = f.feature_idx
+where f.annotation_type = 'bin' and f.match_value = b.match_bin;
+
+update FeatureBin b, Feature f
+set b.match_value_feature_idx = f.feature_idx
+where f.match_source = f.match_source and f.match_value = b.match_value;
+
+-- CALL log('Feature', 'removing sparse features');
+--
+-- select * from Feature where annotation_type != 'bin' and cnt < 10;
+-- delete from Feature where annotation_type != 'bin' and cnt < 10;
+
+CALL log('Feature', 'FeatureBin:done');
+CALL log('Feature', 'POS:begin');
+
+drop table if exists POS;
+create table POS
+(
+ pos varchar(25) null,
+ pos_idx smallint unsigned,
+ pos_bin_idx smallint unsigned
+)
+engine = myisam;
+
+insert into POS (pos, pos_idx, pos_bin_idx)
+select distinct match_value, match_value_feature_idx, match_bin_feature_idx from FeatureBin where match_source='pos';
+
+alter table POS add index (pos);
+CALL log('Feature', 'POS:done');
+call log('Feature', 'done');
+
+
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureBin.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureBin.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureBin.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureBin.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,192 @@
+call Session;
+call log('Step(2)', 'begin');
+
+CALL log('FeatureBin', 'create');
+
+call drop_table('FeatureBin');
+
+create table FeatureBin
+(
+ match_source varchar(50) not null,
+ match_value varchar(50) not null,
+ match_bin varchar(50) not null
+)
+engine = myisam;
+
+insert into FeatureBin values
+('pos','CC', 'com-dep-wd'),
+('pos','CT', 'com-dep-wd'),
+('pos','DT', 'com-dep-wd'),
+('pos','EX', 'com-dep-wd'),
+('pos','IN', 'com-dep-wd'),
+('pos','MD', 'com-dep-wd'),
+('pos','PDT', 'com-dep-wd'),
+('pos','RP', 'com-dep-wd'),
+('pos','TO', 'com-dep-wd'),
+('pos','UH', 'com-dep-wd'),
+('pos','WDT', 'com-dep-wd'),
+('pos','FW', 'FW-Symb'),
+('pos','SYM', 'FW-Symb'),
+('pos','JJ', 'Adjectives'),
+('pos','JJR', 'Adjectives'),
+('pos','JJS', 'Adjectives'),
+('pos','NN', 'Nouns'),
+('pos','NNS', 'Nouns'),
+('pos','NNP', 'Nouns'),
+('pos','NNPS', 'Nouns'),
+('pos','WRB', 'Adverbs'),
+('pos','RB', 'Adverbs'),
+('pos','RBR', 'Adverbs'),
+('pos','RBS', 'Adverbs'),
+('pos','VB', 'Verbs'),
+('pos','VBD', 'Verbs'),
+('pos','VBG', 'Verbs'),
+('pos','VBN', 'Verbs'),
+('pos','VBP', 'Verbs'),
+('pos','VBZ', 'Verbs'),
+('pos','PRP', 'Pronouns'),
+('pos','PRP$', 'Pronouns'),
+('pos','WP', 'Pronouns'),
+('pos','WP$', 'Pronouns'),
+('pos','CD', 'Numbers'),
+('pos','LS', 'Numbers')
+;
+
+insert into FeatureBin values
+('regex', 'HOSPITAL_1', 'hospital'),
+('regex', 'HOSPITAL_2', 'hospital');
+
+
+drop table if exists temp_regex_list;
+create temporary table temp_regex_list
+(
+match_value varchar(50) not null
+);
+
+insert into temp_regex_list values
+('EMAIL_ADDRESS'),
+('TELEPHONE2'),
+('TELEPHONE3'),
+('TELEPHONE0'),
+('TELEPHONE1'),
+('EXTENSION'),
+('IP');
+
+insert into FeatureBin select 'regex', match_value, 'phone' from temp_regex_list;
+truncate temp_regex_list;
+
+insert into temp_regex_list values
+('WRITTEN_AGE_10_100_DIV10'),
+('WRITTEN_AGE_110_TO_119'),
+('WRITTEN_AGE_20_TO_99'),
+('WRITTEN_AGE_1_TO_19'),
+('AGE'),
+('AGED_OVER'),
+('AGE4'),
+('AGE5'),
+('AGE6'),
+('AGE7'),
+('AGE8');
+
+insert into FeatureBin select 'regex', match_value, 'age' from temp_regex_list;
+truncate temp_regex_list;
+
+insert into temp_regex_list values
+('DATE'),
+('DATE1'),
+('DATE2'),
+('DATE3'),
+('DATE4'),
+('DATE5'),
+('DATE5_U'),
+('DATE6'),
+('DATE7'),
+('DATE8'),
+('DATE9'),
+('DATE11'),
+('DATE12'),
+('DATE13'),
+('DATE14'),
+('DATE15'),
+('DATE16'),
+('DATE17'),
+('DATE18'),
+('DATE19'),
+('DATE20'),
+('DATE21'),
+('DATE22'),
+('DATE_SEPARATORS'),
+('YEAR_CENTURY'),
+('MONTH'),
+('MONTH2'),
+('DISCHARGED'),
+('DISCHARGED2'),
+('DATE_23');
+
+insert into FeatureBin select 'regex', match_value, 'date' from temp_regex_list;
+truncate temp_regex_list;
+
+insert into temp_regex_list values
+('ADDRESS'),
+('ADDRESS2'),
+('ADDRESS4'),
+('ADDRESS5'),
+('ADDRESS6'),
+('ADDRESS7'),
+('ADDRESS8'),
+('POBOX'),
+('ROOM'),
+('ZIP_CODE'),
+('LOCATION_FLOOR_1'),
+('LOCATION_FLOOR_2');
+
+insert into FeatureBin select 'regex', match_value, 'address' from temp_regex_list;
+truncate temp_regex_list;
+
+insert into temp_regex_list values
+('SURGEON'),
+('SURGEON2'),
+('SURGEON3'),
+('SURGEON4'),
+('SURGEON5'),
+('SURGEON6'),
+('SURGEON7'),
+('SURGEON8'),
+('ASST'),
+('ASST2'),
+('ASST3'),
+('PA'),
+('CC'),
+('CC2'),
+('FROM'),
+('CONSULTATION'),
+('TECH'),
+('DOCTOR0_00'),
+('DOCTOR0_0'),
+('DOCTOR0_1'),
+('DOCTOR0_2'),
+('DOCTOR0'),
+('DOCTOR1'),
+('DOCTOR2'),
+('DOCTOR3'),
+('DOCTOR4'),
+('DOCTOR_OLDER'),
+('DOCTOR_GEN'),
+('DOCTOR_SUBHEAD');
+
+insert into FeatureBin select 'regex', match_value, 'doctor' from temp_regex_list;
+truncate temp_regex_list;
+
+insert into temp_regex_list values
+('TITLES'),
+('TITLE_THIRD'),
+('PATIENT_NAME'),
+('MR'),
+('MS');
+
+
+insert into FeatureBin select 'regex', match_value, 'patient' from temp_regex_list;
+truncate temp_regex_list;
+
+CALL log('FeatureBin', 'create is done');
+call log('Step(2)', 'done');
\ No newline at end of file
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrix.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrix.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrix.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrix.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,168 @@
+call Session;
+
+call log('FeatureMatrix.sql', 'begin');
+
+call log('FeatureMatrix', 'begin');
+call log('FeatureMatrix', 'create');
+drop table if exists FeatureMatrix;
+create table FeatureMatrix
+(
+ file_idx smallint not null,
+ sample_idx integer not null,
+ feature_idx smallint not null,
+ feature_value float unsigned default 0
+) engine=myisam;
+
+call log('FeatureMatrix', 'TermFrequency');
+
+select feature_idx into @tf_ham_without_pos from Feature where match_source = 'tf_ham_without_pos';
+select feature_idx into @tf_ham_with_pos from Feature where match_source = 'tf_ham_with_pos';
+select feature_idx into @token_len from Feature where match_source = 'token_len';
+
+insert into FeatureMatrix
+select file_idx, sample_idx, @tf_ham_without_pos, tf_ham_without_pos from TermFrequency;
+
+insert into FeatureMatrix
+select file_idx, sample_idx, @tf_ham_with_pos, tf_ham_with_pos from TermFrequency;
+
+insert into FeatureMatrix
+select file_idx, sample_idx, @token_len, token_len from TermFrequency;
+
+call log('FeatureMatrix', 'TermFrequency:done');
+call log('FeatureMatrix', 'WordToken:cap');
+
+select feature_idx into @cap from Feature where annotation_type = 'WordToken' and match_source = 'cap';
+
+insert into FeatureMatrix
+select s.file_idx, s.sample_idx, @cap, w.cap
+from Sample s, WordToken w
+where w.file_idx = s.file_idx and
+ w.token_seq = s.token_seq ;
+
+call log('FeatureMatrix', 'WordToken:cap:done');
+call log('FeatureMatrix', 'BaseToken');
+
+-- | annotation_type | match_source | match_value | datatype | feature_idx |
+-- +------------------------+--------------+--------------------------+-------------+
+-- | WordToken | BaseToken | NULL | binary | 5 |
+-- | PunctuationToken | BaseToken | NULL | binary | 6 |
+-- | NumToken | BaseToken | NULL | binary | 7 |
+-- | SymbolToken | BaseToken | NULL | binary | 8 |
+-- | ContractionToken | BaseToken | NULL | binary | 9 |
+
+-- NumToken and WordToken only
+insert into FeatureMatrix
+select s.file_idx, s.sample_idx, f.feature_idx, 1
+from Feature f, Sample s
+where f.match_source = 'BaseToken' and
+ f.annotation_type = s.annotation_type;
+
+insert into FeatureMatrix
+select s.file_idx, s.sample_idx, f.feature_idx, 1
+from Feature f, Sample s, BaseToken b
+where f.match_source = 'BaseToken' and
+ f.annotation_type = b.annotation_type and
+ b.file_idx = s.file_idx and
+ b.token_seq = s.token_seq-1 and
+ b.annotation_type in ('PunctuationToken', 'SymbolToken', 'ContractionToken');
+
+call log('FeatureMatrix', 'BaseToken:done');
+call log('FeatureMatrix', 'Annotation');
+
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | annotation_type | match_source | match_value | datatype | feature_idx |
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | RomanNumeralAnnotation | Annotation | NULL | binary | 10 |
+-- | FractionAnnotation | Annotation | NULL | binary | 11 |
+-- | DateAnnotation | Annotation | NULL | binary | 12 |
+-- | MeasurementAnnotation | Annotation | NULL | binary | 13 |
+-- | RangeAnnotation | Annotation | NULL | binary | 14 |
+-- | PersonTitleAnnotation | Annotation | NULL | binary | 15 |
+-- | TimeAnnotation | Annotation | NULL | binary | 16 |
+
+-- Annotation is inside the WordNum
+insert into FeatureMatrix
+select s.file_idx, s.sample_idx, f.feature_idx, 1
+from Feature f, Sample s, Annotation a
+where f.match_source = 'Annotation' and
+ f.annotation_type = a.annotation_type and
+ a.file_idx = s.file_idx and
+ a.start_idx >= s.start_idx and
+ a.end_idx <= s.end_idx ;
+
+-- WordNum is inside the Annotation
+insert into FeatureMatrix
+select s.file_idx, s.sample_idx, f.feature_idx, 1
+from Feature f, Sample s, Annotation a
+where f.match_source = 'Annotation' and
+ f.annotation_type = a.annotation_type and
+ a.file_idx = s.file_idx and
+ a.start_idx <= s.start_idx and
+ a.end_idx >= s.end_idx ;
+
+call log('FeatureMatrix', 'Annotation:done');
+call log('FeatureMatrix', 'BaseToken:pos');
+
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | annotation_type | match_source | match_value | datatype | feature_idx |
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | BaseToken | pos | NN | binary | 32 |
+-- | BaseToken | pos | JJ | binary | 33 |
+-- | BaseToken | pos | CD | binary | 34 |
+-- | BaseToken | pos | IN | binary | 35 |
+-- | BaseToken | pos | NNP | binary | 36 |
+-- | BaseToken | pos | DT | binary | 37 |
+-- | ... | ... | ... | ... | .. |
+-- | BaseToken | pos | pos_paren | binary | 71 |
+-- +------------------------+--------------+-------------+------------+-------------+
+
+insert into FeatureMatrix
+select s.file_idx, s.sample_idx, f.feature_idx, 1
+from Feature f, Sample s
+where f.annotation_type = 'BaseToken' and
+ f.match_source = 'pos' and
+ f.match_value = s.pos ;
+
+call log('FeatureMatrix', 'BaseToken:pos:done');
+call log('FeatureMatrix', 'OntologyMatch:regex');
+
+-- +-----------------+--------------+-----------------+----------+-------------+
+-- | annotation_type | match_source | match_value | datatype | feature_idx |
+-- +-----------------+--------------+-----------------+----------+-------------+
+-- | OntologyMatch | regex | DATE22 | binary | 72 |
+-- | OntologyMatch | regex | DATE21 | binary | 73 |
+-- | OntologyMatch | regex | DATE17 | binary | 74 |
+-- | OntologyMatch | regex | DATE18 | binary | 75 |
+-- | OntologyMatch | regex | DATE_SEPARATORS | binary | 76 |
+-- | OntologyMatch | regex | DATE1 | binary | 77 |
+-- | OntologyMatch | regex | DATE4 | binary | 78 |
+-- | OntologyMatch | regex | YEAR_CENTURY | binary | 79 |
+-- | OntologyMatch | regex | SUSPICIOUS_NUM | binary | 80 |
+-- | OntologyMatch | regex | MONTH | binary | 81 |
+-- | ... | ... | ... | ... | ... |
+-- | OntologyMatch | regex | ADDRESS | binary | 148 |
+-- +-----------------+--------------+-----------------+----------+-------------+
+
+insert into FeatureMatrix
+select s.file_idx, s.sample_idx, f.feature_idx, 1
+from Feature f, Sample s, OntologyMatch o
+where f.annotation_type = 'OntologyMatch' and
+ f.match_source = 'regex' and
+ f.match_source = o.match_source and
+ f.match_value = o.match_value and
+ o.file_idx = s.file_idx and
+ o.start_idx <= s.start_idx and
+ o.end_idx >= s.end_idx ;
+
+call log('FeatureMatrix', 'OntologyMatch:regex:done');
+
+call log('FeatureMatrix', 'bin');
+
+insert into FeatureMatrix
+select fm.file_idx, fm.sample_idx, b.match_bin_feature_idx, count(*)
+from FeatureMatrix fm, FeatureBin b
+where fm.feature_idx = match_value_feature_idx
+group by fm.file_idx, fm.sample_idx, b.match_bin_feature_idx;
+
+call log('FeatureMatrix', 'bin:done');
+call log('FeatureMatrix.sql', 'done');
\ No newline at end of file
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrixOnt.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrixOnt.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrixOnt.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/FeatureMatrixOnt.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,33 @@
+call Session;
+
+call log('FeatureMatrixOnt.sql', 'begin');
+
+call drop_table('FeatureMatrixOnt');
+
+create table FeatureMatrixOnt
+(
+ file_idx smallint unsigned not null,
+ sample_idx integer unsigned not null,
+ feature_idx smallint unsigned not null,
+ count_matches smallint unsigned not null
+)
+engine=myisam;
+
+call log('FeatureMatrixOnt', 'begin');
+
+call insert_FeatureMatrixOnt('dict') ;
+call insert_FeatureMatrixOnt('ICD10%') ;
+call insert_FeatureMatrixOnt('ICD9CM') ;
+call insert_FeatureMatrixOnt('RXNORM') ;
+call insert_FeatureMatrixOnt('HL7V3.0') ;
+call insert_FeatureMatrixOnt('HL7V2.5') ;
+call insert_FeatureMatrixOnt('COSTAR');
+call insert_FeatureMatrixOnt('MSH');
+call insert_FeatureMatrixOnt('LNC');
+call insert_FeatureMatrixOnt('SNOMEDCT');
+
+insert into FeatureMatrix select * from FeatureMatrixOnt;
+
+call log('FeatureMatrixOnt', 'done');
+
+call log('FeatureMatrixOnt.sql', 'done');
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Sample.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Sample.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Sample.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/Sample.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,157 @@
+call Session;
+
+call log('Sample', 'begin');
+call log('Sample', 'create');
+call log('Sample', 'BaseToken');
+
+call drop_table('Sample');
+
+create table Sample
+ select * from BaseToken where
+ annotation_type = 'NumToken' or
+ annotation_type = 'WordToken'
+ order by file_idx, token_seq asc;
+
+call log('Sample', 'BaseToken');
+call log('Sample', 'index');
+
+alter table Sample engine = myisam;
+alter table Sample add class_idx tinyint UNSIGNED NULL;
+alter table Sample add sample_idx integer UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+
+alter table Sample add index (file_idx);
+alter table Sample add index (file_idx, token_seq);
+alter table Sample add index (file_idx, start_idx, end_idx);
+
+call log('Sample', 'index:done');
+call log('Sample', 'class:begin');
+
+call log('Class', 'table');
+drop table if exists Class;
+create table Class
+(
+ class_idx tinyint unsigned,
+ class varchar(10) not null
+)
+engine = myisam;
+
+insert into Class values
+ (1, 'NA'),
+ (2, 'AGE'),
+ (3, 'DATE'),
+ (4, 'DOCTOR'),
+ (5, 'HOSPITAL'),
+ (6, 'ID'),
+ (7, 'LOCATION'),
+ (8, 'PATIENT'),
+ (9, 'PHONE');
+
+call log('Class', 'done');
+
+update Sample s, Class c
+set s.class_idx = c.class_idx
+where s.class = c.class;
+
+update Sample set class_idx = 0 where class_idx is null;
+
+call log('Sample', 'class:done');
+call log('Sample', 'sentence');
+
+call drop_table('tmp_sentence_bounds');
+
+create temporary table tmp_sentence_bounds select
+file_idx, sentence_seq, min(sample_idx) sample_idx_start, max(sample_idx) sample_idx_end
+from Sample group by file_idx, sentence_seq order by file_idx, sentence_seq;
+
+call drop_table('tmp_Sentence ');
+
+create table tmp_Sentence select
+ annotation_type ,
+ filename_short ,
+ start_idx ,
+ end_idx ,
+ sentence_seq ,
+ file_idx
+from Sentence
+order by file_idx, sentence_seq;
+
+alter table tmp_Sentence add column sample_idx_start mediumint unsigned;
+alter table tmp_Sentence add column sample_idx_end mediumint unsigned;
+
+alter table tmp_Sentence add index (file_idx, sentence_seq);
+alter table tmp_Sentence add index (file_idx, sample_idx_start, sample_idx_end);
+
+update tmp_Sentence s, tmp_sentence_bounds bounds
+set s.sample_idx_start = bounds.sample_idx_start,
+ s.sample_idx_end = bounds.sample_idx_end
+where s.file_idx = bounds.file_idx and
+ s.sentence_seq = bounds.sentence_seq ;
+
+call drop_table('EmptySentence');
+
+create table EmptySentence
+select * from tmp_Sentence where sample_idx_end = 0;
+delete from tmp_Sentence where sample_idx_end = 0;
+
+alter table Sentence rename to dropme;
+alter table tmp_Sentence rename to Sentence;
+
+call log('Sample', 'sentence:index');
+
+alter table Sentence add column sentence_idx integer UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+alter table Sample add column sentence_idx smallint unsigned ;
+drop table dropme;
+
+update Sample sample, Sentence s
+set sample.sentence_idx = s.sentence_idx
+where sample.file_idx = s.file_idx and
+ sample.start_idx >= s.start_idx and
+ sample.end_idx <= s.end_idx ;
+
+call log('Sample', 'sentence:done');
+call log('Sample', 'done');
+call log('TermFrequency', 'begin');
+
+call drop_table('TermFrequency');
+call log('TermFrequency', 'create');
+
+create table TermFrequency
+(
+ file_idx smallint unsigned,
+ sample_idx int(10) unsigned,
+
+ start_idx smallint unsigned,
+ end_idx smallint unsigned,
+
+ token varchar(100) not null null,
+ token_len tinyint unsigned,
+
+ pos varchar(25) ,
+ tf_ham_with_pos double default -1,
+ tf_ham_without_pos double default -1
+
+) engine = myisam;
+
+insert into TermFrequency (file_idx, sample_idx, start_idx, end_idx, pos, token, token_len)
+select file_idx, sample_idx, start_idx, end_idx, pos, token, length(token)
+from Sample order by sample_idx;
+
+alter table TermFrequency add index (file_idx, sample_idx, start_idx);
+alter table TermFrequency add index (pos);
+
+update TermFrequency tf, Calculation c
+set tf_ham_without_pos = match_value
+where 'tf_ham_without_pos' = c.match_source and
+ tf.file_idx = c.file_idx and
+ tf.start_idx = c.start_idx and
+ tf.end_idx = c.end_idx ;
+
+update TermFrequency tf, Calculation c
+set tf_ham_with_pos = match_value
+where 'tf_ham_with_pos' = c.match_source and
+ tf.file_idx = c.file_idx and
+ tf.start_idx = c.start_idx and
+ tf.end_idx = c.end_idx ;
+
+call log('Sample', 'done');
+
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/load.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/load.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/load.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/machine/load.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,168 @@
+truncate log;
+
+call Session;
+
+call log('load_machine_annotations.sql', 'begin');
+
+CALL log('Annotation', 'begin');
+
+CALL drop_table('Annotation');
+
+create table Annotation
+(
+ annotation_type varchar(50),
+ filename_short varchar(20),
+ start_idx smallint,
+ end_idx smallint
+) engine = myisam;
+
+load data infile '/tmp/mysql/Annotation.csv' into table Annotation
+fields terminated by ',' ignore 1 lines;
+
+CALL log('Annotation', 'done');
+CALL log('Sentence', 'begin');
+
+CALL drop_table('Sentence');
+
+create table Sentence like Annotation;
+alter table Sentence add column sentence_seq smallint not null;
+
+load data infile '/tmp/mysql/Sentence.csv' into table Sentence
+fields terminated by ',' ignore 1 lines;
+
+CALL log('Sentence', 'done');
+CALL log('BaseToken', 'begin');
+
+CALL drop_table('BaseToken');
+create table BaseToken like Annotation;
+
+alter table BaseToken add column token_seq smallint not null;
+alter table BaseToken add column pos varchar(25) not null;
+alter table BaseToken add column token varchar(100) not null;
+
+load data infile '/tmp/mysql/BaseToken.csv' into table BaseToken
+fields terminated by ',' ignore 1 lines;
+
+CALL log('BaseToken', 'done');
+CALL log('NewLineToken', 'skipping...');
+-- CALL log('NewLineToken', 'begin');
+--
+-- create table NewLineToken like BaseToken;
+--
+-- load data infile '/tmp/mysql/NewLineToken.csv' into table NewLineToken
+-- fields terminated by ',' ignore 1 lines;
+--
+-- CALL log('NewLineToken', 'done');
+CALL log('WordToken', 'begin');
+
+CALL drop_table('WordToken');
+create table WordToken like Annotation;
+
+alter table WordToken add column token_seq smallint not null;
+alter table WordToken add column pos varchar(25) not null;
+alter table WordToken add column cap tinyint not null;
+alter table WordToken add column token varchar(100) not null;
+
+load data infile '/tmp/mysql/WordToken.csv' into table WordToken
+fields terminated by ',' ignore 1 lines;
+
+alter table WordToken add index(pos, cap);
+
+CALL log('WordToken', 'done');
+CALL log('OntologyMatch','begin');
+
+CALL drop_table('OntologyMatch');
+create table OntologyMatch like Annotation;
+
+alter table OntologyMatch add column match_source varchar(50) not null;
+alter table OntologyMatch add column match_value varchar(50) not null;
+alter table OntologyMatch add column token varchar(100) not null;
+
+load data infile '/tmp/mysql/OntologyMatch.csv' into table OntologyMatch
+fields terminated by ',' ignore 1 lines;
+
+CALL log('OntologyMatch', 'done');
+CALL log('Calculation', 'begin');
+
+CALL drop_table('Calculation');
+create table Calculation like OntologyMatch;
+
+alter table Calculation drop column token;
+
+load data infile '/tmp/mysql/Calculation.csv' into table Calculation
+fields terminated by ',' ignore 1 lines;
+
+CALL log('Calculation', 'done');
+CALL log('file_idx', 'begin');
+
+alter table Annotation add column file_idx smallint not null;
+alter table BaseToken add column file_idx smallint not null;
+alter table Sentence add column file_idx smallint not null;
+alter table WordToken add column file_idx smallint not null;
+alter table OntologyMatch add column file_idx smallint not null;
+alter table Calculation add column file_idx smallint not null;
+
+update Annotation set file_idx = trim(replace(filename_short, '.txt', ''));
+update BaseToken set file_idx = trim(replace(filename_short, '.txt', ''));
+update Sentence set file_idx = trim(replace(filename_short, '.txt', ''));
+update WordToken set file_idx = trim(replace(filename_short, '.txt', ''));
+update OntologyMatch set file_idx = trim(replace(filename_short, '.txt', ''));
+update Calculation set file_idx = trim(replace(filename_short, '.txt', ''));
+
+CALL log('file_idx', 'done');
+CALL log('index', 'begin');
+
+CALL log('WordToken', 'index');
+alter table WordToken add index (file_idx);
+alter table WordToken add index (file_idx, token_seq);
+alter table WordToken add index (file_idx, start_idx, end_idx);
+
+CALL log('BaseToken', 'index');
+alter table BaseToken add index (filename_short);
+alter table BaseToken add index (file_idx, token_seq);
+alter table BaseToken add index (file_idx, start_idx, end_idx);
+alter table BaseToken add index (annotation_type, file_idx, token_seq);
+
+CALL log('Annotation', 'index');
+alter table Annotation add index (annotation_type);
+alter table Annotation add index (file_idx, start_idx, end_idx);
+
+CALL log('Calculation', 'index');
+alter table Calculation add index (file_idx, start_idx, end_idx);
+
+CALL log('OntologyMatch', 'index');
+alter table OntologyMatch add index(match_source);
+alter table OntologyMatch add index(match_source, match_value);
+alter table OntologyMatch add index(file_idx, start_idx, end_idx);
+alter table OntologyMatch add index(match_source, match_value, file_idx, start_idx, end_idx);
+
+CALL log('Sentence', 'index');
+alter table Sentence add index(file_idx, start_idx, end_idx);
+alter table Sentence add index(file_idx, sentence_seq);
+
+CALL log('index', 'done');
+call log('BaseToken:sentence_seq', 'begin');
+
+alter table BaseToken add sentence_seq integer not null;
+
+update BaseToken b, Sentence s
+set b.sentence_seq = s.sentence_seq
+where b.file_idx = s.file_idx and
+ b.start_idx >= s.start_idx and
+ b.end_idx <= s.end_idx ;
+
+call log('BaseToken:sentence_seq', 'done');
+
+alter table BaseToken add class varchar(20) not null;
+
+update BaseToken b, HumanAnnotations phi
+set b.class = phi.tag_name
+where phi.DATASET = @DATASET and
+ b.filename_short= phi.subject_id and
+ b.start_idx >= phi.start_idx and
+ b.end_idx <= phi.end_idx ;
+
+call log('BaseToken:class', 'done.');
+CALL log('Annotation', 'done');
+
+call log('load_machine_annotations.sql', 'done');
\ No newline at end of file
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README Sun Jul 7 19:39:00 2013
@@ -0,0 +1,50 @@
++----------------+-------------+-----------------------+-------------------------------------+-----------------------------------------+
+| entity_topic | entity_type | entity_name | example | description |
++----------------+-------------+-----------------------+-------------------------------------+-----------------------------------------+
+| README | version | SCRUBBER 3.1-SNAPSHOT | | |
+| README | version | cTAKES 1.2.2 | | |
+| global | | | | |
+| global | variable | @DATASET | select TEST into @DATASET | (TRAIN | TEST) select dataset |
+| global | variable | @CSV_DIR | select /tmp/mysql/ into @CSV_DIR | call csvwrite(table) |
+| global | variable | @LOG_LEVEL | select 1 into @LOG_LEVEL | turn on printing log to console |
+| shell | | | | |
+| shell | procedure | mem | call mem | get schema +memory usage |
+| shell | procedure | ps | call ps | show current sql process |
+| shell | procedure | csvread | call csvread(WordToken) | csv->table |
+| shell | procedure | csvwrite | call csvwrite(WordToken) | table->csv |
+| logging | | | | |
+| logging | table | etl_log | call log | tail | etime | scrubber application log |
+| logging | procedure | log | call log (tablename, message) | add new log event |
+| logging | procedure | tail | call tail | print recent log events |
+| logging | procedure | etime | call etime | print elapsed time from log |
+| ctakes | | | | |
+| ctakes | uima | Annotation | desc table | csvread | csvwrite | UIMA base class |
+| ctakes | -> | Sentence | desc table | csvread | csvwrite | sequence of annotation tokens |
+| ctakes | -> | Calculation | desc table | csvread | csvwrite | TermFrequency |
+| ctakes | -> | OntologyMatch | desc table | csvread | csvwrite | UMLS, Regex, and Private dictionaries |
+| ctakes | type | BaseToken | desc table | csvread | csvwrite | CTAKES base class |
+| ctakes | -> | NumToken | desc table | csvread | csvwrite | token is a number |
+| ctakes | -> | WordToken | desc table | csvread | csvwrite | token is a word |
+| Classification | | | | |
+| Classification | table | Class | desc table | show index | PHI| NA |
+| Classification | table | Sample | desc table | show index | Class ( NumToken | WordToken ) |
+| Classification | table | Feature | desc table | show index | Lexical | Stat | Pattern | Ontology |
+| Classification | table | FeatureBin | desc table | show index | POS | UMLS | regex |
+| Classification | table | FeatureMatrix | desc table | show index | (row,col) = (Sample,Feature) |
+| Classification | table | FeatureMatrixOnt | desc table | show index | (row,col) = (Sample,OntologyMatch) |
+| Features | | | | |
+| Features | type | Feature | Lexical | Stat | Pattern | Ontology | |
+| Features | -> | Lexical | POS* | Capital | cTAKES* | |
+| Features | -> | Lexical | POS* | Part of Speech (BIN)? |
+| Features | -> | Lexical | Capital | Token Capitalization |
+| Features | -> | Lexical | cTAKES* | (context dependent) |
+| Features | -> | Stat | TF* | TokenLength | BagOfWords | |
+| Features | -> | Stat | TF* | Term Frequency (POS)? |
+| Features | -> | Stat | TokenLength | length(token) |
+| Features | -> | Stat | BagOfWords | #OntologyMatches per ontology |
+| Features | -> | Pattern | | |
+| Features | -> | Pattern | Beckwith Regular Expressions | Beckwith et al. 2006 |
+| Features | -> | Pattern | custom | custom regex.list |
+| Features | -> | Ontology | | |
+| Features | -> | Ontology | UMLS | Hl7|ICD|LOINC|MESH|RXNORM|SNOMED|COSTAR |
++----------------+-------------+-----------------------+-------------------------------------+-----------------------------------------+
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/README.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,103 @@
+drop table if exists README;
+create table README
+(
+ entity_topic varchar(50) not null default '',
+ entity_type varchar(50) not null default '',
+ entity_name varchar(50) not null default '',
+ example text not null default '',
+ description text not null default ''
+);
+
+insert into README values
+('README', 'version', 'SCRUBBER 3.1-SNAPSHOT', null, null),
+('README', 'version', 'cTAKES 1.2.2', null, null);
+
+insert into README values
+('global','','','',''),
+('global', 'variable', '@DATASET', 'select TEST into @DATASET', '(TRAIN | TEST) select dataset'),
+('global', 'variable', '@CSV_DIR', 'select /tmp/mysql/ into @CSV_DIR', 'call csvwrite(table) '),
+('global', 'variable', '@LOG_LEVEL', 'select 1 into @LOG_LEVEL', 'turn on printing log to console');
+
+insert into README values
+('shell','','','',''),
+('shell', 'procedure', 'mem', 'call mem', 'get schema +memory usage'),
+('shell', 'procedure', 'ps', 'call ps', 'show current sql process'),
+('shell', 'procedure', 'csvread', 'call csvread(WordToken)', 'csv->table'),
+('shell', 'procedure', 'csvwrite', 'call csvwrite(WordToken)', 'table->csv');
+
+insert into README values
+('logging','','','',''),
+('logging', 'table', 'log', 'call log | tail | etime', 'scrubber application log'),
+('logging', 'procedure', 'log', 'call log (tablename, message)', 'add new log event'),
+('logging', 'procedure', 'tail', 'call tail', 'print recent log events'),
+('logging', 'procedure', 'etime', 'call etime', 'print elapsed time from log');
+
+select 'desc table | csvread | csvwrite' into @csv_readwrite;
+select 'desc table | show index' into @desc_table;
+
+insert into README values
+('ctakes', '', '', null,null),
+('ctakes', 'uima', 'Annotation', @csv_readwrite, 'UIMA base class'),
+('ctakes', '->', 'Sentence', @csv_readwrite, 'sequence of annotation tokens'),
+('ctakes', '->', 'Calculation', @csv_readwrite, 'TermFrequency'),
+('ctakes', '->', 'OntologyMatch', @csv_readwrite, 'UMLS, Regex, and Private dictionaries'),
+('ctakes', 'type', 'BaseToken', @csv_readwrite, 'CTAKES base class'),
+('ctakes', '->', 'NumToken', @csv_readwrite, 'token is a number'),
+('ctakes', '->', 'WordToken', @csv_readwrite, 'token is a word');
+
+
+
+select 'PHI| NA' into @class;
+select 'Class ( NumToken | WordToken )' into @sample;
+select 'Lexical | Stat | Pattern | Ontology' into @feature;
+
+insert into README values
+('Classification','','','',''),
+('Classification', 'table', 'Class', @desc_table, @class),
+('Classification', 'table', 'Sample', @desc_table, @sample),
+('Classification', 'table', 'Feature', @desc_table, @feature),
+('Classification', 'table', 'FeatureBin', @desc_table, 'POS | UMLS | regex'),
+('Classification', 'table', 'FeatureMatrix', @desc_table, '(row,col) = (Sample,Feature)'),
+('Classification', 'table', 'FeatureMatrixOnt', @desc_table, '(row,col) = (Sample,OntologyMatch)');
+
+select 'POS* | Capital | cTAKES*' into @lexical;
+
+insert into README values
+('Features','','','',''),
+('Features', 'type', 'Feature', @feature, null),
+('Features', '->', 'Lexical', @lexical, null),
+('Features', '->', 'Lexical', 'POS*', 'Part of Speech (BIN)?'),
+('Features', '->', 'Lexical', 'Capital', 'Token Capitalization'),
+('Features', '->', 'Lexical', 'cTAKES*', '(context dependent)');
+
+select 'TF* | TokenLength | BagOfWords' into @stat;
+
+insert into README values
+('Features', '->', 'Stat', @stat, null),
+('Features', '->', 'Stat', 'TF*', 'Term Frequency (POS)?'),
+('Features', '->', 'Stat', 'TokenLength', 'length(token)'),
+('Features', '->', 'Stat', 'BagOfWords', '#OntologyMatches per ontology');
+
+select 'Beckwith Regular Expressions' into @pattern;
+
+insert into README values
+('Features', '->', 'Pattern', null, null),
+('Features', '->', 'Pattern', @pattern, 'Beckwith et al. 2006'),
+('Features', '->', 'Pattern', 'custom', 'custom regex.list');
+
+select 'UMLS | custom' into @ont;
+select 'Hl7|ICD|LOINC|MESH|RXNORM|SNOMED|COSTAR' into @umls;
+
+insert into README values
+('Features', '->', 'Ontology', null, null),
+('Features', '->', 'Ontology', 'UMLS', @umls);
+
+
+-- TODO REFACTOR
+-- insert into README values
+-- ('Human', 'table', 'HumanAnnotation', 'REFACTOR', 'REFACTOR');
+
+update README set example='' where example is null;
+update README set description='' where description is null;
+update README set entity_name='' where entity_name is null;
+update README set entity_topic='' where entity_topic is null;
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/drop_tables.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/drop_tables.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/drop_tables.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/drop_tables.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,22 @@
+select 'Dropping Annotation Tables and Samples, Features, and FeatureMatrix' as 'note';
+
+drop table if exists README;
+drop table if exists Annotation;
+drop table if exists OntologyMatch;
+drop table if exists BaseToken;
+drop table if exists WordToken;
+drop table if exists Sentence;
+drop table if exists EmptySentence;
+drop table if exists NewLineToken;
+drop table if exists Calculation;
+drop table if exists POS;
+drop table if exists TermFrequency; ;
+drop table if exists Class;
+drop table if exists Sample;
+drop table if exists Feature;
+drop table if exists FeatureBin;
+drop table if exists FeatureMatrix;
+drop table if exists FeatureMatrixOnt;
+drop table if exists MyFeatureSelection;
+
+select 'Dropping tables is done' as 'note';
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/procedures.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/procedures.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/procedures.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/procedures.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,324 @@
+select 'Creating procedures, see README for usage' as note;
+
+drop procedure if exists README;
+drop procedure if exists CSV_DIR;
+drop procedure if exists LOG_LEVEL;
+drop procedure if exists drop_table;
+drop procedure if exists Session;
+drop procedure if exists log;
+drop procedure if exists tail;
+drop procedure if exists head;
+drop procedure if exists etime;
+drop procedure if exists mem;
+drop procedure if exists ps;
+drop procedure if exists csvread;
+drop procedure if exists csvwrite;
+drop procedure if exists insert_FeatureMatrixOnt;
+
+drop procedure if exists README;
+delimiter //
+create procedure README()
+begin
+ select * from README;
+end//
+delimiter ;
+
+drop procedure if exists DATASET;
+delimiter //
+create procedure DATASET(user_dataset varchar(10))
+begin
+ if @DATASET is null then
+ call SessionResume;
+ end if;
+
+ select user_dataset into @DATASET;
+
+ call log('DATASET', @DATASET) ;
+ call Session;
+
+end//
+delimiter ;
+
+drop procedure if exists USE_TRAIN;
+delimiter //
+create procedure USE_TRAIN()
+begin
+
+ call log('DATASET', 'SWITCH') ;
+ call DATASET('TRAIN');
+end//
+delimiter ;
+
+drop procedure if exists USE_TEST;
+delimiter //
+create procedure USE_TEST()
+begin
+
+ call log('DATASET', 'SWITCH') ;
+ call DATASET('TEST');
+end//
+delimiter ;
+
+
+drop procedure if exists CSV_DIR;
+delimiter //
+create procedure CSV_DIR(csv_dir tinyint)
+begin
+ select csv_dir into @CSV_DIR;
+ call log('CSV_DIR', log_level) ;
+end//
+delimiter ;
+
+drop procedure if exists LOG_LEVEL;
+delimiter //
+create procedure LOG_LEVEL(log_level tinyint)
+begin
+
+ select log_level into @LOG_LEVEL;
+ call log('LOG_LEVEL', log_level) ;
+end//
+delimiter ;
+
+
+drop table if exists Session;
+CREATE TABLE Session (
+ DATASET varchar(5),
+ CSV_DIR varchar(1000),
+ LOG_LEVEL tinyint
+);
+
+alter table Session add column session_id int UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+alter table Session add column session_start timestamp default now();
+
+drop procedure if exists Session;
+delimiter //
+create procedure Session()
+begin
+
+ call SessionResume;
+
+ if @DATASET is null then
+
+ call README;
+ select '@@@@' as warning;
+ select '@DATASET is null? which one to use, train or test?' as warning;
+ select 'Terminating now.' from _WARNING_TRAIN_OR_TEST;
+ end if;
+
+ insert into Session(LOG_LEVEL, DATASET, CSV_DIR) select @LOG_LEVEL, @DATASET, @CSV_DIR;
+
+ select max(session_id) into @session_id from Session;
+ select concat('@DATASET=', @DATASET,' @CSV_DIR=',@CSV_DIR, ' @session_id=', @session_id,' @LOG_LEVEL=', @LOG_LEVEL)
+ as 'your current variable settings = ';
+
+ call log('Session', 'started');
+
+end//
+delimiter ;
+
+-- TODO: @@@NOTICE:
+insert into Session (DATASET, CSV_DIR, LOG_LEVEL) select 'TEST', '/tmp/mysql/', 1;
+
+drop procedure if exists SessionResume;
+delimiter //
+create procedure SessionResume()
+begin
+ if @CSV_DIR is null then
+ select CSV_DIR into @CSV_DIR from Session order by session_id desc limit 1;
+ end if;
+
+ if @DATASET is null then
+ select DATASET into @DATASET from Session order by session_id desc limit 1;
+ end if;
+
+ if @LOG_LEVEL is null then
+ select LOG_LEVEL into @LOG_LEVEL from Session order by session_id desc limit 1;
+ end if;
+end//
+delimiter ;
+
+
+drop table if exists log;
+CREATE TABLE log
+(
+ event_time timestamp default now(),
+ session_id integer NOT NULL,
+ entity_name varchar(100) NOT NULL,
+ message varchar(100) NULL,
+ DATASET varchar(5) NOT null default '?????'
+);
+ALTER TABLE log add idx smallint UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+
+drop procedure if exists log;
+delimiter //
+create procedure log(log_entity_name varchar(100), log_message varchar(100))
+begin
+
+ insert into log
+ (DATASET, session_id, entity_name, message) select
+ @DATASET, @session_id, log_entity_name, log_message;
+
+ if @LOG_LEVEL > 0 then
+ select concat(DATASET, ',', entity_name, ',',message) as '' from log order by idx desc limit 1;
+ end if;
+end//
+delimiter ;
+
+call Session;
+call log('LOG', 'create table');
+
+
+drop procedure if exists tail;
+delimiter //
+create procedure tail()
+begin
+ select event_time, DATASET, entity_name, message from log order by idx DESC limit 50;
+end//
+delimiter ;
+
+drop procedure if exists head;
+delimiter //
+create procedure head()
+begin
+ select event_time, DATASET, entity_name, message from log order by idx ASC limit 50;
+end//
+delimiter ;
+
+drop procedure if exists etime;
+delimiter //
+create procedure etime()
+begin
+ select l1.idx, l1.event_time, l1.entity_name, l1.message, l1.event_time,
+ UNIX_TIMESTAMP(l1.event_time)-UNIX_TIMESTAMP(l2.event_time) as etime
+ from log l1, log l2
+ where l1.idx = l2.idx+1
+ order by l1.idx asc;
+end//
+delimiter ;
+
+drop procedure if exists mem;
+delimiter //
+create procedure mem()
+begin
+ select
+ table_schema,
+ ENGINE,
+ TABLE_NAME,
+ TABLE_ROWS,
+ DATA_LENGTH,
+ concat( round( data_length / ( 1024 *1024 ) , 2 ) , 'M' ) data_length,
+ concat( round( index_length / ( 1024 *1024 ) , 2 ) , 'M' ) index_length
+ from
+ information_schema.TABLES
+ where
+ TABLE_SCHEMA = 'scrubber'
+ order by
+ table_schema, engine, table_name;
+end//
+delimiter ;
+
+drop procedure if exists ps;
+delimiter //
+create procedure ps()
+begin
+ select * from information_schema.processlist
+ where information_schema.processlist.INFO not like '%information_schema.processlist%';
+end//
+delimiter ;
+
+drop procedure if exists csvread;
+delimiter //
+create procedure csvread(mytable varchar(500))
+begin
+
+ select concat( ' load data '
+ ' local infile ', "'", concat(@CSV_DIR, mytable, '.csv'), "'",
+ ' into table ', mytable,
+ ' fields terminated by ',"'", ",","'",' ignore 1 lines') into @csvout;
+
+ select @csvout;
+
+ prepare stmt from @csvout;
+ execute stmt;
+end//
+delimiter ;
+
+drop procedure if exists csvwrite;
+delimiter //
+create procedure csvwrite(mytable varchar(500))
+begin
+
+ select concat(mytable, '.csv') into @table_csv;
+
+ select concat( ' SELECT * from ', mytable,
+ ' into outfile ', "'", concat(@CSV_DIR, @table_csv), "'",
+ ' fields terminated by ', "'", ',', "'"
+ ' lines terminated by ', "'", '\\n', "'") into @csvout;
+
+-- select @csvout;
+
+ prepare stmt from @csvout;
+ execute stmt;
+end//
+delimiter ;
+
+drop procedure if exists drop_table;
+delimiter //
+create procedure drop_table(table_to_drop varchar(50))
+begin
+
+ set @s:=concat('drop table if exists ', table_to_drop);
+
+ call log(table_to_drop, 'drop_table()');
+
+ prepare stmt from @s;
+ execute stmt;
+end//
+delimiter ;
+
+
+-- TODO : refactor below
+-- call log('insert_FeatureMatrixOnt()', '');
+
+drop procedure if exists insert_FeatureMatrixOnt;
+delimiter //
+create procedure insert_FeatureMatrixOnt(ontology varchar(100))
+begin
+
+call log(ontology, 'temp');
+
+drop table if exists tmp_OM , tmp_FeatureMatrixOntOnt;
+
+create temporary table tmp_OM select file_idx, start_idx, end_idx, f.feature_idx, token from OntologyMatch om, Feature f
+ where om.match_source like ontology and om.match_source = f.match_value;
+
+alter table tmp_OM add index (file_idx, start_idx, end_idx);
+
+select count(*) into @count_before from FeatureMatrixOnt;
+
+call log('insert_FeatureMatrixOnt(before)', @count_before);
+
+insert into FeatureMatrixOnt
+select s.file_idx, s.sample_idx,o.feature_idx, count(*) as count_matches
+from Sample s, tmp_OM o
+where o.file_idx = s.file_idx and
+ o.start_idx <= s.start_idx and
+ o.end_idx >= s.end_idx
+group by s.file_idx, s.sample_idx,o.feature_idx;
+
+select count(*) into @count_now from FeatureMatrixOnt;
+select @count_now-@count_before into @count_diff;
+
+call log(ontology, @count_diff);
+call log('insert_FeatureMatrixOnt(now)', @count_now);
+
+call log(ontology, 'done');
+call log('insert_FeatureMatrixOnt()', ontology);
+
+end//
+delimiter ;
+
+call log('procedures.sql', 'done');
+
+
+
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/schema.sh
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/schema.sh?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/schema.sh (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/CSVAnnotation/schema/schema.sh Sun Jul 7 19:39:00 2013
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+scrubberdb < drop_tables.sql
+scrubberdb < procedures.sql
+scrubberdb < README.sql
+scrubberdb -e 'call README';
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/create_database_and_user.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/create_database_and_user.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/create_database_and_user.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/create_database_and_user.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,8 @@
+drop database if exists scrubber;
+
+create database scrubber
+default character set=utf8
+default collate = utf8_general_ci;
+
+create user 'scrubber'@'localhost' identified by 'scrubber';
+grant all on scrubber.* to 'scrubber'@'localhost';
\ No newline at end of file
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_cases.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_cases.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_cases.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_cases.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,25 @@
+-- CASES INDEXES-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- critical idx:
+-- create index idxCovering on annotations (filename_short,token);
+create index idxCovering2 on annotations (filename_short, token, start_idx, end_idx);
+create index idxToken on annotations(token);
+create index idxShortFile on annotations (filename_short);
+-- tf
+create index idxTokPos on lookup_term_frequency (token,pos);
+
+-- useful idx:
+create index idxMatchSource on annotations (match_source);
+create index idxMatchValue on annotations (match_value);
+create index idxMatchKVPair on annotations (match_source, match_value);
+create index idxShortAnnot on annotations (annot_type_short);
+
+
+-- case feature indexes -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+create index idxIsPhi on feature_matrix(is_phi);
+create index idxShortFile on feature_matrix(filename_short);
+create index idxStartIdx on feature_matrix(start_idx);
+
+-- other
+create index idxStartIdx on human_annotations(start_idx);
+create index idxEndIdx on human_annotations(end_idx);
+
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_pubs.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_pubs.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_pubs.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/create_indexes_pubs.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,17 @@
+-- PUBS INDEXES-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- critical idx:
+create index idxCovering on machine_annotations_pubs (filename_short,token);
+create index idxToken on machine_annotations_pubs(token);
+create index idxShortFile on machine_annotations_pubs (filename_short);
+create index idxMatchSource on machine_annotations_pubs (match_source);
+
+-- useful idx:
+create index idxMatchValue on machine_annotations_pubs (match_value);
+create index idxMatchKVPair on machine_annotations_pubs (match_source, match_value);
+create index idxShortAnnot on machine_annotations_pubs (annot_type_short);
+
+create index kwdIdx on keywords(keyword);
+create index pubIdIdx on authors(pub_id);
+create index pubIdIdx on refs(pub_id);
+create index jtitleIdx on pubs(journal_title);
+create index fileIdx on pubs(filepath);
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/create_tables.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/create_tables.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/create_tables.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/create_tables.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,192 @@
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- BEWARE: string comparisons are done against many tables
+-- and it is assumed "COLLATE utf8_ci" for default db collation.
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+
+CREATE TABLE lookup_umls (
+ id int(11) NOT NULL auto_increment,
+ CUI char(10) NOT NULL,
+ SAB varchar(20) NOT NULL,
+ STR text NOT NULL,
+ PRIMARY KEY (id),
+ KEY X_CUI (CUI),
+ KEY X_SAB (SAB),
+ KEY X_STR (STR(255))
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- create table for dictionaryAnnotator, this implements the old txt dictionaries from earlier versions of scrubber.
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+
+CREATE TABLE lookup_dictionary (
+ id int(11) NOT NULL auto_increment,
+ code varchar(50) NOT NULL,
+ ontology varchar(50) NOT NULL,
+ str text NOT NULL,
+ update_date timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP,
+ PRIMARY KEY (id),
+ KEY strIdx (str(255))
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- create table for tf of tokens across a corpus, group by token, pos
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+
+CREATE TABLE lookup_term_frequency (
+ id int(11) NOT NULL auto_increment,
+ cnt int(11) NOT NULL ,
+ token varchar(100) NOT NULL,
+ pos varchar(10),
+ source varchar(20) NOT NULL,
+ PRIMARY KEY (id)
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- intermediate table used for comparing the distribution of part of speech
+-- between pubs, cases, and known phi tokens.
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+CREATE TABLE pubs_pos (
+ id int(11) NOT NULL auto_increment,
+ pos varchar(10),
+ pos_bin varchar(10),
+ pos_cnt int(11) NOT NULL ,
+ PRIMARY KEY (id)
+) ENGINE=innodb DEFAULT CHARSET=utf8;
+
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- create feature table for CASES
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+CREATE TABLE feature_matrix_train (
+ id int(11) NOT NULL auto_increment,
+ token varchar(100) NOT NULL,
+ start_idx int(11) NOT NULL ,
+ end_idx int(11) NOT NULL ,
+ filename_short varchar(20) NOT NULL,
+ pos varchar(10),
+ pos_bin varchar(10) NOT NULL,
+ has_capital int(11) NOT NULL default 0 ,
+ cnt_regex_phon int(11) NOT NULL ,
+ cnt_regex_date int(11) NOT NULL ,
+ cnt_regex_age int(11) NOT NULL ,
+ cnt_regex_id int(11) NOT NULL ,
+ cnt_regex_pat int(11) NOT NULL ,
+ cnt_regex_doc int(11) NOT NULL ,
+ cnt_regex_loc int(11) NOT NULL ,
+ cnt_regex_hosp int(11) NOT NULL ,
+ cnt_priv int(11) NOT NULL ,
+ cnt_hosp int(11) NOT NULL ,
+ cnt_name int(11) NOT NULL ,
+ cnt_dict_costar int(11) NOT NULL ,
+ cnt_dict_hl7v25 int(11) NOT NULL ,
+ cnt_dict_hl7v30 int(11) NOT NULL ,
+ cnt_dict_icd10cm int(11) NOT NULL ,
+ cnt_dict_icd10pcs int(11) NOT NULL ,
+ cnt_dict_icd9cm int(11) NOT NULL ,
+ cnt_dict_lnc int(11) NOT NULL ,
+ cnt_dict_msh int(11) NOT NULL ,
+ cnt_dict_rxnorm int(11) NOT NULL ,
+ cnt_dict_snomedct int(11) NOT NULL ,
+ cnt_ham_w_pos float(23,15) NOT NULL ,
+ cnt_ham_wo_pos float(23,15) NOT NULL ,
+ is_phi int(11) NOT NULL ,
+ classified_as varchar(20) NOT NULL default 'NA',
+ phi_type varchar(20),
+ PRIMARY KEY (id)
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+create table feature_matrix_test like feature_matrix_train;
+
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- create table for KNOWN PHI. determined by annotated gold standard for training data.
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+CREATE TABLE human_annotations_train (
+ id int(11) NOT NULL auto_increment,
+ subject_id varchar(100) NOT NULL,
+ tag_name varchar(1000) NOT NULL,
+ tag_value LONGTEXT NOT NULL,
+ start_idx int(11) NOT NULL,
+ end_idx int(11) NOT NULL,
+ PRIMARY KEY (id)
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+create table human_annotations_test like human_annotations_train;
+
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- create table for annotations
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+CREATE TABLE machine_annotations_train (
+ id int(11) NOT NULL auto_increment,
+ source varchar(20),
+ filename_long varchar(1000) NOT NULL,
+ filename_short varchar(20) NOT NULL,
+ annot_type_long varchar(1000) NOT NULL,
+ annot_type_short varchar(50) NOT NULL,
+ token varchar(100) NOT NULL,
+ start_idx int(11) NOT NULL,
+ end_idx int(11) NOT NULL,
+ match_value varchar(50),
+ match_source varchar(50),
+ update_date timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP,
+ PRIMARY KEY (id)
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+create table machine_annotations_test like machine_annotations_train;
+
+CREATE TABLE machine_annotations_pubs (
+ id int(11) NOT NULL auto_increment,
+ source varchar(20),
+ filename_long varchar(1000) NOT NULL,
+ filename_short varchar(20) NOT NULL,
+ annot_type_long varchar(1000) NOT NULL,
+ annot_type_short varchar(50) NOT NULL,
+ token varchar(100) NOT NULL,
+ start_idx int(11) NOT NULL,
+ end_idx int(11) NOT NULL,
+ match_value varchar(50),
+ match_source varchar(50),
+ update_date timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP,
+ PRIMARY KEY (id)
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- creates table for OPEN ACCESS pub data
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+CREATE TABLE pubs (
+ id int(11) NOT NULL auto_increment,
+ journal_title varchar(1000),
+ article_title varchar(1000),
+ pmc varchar(100),
+ pmid varchar(100),
+ doi varchar(100),
+ body LONGTEXT,
+ filepath varchar(1000),
+ update_date timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP,
+ PRIMARY KEY (id)
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+CREATE TABLE pubs_authors (
+ id int(11) NOT NULL auto_increment,
+ pub_id int(11) NOT NULL ,
+ surname varchar(1000),
+ given_name varchar(1000),
+ update_date timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP,
+ PRIMARY KEY (id)
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+CREATE TABLE pubs_refs (
+ id int(11) NOT NULL auto_increment,
+ pub_id int(11) NOT NULL ,
+ surname varchar(1000),
+ given_name varchar(1000),
+ update_date timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP,
+ PRIMARY KEY (id)
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
+
+CREATE TABLE pubs_keywords (
+ id int(11) NOT NULL auto_increment,
+ pub_id int(11) NOT NULL ,
+ keyword varchar(1000),
+ update_date timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP,
+ PRIMARY KEY (id)
+) ENGINE=MyISAM DEFAULT CHARSET=utf8;
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/create_umls_tables_from_local_install.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/create_umls_tables_from_local_install.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/create_umls_tables_from_local_install.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/create_umls_tables_from_local_install.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,20 @@
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+-- used to house the subset of umls that is being used by scrubber dictionary annotator
+-- this is housed in the same schema as the umls data (currently that is expected to be called 'umls')
+--
+-- this will also populate the table with that subset.
+-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -
+create table lookup_umls like mrconso;
+
+insert into lookup_umls select * from mrconso where sab in (
+'COSTAR',
+'HL7V2.5',
+'HL7V3.0',
+'ICD10CM',
+'ICD10PCS',
+'ICD9CM',
+'LNC',
+'MSH',
+'RXNORM',
+'SNOMEDCT'
+);
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/drop_database_and_user.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/drop_database_and_user.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/drop_database_and_user.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/drop_database_and_user.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,2 @@
+drop database if exists scrubber;
+drop user 'scrubber'@'localhost';
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/drop_tables.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/drop_tables.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/drop_tables.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/drop_tables.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,31 @@
+use scrubber;
+
+drop table if exists lookup_umls;
+
+drop table if exists lookup_dictionary;
+
+drop table if exists lookup_term_frequency;
+
+drop table if exists pubs_pos;
+
+drop table if exists feature_matrix_train;
+
+drop table if exists feature_matrix_test;
+
+drop table if exists human_annotations_train;
+
+drop table if exists human_annotations_test;
+
+drop table if exists machine_annotations_train;
+
+drop table if exists machine_annotations_test;
+
+drop table if exists machine_annotations_pubs;
+
+drop table if exists pubs;
+
+drop table if exists pubs_authors;
+
+drop table if exists pubs_refs;
+
+drop table if exists pubs_keywords;
\ No newline at end of file
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/evaluate_results.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/evaluate_results.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/evaluate_results.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/evaluate_results.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,9 @@
+select 'ALL' as type , count(*) from feature_matrix_test
+union
+select 'TN' as type , count(*) from feature_matrix_test where classified_as = 'NA' and phi_type = 'NA'
+union
+select 'TP' as type, count(*) from feature_matrix_test where classified_as <> 'NA' and phi_type <> 'NA'
+union
+select 'FN' as type, count(*) from feature_matrix_test where classified_as = 'NA' and phi_type <> 'NA'
+union
+select 'FP' as type, count(*) from feature_matrix_test where classified_as <> 'NA' and phi_type = 'NA' ;