You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by br...@apache.org on 2013/07/07 21:39:01 UTC
svn commit: r1500518 [2/2] - in /ctakes/sandbox/ctakes-scrubber-deid/sql: ./
CSVAnnotation/ CSVAnnotation/human/ CSVAnnotation/machine/
CSVAnnotation/schema/ tmp/
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/export_matlab.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/export_matlab.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/export_matlab.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/export_matlab.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,382 @@
+call log('Sentence', 'begin');
+
+drop table if exists t_sort ;
+create table t_sort like Sentence;
+
+insert into t_sort select * from Sentence order by file_idx, sentence_seq;
+alter table t_sort add sentence_idx integer UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+
+drop table Sentence;
+alter table t_sort rename to Sentence;
+
+alter table BaseToken add sentence_idx integer not null;
+
+update BaseToken b, Sentence s
+set b.sentence_idx = s.sentence_idx
+where b.file_idx = s.file_idx and
+ b.start_idx >= s.start_idx and
+ b.end_idx <= s.end_idx ;
+
+call log('Sentence', 'done');
+call log('Sample', 'begin');
+call log('Sample', 'BaseToken');
+
+drop table if exists Sample;
+
+create table Sample
+ select * from BaseToken where
+ annotation_type = 'NumToken' or
+ annotation_type = 'WordToken'
+ order by file_idx, token_seq asc;
+
+call log('Sample', 'BaseToken');
+call log('Sample', 'index');
+alter table Sample engine = myisam;
+alter table Sample add sample_idx integer UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+alter table Sample add class_idx tinyint UNSIGNED NULL;
+alter table Sample add index (file_idx);
+alter table Sample add index (file_idx, token_seq);
+alter table Sample add index (file_idx, start_idx, end_idx);
+
+call log('Sample', 'index');
+call log('Sample', 'class');
+update Sample s, Class c
+set s.class_idx = c.class_idx
+where s.class = c.class;
+update Sample set class = 'NA', class_idx = 0 where class_idx is null;
+
+call log('Sample', 'class');
+call log('Sample', 'sentence');
+
+-- alter table Sentence drop column sample_idx_start ;
+-- alter table Sentence drop column sample_idx_end ;
+
+alter table Sentence add sample_idx_start int unsigned not null;
+alter table Sentence add sample_idx_end int unsigned not null;
+
+create temporary table sentence_bounds select
+sentence_idx, min(sample_idx) sample_idx_start, max(sample_idx) sample_idx_end
+from Sample group by sentence_idx order by sentence_idx;
+
+update Sentence s, sentence_bounds bounds
+set s.sample_idx_start = bounds.sample_idx_start,
+ s.sample_idx_end = bounds.sample_idx_end
+where s.sentence_idx = bounds.sentence_idx;
+
+call log('Sample', 'sentence');
+call log('Sample', 'done');
+call log('Feature', 'begin');
+
+drop table if exists Feature;
+create table Feature
+(
+ annotation_type varchar(25) not null,
+ match_source varchar(25) not null,
+ match_value varchar(25) null,
+ cnt int null,
+ datatype varchar(25) null
+) engine=myisam;
+
+alter table Feature add index (match_source);
+alter table Feature add index (match_value);
+alter table Feature add index (match_source, match_value);
+
+
+insert into Feature
+select 'WordToken', 'cap', cap, count(*) as cnt, 'nominal(4)'
+from WordToken group by cap order by cnt desc;
+
+insert into Feature
+select annotation_type, 'BaseToken', null, count(*), 'bool'
+from BaseToken group by annotation_type order by count(*) desc;
+
+insert into Feature
+select annotation_type, 'Annotation', null, count(*), 'bool'
+from Annotation
+where annotation_type in
+(
+ 'RomanNumeralAnnotation',
+ 'FractionAnnotation',
+ 'DateAnnotation',
+ 'MeasurementAnnotation',
+ 'RangeAnnotation',
+ 'PersonTitleAnnotation',
+ 'TimeAnnotation'
+)
+group by annotation_type
+order by count(*) desc;
+
+insert into Feature
+select 'bin', match_source, match_bin, count(*) as cnt, 'bool'
+from FeatureBin
+group by match_source, match_bin;
+
+insert into Feature
+select 'BaseToken', 'pos', pos, count(*) as cnt, 'bool'
+from BaseToken
+where annotation_type = 'WordToken' or annotation_type = 'NumToken'
+group by pos
+order by cnt desc;
+
+insert into Feature
+select annotation_type, match_source, match_value, count(*) as cnt, 'bool'
+from OntologyMatch
+where match_source = 'regex'
+group by match_source, match_value
+order by count(*) desc;
+
+insert into Feature
+select annotation_type, match_source, null, count(*) as cnt, 'INTEGER'
+from OntologyMatch
+group by match_source
+order by cnt desc;
+
+ALTER TABLE Feature add feature_idx smallint UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+ALTER TABLE Feature add index (annotation_type, match_source, match_value);
+
+call log('Feature', 'done');
+call log('FeatureMatrix', 'begin');
+
+drop table if exists FeatureMatrix;
+create table FeatureMatrix
+(
+ file_idx smallint not null,
+ sample_idx integer not null,
+ feature_idx smallint not null
+) engine=myisam;
+
+call log('FeatureMatrix', 'done');
+call log('FeatureMatrix', 'WordToken:cap');
+
+-- | annotation_type | match_source | match_value | datatype | feature_idx |
+-- +------------------------+--------------+--------------------------+-------------+-------------+
+-- | WordToken | cap | 0 | nominal(4) | 1 |
+-- | WordToken | cap | 3 | nominal(4) | 2 |
+-- | WordToken | cap | 1 | nominal(4) | 3 |
+-- | WordToken | cap | 2 | nominal(4) | 4 |
+
+insert into FeatureMatrix
+select s.file_idx, s.sample_idx, f.feature_idx
+from Feature f, Sample s, WordToken w
+where f.match_source = 'cap' and
+ f.match_value = w.cap and
+ w.file_idx = s.file_idx and
+ w.token_seq = s.token_seq ;
+
+call log('FeatureMatrix', 'WordToken:cap');
+call log('FeatureMatrix', 'BaseToken');
+
+-- | annotation_type | match_source | match_value | datatype | feature_idx |
+-- +------------------------+--------------+--------------------------+-------------+
+-- | WordToken | BaseToken | NULL | bool | 5 |
+-- | PunctuationToken | BaseToken | NULL | bool | 6 |
+-- | NumToken | BaseToken | NULL | bool | 7 |
+-- | SymbolToken | BaseToken | NULL | bool | 8 |
+-- | ContractionToken | BaseToken | NULL | bool | 9 |
+
+insert into FeatureMatrix
+select distinct
+ s.file_idx, s.sample_idx, f.feature_idx
+from Feature f, Sample s, BaseToken t
+where f.match_source = 'BaseToken' and
+ f.annotation_type = t.annotation_type and
+ t.file_idx = s.file_idx and
+ t.token_seq = s.token_seq ;
+
+call log('FeatureMatrix', 'BaseToken');
+call log('FeatureMatrix', 'Annotation');
+
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | annotation_type | match_source | match_value | datatype | feature_idx |
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | RomanNumeralAnnotation | Annotation | NULL | bool | 10 |
+-- | FractionAnnotation | Annotation | NULL | bool | 11 |
+-- | DateAnnotation | Annotation | NULL | bool | 12 |
+-- | MeasurementAnnotation | Annotation | NULL | bool | 13 |
+-- | RangeAnnotation | Annotation | NULL | bool | 14 |
+-- | PersonTitleAnnotation | Annotation | NULL | bool | 15 |
+-- | TimeAnnotation | Annotation | NULL | bool | 16 |
+
+insert into FeatureMatrix
+select distinct
+ s.file_idx, s.sample_idx, f.feature_idx
+from Feature f, Sample s, Annotation a
+where f.match_source = 'Annotation' and
+ f.annotation_type = a.annotation_type and
+ a.file_idx = s.file_idx and
+ a.start_idx >= s.start_idx and
+ a.end_idx <= s.end_idx ;
+
+call log('FeatureMatrix', 'Annotation');
+call log('FeatureMatrix', 'bin');
+
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | annotation_type | match_source | match_value | datatype | feature_idx |
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | bin | pos | Adjectives | bool | 17 |
+-- | bin | pos | Adverbs | bool | 18 |
+-- | bin | pos | com-dep-wd | bool | 19 |
+-- | bin | pos | FW-Symb | bool | 20 |
+-- | bin | pos | Nouns | bool | 21 |
+-- | bin | pos | Numbers | bool | 22 |
+-- | bin | pos | Pronouns | bool | 23 |
+-- | bin | pos | Verbs | bool | 24 |
+-- | bin | regex | address | bool | 25 |
+-- | bin | regex | age | bool | 26 |
+-- | bin | regex | date | bool | 27 |
+-- | bin | regex | doctor | bool | 28 |
+-- | bin | regex | hospital | bool | 29 |
+-- | bin | regex | patient | bool | 30 |
+-- | bin | regex | phone | bool | 31 |
+
+call log('FeatureMatrix', 'bin:pos');
+
+insert into FeatureMatrix
+select distinct
+ s.file_idx, s.sample_idx, f.feature_idx
+from Feature f, Sample s, FeatureBin b
+where f.match_source = 'pos' and
+ f.match_source = b.match_source and
+ f.match_value = b.match_bin and
+ s.pos = b.match_value ;
+
+call log('FeatureMatrix', 'bin:pos');
+call log('FeatureMatrix', 'bin:regex');
+
+insert into FeatureMatrix
+select distinct
+ s.file_idx, s.sample_idx, f.feature_idx
+from Feature f, Sample s, FeatureBin b, OntologyMatch o
+where f.annotation_type = 'bin' and
+ f.match_source = 'regex' and
+ f.match_source = b.match_source and
+ f.match_source = o.match_source and
+ f.match_value = b.match_bin and
+ o.match_value = b.match_value and
+ o.file_idx = s.file_idx and
+ o.start_idx >= s.start_idx and
+ o.end_idx <= s.end_idx ;
+
+call log('FeatureMatrix', 'bin:regex');
+call log('FeatureMatrix', 'BaseToken:pos');
+
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | annotation_type | match_source | match_value | datatype | feature_idx |
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | BaseToken | pos | NN | bool | 32 |
+-- | BaseToken | pos | JJ | bool | 33 |
+-- | BaseToken | pos | CD | bool | 34 |
+-- | BaseToken | pos | IN | bool | 35 |
+-- | BaseToken | pos | NNP | bool | 36 |
+-- | BaseToken | pos | DT | bool | 37 |
+-- | ... | ... | ... | ... | .. |
+-- | BaseToken | pos | pos_paren | bool | 71 |
+-- +------------------------+--------------+-------------+------------+-------------+
+
+
+insert into FeatureMatrix
+select distinct
+ s.file_idx, s.sample_idx, f.feature_idx
+from Feature f, Sample s
+where
+ f.annotation_type = 'BaseToken' and
+ f.match_source = 'pos' and
+ f.match_value = s.pos ;
+
+
+call log('FeatureMatrix', 'BaseToken:pos');
+call log('FeatureMatrix', 'OntologyMatch:regex');
+
+-- +-----------------+--------------+-----------------+----------+-------------+
+-- | annotation_type | match_source | match_value | datatype | feature_idx |
+-- +-----------------+--------------+-----------------+----------+-------------+
+-- | OntologyMatch | regex | DATE22 | bool | 72 |
+-- | OntologyMatch | regex | DATE21 | bool | 73 |
+-- | OntologyMatch | regex | DATE17 | bool | 74 |
+-- | OntologyMatch | regex | DATE18 | bool | 75 |
+-- | OntologyMatch | regex | DATE_SEPARATORS | bool | 76 |
+-- | OntologyMatch | regex | DATE1 | bool | 77 |
+-- | OntologyMatch | regex | DATE4 | bool | 78 |
+-- | OntologyMatch | regex | YEAR_CENTURY | bool | 79 |
+-- | OntologyMatch | regex | SUSPICIOUS_NUM | bool | 80 |
+-- | OntologyMatch | regex | MONTH | bool | 81 |
+-- | ... | ... | ... | ... | ... |
+-- | OntologyMatch | regex | ADDRESS | bool | 148 |
+-- +-----------------+--------------+-----------------+----------+-------------+
+
+insert into FeatureMatrix
+select distinct
+ s.file_idx, s.sample_idx, f.feature_idx
+from Feature f, Sample s, OntologyMatch o
+where f.annotation_type = 'OntologyMatch' and
+ f.match_source = 'regex' and
+ f.match_source = o.match_source and
+ f.match_value = o.match_value and
+ o.file_idx = s.file_idx and
+ o.start_idx >= s.start_idx and
+ o.end_idx <= s.end_idx ;
+
+call log('FeatureMatrix', 'OntologyMatch:regex');
+
+-- TODO refactor
+
+create table EmptySentence
+select * from Sentence where sample_idx_start = 0 and sample_idx_end = 0;
+
+delete from Sentence where sample_idx_start = 0 and sample_idx_end = 0;
+alter table Sentence drop column sentence_idx ;
+alter table Sentence add column sentence_idx integer UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+
+update BaseToken b, Sentence s
+set b.sentence_idx = s.sentence_idx
+where b.file_idx = s.file_idx and
+ b.start_idx >= s.start_idx and
+ b.end_idx <= s.end_idx ;
+
+update Sample sample, Sentence s
+set sample.sentence_idx = s.sentence_idx
+where sample.file_idx = s.file_idx and
+ sample.start_idx >= s.start_idx and
+ sample.end_idx <= s.end_idx ;
+
+call log('FeatureMatrix', 'csv');
+
+select s.file_idx, s.sample_idx, s.sentence_idx, m.feature_idx
+from FeatureMatrix m, Sample s
+where m.sample_idx = s.sample_idx
+order by m.sample_idx, m.feature_idx
+into
+ outfile '/tmp/mysql/FeatureMatrix.csv'
+ FIELDS TERMINATED BY ','
+ LINES TERMINATED BY '\n';
+
+call log('FeatureMatrix', 'csv');
+call log('Sample', 'csv');
+
+select file_idx, sample_idx, sentence_idx, start_idx, end_idx, token_seq, class_idx
+from Sample
+order by sample_idx
+into
+ outfile '/tmp/mysql/Sample.csv'
+ FIELDS TERMINATED BY ','
+ LINES TERMINATED BY '\n';
+
+call log('Sample', 'csv');
+call log('SentenceBoundary', 'csv');
+
+select file_idx, 0 as sample_idx, sentence_idx, sample_idx_start, sample_idx_end, sentence_seq
+from Sentence
+-- where sample_idx_start >0 and sample_idx_end > 0
+group by sentence_idx order by sentence_idx
+into
+ outfile '/tmp/mysql/SentenceBoundary.csv'
+ FIELDS TERMINATED BY ','
+ LINES TERMINATED BY '\n';
+
+call log('SentenceBoundary', 'csv');
+
+
+
+
+
+
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/load_csv_annotations.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/load_csv_annotations.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/load_csv_annotations.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/load_csv_annotations.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,174 @@
+truncate etl_log;
+
+CALL log('load_csv_annotations.sql', 'begin');
+
+CALL log('Drop CSV tables', 'begin');
+drop table if exists Annotation, OntologyMatch, Calculation;
+drop table if exists BaseToken, WordToken;
+drop table if exists Sentence;
+
+CALL log('Drop CSV tables', 'done');
+CALL log('Annotation', 'begin');
+drop table if exists Annotation;
+
+create table Annotation
+(
+ annotation_type varchar(50),
+ filename_short varchar(20),
+ start_idx smallint,
+ end_idx smallint
+) engine = myisam;
+
+load data infile '/tmp/mysql/Annotation.csv' into table Annotation
+fields terminated by ',' ignore 1 lines;
+
+CALL log('Annotation', 'done');
+CALL log('Sentence', 'begin');
+
+drop table if exists Sentence;
+create table Sentence like Annotation;
+alter table Sentence add column sentence_seq smallint not null;
+
+load data infile '/tmp/mysql/Sentence.csv' into table Sentence
+fields terminated by ',' ignore 1 lines;
+
+CALL log('Sentence', 'done');
+CALL log('BaseToken', 'begin');
+
+drop table if exists BaseToken;
+create table BaseToken like Annotation;
+
+alter table BaseToken add column token_seq smallint not null;
+alter table BaseToken add column pos varchar(25) not null;
+alter table BaseToken add column token varchar(100) not null;
+
+load data infile '/tmp/mysql/BaseToken.csv' into table BaseToken
+fields terminated by ',' ignore 1 lines;
+
+CALL log('BaseToken', 'done');
+CALL log('NewLineToken', 'begin');
+
+drop table if exists NewLineToken;
+create table NewLineToken like BaseToken;
+
+load data infile '/tmp/mysql/NewLineToken.csv' into table NewLineToken
+fields terminated by ',' ignore 1 lines;
+
+CALL log('NewLineToken', 'done');
+CALL log('WordToken', 'begin');
+
+drop table if exists WordToken;
+create table WordToken like Annotation;
+
+alter table WordToken add column token_seq smallint not null;
+alter table WordToken add column pos varchar(25) not null;
+alter table WordToken add column cap tinyint not null;
+alter table WordToken add column token varchar(100) not null;
+
+load data infile '/tmp/mysql/WordToken.csv' into table WordToken
+fields terminated by ',' ignore 1 lines;
+
+alter table WordToken add index(pos, cap);
+
+CALL log('WordToken', 'done');
+CALL log('OntologyMatch','begin');
+
+drop table if exists OntologyMatch;
+create table OntologyMatch like Annotation;
+
+alter table OntologyMatch add column match_source varchar(50) not null;
+alter table OntologyMatch add column match_value varchar(50) not null;
+alter table OntologyMatch add column token varchar(100) not null;
+
+load data infile '/tmp/mysql/OntologyMatch.csv' into table OntologyMatch
+fields terminated by ',' ignore 1 lines;
+
+CALL log('OntologyMatch', 'done');
+CALL log('Calculation', 'begin');
+
+drop table if exists Calculation;
+create table Calculation like OntologyMatch;
+
+alter table Calculation drop column token;
+
+load data infile '/tmp/mysql/Calculation.csv' into table Calculation
+fields terminated by ',' ignore 1 lines;
+
+CALL log('Calculation', 'done');
+CALL log('file_idx', 'begin');
+
+alter table Annotation add column file_idx smallint not null;
+alter table BaseToken add column file_idx smallint not null;
+alter table Sentence add column file_idx smallint not null;
+alter table WordToken add column file_idx smallint not null;
+alter table OntologyMatch add column file_idx smallint not null;
+alter table Calculation add column file_idx smallint not null;
+
+update Annotation set file_idx = trim(replace(filename_short, '.txt', ''));
+update BaseToken set file_idx = trim(replace(filename_short, '.txt', ''));
+update Sentence set file_idx = trim(replace(filename_short, '.txt', ''));
+update WordToken set file_idx = trim(replace(filename_short, '.txt', ''));
+update OntologyMatch set file_idx = trim(replace(filename_short, '.txt', ''));
+update Calculation set file_idx = trim(replace(filename_short, '.txt', ''));
+
+CALL log('file_idx', 'done');
+CALL log('index', 'begin');
+
+alter table WordToken add index (file_idx);
+alter table WordToken add index (file_idx, token_seq);
+alter table WordToken add index (file_idx, start_idx, end_idx);
+alter table BaseToken add index (filename_short);
+alter table BaseToken add index (file_idx, token_seq);
+alter table BaseToken add index (file_idx, start_idx, end_idx);
+alter table BaseToken add index (annotation_type, file_idx, token_seq);
+alter table Annotation add index (annotation_type);
+alter table Annotation add index (file_idx, start_idx, end_idx);
+
+alter table Calculation add index (file_idx, start_idx, end_idx);
+
+alter table OntologyMatch add index(match_source);
+alter table OntologyMatch add index(match_source, match_value);
+alter table OntologyMatch add index(file_idx, start_idx, end_idx);
+alter table OntologyMatch add index(match_source, match_value, file_idx, start_idx, end_idx);
+
+alter table Sentence add index(file_idx, start_idx, end_idx);
+alter table Sentence add index(file_idx, sentence_seq);
+
+CALL log('index', 'done');
+call log('BaseToken:sentence_seq', 'begin.');
+
+alter table BaseToken add sentence_seq integer not null;
+
+update BaseToken b, Sentence s
+set b.sentence_seq = s.sentence_seq
+where b.file_idx = s.file_idx and
+ b.start_idx >= s.start_idx and
+ b.end_idx <= s.end_idx ;
+
+call log('BaseToken:sentence_seq', 'done.');
+call log('BaseToken:class', 'begin');
+
+drop table if exists Class;
+create table Class
+(
+ class_idx smallint UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+ class varchar(20),
+ cnt integer
+) engine = myisam;
+
+insert into Class (class, cnt) values ('NA', 0);
+
+insert into Class (class, cnt)
+select tag_name as class, count(*) as cnt from human_annotations_train group by tag_name;
+alter table Class add index (class);
+
+alter table BaseToken add class varchar(20) not null;
+
+update BaseToken b, human_annotations_train phi
+set b.class = phi.tag_name
+where b.filename_short= phi.subject_id and
+ b.start_idx >= phi.start_idx and
+ b.end_idx <= phi.end_idx ;
+
+call log('BaseToken:class', 'done.');
+CALL log('load_csv_annotations.sql', 'done');
\ No newline at end of file
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/populateTFTables.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/populateTFTables.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/populateTFTables.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/populateTFTables.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,38 @@
+
+-- pubs
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'a%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'b%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'c%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'd%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'e%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'f%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'g%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'h%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'i%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'j%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'k%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'l%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'm%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'n%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'o%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'p%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'q%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'r%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 's%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 't%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'u%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'v%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'w%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'x%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'y%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'z%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '0%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '1%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '2%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '3%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '4%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '5%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '6%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '7%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '8%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '9%' group by token, match_value;
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/tmp/FeatureMatrixOnt.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/tmp/FeatureMatrixOnt.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/tmp/FeatureMatrixOnt.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/tmp/FeatureMatrixOnt.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,55 @@
+call log('FeatureOnt', 'begin');
+drop table if exists FeatureOnt;
+create table FeatureOnt
+(
+ match_source varchar(50) not null,
+ match_value varchar(50) not null,
+ cnt int null
+) engine = myisam;
+
+insert into FeatureOnt
+select match_source,match_value, count(*) as cnt
+from OntologyMatch
+where match_source != 'regex'
+group by match_source,match_value
+order by match_source,match_value;
+
+alter table FeatureOnt add feature_idx integer UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+alter table FeatureOnt add index (match_source, match_value);
+alter table FeatureOnt add index (feature_idx, match_source, match_value);
+
+alter table FeatureOnt add column str text;
+
+update FeatureOnt f, lookup_umls u
+ set f.str = u.str
+where
+ f.match_source = u.sab and
+ f.match_value = u.cui ;
+
+call log('FeatureOnt', 'done');
+call log('FeatureMatrixOnt', 'start');
+call log('FeatureMatrixOnt', 'table');
+
+drop table if exists FeatureMatrixOnt;
+
+create table FeatureMatrixOnt
+(
+ sample_idx MEDIUMINT unsigned ,
+ feature_idx smallint unsigned ,
+ match_source varchar(10)
+)
+engine = myisam;
+
+call log('FeatureMatrixOnt', 'table');
+
+-- call log('FeatureMatrixOnt', 'slow bulk insert....');
+--
+-- create table FeatureMatrixOnt
+-- select s.sample_idx,o.match_source, o.match_value
+-- from Sample s, OntologyMatch o
+-- where o.match_source !='regex' and
+-- o.file_idx = s.file_idx and
+-- o.start_idx <= s.start_idx and
+-- o.end_idx >= s.end_idx ;
+--
+-- call log('FeatureMatrixOnt', 'done');
\ No newline at end of file
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_test.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_test.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_test.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_test.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,5 @@
+truncate human_annotations_test;
+
+truncate machine_annotations_test;
+
+truncate feature_matrix_test;
\ No newline at end of file
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_train.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_train.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_train.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_train.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,5 @@
+truncate human_annotations_train;
+
+truncate machine_annotations_train;
+
+truncate feature_matrix_train;
\ No newline at end of file
Added: ctakes/sandbox/ctakes-scrubber-deid/sql/update_pos_bins.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/update_pos_bins.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/update_pos_bins.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/update_pos_bins.sql Sun Jul 7 19:39:00 2013
@@ -0,0 +1,10 @@
+update pubs_pos set pos_bin = 'unknown' where pos in ('NULL',',','.',':','$','(',')','"',"'", "''", 'POS') or pos is null;
+update pubs_pos set pos_bin = 'com-dep-wd' where pos in ('CC','CT','DT','EX','IN','MD','PDT','RP','TO','UH','WDT');
+update pubs_pos set pos_bin = 'FW/Symb' where pos in ('FW','SYM');
+update pubs_pos set pos_bin = 'Adjectives' where pos in ('JJ','JJR','JJS');
+update pubs_pos set pos_bin = 'Nouns' where pos in ('NN','NNS','NNP','NNPS');
+update pubs_pos set pos_bin = 'Adverbs' where pos in ('WRB','RB','RBR','RBS');
+update pubs_pos set pos_bin = 'Verbs' where pos in ('VB','VBD','VBG','VBN','VBP','VBZ');
+update pubs_pos set pos_bin = 'Pronouns' where pos in ('PRP','PRP$','WP','WP$');
+update pubs_pos set pos_bin = 'Numbers' where pos in ('CD', 'LS');
+-- TODO: deprecated
\ No newline at end of file