You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by br...@apache.org on 2013/07/07 21:39:01 UTC

svn commit: r1500518 [2/2] - in /ctakes/sandbox/ctakes-scrubber-deid/sql: ./ CSVAnnotation/ CSVAnnotation/human/ CSVAnnotation/machine/ CSVAnnotation/schema/ tmp/

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/export_matlab.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/export_matlab.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/export_matlab.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/export_matlab.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,382 @@
+call log('Sentence', 'begin');
+
+drop table if exists t_sort ;
+create table t_sort like Sentence;
+
+insert into t_sort select * from Sentence order by file_idx, sentence_seq;
+alter table t_sort add sentence_idx integer UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+
+drop table Sentence;
+alter table t_sort rename to Sentence;
+
+alter table BaseToken add sentence_idx integer not null;
+
+update BaseToken b,    Sentence s
+set    b.sentence_idx  = s.sentence_idx
+where  b.file_idx      = s.file_idx      and
+       b.start_idx    >= s.start_idx     and
+       b.end_idx      <= s.end_idx         ;
+
+call log('Sentence', 'done');
+call log('Sample',   'begin');
+call log('Sample',   'BaseToken');
+
+drop table if exists Sample;
+
+create table Sample
+  select * from BaseToken where
+  annotation_type = 'NumToken' or
+  annotation_type = 'WordToken'
+  order by file_idx, token_seq asc;
+
+call log('Sample',   'BaseToken');
+call log('Sample',   'index');
+alter table Sample engine = myisam;
+alter table Sample add sample_idx integer UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+alter table Sample add class_idx  tinyint UNSIGNED NULL;
+alter table Sample add index (file_idx);
+alter table Sample add index (file_idx, token_seq);
+alter table Sample add index (file_idx, start_idx, end_idx);
+
+call log('Sample',   'index');
+call log('Sample',   'class');
+update Sample s, Class c
+set   s.class_idx = c.class_idx
+where s.class = c.class;
+update Sample set class = 'NA', class_idx = 0 where class_idx is null;
+
+call log('Sample',   'class');
+call log('Sample',   'sentence');
+
+-- alter table Sentence drop column sample_idx_start ;
+-- alter table Sentence drop column sample_idx_end   ;
+
+alter table Sentence add sample_idx_start int unsigned not null;
+alter table Sentence add sample_idx_end   int unsigned not null;
+
+create temporary table sentence_bounds select
+sentence_idx, min(sample_idx) sample_idx_start, max(sample_idx) sample_idx_end
+from Sample group by sentence_idx order by sentence_idx;
+
+update Sentence s,  sentence_bounds bounds
+set    s.sample_idx_start = bounds.sample_idx_start,
+       s.sample_idx_end   = bounds.sample_idx_end
+where  s.sentence_idx     = bounds.sentence_idx;
+
+call log('Sample',  'sentence');
+call log('Sample',  'done');
+call log('Feature', 'begin');
+
+drop table if exists Feature;
+create table Feature
+(
+  annotation_type    varchar(25) not null,
+  match_source       varchar(25) not null,
+  match_value        varchar(25) null,
+  cnt                int         null,
+  datatype           varchar(25) null
+) engine=myisam;
+
+alter table Feature add index (match_source);
+alter table Feature add index (match_value);
+alter table Feature add index (match_source, match_value);
+
+
+insert into Feature
+select 'WordToken', 'cap', cap, count(*) as cnt, 'nominal(4)'
+from WordToken group by cap  order by cnt desc;
+
+insert into Feature
+select annotation_type, 'BaseToken', null, count(*), 'bool'
+from BaseToken group by annotation_type order by count(*) desc;
+
+insert into Feature
+select annotation_type, 'Annotation', null, count(*), 'bool'
+from Annotation
+where annotation_type in
+(
+  'RomanNumeralAnnotation',
+  'FractionAnnotation',
+  'DateAnnotation',
+  'MeasurementAnnotation',
+  'RangeAnnotation',
+  'PersonTitleAnnotation',
+  'TimeAnnotation'
+)
+group by annotation_type
+order by count(*) desc;
+
+insert into Feature
+select 'bin', match_source, match_bin, count(*) as cnt, 'bool'
+from FeatureBin
+group by match_source, match_bin;
+
+insert into Feature
+select 'BaseToken', 'pos', pos, count(*) as cnt, 'bool'
+from BaseToken
+where annotation_type = 'WordToken' or annotation_type = 'NumToken'
+group by pos
+order by cnt desc;
+
+insert into Feature
+select annotation_type, match_source, match_value, count(*) as cnt, 'bool'
+from OntologyMatch
+where match_source  = 'regex'
+group by match_source, match_value
+order by count(*) desc;
+
+insert into Feature
+select annotation_type, match_source, null, count(*) as cnt, 'INTEGER'
+from OntologyMatch
+group by match_source
+order by cnt desc;
+
+ALTER TABLE Feature add feature_idx smallint UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+ALTER TABLE Feature add index (annotation_type, match_source, match_value);
+
+call log('Feature',  'done');
+call log('FeatureMatrix', 'begin');
+
+drop table if exists FeatureMatrix;
+create table FeatureMatrix
+(
+  file_idx       smallint not null,
+  sample_idx     integer  not null,
+  feature_idx    smallint not null
+) engine=myisam;
+
+call log('FeatureMatrix', 'done');
+call log('FeatureMatrix', 'WordToken:cap');
+
+-- | annotation_type        | match_source | match_value              | datatype    | feature_idx |
+-- +------------------------+--------------+--------------------------+-------------+-------------+
+-- | WordToken              | cap          | 0                        |  nominal(4) |           1 |
+-- | WordToken              | cap          | 3                        |  nominal(4) |           2 |
+-- | WordToken              | cap          | 1                        |  nominal(4) |           3 |
+-- | WordToken              | cap          | 2                        |  nominal(4) |           4 |
+
+insert into FeatureMatrix
+select  s.file_idx, s.sample_idx, f.feature_idx
+from    Feature f, Sample s, WordToken w
+where   f.match_source = 'cap'       and
+        f.match_value  = w.cap       and
+        w.file_idx     = s.file_idx  and
+        w.token_seq    = s.token_seq   ;
+
+call log('FeatureMatrix', 'WordToken:cap');
+call log('FeatureMatrix', 'BaseToken');
+
+-- | annotation_type        | match_source | match_value | datatype   | feature_idx |
+-- +------------------------+--------------+--------------------------+-------------+
+-- | WordToken              | BaseToken    | NULL        | bool       |           5 |
+-- | PunctuationToken       | BaseToken    | NULL        | bool       |           6 |
+-- | NumToken               | BaseToken    | NULL        | bool       |           7 |
+-- | SymbolToken            | BaseToken    | NULL        | bool       |           8 |
+-- | ContractionToken       | BaseToken    | NULL        | bool       |           9 |
+
+insert into FeatureMatrix
+select  distinct
+        s.file_idx, s.sample_idx, f.feature_idx
+from    Feature f, Sample s, BaseToken t
+where   f.match_source     = 'BaseToken'       and
+        f.annotation_type  = t.annotation_type and
+        t.file_idx         = s.file_idx        and
+        t.token_seq        = s.token_seq   ;
+
+call log('FeatureMatrix', 'BaseToken');
+call log('FeatureMatrix', 'Annotation');
+
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | annotation_type        | match_source | match_value | datatype   | feature_idx |
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | RomanNumeralAnnotation | Annotation   | NULL        | bool       |          10 |
+-- | FractionAnnotation     | Annotation   | NULL        | bool       |          11 |
+-- | DateAnnotation         | Annotation   | NULL        | bool       |          12 |
+-- | MeasurementAnnotation  | Annotation   | NULL        | bool       |          13 |
+-- | RangeAnnotation        | Annotation   | NULL        | bool       |          14 |
+-- | PersonTitleAnnotation  | Annotation   | NULL        | bool       |          15 |
+-- | TimeAnnotation         | Annotation   | NULL        | bool       |          16 |
+
+insert into FeatureMatrix
+select  distinct
+        s.file_idx, s.sample_idx, f.feature_idx
+from    Feature f, Sample s, Annotation a
+where   f.match_source     = 'Annotation'      and
+        f.annotation_type  = a.annotation_type and
+        a.file_idx         = s.file_idx        and
+        a.start_idx       >= s.start_idx       and
+        a.end_idx         <= s.end_idx           ;
+
+call log('FeatureMatrix', 'Annotation');
+call log('FeatureMatrix', 'bin');
+
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | annotation_type        | match_source | match_value | datatype   | feature_idx |
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | bin                    | pos          | Adjectives  | bool       |          17 |
+-- | bin                    | pos          | Adverbs     | bool       |          18 |
+-- | bin                    | pos          | com-dep-wd  | bool       |          19 |
+-- | bin                    | pos          | FW-Symb     | bool       |          20 |
+-- | bin                    | pos          | Nouns       | bool       |          21 |
+-- | bin                    | pos          | Numbers     | bool       |          22 |
+-- | bin                    | pos          | Pronouns    | bool       |          23 |
+-- | bin                    | pos          | Verbs       | bool       |          24 |
+-- | bin                    | regex        | address     | bool       |          25 |
+-- | bin                    | regex        | age         | bool       |          26 |
+-- | bin                    | regex        | date        | bool       |          27 |
+-- | bin                    | regex        | doctor      | bool       |          28 |
+-- | bin                    | regex        | hospital    | bool       |          29 |
+-- | bin                    | regex        | patient     | bool       |          30 |
+-- | bin                    | regex        | phone       | bool       |          31 |
+
+call log('FeatureMatrix', 'bin:pos');
+
+insert into FeatureMatrix
+select  distinct
+        s.file_idx, s.sample_idx, f.feature_idx
+from    Feature f, Sample s, FeatureBin b
+where   f.match_source = 'pos'           and
+        f.match_source = b.match_source  and
+        f.match_value  = b.match_bin     and
+        s.pos          = b.match_value     ;
+
+call log('FeatureMatrix', 'bin:pos');
+call log('FeatureMatrix', 'bin:regex');
+
+insert into FeatureMatrix
+select  distinct
+        s.file_idx, s.sample_idx, f.feature_idx
+from    Feature f, Sample s, FeatureBin b, OntologyMatch o
+where   f.annotation_type = 'bin'           and
+        f.match_source    = 'regex'         and
+        f.match_source    = b.match_source  and
+        f.match_source    = o.match_source  and
+        f.match_value     = b.match_bin     and
+        o.match_value     = b.match_value   and
+        o.file_idx        = s.file_idx      and
+        o.start_idx       >= s.start_idx    and
+        o.end_idx         <= s.end_idx         ;
+
+call log('FeatureMatrix', 'bin:regex');
+call log('FeatureMatrix', 'BaseToken:pos');
+
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | annotation_type        | match_source | match_value | datatype   | feature_idx |
+-- +------------------------+--------------+-------------+------------+-------------+
+-- | BaseToken              | pos          | NN          | bool       |          32 |
+-- | BaseToken              | pos          | JJ          | bool       |          33 |
+-- | BaseToken              | pos          | CD          | bool       |          34 |
+-- | BaseToken              | pos          | IN          | bool       |          35 |
+-- | BaseToken              | pos          | NNP         | bool       |          36 |
+-- | BaseToken              | pos          | DT          | bool       |          37 |
+-- | ...                    | ...          | ...         | ...        |          .. |
+-- | BaseToken              | pos          | pos_paren   | bool       |          71 |
+-- +------------------------+--------------+-------------+------------+-------------+
+
+
+insert into FeatureMatrix
+select  distinct
+        s.file_idx, s.sample_idx, f.feature_idx
+from    Feature f, Sample s
+where
+        f.annotation_type  = 'BaseToken'       and
+        f.match_source     = 'pos'             and
+        f.match_value      = s.pos               ;
+
+
+call log('FeatureMatrix', 'BaseToken:pos');
+call log('FeatureMatrix', 'OntologyMatch:regex');
+
+-- +-----------------+--------------+-----------------+----------+-------------+
+-- | annotation_type | match_source | match_value     | datatype | feature_idx |
+-- +-----------------+--------------+-----------------+----------+-------------+
+-- | OntologyMatch   | regex        | DATE22          | bool     |          72 |
+-- | OntologyMatch   | regex        | DATE21          | bool     |          73 |
+-- | OntologyMatch   | regex        | DATE17          | bool     |          74 |
+-- | OntologyMatch   | regex        | DATE18          | bool     |          75 |
+-- | OntologyMatch   | regex        | DATE_SEPARATORS | bool     |          76 |
+-- | OntologyMatch   | regex        | DATE1           | bool     |          77 |
+-- | OntologyMatch   | regex        | DATE4           | bool     |          78 |
+-- | OntologyMatch   | regex        | YEAR_CENTURY    | bool     |          79 |
+-- | OntologyMatch   | regex        | SUSPICIOUS_NUM  | bool     |          80 |
+-- | OntologyMatch   | regex        | MONTH           | bool     |          81 |
+-- | ...             | ...          | ...             | ...      |         ... |
+-- | OntologyMatch   | regex        | ADDRESS         | bool     |         148 |
+-- +-----------------+--------------+-----------------+----------+-------------+
+
+insert into FeatureMatrix
+select  distinct
+        s.file_idx, s.sample_idx, f.feature_idx
+from    Feature f, Sample s, OntologyMatch o
+where   f.annotation_type = 'OntologyMatch' and
+        f.match_source    = 'regex'         and
+        f.match_source    = o.match_source  and
+        f.match_value     = o.match_value   and
+        o.file_idx        = s.file_idx      and
+        o.start_idx       >= s.start_idx    and
+        o.end_idx         <= s.end_idx         ;
+
+call log('FeatureMatrix', 'OntologyMatch:regex');
+
+-- TODO refactor
+
+create table EmptySentence
+select * from Sentence where sample_idx_start = 0 and sample_idx_end = 0;
+
+delete from Sentence where sample_idx_start = 0 and sample_idx_end = 0;
+alter table Sentence drop column sentence_idx ;
+alter table Sentence add column sentence_idx integer UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+
+update BaseToken b,    Sentence s
+set    b.sentence_idx  = s.sentence_idx
+where  b.file_idx      = s.file_idx      and
+       b.start_idx    >= s.start_idx     and
+       b.end_idx      <= s.end_idx         ;
+
+update Sample sample,    Sentence s
+set    sample.sentence_idx  = s.sentence_idx
+where  sample.file_idx      = s.file_idx      and
+       sample.start_idx    >= s.start_idx     and
+       sample.end_idx      <= s.end_idx         ;
+
+call log('FeatureMatrix', 'csv');
+
+select     s.file_idx, s.sample_idx, s.sentence_idx, m.feature_idx
+from       FeatureMatrix m, Sample s
+where      m.sample_idx = s.sample_idx
+order by   m.sample_idx, m.feature_idx
+into
+  outfile '/tmp/mysql/FeatureMatrix.csv'
+  FIELDS TERMINATED BY ','
+  LINES TERMINATED BY '\n';
+
+call log('FeatureMatrix', 'csv');
+call log('Sample',        'csv');
+
+select     file_idx, sample_idx, sentence_idx, start_idx, end_idx, token_seq, class_idx
+from       Sample
+order by   sample_idx
+into
+  outfile '/tmp/mysql/Sample.csv'
+  FIELDS TERMINATED BY ','
+  LINES TERMINATED BY '\n';
+
+call log('Sample',   'csv');
+call log('SentenceBoundary', 'csv');
+
+select file_idx, 0 as sample_idx, sentence_idx, sample_idx_start, sample_idx_end, sentence_seq
+from Sentence
+-- where sample_idx_start >0 and sample_idx_end > 0
+group by sentence_idx order by sentence_idx
+into
+  outfile '/tmp/mysql/SentenceBoundary.csv'
+  FIELDS TERMINATED BY ','
+  LINES TERMINATED BY '\n';
+
+call log('SentenceBoundary', 'csv');
+
+
+
+
+
+

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/load_csv_annotations.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/load_csv_annotations.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/load_csv_annotations.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/load_csv_annotations.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,174 @@
+truncate etl_log;
+
+CALL log('load_csv_annotations.sql', 'begin');
+
+CALL log('Drop CSV tables', 'begin');
+drop table if exists Annotation, OntologyMatch, Calculation;
+drop table if exists BaseToken, WordToken;
+drop table if exists Sentence;
+
+CALL log('Drop CSV tables', 'done');
+CALL log('Annotation', 'begin');
+drop table if exists Annotation;
+
+create table Annotation
+(
+  annotation_type varchar(50),
+  filename_short  varchar(20),
+  start_idx       smallint,
+  end_idx         smallint
+) engine = myisam;
+
+load data infile '/tmp/mysql/Annotation.csv' into table Annotation
+fields terminated by ',' ignore 1 lines;
+
+CALL log('Annotation', 'done');
+CALL log('Sentence',   'begin');
+
+drop table if exists Sentence;
+create table Sentence like Annotation;
+alter table Sentence add  column sentence_seq smallint  not null;
+
+load data infile '/tmp/mysql/Sentence.csv' into table Sentence
+fields terminated by ',' ignore 1 lines;
+
+CALL log('Sentence',     'done');
+CALL log('BaseToken',    'begin');
+
+drop table if exists BaseToken;
+create table BaseToken like Annotation;
+
+alter table BaseToken add column token_seq  smallint    not null;
+alter table BaseToken add column pos        varchar(25) not null;
+alter table BaseToken add column token      varchar(100) not null;
+
+load data infile '/tmp/mysql/BaseToken.csv' into table BaseToken
+fields terminated by ',' ignore 1 lines;
+
+CALL log('BaseToken',  'done');
+CALL log('NewLineToken',   'begin');
+
+drop table if exists NewLineToken;
+create table NewLineToken like BaseToken;
+
+load data infile '/tmp/mysql/NewLineToken.csv' into table NewLineToken
+fields terminated by ',' ignore 1 lines;
+
+CALL log('NewLineToken', 'done');
+CALL log('WordToken',  'begin');
+
+drop table if exists WordToken;
+create table WordToken like Annotation;
+
+alter table WordToken add column token_seq  smallint     not null;
+alter table WordToken add column pos        varchar(25)  not null;
+alter table WordToken add column cap        tinyint      not null;
+alter table WordToken add column token      varchar(100) not null;
+
+load data infile '/tmp/mysql/WordToken.csv' into table WordToken
+fields terminated by ',' ignore 1 lines;
+
+alter table WordToken add index(pos, cap);
+
+CALL log('WordToken',    'done');
+CALL log('OntologyMatch','begin');
+
+drop table if exists OntologyMatch;
+create table OntologyMatch like Annotation;
+
+alter table OntologyMatch add column match_source  varchar(50)  not null;
+alter table OntologyMatch add column match_value   varchar(50)  not null;
+alter table OntologyMatch add column token         varchar(100) not null;
+
+load data infile '/tmp/mysql/OntologyMatch.csv' into table OntologyMatch
+fields terminated by ',' ignore 1 lines;
+
+CALL log('OntologyMatch',  'done');
+CALL log('Calculation',    'begin');
+
+drop table if exists Calculation;
+create table Calculation like OntologyMatch;
+
+alter table Calculation drop column token;
+
+load data infile '/tmp/mysql/Calculation.csv' into table Calculation
+fields terminated by ',' ignore 1 lines;
+
+CALL log('Calculation',    'done');
+CALL log('file_idx',      'begin');
+
+alter table Annotation     add column file_idx smallint not null;
+alter table BaseToken      add column file_idx smallint not null;
+alter table Sentence       add column file_idx smallint not null;
+alter table WordToken      add column file_idx smallint not null;
+alter table OntologyMatch  add column file_idx smallint not null;
+alter table Calculation    add column file_idx smallint not null;
+
+update Annotation     set file_idx = trim(replace(filename_short, '.txt', ''));
+update BaseToken      set file_idx = trim(replace(filename_short, '.txt', ''));
+update Sentence       set file_idx = trim(replace(filename_short, '.txt', ''));
+update WordToken      set file_idx = trim(replace(filename_short, '.txt', ''));
+update OntologyMatch  set file_idx = trim(replace(filename_short, '.txt', ''));
+update Calculation    set file_idx = trim(replace(filename_short, '.txt', ''));
+
+CALL log('file_idx',   'done');
+CALL log('index',      'begin');
+
+alter table WordToken add index (file_idx);
+alter table WordToken add index (file_idx, token_seq);
+alter table WordToken add index (file_idx, start_idx, end_idx);
+alter table BaseToken add index (filename_short);
+alter table BaseToken add index (file_idx, token_seq);
+alter table BaseToken add index (file_idx, start_idx, end_idx);
+alter table BaseToken add index (annotation_type, file_idx, token_seq);
+alter table Annotation add index (annotation_type);
+alter table Annotation add index (file_idx, start_idx, end_idx);
+
+alter table Calculation add index (file_idx, start_idx, end_idx);
+
+alter table OntologyMatch add index(match_source);
+alter table OntologyMatch add index(match_source, match_value);
+alter table OntologyMatch add index(file_idx, start_idx, end_idx);
+alter table OntologyMatch add index(match_source, match_value, file_idx, start_idx, end_idx);
+
+alter table Sentence add index(file_idx, start_idx, end_idx);
+alter table Sentence add index(file_idx, sentence_seq);
+
+CALL log('index',      'done');
+call log('BaseToken:sentence_seq', 'begin.');
+
+alter table BaseToken add sentence_seq integer not null;
+
+update BaseToken b,    Sentence s
+set    b.sentence_seq  = s.sentence_seq
+where  b.file_idx      = s.file_idx      and
+       b.start_idx    >= s.start_idx     and
+       b.end_idx      <= s.end_idx         ;
+
+call log('BaseToken:sentence_seq', 'done.');
+call log('BaseToken:class',   'begin');
+
+drop table if exists Class;
+create table Class
+(
+  class_idx  smallint UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY,
+  class      varchar(20),
+  cnt        integer
+) engine = myisam;
+
+insert into Class (class, cnt) values ('NA', 0);
+
+insert into Class (class, cnt)
+select tag_name as class, count(*) as cnt from human_annotations_train group by tag_name;
+alter table  Class add index (class);
+
+alter table BaseToken add class varchar(20) not null;
+
+update BaseToken b,    human_annotations_train phi
+set    b.class         = phi.tag_name
+where  b.filename_short= phi.subject_id  and
+       b.start_idx    >= phi.start_idx   and
+       b.end_idx      <= phi.end_idx       ;
+
+call log('BaseToken:class',    'done.');
+CALL log('load_csv_annotations.sql', 'done');
\ No newline at end of file

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/populateTFTables.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/populateTFTables.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/populateTFTables.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/populateTFTables.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,38 @@
+
+-- pubs
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'a%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'b%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'c%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'd%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'e%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'f%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'g%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'h%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'i%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'j%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'k%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'l%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'm%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'n%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'o%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'p%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'q%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'r%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 's%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 't%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'u%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'v%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'w%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'x%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'y%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like 'z%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '0%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '1%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '2%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '3%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '4%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '5%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '6%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '7%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '8%' group by token, match_value;
+insert into lookup_term_frequency (cnt, token, pos) select count(*) cnt, token, match_value from machine_annotations_pubs where match_source = 'pos' and token like '9%' group by token, match_value;

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/tmp/FeatureMatrixOnt.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/tmp/FeatureMatrixOnt.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/tmp/FeatureMatrixOnt.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/tmp/FeatureMatrixOnt.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,55 @@
+call log('FeatureOnt', 'begin');
+drop table if exists FeatureOnt;
+create table FeatureOnt
+(
+  match_source varchar(50) not null,
+  match_value  varchar(50) not null,
+  cnt          int         null
+) engine = myisam;
+
+insert into FeatureOnt
+select match_source,match_value, count(*) as cnt
+from OntologyMatch
+where match_source != 'regex'
+group by match_source,match_value
+order by match_source,match_value;
+
+alter table FeatureOnt add feature_idx integer UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY;
+alter table FeatureOnt add index (match_source, match_value);
+alter table FeatureOnt add index (feature_idx, match_source, match_value);
+
+alter table FeatureOnt add column str text;
+
+update FeatureOnt f, lookup_umls u
+ set f.str = u.str
+where
+    f.match_source = u.sab and
+    f.match_value  = u.cui   ;
+
+call log('FeatureOnt',        'done');
+call log('FeatureMatrixOnt', 'start');
+call log('FeatureMatrixOnt', 'table');
+
+drop table if exists FeatureMatrixOnt;
+
+create table FeatureMatrixOnt
+(
+  sample_idx   MEDIUMINT unsigned ,
+  feature_idx  smallint  unsigned ,
+  match_source varchar(10)
+)
+engine = myisam;
+
+call log('FeatureMatrixOnt', 'table');
+
+-- call log('FeatureMatrixOnt',  'slow bulk insert....');
+--
+-- create table FeatureMatrixOnt
+-- select  s.sample_idx,o.match_source, o.match_value
+-- from    Sample s, OntologyMatch o
+-- where   o.match_source   !='regex'          and
+--         o.file_idx        = s.file_idx      and
+--         o.start_idx       <= s.start_idx    and
+--         o.end_idx         >= s.end_idx        ;
+--
+-- call log('FeatureMatrixOnt', 'done');
\ No newline at end of file

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_test.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_test.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_test.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_test.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,5 @@
+truncate human_annotations_test;
+
+truncate machine_annotations_test;
+
+truncate feature_matrix_test;
\ No newline at end of file

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_train.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_train.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_train.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/truncate_tables_train.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,5 @@
+truncate human_annotations_train;
+
+truncate machine_annotations_train;
+
+truncate feature_matrix_train;
\ No newline at end of file

Added: ctakes/sandbox/ctakes-scrubber-deid/sql/update_pos_bins.sql
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/sql/update_pos_bins.sql?rev=1500518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/sql/update_pos_bins.sql (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/sql/update_pos_bins.sql Sun Jul  7 19:39:00 2013
@@ -0,0 +1,10 @@
+update pubs_pos set pos_bin = 'unknown' where pos in ('NULL',',','.',':','$','(',')','"',"'", "''", 'POS') or pos is null;
+update pubs_pos set pos_bin = 'com-dep-wd' where pos in ('CC','CT','DT','EX','IN','MD','PDT','RP','TO','UH','WDT');
+update pubs_pos set pos_bin = 'FW/Symb' where pos in ('FW','SYM');
+update pubs_pos set pos_bin = 'Adjectives' where pos in ('JJ','JJR','JJS');
+update pubs_pos set pos_bin = 'Nouns' where pos in ('NN','NNS','NNP','NNPS');
+update pubs_pos set pos_bin = 'Adverbs' where pos in ('WRB','RB','RBR','RBS');
+update pubs_pos set pos_bin = 'Verbs' where pos in ('VB','VBD','VBG','VBN','VBP','VBZ');
+update pubs_pos set pos_bin = 'Pronouns' where pos in ('PRP','PRP$','WP','WP$');
+update pubs_pos set pos_bin = 'Numbers' where pos in ('CD', 'LS');
+-- TODO: deprecated
\ No newline at end of file