You are viewing a plain text version of this content. The canonical link for it is here.
Posted to by on 2013/07/20 18:39:46 UTC

svn commit: r1505167 [5/5] - in /ctakes/sandbox/ctakes-scrubber-deid/conf: ./ hospital_names.txt names.txt private_dict.txt regex_patterns.txt

Propchange: ctakes/sandbox/ctakes-scrubber-deid/conf/names.txt
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/conf/private_dict.txt
--- ctakes/sandbox/ctakes-scrubber-deid/conf/private_dict.txt (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/conf/private_dict.txt Sat Jul 20 16:39:46 2013
@@ -0,0 +1,7 @@
\ No newline at end of file

Propchange: ctakes/sandbox/ctakes-scrubber-deid/conf/private_dict.txt
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/conf/regex_patterns.txt
--- ctakes/sandbox/ctakes-scrubber-deid/conf/regex_patterns.txt (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/conf/regex_patterns.txt Sat Jul 20 16:39:46 2013
@@ -0,0 +1,533 @@
+//supposed to match email addresses.Taken from Can an email start with a non-alphanumeric?
+//Take out any telephone numbers
+\s+[\\(]{0,1}([0-9]){3}[\\)]{0,1}[ ]?([^0-1]){1}([0-9]){2}[ ]?[-]?[ ]?([0-9]){4}[ ]*((x){0,1}([0-9]){1,5}){0,1}
+// Take out more telephone formats
+((\(\d{3}\) ?)|(\d{3}[- \.]))?\d{3}[- \.]\d{4}(\s(x\d+)?){0,1}
+//better telephone
+(([01][\.\- +]\(\d{3}\)[\.\- +]?)|([01][\.\- +]\d{3}[\.\- +])|(\(\d{3}\) ?)|(\d{3}[- \.]))?\d{3}[- \.]\d{4}
+//other telefone
+// extension number
+//matches IP address except of the form [1200.5.4.3], [abc.def.ghi.jkl], [] .Taken from
+//To take out date of the form yyyy/m/d or yyyy/mm/dd or yyyy/mm/d or yyyy/m/dd
+// mm/dd/yy
+//take out dates of the form mm/yyyy 
+//take out dates of the form  YYYY-mm-dd
+// mm/yy
+// take out dates of the form MMM dd, yyyy format from Jan 1, 1600 to Dec 31, 9999.
+(?:(((Jan(uary)?|Ma(r(ch)?|y)|Jul(y)?|Aug(ust)?|Oct(ober)?|Dec(ember)?)\ 31)|((Jan(uary)?|Ma(r(ch)?|y)|Apr(il)?|Ju((ly?)|(ne?))|Aug(ust)?|Oct(ober)?|(Sept|Nov|Dec)(ember)?)\ (0?[1-9]|([12]\d)|30))|(Feb(ruary)?\ (0?[1-9]|1\d|2[0-8]|(29(?=,\ ((1[6-9]|[2-9]\d)(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00)))))))\,\ ((1[6-9]|[2-9]\d)\d{2}))
+// uppercase date
+(?:(((JAN(UARY)?|MA(R(CH)?|Y)|JUL(Y)?|AUG(UST)?|OCT(OBER)?|DEC(EMBER)?)\ 31)|((JAN(UARY)?|MA(R(CH)?|Y)|APR(IL)?|JU((LY?)|(NE?))|AUG(UST)?|OCT(OBER)?|(SEPT|NOV|DEC)(EMBER)?)\ (0?[1-9]|([12]\d)|30))|(FEB(RUARY)?\ (0?[1-9]|1\d|2[0-8]|(29(?=,\ ((1[6-9]|[2-9]\d)(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00)))))))\,\ ((1[6-9]|[2-9]\d)\d{2}))
+// uppercase month
+((31(?! (FEB|APR|JUN|SEP|NOV)))|((30|29)(?! FEB))|(29(?= FEB (((1[6-9]|[2-9]\d)(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00)))))|(0?[1-9])|1\d|2[0-8]) (JAN|FEB|MAR|MAY|APR|JUL|JUN|AUG|OCT|SEP|NOV|DEC) ((1[6-9]|[2-9]\d)\d{2})
+// MM dd 
+((?:J(anuary|u(ne|ly))|February|Ma(rch|y)|A(pril|ugust)|(((Sept|Nov|Dec)em)|Octo)ber)|(?:j(anuary|u(ne|ly))|february|ma(rch|y)|a(pril|ugust)|(((sept|nov|dec)em)|octo)ber)|(?:(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))|(?:(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)))( |\,)(\d{2}|\d)
+// dd MM 
+((?:J(anuary|u(ne|ly))|February|Ma(rch|y)|A(pril|ugust)|(((Sept|Nov|Dec)em)|Octo)ber)|(?:j(anuary|u(ne|ly))|february|ma(rch|y)|a(pril|ugust)|(((sept|nov|dec)em)|octo)ber)|(?:(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))|(?:(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)))( |\,)(\d{2}|\d)
+// dd MM yy?
+// ex: 9 July 1993 or 9 July of 1993
+((31(?!\ (Feb(ruary)?|Apr(il)?|June?|(Sep(?=\b|t)t?|Nov)(ember)?)))|((30|29)(?!\ Feb(ruary)?))|(29(?=\ Feb(ruary)?\ (((1[6-9]|[2-9]\d)(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00)))))|(0?[1-9])|1\d|2[0-8])\ (Jan(uary)?|Feb(ruary)?|Ma(r(ch)?|y)|Apr(il)?|Ju((ly?)|(ne?))|Aug(ust)?|Oct(ober)?|(Sep(?=\b|t)t?|Nov|Dec)(ember)?)\ ((1[6-9]|[2-9]\d)\d{2})
+// ex: 8/4/93
+//ex August 1995
+//ex: Jan of 2004
+// ex: Jan 1st, January 3rd, August 16th
+// ex: 2nd of November
+// ex: ii-05-99
+// ex:     DISCHARGED:     4/2/94 
+// ex: 10 March.
+//ex in 01-96
+// dd/yy
+// dd/mm
+//(regexp ) Take out any dates in the format M/D/YY , M/D/YYYY, mm/dd/yyyy , mm/dd/yy , dd/mm/yy ,dd/mm/yyyy.The separator can be any of the following characters : ".","/","-"
+//all ages 10-100 divisible by 10 (was writtenAge1)
+//all ages from 110 to 119 (was writtenAge2)
+//All ages from 20-99. (was writtenAge4)
+//All ages from 1-19 (was writtenAge5)
+//This should take out any mention of age followed by woman/man/male/female/m/f
+//this works to take out any mention of the age.(or of this form -- "aged over 50"
+// The word Age followed by the age: ex AGE 77
+// Age followed by gender i.e. 86 year old woman
+//Allow for multiple non-numeric/non-alph characters between number and text
+//ex: age twenty - nine,
+//ex: 77 yo
+// note that these come after the other known identifiers like pt ssn, acc num, etc
+(?!000)([0-6]\d{2}|7([0-6]\d|7[012]))([ -]?)(?!00)\d\d\3(?!0000)\d{4}
+//Match all jr and sr
+//Match all roman numeral III
+//Patient's Name
+// this looks for key words to identify accession numbers by the word accession or any abbreviated form
+// accession number  KPNW format R98-87848 
+//accession number  KPNW format R98-87848 
+//record number KPNW
+// mrn for KPNW with spaces
+//suspicious number 4-15 digits
+//2 numbers followed by a dash followed by a number larger than three digits
+//take out any year of the form 19-- or 20-- or mispelled "1" for "l"
+// Single Month
+//end/beginning/middle of single month: end of May.
+// Surgeon ex: SURGEON: T. Todd, M.D.
+// Surgeon ex: SURGEON: Dr. Bernard
+// Surgeon ex: D. Perez, D.P.M.
+// Surgeon ex: SURGEON:                  F. Joseph, C.N.M.
+// Surgeon (Suregon followed by any characters ending on MD) 
+// Surgeon ex SURGEON:                  J. Ray, D.O.   
+// Surgeon with Dr.
+// Surgeon ex: L. Dwight or single name
+// ASST: same as above
+//ASST3 ex: ASST: M. Chap, S.A.
+//PA ex Christine Fox, P.A.
+//CC: same as above
+//CC with RN or MD
+//Doctor 2-token-name. ex: Dr. Sam Smith. both names must be initialUppercase
+// New Doctor rule ex: Dr. F. Gelman.
+//New doctor rule ex: Dr. Antonioni 
+//New Doctor rule with ands: 
+//Dr followed by a single name and punctuation
+// MD2
+//DRS plural
+//Drs plural and
+//put the older Dr regex. this should be a safety net incase the above regex is not satisfied.
+// this line looks for DR|doctor followed by up to six "words"
+// ex: 3414 N. Kaiser Center Drive, Portland, OR 97227 -- dangerous regex may overmatch
+//the following regex does this --- any number may be followed by an alpha(to take care of 200A longwood ave) ,followed by a space,one or more words separated by space and followed by one of the list of synonyms for street which is followed by a space or a comma .
+// ex: 123 Anywhere Dr. Somewhere, ST 55789 
+[ \w]{3,}(\w+\.)?([ \w]*\#\d+)?(\r\n| )[ \w]{3,},\x20[A-Za-z]{2}\x20\d{5}(-\d{4})?
+//ex:SALEM OR  97310-1020
+//ex:2045 Fake Name South West STREET
+// ex:501 N. Graham, Suite 500
+//ex: 1211 SW 5TH
+//ex: PO BOX 8004
+// ROOM 680
+//Take out any 5 numbers-4 numbers since they are assumed to be zip codes
+// Discharged ex DISCHARGED:  JANUARY 12, 2001
+// Discharged ex. DISCHARGED:     08-12-95
+// Mr. Fitch
+// Ms. Fitch or Mrs. Fitch
+//doctor name found in subheadings. ex.) Attending : Fitch, Britt, <DISTINCTION>
+//where <DISTINCTION> is in (rn, md, cnm, np)
+//max length is 30 chars for name to prevent matching areas that are not actually subheadings.
+//match 4th floor, 2nd fl, etc...
+//match 4th floor, 2nd fl, etc...
+//match hospital names in initialUppercase ex.) Mass General Hospital, Children's hospital, Superman Memorial Regional Medical Center 
+//max length of 6 tokens to prevent runaway matching
+//may want to add in more hospital matching regex for better classification?
+//match hospital names in ALL Uppercase ex.) MASS GENERAL HOSPITAL 
+//to prevent runaway matching this will match unlimited times from a set of common building words for hospital names and 1 preceding word in ALL UPPERCASE
+//may want to add in more hospital matching regex for better classification?
+//id format for i2b2 smoking de-id data
+//id format for i2b2 smoking de-id data
+//id format for i2b2 smoking de-id data. ex.) TR: fooID (will ignore the token 'TR: ')
+//format date in the format: 'on the 22nd', 'on the 31st', etc... (will ignore the tokens 'on the')
\ No newline at end of file

Propchange: ctakes/sandbox/ctakes-scrubber-deid/conf/regex_patterns.txt
    svn:mime-type = text/plain