You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by br...@apache.org on 2013/07/20 18:39:46 UTC

svn commit: r1505167 [5/5] - in /ctakes/sandbox/ctakes-scrubber-deid/conf: ./ hospital_names.txt names.txt private_dict.txt regex_patterns.txt

Propchange: ctakes/sandbox/ctakes-scrubber-deid/conf/names.txt
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/conf/private_dict.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/conf/private_dict.txt?rev=1505167&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/conf/private_dict.txt (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/conf/private_dict.txt Sat Jul 20 16:39:46 2013
@@ -0,0 +1,7 @@
+#private
+%org.spin.scrubber.type.OntologyMatch
+
+\bbritt\b
+\bfitch\b
+\bandy\b
+\bmcmurry\b
\ No newline at end of file

Propchange: ctakes/sandbox/ctakes-scrubber-deid/conf/private_dict.txt
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/conf/regex_patterns.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/conf/regex_patterns.txt?rev=1505167&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/conf/regex_patterns.txt (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/conf/regex_patterns.txt Sat Jul 20 16:39:46 2013
@@ -0,0 +1,533 @@
+//supposed to match email addresses.Taken from Regexlib.com. Can an email start with a non-alphanumeric?
+%org.spin.scrubber.type.OntologyMatch
+#EMAIL_ADDRESS
+([a-zA-Z0-9_\-])+(\.([a-zA-Z0-9_\-])+)*@((\[(((([0-1])?([0-9])?[0-9])|(2[0-4][0-9])|(2[0-5][0-5])))\.(((([0-1])?([0-9])?[0-9])|(2[0-4][0-9])|(2[0-5][0-5])))\.(((([0-1])?([0-9])?[0-9])|(2[0-4][0-9])|(2[0-5][0-5])))\.(((([0-1])?([0-9])?[0-9])|(2[0-4][0-9])|(2[0-5][0-5]))\]))|((([a-zA-Z0-9])+(([\-])+([a-zA-Z0-9])+)*\.)+([a-zA-Z])+(([\-])+([a-zA-Z0-9])+)*))
+
+//Take out any telephone numbers
+%org.spin.scrubber.type.OntologyMatch
+#TELEPHONE2
+\s+[\\(]{0,1}([0-9]){3}[\\)]{0,1}[ ]?([^0-1]){1}([0-9]){2}[ ]?[-]?[ ]?([0-9]){4}[ ]*((x){0,1}([0-9]){1,5}){0,1}
+
+// Take out more telephone formats
+%org.spin.scrubber.type.OntologyMatch
+#TELEPHONE3
+((\(\d{3}\) ?)|(\d{3}[- \.]))?\d{3}[- \.]\d{4}(\s(x\d+)?){0,1}
+
+//better telephone
+%org.spin.scrubber.type.OntologyMatch
+#TELEPHONE0
+(([01][\.\- +]\(\d{3}\)[\.\- +]?)|([01][\.\- +]\d{3}[\.\- +])|(\(\d{3}\) ?)|(\d{3}[- \.]))?\d{3}[- \.]\d{4}
+
+//other telefone
+%org.spin.scrubber.type.OntologyMatch
+#TELEPHONE1
+(1?(-?\d{3})-?)?(\d{3})(-?\d{4})
+
+// extension number
+%org.spin.scrubber.type.OntologyMatch
+#EXTENSION
+(([E|e][X|x][T|t]?[E|e]?[N|n]?[S|s]?[I|i]?[O|o]?[N|n]?)\s+\d+)
+
+//matches IP address except of the form [1200.5.4.3], [abc.def.ghi.jkl], [255.foo.bar.1] .Taken from regexlib.com
+%org.spin.scrubber.type.OntologyMatch
+#IP
+(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])
+
+//To take out date of the form yyyy/m/d or yyyy/mm/dd or yyyy/mm/d or yyyy/m/dd
+%org.spin.scrubber.type.OntologyMatch
+#DATE
+[^a-z^A-Z^0-9]?\d{4}[-\./][0-3]?[0-9][-\./][0-3]?[0-9]
+
+// mm/dd/yy
+%org.spin.scrubber.type.OntologyMatch
+#DATE1
+((0?[13578]|10|12)(-|\/)((0[0-9])|([12])([0-9]?)|(3[01]?))(-|\/)((\d{4})|(\d{2}))|(0?[2469]|11)(-|\/)((0[0-9])|([12])([0-9]?)|(3[0]?))(-|\/)((\d{4}|\d{2})))
+
+//take out dates of the form mm/yyyy 
+%org.spin.scrubber.type.OntologyMatch
+#DATE2
+((0[1-9])|(1[0-2]))\/(\d{4})
+
+//take out dates of the form  YYYY-mm-dd
+%org.spin.scrubber.type.OntologyMatch
+#DATE3
+^?[0-9]{4}-(((0[13578]|(10|12))-(0[1-9]|[1-2][0-9]|3[0-1]))|(02-(0[1-9]|[1-2][0-9]))|((0[469]|11)-(0[1-9]|[1-2][0-9]|30)))$?
+
+// mm/yy
+%org.spin.scrubber.type.OntologyMatch
+#DATE4
+((0[1-9])|(1[0-2]))\/(\d{2})
+
+// take out dates of the form MMM dd, yyyy format from Jan 1, 1600 to Dec 31, 9999.
+%org.spin.scrubber.type.OntologyMatch
+#DATE5
+(?:(((Jan(uary)?|Ma(r(ch)?|y)|Jul(y)?|Aug(ust)?|Oct(ober)?|Dec(ember)?)\ 31)|((Jan(uary)?|Ma(r(ch)?|y)|Apr(il)?|Ju((ly?)|(ne?))|Aug(ust)?|Oct(ober)?|(Sept|Nov|Dec)(ember)?)\ (0?[1-9]|([12]\d)|30))|(Feb(ruary)?\ (0?[1-9]|1\d|2[0-8]|(29(?=,\ ((1[6-9]|[2-9]\d)(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00)))))))\,\ ((1[6-9]|[2-9]\d)\d{2}))
+
+// uppercase date
+%org.spin.scrubber.type.OntologyMatch
+#DATE5_U
+(?:(((JAN(UARY)?|MA(R(CH)?|Y)|JUL(Y)?|AUG(UST)?|OCT(OBER)?|DEC(EMBER)?)\ 31)|((JAN(UARY)?|MA(R(CH)?|Y)|APR(IL)?|JU((LY?)|(NE?))|AUG(UST)?|OCT(OBER)?|(SEPT|NOV|DEC)(EMBER)?)\ (0?[1-9]|([12]\d)|30))|(FEB(RUARY)?\ (0?[1-9]|1\d|2[0-8]|(29(?=,\ ((1[6-9]|[2-9]\d)(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00)))))))\,\ ((1[6-9]|[2-9]\d)\d{2}))
+
+// uppercase month
+%org.spin.scrubber.type.OntologyMatch
+#DATE6
+((31(?! (FEB|APR|JUN|SEP|NOV)))|((30|29)(?! FEB))|(29(?= FEB (((1[6-9]|[2-9]\d)(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00)))))|(0?[1-9])|1\d|2[0-8]) (JAN|FEB|MAR|MAY|APR|JUL|JUN|AUG|OCT|SEP|NOV|DEC) ((1[6-9]|[2-9]\d)\d{2})
+
+// MM dd 
+%org.spin.scrubber.type.OntologyMatch
+#DATE7
+((?:J(anuary|u(ne|ly))|February|Ma(rch|y)|A(pril|ugust)|(((Sept|Nov|Dec)em)|Octo)ber)|(?:j(anuary|u(ne|ly))|february|ma(rch|y)|a(pril|ugust)|(((sept|nov|dec)em)|octo)ber)|(?:(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))|(?:(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)))( |\,)(\d{2}|\d)
+
+// dd MM 
+%org.spin.scrubber.type.OntologyMatch
+#DATE8
+((?:J(anuary|u(ne|ly))|February|Ma(rch|y)|A(pril|ugust)|(((Sept|Nov|Dec)em)|Octo)ber)|(?:j(anuary|u(ne|ly))|february|ma(rch|y)|a(pril|ugust)|(((sept|nov|dec)em)|octo)ber)|(?:(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))|(?:(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)))( |\,)(\d{2}|\d)
+
+// dd MM yy?
+%org.spin.scrubber.type.OntologyMatch
+#DATE9
+(3[0-1]|2[0-9]|1[0-9]|0[1-9])[\s{1}|\/|-]((Jan|JAN|Feb|FEB|Mar|MAR|Apr|APR|May|MAY|Jun|JUN|Jul|JUL|Aug|AUG|Sep|SEP|Oct|OCT|Nov|NOV|Dec|DEC)|(January|JANUARY|February|FEBRUARY|March|MARCH|April|APRIL|May|MAY|June|JUNE|July|JULY|August|AUGUST|September|SEPTEMBER|October|OCTOBER|November|NOVEMBER|December|DECEMBER)|(january|february|march|april|may|june|july|august|september|october|november|december)|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec))[\s{1}|\/|-](\d{4}|\d{2})?
+
+// ex: 9 July 1993 or 9 July of 1993
+%org.spin.scrubber.type.OntologyMatch
+#DATE11
+((31(?!\ (Feb(ruary)?|Apr(il)?|June?|(Sep(?=\b|t)t?|Nov)(ember)?)))|((30|29)(?!\ Feb(ruary)?))|(29(?=\ Feb(ruary)?\ (((1[6-9]|[2-9]\d)(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00)))))|(0?[1-9])|1\d|2[0-8])\ (Jan(uary)?|Feb(ruary)?|Ma(r(ch)?|y)|Apr(il)?|Ju((ly?)|(ne?))|Aug(ust)?|Oct(ober)?|(Sep(?=\b|t)t?|Nov|Dec)(ember)?)\ ((1[6-9]|[2-9]\d)\d{2})
+
+// ex: 8/4/93
+%org.spin.scrubber.type.OntologyMatch
+#DATE12
+(((0?[1-9]|[12]\d|3[01])[\.\-\/](0?[13578]|1[02])[\.\-\/]((1[6-9]|[2-9]\d)?\d{2}|\d))|((0?[1-9]|[12]\d|30)[\.\-\/](0?[13456789]|1[012])[\.\-\/]((1[6-9]|[2-9]\d)?\d{2}|\d))|((0?[1-9]|1\d|2[0-8])[\.\-\/]0?2[\.\-\/]((1[6-9]|[2-9]\d)?\d{2}|\d))|(29[\.\-\/]0?2[\.\-\/]((1[6-9]|[2-9]\d)?(0[48]|[2468][048]|[13579][26])|((16|[2468][048]|[3579][26])00)|00|[048])))((\.|,))*
+
+//ex August 1995
+%org.spin.scrubber.type.OntologyMatch
+#DATE13
+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\,*\s\s*\d{4}|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\,*\s\d{4}|(January|February|March|April|May|June|July|August|September|October|November|December)\,*\s\d{4}|(january|february|march|april|may|june|july|august|september|october|november|december)\,*\s\d{4}|(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)\,*\s\d{4}
+
+//ex: Jan of 2004
+%org.spin.scrubber.type.OntologyMatch
+#DATE14
+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)((\,*\s\s*)|\s+of\s+)(\d{4}|\d{2})|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)((\,*\s\s*)|\s+of\s+)(\d{4}|\d{2})|(January|February|March|April|May|June|July|August|September|October|November|December)((\,*\s\s*)|\s+of\s+)(\d{4}|\d{2})|(january|february|march|april|may|june|july|august|september|october|november|december)((\,*\s\s*)|\s+of\s+)(\d{4}|\d{2})
+
+// ex: Jan 1st, January 3rd, August 16th
+%org.spin.scrubber.type.OntologyMatch
+#DATE15
+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)((\,*\s\s*)|\s+of\s+)(\d{4}|(\d+(st|nd|rd|th)))|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)((\,*\s\s*)|\s+of\s+)(\d{4}|((\d+(st|nd|rd|th))))|(January|February|March|April|May|June|July|August|September|October|November|December)((\,*\s\s*)|\s+of\s+)(\d{4}|(\d{4}|(\d+(st|nd|rd|th))))|(january|february|march|april|may|june|july|august|september|october|november|december)((\,*\s\s*)|\s+of\s+)(\d{4}|(\d{4}|(\d+(st|nd|rd|th))))
+
+// ex: 2nd of November
+%org.spin.scrubber.type.OntologyMatch
+#DATE16
+((\d+(st|nd|rd|th)))((\,*|\s|\s+)|(\s+of\s+))(January|February|March|April|May|June|July|August|September|October|November|December)|((\d+(st|nd|rd|th)))((\,*|\s|\s+)|(\s+of\s+))(january|february|march|april|may|june|july|august|september|october|november|december)|((\d+(st|nd|rd|th)))((\,*|\s|\s+)|(\s+of\s+))(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)|((\d+(st|nd|rd|th)))((\,*|\s|\s+)|(\s+of\s+))(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)
+
+// ex: ii-05-99
+%org.spin.scrubber.type.OntologyMatch
+#DATE17
+([0-9lL]+?)[-,\/]([0-9iL]+?)[-,\/]([0-9iL]+?){2,4}
+
+// ex:     DISCHARGED:     4/2/94 
+%org.spin.scrubber.type.OntologyMatch
+#DATE18
+\d{1,2}(/|-)\d{1,2}(/|-)\d{1,2}(/.|,)*
+
+// ex: 10 March.
+%org.spin.scrubber.type.OntologyMatch
+#DATE19
+(\d{1,2}\s+)((January|February|March|April|June|July|August|September|October|November|December)|(january|february|march|april|june|july|august|september|october|november|december)|(Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec)|(jan|feb|mar|apr|jun|jul|aug|sep|oct|nov|dec)|(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER))
+
+//ex in 01-96
+%org.spin.scrubber.type.OntologyMatch
+#DATE20
+in\s+\d{1,2}-\d{2,4}(/.|,)*
+
+// dd/yy
+%org.spin.scrubber.type.OntologyMatch
+#DATE21
+((0[1-9])|(1[02])|\d)/\d{2}
+
+// dd/mm
+%org.spin.scrubber.type.OntologyMatch
+#DATE22
+\d{1,2}\/(1[012]|[1-9])
+
+//(regexp ) Take out any dates in the format M/D/YY , M/D/YYYY, mm/dd/yyyy , mm/dd/yy , dd/mm/yy ,dd/mm/yyyy.The separator can be any of the following characters : ".","/","-"
+%org.spin.scrubber.type.OntologyMatch
+#DATE_SEPARATORS
+[^a-z^A-Z^0-9][0-3]?[0-9][-\./][0-3]?[0-9][-\./][0-9]{2,4}[^a-z^A-Z^%]+
+
+//all ages 10-100 divisible by 10 (was writtenAge1)
+%org.spin.scrubber.type.OntologyMatch
+#WRITTEN_AGE_10_100_DIV10
+(\s)+([Tt][Ee][Nn]|[Tt][Ww][Ee][Nn][Tt][Yy]|[Tt][hH][iI][Rr][Tt][Yy]|[Ff][Oo][Rr][Tt][Yy]|[Ff][Ii][Ff][Tt][Yy]|[Ss][Ii][Xx][Tt][Yy]|[Ss][Ee][Vv][Ee][Nn][Tt][Yy]|[Ee][Ii][Gg][Hh][Tt][Yy]|[Nn][Ii][Nn][Ee]?[Tt][Yy]|[Hh][Uu][Nn][Dd][Rr][Ee][Dd])[-\s]?((y(ea)?rs?)|(months?)|(days?)|(weeks?)|(wks?\.?))(\s+((old)|(of\s+age)))?\s
+
+//all ages from 110 to 119 (was writtenAge2)
+%org.spin.scrubber.type.OntologyMatch
+#WRITTEN_AGE_110_TO_119
+\s+([Oo][Nn][Ee])?\s+[Hh][Uu][Nn][Dd][Rr][Ee][Dd][-\s]?([Tt][Ee][Nn]|[Ee][Ll][Ee][Vv][Ee][Nn]|[Tt][Ww][Ee][Ll][Vv][Ee]|[Tt][Hh][Ii][Rr][Tt][Ee][Ee][Nn]|[Ff][Oo][Uu][Rr][Tt][Ee][Ee][Nn]|[Ff][Ii][Ff][Tt][Ee][Ee][Nn]|[sS][Ii][Xx][Tt][Ee][Ee][Nn]|[Ss][Ee][Vv][Ee][Nn][Tt][Ee][Ee][Nn]|[Ee][Ii][Gg][Hh][Tt][Ee][Ee][Nn]|[Nn][Ii][Nn][Ee][Tt][Ee][Ee][Nn])?[-\s]?((y(ea)?rs?)|(months?)|(days?)|(weeks?)|(wks?\.?))(\s+((old)|(of\s+age)))?\s
+
+//All ages from 20-99. (was writtenAge4)
+%org.spin.scrubber.type.OntologyMatch
+#WRITTEN_AGE_20_TO_99
+(\s)(([Tt][Ww][Ee][Nn][Tt][Yy])|([Tt][hH][iI][Rr][Tt][Yy])|[Ff]orty|[Ff]ifty|[Ss]ixty|[Ss]eventy|[Ee]ighty|[Nn]ine?ty)[-\s]?([Oo][Nn][Ee]|[Tt][Ww][Oo]|[Tt][Hh][Rr][Ee][Ee]|[Ff][Oo][Uu][Rr]|[Ff][Ii][Vv][Ee]|[sS][Ii][Xx]|[Ss][Ee][Vv][Ee][Nn]|[Ee][Ii][Gg][Hh][Tt]|[Nn][Ii][Nn][Ee])?[-\s]?(([yY](ea)?rs?)|([Mm]onths?)|([Dd]ays?)|([Ww]eeks?)|([Ww]ks?\.?))(\s+(([Oo]ld)|([Oo]f\s+[Aa]ge)))?\s
+
+//All ages from 1-19 (was writtenAge5)
+%org.spin.scrubber.type.OntologyMatch
+#WRITTEN_AGE_1_TO_19
+\s((aged|AGED)?(\s+)?([Oo][Nn][Ee]|[Tt][Ww][Oo]|[Tt][Hh][Rr][Ee][Ee]|[Ff][Oo][Uu][Rr]|[Ff][Ii][Vv][Ee]|[sS][Ii][Xx]|[Ss][Ee][Vv][Ee][Nn]|[Ee][Ii][Gg][Hh][Tt]|[Nn][Ii][Nn][Ee]|[Tt][Ee][Nn]|[Ee][Ll][Ee][Vv][Ee][Nn]|[Tt][Ww][Ee][Ll][Vv][Ee]|[Tt][Hh][Ii][Rr][Tt][Ee][Ee][Nn]|[Ff][Oo][Uu][Rr][Tt][Ee][Ee][Nn]|[Ff][Ii][Ff][Tt][Ee][Ee][Nn]|[sS][Ii][Xx][Tt][Ee][Ee][Nn]|[Ss][Ee][Vv][Ee][Nn][Tt][Ee][Ee][Nn]|[Ee][Ii][Gg][Hh][Tt][Ee][Ee][Nn]|[Nn][Ii][Nn][Ee][Tt][Ee][Ee][Nn]))[-\s]?((y(ea)?rs?)|(months?)|(days?)|(weeks?)|(wks?\.?))(\s+((old)|(of\s+age)))?\s
+
+//This should take out any mention of age followed by woman/man/male/female/m/f
+//([Mm](ale)?|[Ff](emale)?|[Ww](oman)?|[Mm](an)?)
+%org.spin.scrubber.type.OntologyMatch
+#AGE
+[^a-z^A-Z^0-9]([0-9]{1,3}[^a-z^A-Z^0-9]+([Mm](ale)?|[Ff](emale)?|[Ww](oman)?|[Mm](an)?|([Yy]/[Oo]))[^a-z^A-Z^0-9])
+
+//this works to take out any mention of the age.(or of this form -- "aged over 50"
+%org.spin.scrubber.type.OntologyMatch
+#AGED_OVER
+[^a-z^A-Z^0-9](([Aa]ged?|AGED?|AGE)([^a-z^A-Z^0-9](over)[^a-z^A-Z^0-9]+)?[0-9]{1,3}[^a-z^A-Z^0-9])
+
+// The word Age followed by the age: ex AGE 77
+%org.spin.scrubber.type.OntologyMatch
+#AGE4
+(([Aa]ged?|AGED?|AGE)(:)*\s+[0-9]{1,3})
+
+// Age followed by gender i.e. 86 year old woman
+%org.spin.scrubber.type.OntologyMatch
+#AGE5
+(\d+)(\s+|-)([Y|y][E|e][A|a][R|r]([S|s]*))(\s+|-)(([O|o][L|l][D|d])*)(\s+|-)((man|MAN|[Mm](ale)?|[Ff](emale)?|([Ww](oman))?))
+
+//Allow for multiple non-numeric/non-alph characters between number and text
+%org.spin.scrubber.type.OntologyMatch
+#AGE6
+[^a-z^A-Z^0-9]([0-9]{1,3}[^a-z^A-Z^0-9]+(([yY](ea)?rs?)|([Ww]eeks?)|([Dd]ays?)|([Mm]onths?)|(MONTHS?)|(Y(EA)?RS?)|(WEEKS?)|(DAYS?)))
+
+//ex: age twenty - nine,
+%org.spin.scrubber.type.OntologyMatch
+#AGE7
+((age|aged|AGED)(\s+)(\w+)(\s+)(-|,)*(\s+)\w+)
+
+//ex: 77 yo
+%org.spin.scrubber.type.OntologyMatch
+#AGE8
+(\d{1,2})\s(yo)
+
+// note that these come after the other known identifiers like pt ssn, acc num, etc
+%org.spin.scrubber.type.OntologyMatch
+#SSN
+(?!000)([0-6]\d{2}|7([0-6]\d|7[012]))([ -]?)(?!00)\d\d\3(?!0000)\d{4}
+
+//Match all jr and sr
+%org.spin.scrubber.type.OntologyMatch
+#TITLES
+([Jj][rR]|[Ss][rR])\.?
+
+//Match all roman numeral III
+%org.spin.scrubber.type.OntologyMatch
+#TITLE_THIRD
+[^a-z^A-Z^0-9]+(^[Pp]art|^[Gg]rade):?\s+III\.?
+
+//Patient's Name
+%org.spin.scrubber.type.OntologyMatch
+#PATIENT_NAME
+[^a-z^A-Z^0-9]+(((patient(')?(s)?)\s+(name))[^a-z^A-Z^0-9^\"]+(is)?\"\w+[^a-z^A-Z]+\w+[^a-z^A-Z]+)
+
+// this looks for key words to identify accession numbers by the word accession or any abbreviated form
+%org.spin.scrubber.type.OntologyMatch
+#ACCESSION
+(([Aa][Cc][Cc]\.?)|([Aa][Cc][Cc][Ee][Ss][Ss][Ii][Oo][Nn])|([Cc][Aa][Ss][Ee]))[^A-Z^a-z^0-9]+(([Nn][Uu][Mm]\.?)|(#)|[Nn]number|NUMBER)[^A-Z^a-z^0-9]+[a-zA-Z]{1,3}[0-9]{1,12}[a-zA-Z]{0,3}[^A-Z^a-z^0-9]
+
+// accession number  KPNW format R98-87848 
+%org.spin.scrubber.type.OntologyMatch
+#ACCESSION_KP
+((\w\d{2}-(\d+)))
+
+//accession number  KPNW format R98-87848 
+%org.spin.scrubber.type.OntologyMatch
+#ACCESSION_KP2
+(\d{2}-\w\d{4})
+
+//record number KPNW
+%org.spin.scrubber.type.OntologyMatch
+#RN_KPNW
+\w\d{2}-\d{4,6}
+
+// mrn for KPNW with spaces
+%org.spin.scrubber.type.OntologyMatch
+#MRN_KPNW
+\d{4}\s+\d{2}\s+\d{2}
+
+//suspicious number 4-15 digits
+%org.spin.scrubber.type.OntologyMatch
+#SUSPICIOUS_NUM
+[0-9]{5,15}
+
+//2 numbers followed by a dash followed by a number larger than three digits
+%org.spin.scrubber.type.OntologyMatch
+#SUSPICIOUS_NUM2
+(\d{2,}?)(-\d{3,})+?
+
+//take out any year of the form 19-- or 20-- or mispelled "1" for "l"
+%org.spin.scrubber.type.OntologyMatch
+#YEAR_CENTURY
+(19|20|l9)[0-9]{2}
+
+// Single Month
+%org.spin.scrubber.type.OntologyMatch
+#MONTH
+((January|February|March|April|June|July|August|September|October|November|December)|(january|february|march|april|june|july|august|september|october|november|december)|(Jan|Feb|Mar|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec)|(jan|feb|mar|apr|jun|jul|aug|sep|oct|nov|dec)|(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER))
+
+//end/beginning/middle of single month: end of May.
+%org.spin.scrubber.type.OntologyMatch
+#MONTH2
+((((e|E)nd)|((B|b)eginning)|((M|m)iddle))\s+of\s+)((January|February|March|April|May|June|July|August|September|October|November|December)|(january|february|march|april|may|june|july|august|september|october|november|december)|(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)|(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER))(\.|,)
+
+// Surgeon ex: SURGEON: T. Todd, M.D.
+%org.spin.scrubber.type.OntologyMatch
+#SURGEON
+(SURGEON\(S\)|SURGEON|SUREGON|Surgeon|surgeon)(:)*\s+(\w+)(\.)*\s+\w+\W+(\s*)((M|m)(\.)*(\s*)(D|d)(\.)*)(((\W*\s+\w+\W\s+\w+\W)\s+(((M|m)(\.)*(\s*)(D|d)(\.)*)))*)
+
+// Surgeon ex: SURGEON: Dr. Bernard
+%org.spin.scrubber.type.OntologyMatch
+#SURGEON2
+(SURGEON\(S\)|SURGEON|SUREGON|Surgeon|surgeon)(:)*\s+(D|d)(R|r)((\.)*)\s+\w+\s+
+
+// Surgeon ex: D. Perez, D.P.M.
+%org.spin.scrubber.type.OntologyMatch
+#SURGEON3
+(SURGEON\(S\)|SURGEON|SUREGON|Surgeon|surgeon|ASST|asst|Asst)(:)*\s+(\w+)(\.)*\s+\w+\W+(\s*)((D|d)(\.)*(\s*)(P|p)(\.)*)(M|m)(\.)*(((\W*\s+\w+\W\s+\w+\W)\s+(((M|m)(\.)*(\s*)(D|d)(\.)*)))*)
+
+// Surgeon ex: SURGEON:                  F. Joseph, C.N.M.
+%org.spin.scrubber.type.OntologyMatch
+#SURGEON4
+(SURGEON\(S\)|SURGEON|SUREGON|Surgeon|surgeon|ASST|asst|Asst)(:)*\s+(\w+)(\.)*\s+\w+\W+(\s*)((C|c)(\.)*(\s*)(N|n)(\.)*)(M|m)(\.)*(((\W*\s+\w+\W\s+\w+\W)\s+(((M|m)(\.)*(\s*)(D|d)(\.)*)))*)
+
+// Surgeon (Suregon followed by any characters ending on MD) 
+%org.spin.scrubber.type.OntologyMatch
+#SURGEON5
+(SURGEON\(S\)|SURGEON|SUREGON|Surgeon|surgeon)(:)(\s+)*(.+)(M|m)(\.)*(D|d)(\.)*
+
+// Surgeon ex SURGEON:                  J. Ray, D.O.   
+%org.spin.scrubber.type.OntologyMatch
+#SURGEON6
+(SURGEON\(S\)|SURGEON|SUREGON|Surgeon|surgeon)(:)(\s+)*(.+)(D|d)(\.)*(O|o)(\.)*
+
+// Surgeon with Dr.
+%org.spin.scrubber.type.OntologyMatch
+#SURGEON7
+(SURGEON\(S\)|SURGEON|SUREGON|Surgeon|surgeon|ASST|asst|Asst)(:)*\s+(D|d)(\.)*(R|r)(\.)*(.+)
+
+// Surgeon ex: L. Dwight or single name
+%org.spin.scrubber.type.OntologyMatch
+#SURGEON8
+(SURGEON\(S\)|SURGEON|SUREGON|Surgeon|surgeon|ASST|asst|Asst)(:)*\s+((\w+)(\.)*(\s+)(\w+)|(\w+))
+
+// ASST: same as above
+%org.spin.scrubber.type.OntologyMatch
+#ASST
+(ASST?|asst?|assistant?|Assistant?|ASSISTANT\(S\)?)(:)*\s+(\w+)(\.)*\s+\w+\W+(\s*)((M|m)(\.)*(\s*)(D|d)(\.)*)(((\W*\s+\w+\W\s+\w+\W)\s+(((M|m)(\.)*(\s*)(D|d)(\.)*)))*)
+
+//ASST2
+%org.spin.scrubber.type.OntologyMatch
+#ASST2
+(ASST?|asst?|assistant?|Assistant?|ASSISTANT\(S\)?)(:)*\s+\w+\W\s*\w+\W\s+(((M|m)(\.)*(\s*)(D|d)(\.)*))*
+
+//ASST3 ex: ASST: M. Chap, S.A.
+%org.spin.scrubber.type.OntologyMatch
+#ASST3
+(ASST?|asst?|assistant?|Assistant?|ASSISTANT\(S\)?)(:)*\s+\w+\W\s*\w+\W\s+(((S|s)(\.)*(\s*)(A|a)(\.)*))*
+
+//PA ex Christine Fox, P.A.
+%org.spin.scrubber.type.OntologyMatch
+#PA
+(\w+)(\s+)(\w+)(,)*\s((P|p)(\.)+(A|a)(\.)+)
+
+//CC: same as above
+%org.spin.scrubber.type.OntologyMatch
+#CC
+(cc|CC)(:)*\s+(\w+)(\.)*\s+\w+\W+(\s*)((M|m)(\.)*(\s*)(D|d)(\.)*)(((\W*\s+\w+\W\s+\w+\W)\s+(((M|m)(\.)*(\s*)(D|d)(\.)*)))*)
+
+//CC with RN or MD
+%org.spin.scrubber.type.OntologyMatch
+#CC2
+(cc|CC)(:)*\s+(.+)(((R|r)(\.)*(N|n)(\.)*)|(M|m)(\.)*(D|d)(\.)*)
+
+//FROM:
+%org.spin.scrubber.type.OntologyMatch
+#FROM
+(FROM|From)(:)\s+(\w+)(\.)*\s+\w+(((M|m)(\.)*(\s*)(D|d)(\.)*))*
+
+//CONSULTATION
+%org.spin.scrubber.type.OntologyMatch
+#CONSULTATION
+(CONSULTATION)(:)*\s+(\w+)(\.)*\s+\w+\W+(\s*)((M|m)(\.)*(\s*)(D|d)(\.)*)(((\W*\s+\w+\W\s+\w+\W)\s+(((M|m)(\.)*(\s*)(D|d)(\.)*)))*)
+
+//TECHNOLOGIST
+%org.spin.scrubber.type.OntologyMatch
+#TECH
+(TECHNOLOGIST)(:)*\s+(\w+)(\.)*
+
+//Doctor 2-token-name. ex: Dr. Sam Smith. both names must be initialUppercase
+%org.spin.scrubber.type.OntologyMatch
+#DOCTOR0_00
+(([dD][Rr]([Ss])?))(\.|,)*(\s+[^\Wa-z0-9_][^\WA-Z0-9_]*\b)(\s+[^\Wa-z0-9_][^\WA-Z0-9_]*\b)
+
+// New Doctor rule ex: Dr. F. Gelman.
+%org.spin.scrubber.type.OntologyMatch
+#DOCTOR0_0
+(([dD][Rr]([Ss])?))(\.)*\s(\w|w+)(\.)*\s(\w+)
+
+//New doctor rule ex: Dr. Antonioni 
+%org.spin.scrubber.type.OntologyMatch
+#DOCTOR0_1
+(([dD][Rr]([Ss])?))(\.|,)*\s+\w+
+
+//New Doctor rule with ands: 
+%org.spin.scrubber.type.OntologyMatch
+#DOCTOR0_2
+(([dD][Rr]([Ss])?))(\.|,)*\s\w+\s(and|AND)\s(\w+)
+
+//Doctor
+%org.spin.scrubber.type.OntologyMatch
+#DOCTOR0
+(([dD][Rr]([Ss])?))(\.|,)*(\W\w+)(\W\w+)
+
+//Dr followed by a single name and punctuation
+%org.spin.scrubber.type.OntologyMatch
+#DOCTOR1
+(([dD][Rr])|([Dd]octor)|(DOCTOR))(\.|,)*(\s+)(\w+)(\.|,)
+
+// MD2
+%org.spin.scrubber.type.OntologyMatch
+#DOCTOR2
+\w+\W\s*\w+\W*(((M|m)(\.)*(\s*)(D|d)(\.)*))
+
+//DRS plural
+%org.spin.scrubber.type.OntologyMatch
+#DOCTOR3
+(Drs|DRS)(\.)*(.+)
+
+//Drs plural and
+%org.spin.scrubber.type.OntologyMatch
+#DOCTOR4
+(([dD][Rr]([Ss])?))(\.|,)*(\W\w+)(\W(?i)(and))(\W\w+)
+
+//put the older Dr regex. this should be a safety net incase the above regex is not satisfied.
+%org.spin.scrubber.type.OntologyMatch
+#DOCTOR_OLDER
+(([dD][Rr]([Ss])?)|([Dd]octor)s?|(DOCTORS?))[^a-zA-Z^0-9]+(\w+[^a-z^A-Z^0-9]+){1,3}
+
+// this line looks for DR|doctor followed by up to six "words"
+%org.spin.scrubber.type.OntologyMatch
+#DOCTOR_GEN
+(([dD][Rr]([Ss])?)|([Dd]octor)s?|(DOCTORS?))[\.](\s+)?([A-Z]([a-zA-Z]+)?([^a-z^A-Z^0-9]+|(and)|(AND))+){1,8}
+
+// ex: 3414 N. Kaiser Center Drive, Portland, OR 97227 -- dangerous regex may overmatch
+%org.spin.scrubber.type.OntologyMatch
+#ADDRESS
+\d+\s+((.+)(,|\s))((.+)(,|\s))(\s*)(?-i:A[LKSZRAEP]|C[AOT]|D[EC]|F[LM]|G[AU]|HI|I[ADLN]|K[SY]|LA|M[ADEHINOPST]|N[CDEHJMVY]|O[HKR]|P[ARW]|RI|S[CD]|T[NX]|UT|V[AIT]|W[AIVY])+(\s+|(,|\s))(\d{5}|(\d{5}-\d{4}))+
+
+//the following regex does this --- any number may be followed by an alpha(to take care of 200A longwood ave) ,followed by a space,one or more words separated by space and followed by one of the list of synonyms for street which is followed by a space or a comma .
+%org.spin.scrubber.type.OntologyMatch
+#ADDRESS2
+[^a-z^A-Z][0-9]{1,6}(\w+)?\s(\w+\s)+([sS]t|[sS]treet|[aA]venue|[aA]ve|[Bb]lvd|[bB]oulevard|[sS]uite|[pP]ark|[dD]rive|[dD]r|[lL]ane|[lL]n|[Ww]ay|[Pp]ky|[pP]arkway|[Rr]oute|rt|RT|Rt|[rR]oad|[rR]d|[pP]ass|Square|Sq|[Pp]laza|[lL]ink|[bB]end|[gG]ardens|[cC]ircle|[rR]ow|[tT]urn|[hH]wy|[hH]ighway|[cC]ir|[cC]ourt|[cC]rossing|[tT]rail|[rR]un|[pP]ike|[tT]errace|Place|[pP]l|[lL]oop|[pP]arade|[aA]lley)[^A-Za-z]?(\.)?[\s,]?([A-Z][a-zA-Z]+[^a-z^A-Z^0-9]+){1,5}((\d{5}(-)\d{4})|(\d{5}))?[^a-z^A-Z^0-9]?
+
+// ex: 123 Anywhere Dr. Somewhere, ST 55789 
+%org.spin.scrubber.type.OntologyMatch
+#ADDRESS4
+[ \w]{3,}(\w+\.)?([ \w]*\#\d+)?(\r\n| )[ \w]{3,},\x20[A-Za-z]{2}\x20\d{5}(-\d{4})?
+
+//ex:SALEM OR  97310-1020
+%org.spin.scrubber.type.OntologyMatch
+#ADDRESS5
+\w+(,)*\s+(?-i:A[LKSZRAEP]|C[AOT]|D[EC]|F[LM]|G[AU]|HI|I[ADLN]|K[SY]|LA|M[ADEHINOPST]|N[CDEHJMVY]|O[HKR]|P[ARW]|RI|S[CD]|T[NX]|UT|V[AIT]|W[AIVY])\s+\d{5}(-\d{4})*
+
+//ex:2045 Fake Name South West STREET
+%org.spin.scrubber.type.OntologyMatch
+#ADDRESS6
+\d+(\s+\w+){1,6}\s+(STREET|ST|AVENUE|AVE|BBLVD|BOULEVARD|SUITE|PARK|DRIVE|DR|LNE|LN|WAY|PKY|PARKWAY|ROUTE|RT|ROAD|RD|PASS|SQUARE|SQ|PLAZA|LINK|GARDENS|CIRCLE|ROW|TURN|HWY|HIGHWAY|CIR|COURT|CROSSING|TRAIL|RUN|PIKE|TERRACE|PLACE|PPL|LOOP|PARADE|ALLEY)(\.)*\s+([Nn]orth|[Ss]outh|[Ee]ast|[Ww]est|NE|ne|SE|se|NW|nw|SW|sw)*
+
+// ex:501 N. Graham, Suite 500
+%org.spin.scrubber.type.OntologyMatch
+#ADDRESS7
+\d+(.*)(?i)\W(STREET|ST|AVENUE|AVE|BBLVD|BOULEVARD|SUITE|PARK|DRIVE|DR|LNE|LN|WAY|PKY|PARKWAY|ROUTE|RT|ROAD|RD|PASS|SQUARE|SQ|PLAZA|LINK|GARDENS|CIRCLE|ROW|TURN|HWY|HIGHWAY|CIR|COURT|CROSSING|TRAIL|RUN|PIKE|TERRACE|PLACE|PPL|LOOP|PARADE|ALLEY)(\W\d+)*
+
+//ex: 1211 SW 5TH
+%org.spin.scrubber.type.OntologyMatch
+#ADDRESS8
+(\d+?)\W[NnSsEeWw\.]+\W\w+
+
+//ex: PO BOX 8004
+%org.spin.scrubber.type.OntologyMatch
+#POBOX
+(?i)[PO\.]+\W(?i)[BOX\.]+\W\d+
+
+// ROOM 680
+%org.spin.scrubber.type.OntologyMatch
+#ROOM
+(?i)(ROOM)\W(.*?)\d+
+
+//Take out any 5 numbers-4 numbers since they are assumed to be zip codes
+%org.spin.scrubber.type.OntologyMatch
+#ZIP_CODE
+(\d{5}(-)\d{4})
+
+// Discharged ex DISCHARGED:  JANUARY 12, 2001
+%org.spin.scrubber.type.OntologyMatch
+#DISCHARGED
+(DISCHARGED|Discharged|discharged)(:)\s+\w+\s+(\d{2}|\d)(\,|\s+)(\d{4}|\d{2})
+
+// Discharged ex. DISCHARGED:     08-12-95
+%org.spin.scrubber.type.OntologyMatch
+#DISCHARGED2
+(DISCHARGED|Discharged|discharged)(:)\s+(\d{1,})(-)(\d{1,})(-)(\d{1,})
+
+// Mr. Fitch
+%org.spin.scrubber.type.OntologyMatch
+#MR
+(?i)(MR)(\.)*(\W\w+)
+
+// Ms. Fitch or Mrs. Fitch
+%org.spin.scrubber.type.OntologyMatch
+#MS
+(?i)(MS|MRS)(\.)*(\W\w+)
+
+//doctor name found in subheadings. ex.) Attending : Fitch, Britt, <DISTINCTION>
+//where <DISTINCTION> is in (rn, md, cnm, np)
+//max length is 30 chars for name to prevent matching areas that are not actually subheadings.
+%org.spin.scrubber.type.OntologyMatch
+#DOCTOR_SUBHEAD
+(:)+\s+(.){1,30}\s+(((R|r)(\.)*(N|n)(\.)*)|(M|m)(\.)*(D|d)(\.)*|(C|c)(\.)*(N|n)(\.)*(M|m)(\.)*|(N|n)(\.)*(P|p)(\.)*)\s+
+
+//match 4th floor, 2nd fl, etc...
+#LOCATION_FLOOR_1
+(?i)(\d{1,3})(nd|rd|st|th)\s+(floor|flr|fl)
+
+//match 4th floor, 2nd fl, etc...
+#LOCATION_FLOOR_2
+(?i)(floor|flr|fl)(\.)*\s+(\d{1,3})
+
+//match hospital names in initialUppercase ex.) Mass General Hospital, Children's hospital, Superman Memorial Regional Medical Center 
+//max length of 6 tokens to prevent runaway matching
+//may want to add in more hospital matching regex for better classification?
+#HOSPITAL_1
+((\s+|\b)[^\Wa-z0-9_][^\WA-Z0-9_]*\b(\.|'s)*){1,5}\s+((?i)(hospital|hosp|center|centre|cntr|ctr|clinic))
+
+//match hospital names in ALL Uppercase ex.) MASS GENERAL HOSPITAL 
+//to prevent runaway matching this will match unlimited times from a set of common building words for hospital names and 1 preceding word in ALL UPPERCASE
+//may want to add in more hospital matching regex for better classification?
+#HOSPITAL_2
+((\s+|\b)[A-Z0-9_-]*\b(\.|'[s|S])*){1}\s+((MEMORIAL|MEM|COUNTY|REGIONAL|REG|MEDICAL|MED|HEALTH|DISTRICT|DIST|REHABILITATION|REHAB|GENERAL|GEN|COMMUNITY|COMM|UNIVERSITY|UNIV)+(\.)*\s+)+(HOSPITAL|HOSP|CENTER|CENTRE|CNTR|CTR|CLINIC)
+
+//id format for i2b2 smoking de-id data
+#ID_I2B2_SMOK
+(\d{3})-(\d{2})-(\d{2})-(\d{1})
+
+//id format for i2b2 smoking de-id data
+#ID_I2B2_SMOK_2
+(\d{3})-(\d{2})-(\d{2})-(\d{1})\s+[A-Za-z0-9]{3}\b
+
+//id format for i2b2 smoking de-id data. ex.) TR: fooID (will ignore the token 'TR: ')
+#ID_I2B2_SMOK_3
+(?<=((?i)\bTR\s?:\s?))([A-Za-z0-9]*\b)
+
+//format date in the format: 'on the 22nd', 'on the 31st', etc... (will ignore the tokens 'on the')
+#DATE_23
+(?<=(\bon\s?the\s?))(\d+(?i)(st|nd|rd|th))
\ No newline at end of file

Propchange: ctakes/sandbox/ctakes-scrubber-deid/conf/regex_patterns.txt
------------------------------------------------------------------------------
    svn:mime-type = text/plain