You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by pk...@apache.org on 2016/06/06 11:56:42 UTC
svn commit: r1746984 [3/3] - in /ctakes/sandbox/ctakes-clinical-deid/src:
main/resources/wordlists/ main/ruta/org/apache/ctakes/deid/
test/java/org/apache/ctakes/deid/
Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/spoken_language.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/spoken_language.txt?rev=1746984&r1=1746983&r2=1746984&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/spoken_language.txt (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/spoken_language.txt Mon Jun 6 11:56:42 2016
@@ -31,7 +31,7 @@ Burundian-speaking
Cambodian-speaking
Cameroonian-speaking
Canadian-speaking
-Cape Verdean-speaking
+CapeVerdean-speaking
Chadian-speaking
Chilean-speaking
Chinese-speaking
@@ -130,14 +130,14 @@ Senegalese-speaking
Serb-speaking
Serbian-speaking
Seychellois-speaking
-Sierra Leonian-speaking
+SierraLeonian-speaking
Singaporean-speaking
Slovak-speaking
Slovene-speaking
Slovenian-speaking
Somali-speaking
Spanish-speaking
-Sri Lankan-speaking
+SriLankan-speaking
Sudanese-speaking
Surinamese-speaking
Swazi-speaking
@@ -165,7 +165,7 @@ Vanuatuan-speaking
Venezuelan-speaking
Vietnamese-speaking
Welsh-speaking
-Western Samoan-speaking
+WesternSamoan-speaking
Yemeni-speaking
Yugoslav-speaking
Zairean-speaking
Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state.txt?rev=1746984&r1=1746983&r2=1746984&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state.txt (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state.txt Mon Jun 6 11:56:42 2016
@@ -1,14 +1,14 @@
ALABAMA
ALASKA
-AMERICAN SAMOA
+AMERICANSAMOA
ARIZONA
ARKANSAS
CALIFORNIA
COLORADO
CONNECTICUT
DELAWARE
-DISTRICT OF COLUMBIA
-FEDERATED STATES OF MICRONESIA
+DISTRICTOFCOLUMBIA
+FEDERATEDSTATESOFMICRONESIA
FLORIDA
GEORGIA
GUAM
@@ -21,7 +21,7 @@ KANSAS
KENTUCKY
LOUISIANA
MAINE
-MARSHALL ISLANDS
+MARSHALLISLANDS
MARYLAND
MASSACHUSETTS
MICHIGAN
@@ -31,29 +31,29 @@ MISSOURI
MONTANA
NEBRASKA
NEVADA
-NEW HAMPSHIRE
-NEW JERSEY
-NEW MEXICO
-NEW YORK
-NORTH CAROLINA
-NORTH DAKOTA
-NORTHERN MARIANA ISLANDS
+NEWHAMPSHIRE
+NEWJERSEY
+NEWMEXICO
+NEWYORK
+NORTHCAROLINA
+NORTHDAKOTA
+NORTHERNMARIANAISLANDS
OHIO
OKLAHOMA
OREGON
PALAU
PENNSYLVANIA
-PUERTO RICO
-RHODE ISLAND
-SOUTH CAROLINA
-SOUTH DAKOTA
+PUERTORICO
+RHODEISLAND
+SOUTHCAROLINA
+SOUTHDAKOTA
TENNESSEE
TEXAS
UTAH
VERMONT
VIRGINIA
-VIRGIN ISLANDS
+VIRGINISLANDS
WASHINGTON
-WEST VIRGINIA
+WESTVIRGINIA
WISCONSIN
WYOMING
\ No newline at end of file
Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state_acronym_abbreviation.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state_acronym_abbreviation.txt?rev=1746984&r1=1746983&r2=1746984&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state_acronym_abbreviation.txt (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state_acronym_abbreviation.txt Mon Jun 6 11:56:42 2016
@@ -52,7 +52,7 @@ WI
WV
WY
Ala
-Amer. Samoa
+Amer.Samoa
Ariz
Ark
Calif
@@ -72,13 +72,13 @@ Mo
Mont
N.C.
N.Dak
-N. Dak
+N.Dak
Nebr
Nev
N.H.
N.J.
N.Mex
-N. Mex
+N.Mex
N.Y.
Ohio
Okla
@@ -89,7 +89,7 @@ P.R.
R.I.
S.C.
S.Dak
-S. Dak
+S.Dak
Tenn
Tex
Utah
@@ -100,5 +100,5 @@ Wash
Wis
Wisc
W.Va
-W. Va
+W.Va
Wyo
\ No newline at end of file
Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta?rev=1746984&r1=1746983&r2=1746984&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta Mon Jun 6 11:56:42 2016
@@ -11,5 +11,10 @@ RETAINTYPE(WS);
RETAINTYPE;
(MonthInd{-PARTOF(deid.Date)} Num4{-PARTOF(deid.Date),REGEXP("19..|20..")}){-> deid.Date};
-Num4{-PARTOF(deid.Date),REGEXP("19..|20..")-> deid.Date};
-MonthInd{-PARTOF(deid.Date), -PARTOF(deid.DeidEntity)-> deid.Date};
+
+RETAINTYPE(WS);
+WS @Num4{-PARTOF(deid.Date),REGEXP("19..|20..")-> deid.Date};
+WS @MonthInd{-PARTOF(deid.Date), -PARTOF(deid.DeidEntity)-> deid.Date};
+RETAINTYPE;
+
+deid.Date.ct=="may"{-> UNMARK(deid.Date)};
\ No newline at end of file
Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta?rev=1746984&r1=1746983&r2=1746984&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta Mon Jun 6 11:56:42 2016
@@ -53,6 +53,7 @@ CALL(Date);
Age{PARTOF(deid.Date) -> UNMARK(Age)};
+Fax{PARTOF(Phone) -> UNMARK(Fax)};
Email{-> Contact, Contact.entityType = "EMAIL"};
Url{-> Contact, Contact.entityType = "URL"};
Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta?rev=1746984&r1=1746983&r2=1746984&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta Mon Jun 6 11:56:42 2016
@@ -31,6 +31,8 @@ TRIE(
"dr_prefix2.txt" = DrPrefixInd2,
trie, true, 4, false, 0, "-");
+KeywordInd-> {KeywordInd{-> UNMARK(KeywordInd)} ANY; ANY KeywordInd{-> UNMARK(KeywordInd)};};
+
DECLARE Url, Email;
"[_A-Za-z0-9-\\+]+(\\.[_A-Za-z0-9- ]+)*@[A-Za-z0-9-]+(\\.[A-Za-z0-9]+)*(\\.(com|org|edu|gov|mil|co\\.uk))" -> Email;
"(https?://)?(www.)([a-zA-Z0-9]+).[a-zA-Z0-9]*.[a-z]{3}.?([a-z]+)?(/[a-zA-Z0-9]+)?|(https?|ftp)://[^\\s/$.?#].[^\\s]*|www.[^\\s/$.?#].[^\\s]*" -> Url;
Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta?rev=1746984&r1=1746983&r2=1746984&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta Mon Jun 6 11:56:42 2016
@@ -26,15 +26,15 @@ W{REGEXP("Drs?", true)} PERIOD?
(CAP{-REGEXP("DR")} (COMMA| (CW PERIOD))? CAP CAP? CW?){-> Doctor} COMMA? DrPostfixInd;
DrPrefixInd1{-> SHIFT(DrPrefixInd1,1,4)} W{REGEXP("(?i)by|for")} W{REGEXP("(?i)physician")} COLON?;
-DrPrefixInd1 (CW{-REGEXP("Dr")} CW PERIOD? CW){-> Doctor};
-DrPrefixInd1 (CW{-REGEXP("Dr")} COMMA? CW CW?){-> Doctor};
-DrPrefixInd1 (CAP{-REGEXP("DR")} ANY{PARTOF({COMMA,CW,PERIOD})} CAP PERIOD? CAP CAP?){-> Doctor};
+DrPrefixInd1 (CW{-REGEXP("Dr")} CW PERIOD? CW){-PARTOF(Doctor) -> Doctor};
+DrPrefixInd1 (CW{-REGEXP("Dr")} COMMA? CW CW?){-PARTOF(Doctor) -> Doctor};
+DrPrefixInd1 (CAP{-REGEXP("DR")} ANY{PARTOF({COMMA,CW,PERIOD})} CAP PERIOD? CAP CAP?){-PARTOF(Doctor) -> Doctor};
Split (W{-> Doctor} ANY?{PARTOF({Slash,COLON,Bar})})+ SEMICOLON? (Num2 Slash)? Num69 PERIOD "doc";
-DrPrefixInd2{-> SHIFT(DrPrefixInd2,1,3)} W?{REGEXP("(?i)physician")} COLON;
-DrPrefixInd2 (CW{-REGEXP("Dr")} CW PERIOD? CW){-> Doctor};
-DrPrefixInd2 (CW{-REGEXP("Dr")} COMMA? CW CW?){-> Doctor};
-DrPrefixInd1 (CAP{-REGEXP("DR")} COMMA? CAP){-> Doctor};
-DrPrefixInd1 (CW{-REGEXP("Dr")}){-> Doctor};
+DrPrefixInd2 (CW{-REGEXP("Dr")} CW PERIOD? CW){-PARTOF(Doctor) -> Doctor};
+DrPrefixInd2 (CW{-REGEXP("Dr")} COMMA? CW CW?){-PARTOF(Doctor) -> Doctor};
+DrPrefixInd2 (CAP{-REGEXP("DR")} COMMA? CAP){-PARTOF(Doctor) -> Doctor};
+DrPrefixInd2 (CW{-REGEXP("Dr")}){-> Doctor};
+Doctor->{ANY @Doctor{-> UNMARK(Doctor)}; @Doctor{-> UNMARK(Doctor)} ANY;};
\ No newline at end of file
Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Fax.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Fax.ruta?rev=1746984&r1=1746983&r2=1746984&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Fax.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Fax.ruta Mon Jun 6 11:56:42 2016
@@ -11,5 +11,6 @@ DECLARE FaxPattern;
(Num3 Num3 Num4){-> FaxPattern};
(Num3 Num4 Num3){-> FaxPattern};
-W?{REGEXP("(?i)fax")} W?{REGEXP("(?i)No|Num|Number")} ANY?{PARTOF({COLON,Hash,PERIOD})}
- @FaxPattern{-> Fax};
\ No newline at end of file
+(W{REGEXP("(?i)fax")} W?{REGEXP("(?i)No|Num|Number")})? ANY?{PARTOF({COLON,Hash,PERIOD})}
+ @FaxPattern{-> Fax};
+
Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta?rev=1746984&r1=1746983&r2=1746984&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta Mon Jun 6 11:56:42 2016
@@ -4,65 +4,66 @@ TYPESYSTEM org.apache.ctakes.deid.Dictio
DECLARE Patient;
+NamePrefixInd COLON?{-ENDSWITH(SplitExternal)}
+ (
+ CW{-PARTOF(NamePrefixInd)}
+ COMMA
+ CW (CW{REGEXP(".")} PERIOD? CW?)?
+ ){-PARTOF(Patient) -> Patient};
NamePrefixInd COLON?{-ENDSWITH(SplitExternal)}
(
CW{-PARTOF(NamePrefixInd), -REGEXP(".")}
CW[0,3]{-REGEXP("Done")}
- ){-> Patient};
+ ){-PARTOF(Patient) -> Patient};
NamePrefixInd COLON?{-ENDSWITH(SplitExternal)}
(
CW{-PARTOF(NamePrefixInd)}
PERIOD{-ENDSWITH(Split)}
CW CW? PERIOD?{-ENDSWITH(Split)}
- ){-> Patient};
-
-NamePrefixInd COLON?{-ENDSWITH(SplitExternal)}
- (
- CW{-PARTOF(NamePrefixInd)}
- COMMA
- CW (CW{REGEXP(".")} PERIOD?)?
- ){-> Patient};
+ ){-PARTOF(Patient) -> Patient};
NamePrefixInd COLON?{-ENDSWITH(SplitExternal)}
(
CAP COMMA? CAP (CW{REGEXP(".")} PERIOD?)?
- ){-> Patient};
+ ){-PARTOF(Patient) -> Patient};
+
+PatientPrefixInd COLON
+ (
+ CW{-PARTOF(NamePrefixInd)}
+ COMMA
+ CW (CW{REGEXP(".")} PERIOD? CW?)?
+ ){-PARTOF(Patient) -> Patient};
PatientPrefixInd COLON
(
CW{-PARTOF(NamePrefixInd), -REGEXP(".")}
CW[0,3]{-REGEXP("Done")}
- ){-> Patient};
+ ){-PARTOF(Patient) -> Patient};
PatientPrefixInd COLON
(
CW{-PARTOF(NamePrefixInd)}
PERIOD{-ENDSWITH(Split)}
CW CW? PERIOD?{-ENDSWITH(Split)}
- ){-> Patient};
+ ){-PARTOF(Patient) -> Patient};
+
-PatientPrefixInd COLON
- (
- CW{-PARTOF(NamePrefixInd)}
- COMMA
- CW (CW{REGEXP(".")} PERIOD?)?
- ){-> Patient};
PatientPrefixInd COLON
(
CAP COMMA? CAP (CW{REGEXP(".")} PERIOD?)?
- ){-> Patient};
+ ){-PARTOF(Patient) -> Patient};
W.ct=="seeing"
(CW{-REGEXP("(?i)Done|Dr|Mr|Mrs|Miss|Ms|Pt|Patients")}
- CW?{-REGEXP("Done")}){-> Patient};
+ CW?{-REGEXP("Done")}){-PARTOF(Patient) -> Patient};
RETAINTYPE(BREAK);
-BREAK BREAK (@CAP COMMA? CAP ((CW PERIOD)|(CW COMMA? CW))?){-> Patient}
+BREAK BREAK (@CAP COMMA? CAP ((CW PERIOD)|(CW COMMA? CW))?){-PARTOF(Patient) -> Patient}
BREAK BREAK? Num78 BREAK;
RETAINTYPE;
-W{REGEXP("Mr|Mrs|Ms|Miss")} PERIOD? W{-REGEXP("(?i)take|pt") -> Patient};
\ No newline at end of file
+W{REGEXP("Mr|Mrs|Ms|Miss")} PERIOD? W{-REGEXP("(?i)take|pt"), -PARTOF(Patient) -> Patient};
\ No newline at end of file
Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/State.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/State.ruta?rev=1746984&r1=1746983&r2=1746984&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/State.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/State.ruta Mon Jun 6 11:56:42 2016
@@ -10,16 +10,16 @@ State CW{->UNMARK(State)};
State{-> UNMARK(State)} W{REGEXP("hospital|heart|association|medical|care|nursing|avenue|street|road|drive|boulevard", true)};
//getSTATE3
-"originally" "from" W[0,3]{-REGEXP("from")} @StateContext{-> State};
+"originally" "from" W[0,3]{-REGEXP("from")} @StateContext{-PARTOF(State) -> State};
// TODO refactor to wordlist?
W{REGEXP("home|son|daughter|mother|father|raised|grew|lived|lives", true)} W[0,3]?
- W{REGEXP("in")} W[0,3]{-REGEXP("in")} @StateContext{->State};
+ W{REGEXP("in")} W[0,3]{-REGEXP("in")} @StateContext{-PARTOF(State) -> State};
//getSTATE4
-CW (CW? COMMA) @StateContext{-> State};
+CW (CW? COMMA) @StateContext{-PARTOF(State) -> State};
//getSTATE6
RETAINTYPE(BREAK);
-BREAK CW COMMA W{REGEXP("MD|IN|AS|DE|HI|OR") -> State} NUM{REGEXP(".{5}")};
+BREAK CW COMMA W{REGEXP("MD|IN|AS|DE|HI|OR"), -PARTOF(State) -> State} NUM{REGEXP(".{5}")};
RETAINTYPE;
Modified: ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java?rev=1746984&r1=1746983&r2=1746984&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java Mon Jun 6 11:56:42 2016
@@ -32,7 +32,7 @@ import org.apache.uima.ruta.engine.RutaE
public class I2B2Evaluation {
- private static final boolean DEBUG = true;
+ private static final boolean DEBUG = false;
public static void main(String[] args)
throws ResourceInitializationException, UIMAException, IOException {