You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by ch...@apache.org on 2016/02/02 15:56:37 UTC

svn commit: r1728143 [3/3] - in /ctakes/sandbox/ctakes-clinical-deid/src/main: resources/wordlists/ ruta/org/apache/ctakes/deid/

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/street_full_ind.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/street_full_ind.txt?rev=1728143&r1=1728142&r2=1728143&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/street_full_ind.txt (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/street_full_ind.txt Tue Feb  2 14:56:36 2016
@@ -1,54 +1,3 @@
 street
-st
-drive
-dr
-road
-rd
-lane
-ln
-avenue
-ave
-court
-ct
-boulevard
-terrace
-circle
-place
-way
-street
-st
-drive
-dr
-road
-rd
-lane
-ln
-avenue
-ave
-court
-ct
-boulevard
-terrace
-circle
-place
-way
-street
-st
-drive
-dr
-road
-rd
-lane
-ln
-avenue
-ave
-court
-ct
-boulevard
-terrace
-circle
-place
-way
-street
 road
 avenue
\ No newline at end of file

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/street_ind.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/street_ind.txt?rev=1728143&r1=1728142&r2=1728143&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/street_ind.txt (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/street_ind.txt Tue Feb  2 14:56:36 2016
@@ -14,38 +14,4 @@ boulevard
 terrace
 circle
 place
-way
-street
-st
-drive
-dr
-road
-rd
-lane
-ln
-avenue
-ave
-court
-ct
-boulevard
-terrace
-circle
-place
-way
-street
-st
-drive
-dr
-road
-rd
-lane
-ln
-avenue
-ave
-court
-ct
-boulevard
-terrace
-circle
-place
 way
\ No newline at end of file

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state.txt?rev=1728143&r1=1728142&r2=1728143&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state.txt (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state.txt Tue Feb  2 14:56:36 2016
@@ -56,181 +56,4 @@ VIRGIN ISLANDS
 WASHINGTON
 WEST VIRGINIA
 WISCONSIN
-WYOMING
-ALABAMA
-ALASKA
-AMERICAN SAMOA
-ARIZONA
-ARKANSAS
-CALIFORNIA
-COLORADO
-CONNECTICUT
-DELAWARE
-DISTRICT OF COLUMBIA
-FEDERATED STATES OF MICRONESIA
-FLORIDA
-GEORGIA
-GUAM
-HAWAII
-IDAHO
-ILLINOIS
-INDIANA
-IOWA
-KANSAS
-KENTUCKY
-LOUISIANA
-MAINE
-MARSHALL ISLANDS
-MARYLAND
-MASSACHUSETTS
-MICHIGAN
-MINNESOTA
-MISSISSIPPI
-MISSOURI
-MONTANA
-NEBRASKA
-NEVADA
-NEW HAMPSHIRE
-NEW JERSEY
-NEW MEXICO
-NEW YORK
-NORTH CAROLINA
-NORTH DAKOTA
-NORTHERN MARIANA ISLANDS
-OHIO
-OKLAHOMA
-OREGON
-PALAU
-PENNSYLVANIA
-PUERTO RICO
-RHODE ISLAND
-SOUTH CAROLINA
-SOUTH DAKOTA
-TENNESSEE
-TEXAS
-UTAH
-VERMONT
-VIRGINIA
-VIRGIN ISLANDS
-WASHINGTON
-WEST VIRGINIA
-WISCONSIN
-WYOMING
-ALABAMA
-ALASKA
-AMERICAN SAMOA
-ARIZONA
-ARKANSAS
-CALIFORNIA
-COLORADO
-CONNECTICUT
-DELAWARE
-DISTRICT OF COLUMBIA
-FEDERATED STATES OF MICRONESIA
-FLORIDA
-GEORGIA
-GUAM
-HAWAII
-IDAHO
-ILLINOIS
-INDIANA
-IOWA
-KANSAS
-KENTUCKY
-LOUISIANA
-MAINE
-MARSHALL ISLANDS
-MARYLAND
-MASSACHUSETTS
-MICHIGAN
-MINNESOTA
-MISSISSIPPI
-MISSOURI
-MONTANA
-NEBRASKA
-NEVADA
-NEW HAMPSHIRE
-NEW JERSEY
-NEW MEXICO
-NEW YORK
-NORTH CAROLINA
-NORTH DAKOTA
-NORTHERN MARIANA ISLANDS
-OHIO
-OKLAHOMA
-OREGON
-PALAU
-PENNSYLVANIA
-PUERTO RICO
-RHODE ISLAND
-SOUTH CAROLINA
-SOUTH DAKOTA
-TENNESSEE
-TEXAS
-UTAH
-VERMONT
-VIRGINIA
-VIRGIN ISLANDS
-WASHINGTON
-WEST VIRGINIA
-WISCONSIN
-WYOMING
-ALABAMA
-ALASKA
-AMERICAN SAMOA
-ARIZONA
-ARKANSAS
-CALIFORNIA
-COLORADO
-CONNECTICUT
-DELAWARE
-DISTRICT OF COLUMBIA
-FEDERATED STATES OF MICRONESIA
-FLORIDA
-GEORGIA
-GUAM
-HAWAII
-IDAHO
-ILLINOIS
-INDIANA
-IOWA
-KANSAS
-KENTUCKY
-LOUISIANA
-MAINE
-MARSHALL ISLANDS
-MARYLAND
-MASSACHUSETTS
-MICHIGAN
-MINNESOTA
-MISSISSIPPI
-MISSOURI
-MONTANA
-NEBRASKA
-NEVADA
-NEW HAMPSHIRE
-NEW JERSEY
-NEW MEXICO
-NEW YORK
-NORTH CAROLINA
-NORTH DAKOTA
-NORTHERN MARIANA ISLANDS
-OHIO
-OKLAHOMA
-OREGON
-PALAU
-PENNSYLVANIA
-PUERTO RICO
-RHODE ISLAND
-SOUTH CAROLINA
-SOUTH DAKOTA
-TENNESSEE
-TEXAS
-UTAH
-VERMONT
-VIRGINIA
-VIRGIN ISLANDS
-WASHINGTON
-WEST VIRGINIA
-WISCONSIN
 WYOMING
\ No newline at end of file

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state_acronym_abbreviation.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state_acronym_abbreviation.txt?rev=1728143&r1=1728142&r2=1728143&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state_acronym_abbreviation.txt (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state_acronym_abbreviation.txt Tue Feb  2 14:56:36 2016
@@ -101,212 +101,4 @@ Wis
 Wisc
 W.Va
 W. Va
-Wyo
-AK
-AL
-AR
-AZ
-CA
-CO
-CT
-DC
-FL
-FM
-GA
-GU
-IA
-ID
-IL
-KS
-KY
-LA
-MA
-ME
-MH
-MI
-MN
-MO
-MP
-MS
-MT
-NC
-ND
-NE
-NH
-NJ
-NM
-NV
-NY
-OH
-OK
-PA
-PR
-PW
-RI
-SC
-SD
-TN
-TX
-UT
-VA
-VI
-VT
-WA
-WI
-WV
-WY
-Ala
-Amer. Samoa
-Ariz
-Ark
-Calif
-Colo
-Conn
-C.Z.
-D.C.
-Del
-Fla
-Ill
-Ind
-Kans
-Mass
-Mich
-Minn
-Mo
-Mont
-N.C.
-N.Dak
-N. Dak
-Nebr
-Nev
-N.H.
-N.J.
-N.Mex
-N. Mex
-N.Y.
-Ohio
-Okla
-Ore
-Oreg
-Pa
-P.R.
-R.I.
-S.C.
-S.Dak
-S. Dak
-Tenn
-Tex
-Utah
-Va.
-V.I.
-Vt
-Wash
-Wis
-Wisc
-W.Va
-W. Va
-Wyo
-AK
-AL
-AR
-AZ
-CA
-CO
-CT
-DC
-FL
-FM
-GA
-GU
-IA
-ID
-IL
-KS
-KY
-LA
-MA
-ME
-MH
-MI
-MN
-MO
-MP
-MS
-MT
-NC
-ND
-NE
-NH
-NJ
-NM
-NV
-NY
-OH
-OK
-PA
-PR
-PW
-RI
-SC
-SD
-TN
-TX
-UT
-VA
-VI
-VT
-WA
-WI
-WV
-WY
-Ala
-Amer. Samoa
-Ariz
-Ark
-Calif
-Colo
-Conn
-C.Z.
-D.C.
-Del
-Fla
-Ill
-Ind
-Kans
-Mass
-Mich
-Minn
-Mo
-Mont
-N.C.
-N.Dak
-N. Dak
-Nebr
-Nev
-N.H.
-N.J.
-N.Mex
-N. Mex
-N.Y.
-Ohio
-Okla
-Ore
-Oreg
-Pa
-P.R.
-R.I.
-S.C.
-S.Dak
-S. Dak
-Tenn
-Tex
-Utah
-Va.
-V.I.
-Vt
-Wash
-Wis
-Wisc
-W.Va
-W. Va
 Wyo
\ No newline at end of file

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta?rev=1728143&r1=1728142&r2=1728143&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta Tue Feb  2 14:56:36 2016
@@ -18,23 +18,3 @@ DECLARE Url, Email;
 "[_A-Za-z0-9-\\+]+(\\.[_A-Za-z0-9- ]+)*@[A-Za-z0-9-]+(\\.[A-Za-z0-9]+)*(\\.(com|org|edu|gov|mil|co\\.uk))" -> Email;
 "(https?://)?(www.)([a-zA-Z0-9]+).[a-zA-Z0-9]*.[a-z]{3}.?([a-z]+)?(/[a-zA-Z0-9]+)?|(https?|ftp)://[^\\s/$.?#].[^\\s]*|www.[^\\s/$.?#].[^\\s]*" -> Url;
 
-PACKAGE org.apache.ctakes.deid;
-
-WORDLIST trie = 'generated.mtwl';
-DECLARE KeywordInd; 
-DECLARE KeywordInd Profession, StateContext;
-DECLARE KeywordInd StreetInd, StreetFullInd;
-
-TRIE(
-    "profession.txt" = Profession,
-    "us_state.txt" = StateContext,
-    "us_state_acronym_abbreviation.txt" = StateContext,
-    "street_ind.txt" = StreetInd,
-    "street_full_ind.txt" = StreetFullInd,
-    trie, true, 4, false, 0, "-");
-    
-
-DECLARE Url, Email;
-"[_A-Za-z0-9-\\+]+(\\.[_A-Za-z0-9- ]+)*@[A-Za-z0-9-]+(\\.[A-Za-z0-9]+)*(\\.(com|org|edu|gov|mil|co\\.uk))" -> Email;
-"(https?://)?(www.)([a-zA-Z0-9]+).[a-zA-Z0-9]*.[a-z]{3}.?([a-z]+)?(/[a-zA-Z0-9]+)?|(https?|ftp)://[^\\s/$.?#].[^\\s]*|www.[^\\s/$.?#].[^\\s]*" -> Url;
-

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Street.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Street.ruta?rev=1728143&r1=1728142&r2=1728143&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Street.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Street.ruta Tue Feb  2 14:56:36 2016
@@ -14,38 +14,4 @@ Street PERIOD{-> SHIFT(Street,1,2)};
 
 //getSTREET5
 //(CW|CAP) W{REGEXP("street|road|avenue", true)->MARK(Street,1,2)};
-((CW|CAP) @StreetFullInd){-PARTOF(Street)-> Street};
-PACKAGE org.apache.ctakes.deid;
-
-TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
-
-DECLARE Street;
-
-//getSTREET1-4
-(NUM{REGEXP(".{1,4}")} (CW|CAP) @StreetInd){-PARTOF(Street)-> Street};
-(NUM{REGEXP(".{1,4}")} CW CW @StreetInd){-PARTOF(Street)-> Street};
-Street PERIOD{-> SHIFT(Street,1,2)};
-
-//remove STREET annotation if the annotation is followed by CW on the same line/sentence.
-//STREET CW{->UNMARK(STREET,1)};
-
-//getSTREET5
-//(CW|CAP) W{REGEXP("street|road|avenue", true)->MARK(Street,1,2)};
-((CW|CAP) @StreetFullInd){-PARTOF(Street)-> Street};
-PACKAGE org.apache.ctakes.deid;
-
-TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
-
-DECLARE Street;
-
-//getSTREET1-4
-(NUM{REGEXP(".{1,4}")} (CW|CAP) @StreetInd){-PARTOF(Street)-> Street};
-(NUM{REGEXP(".{1,4}")} CW CW @StreetInd){-PARTOF(Street)-> Street};
-Street PERIOD{-> SHIFT(Street,1,2)};
-
-//remove STREET annotation if the annotation is followed by CW on the same line/sentence.
-//STREET CW{->UNMARK(STREET,1)};
-
-//getSTREET5
-//(CW|CAP) W{REGEXP("street|road|avenue", true)->MARK(Street,1,2)};
 ((CW|CAP) @StreetFullInd){-PARTOF(Street) -> Street};

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/UserName.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/UserName.ruta?rev=1728143&r1=1728142&r2=1728143&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/UserName.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/UserName.ruta Tue Feb  2 14:56:36 2016
@@ -14,35 +14,3 @@ DECLARE MDInd;
 MDInd W{REGEXP(".{2,3}")} NUM{REGEXP(".{1,3}")->MARK(UserName,2,3)};
 MDInd W{REGEXP("[Oo]n")} NUM{REGEXP(".{1,3}")->UNMARK(UserName,2,3)};
 W{REGEXP("[Oo]n")} @NUM{REGEXP(".{1,3}")->UNMARK(UserName,1,2)};
-PACKAGE org.apache.ctakes.deid;
-
-DECLARE UserName;
-//getUSERNAME 1
-RETAINTYPE(WS);
-SPECIAL.ct=="[" 
-	(W{REGEXP(".{2,3}")} @NUM{REGEXP(".{1,3}")}){-> UserName} 
-	SPECIAL.ct=="]" ;
-RETAINTYPE;
-//getUSERNAME2
-
-DECLARE MDInd;
-"M\\.D\\."-> MDInd;
-MDInd W{REGEXP(".{2,3}")} NUM{REGEXP(".{1,3}")->MARK(UserName,2,3)};
-MDInd W{REGEXP("[Oo]n")} NUM{REGEXP(".{1,3}")->UNMARK(UserName,2,3)};
-W{REGEXP("[Oo]n")} @NUM{REGEXP(".{1,3}")->UNMARK(UserName,1,2)};
-PACKAGE org.apache.ctakes.deid;
-
-DECLARE UserName;
-//getUSERNAME 1
-RETAINTYPE(WS);
-SPECIAL.ct=="[" 
-	(W{REGEXP(".{2,3}")} @NUM{REGEXP(".{1,3}")}){-> UserName} 
-	SPECIAL.ct=="]" ;
-RETAINTYPE;
-//getUSERNAME2
-
-DECLARE MDInd;
-"M\\.D\\."-> MDInd;
-MDInd W{REGEXP(".{2,3}")} NUM{REGEXP(".{1,3}")->MARK(UserName,2,3)};
-MDInd W{REGEXP("[Oo]n")} NUM{REGEXP(".{1,3}")->UNMARK(UserName,2,3)};
-W{REGEXP("[Oo]n")} @NUM{REGEXP(".{1,3}")->UNMARK(UserName,1,2)};

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/ZipState.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/ZipState.ruta?rev=1728143&r1=1728142&r2=1728143&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/ZipState.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/ZipState.ruta Tue Feb  2 14:56:36 2016
@@ -27,61 +27,3 @@ CW (CW? COMMA) @StateContext{-> State};
 RETAINTYPE(BREAK);
 BREAK CW COMMA W{REGEXP("MD|IN|AS|DE|HI|OR") -> State} NUM{REGEXP(".{5}")};
 RETAINTYPE;
-PACKAGE org.apache.ctakes.deid;
-
-TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
-
-DECLARE State, Zip;
-
-//getZIP + getSTATE1
-StateContext{-> State} COMMA? NUM{REGEXP(".{5}") -> Zip};
-Zip{-> SHIFT(Zip,1,3)} "-" NUM{REGEXP(".{4}")};
-
-//getSTATE2/getSTATE5: remove if has given context
-NUM{REGEXP(".{1,4}")} @State{->UNMARK(State)};
-State CW{->UNMARK(State)};
-// TODO refactor to wordlist?
-State{-> UNMARK(State)} W{REGEXP("hospital|heart|association|medical|care|nursing|avenue|street|road|drive|boulevard", true)};
-
-//getSTATE3 
-"originally" "from" W[0,3]{-REGEXP("from")} @StateContext{-> State};
-// TODO refactor to wordlist?
-W{REGEXP("home|son|daughter|mother|father|raised|grew|lived|lives", true)} W[0,3]?
-    W{REGEXP("in")} W[0,3]{-REGEXP("in")} @StateContext{->State};
-
-//getSTATE4
-CW (CW? COMMA) @StateContext{-> State};
-
-//getSTATE6
-RETAINTYPE(BREAK);
-BREAK CW COMMA W{REGEXP("MD|IN|AS|DE|HI|OR") -> State} NUM{REGEXP(".{5}")};
-RETAINTYPE;
-PACKAGE org.apache.ctakes.deid;
-
-TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
-
-DECLARE State, Zip;
-
-//getZIP + getSTATE1
-StateContext{-> State} COMMA? NUM{REGEXP(".{5}") -> Zip};
-Zip{-> SHIFT(Zip,1,3)} "-" NUM{REGEXP(".{4}")};
-
-//getSTATE2/getSTATE5: remove if has given context
-NUM{REGEXP(".{1,4}")} @State{->UNMARK(State)};
-State CW{->UNMARK(State)};
-// TODO refactor to wordlist?
-State{-> UNMARK(State)} W{REGEXP("hospital|heart|association|medical|care|nursing|avenue|street|road|drive|boulevard", true)};
-
-//getSTATE3 
-"originally" "from" W[0,3]{-REGEXP("from")} @StateContext{-> State};
-// TODO refactor to wordlist?
-W{REGEXP("home|son|daughter|mother|father|raised|grew|lived|lives", true)} W[0,3]?
-    W{REGEXP("in")} W[0,3]{-REGEXP("in")} @StateContext{->State};
-
-//getSTATE4
-CW (CW? COMMA) @StateContext{-> State};
-
-//getSTATE6
-RETAINTYPE(BREAK);
-BREAK CW COMMA W{REGEXP("MD|IN|AS|DE|HI|OR") -> State} NUM{REGEXP(".{5}")};
-RETAINTYPE;