You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by ch...@apache.org on 2016/01/30 15:08:08 UTC

svn commit: r1727698 [3/3] - in /ctakes/sandbox/ctakes-clinical-deid: ./ src/main/resources/META-INF/org.apache.uima.fit/ src/main/resources/wordlists/ src/main/ruta/org/apache/ctakes/deid/ src/test/java/org/apache/ctakes/deid/

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/street_full_ind.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/street_full_ind.txt?rev=1727698&r1=1727697&r2=1727698&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/street_full_ind.txt (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/street_full_ind.txt Sat Jan 30 14:08:08 2016
@@ -50,19 +50,5 @@ circle
 place
 way
 street
-st
-drive
-dr
 road
-rd
-lane
-ln
-avenue
-ave
-court
-ct
-boulevard
-terrace
-circle
-place
-way
\ No newline at end of file
+avenue
\ No newline at end of file

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/street_ind.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/street_ind.txt?rev=1727698&r1=1727697&r2=1727698&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/street_ind.txt (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/street_ind.txt Sat Jan 30 14:08:08 2016
@@ -48,21 +48,4 @@ boulevard
 terrace
 circle
 place
-way
-street
-st
-drive
-dr
-road
-rd
-lane
-ln
-avenue
-ave
-court
-ct
-boulevard
-terrace
-circle
-place
 way
\ No newline at end of file

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state.txt?rev=1727698&r1=1727697&r2=1727698&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state.txt (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state.txt Sat Jan 30 14:08:08 2016
@@ -233,122 +233,4 @@ VIRGIN ISLANDS
 WASHINGTON
 WEST VIRGINIA
 WISCONSIN
-WYOMING
-ALABAMA
-ALASKA
-AMERICAN SAMOA
-ARIZONA
-ARKANSAS
-CALIFORNIA
-COLORADO
-CONNECTICUT
-DELAWARE
-DISTRICT OF COLUMBIA
-FEDERATED STATES OF MICRONESIA
-FLORIDA
-GEORGIA
-GUAM
-HAWAII
-IDAHO
-ILLINOIS
-INDIANA
-IOWA
-KANSAS
-KENTUCKY
-LOUISIANA
-MAINE
-MARSHALL ISLANDS
-MARYLAND
-MASSACHUSETTS
-MICHIGAN
-MINNESOTA
-MISSISSIPPI
-MISSOURI
-MONTANA
-NEBRASKA
-NEVADA
-NEW HAMPSHIRE
-NEW JERSEY
-NEW MEXICO
-NEW YORK
-NORTH CAROLINA
-NORTH DAKOTA
-NORTHERN MARIANA ISLANDS
-OHIO
-OKLAHOMA
-OREGON
-PALAU
-PENNSYLVANIA
-PUERTO RICO
-RHODE ISLAND
-SOUTH CAROLINA
-SOUTH DAKOTA
-TENNESSEE
-TEXAS
-UTAH
-VERMONT
-VIRGINIA
-VIRGIN ISLANDS
-WASHINGTON
-WEST VIRGINIA
-WISCONSIN
-WYOMING
-ALABAMA
-ALASKA
-AMERICAN SAMOA
-ARIZONA
-ARKANSAS
-CALIFORNIA
-COLORADO
-CONNECTICUT
-DELAWARE
-DISTRICT OF COLUMBIA
-FEDERATED STATES OF MICRONESIA
-FLORIDA
-GEORGIA
-GUAM
-HAWAII
-IDAHO
-ILLINOIS
-INDIANA
-IOWA
-KANSAS
-KENTUCKY
-LOUISIANA
-MAINE
-MARSHALL ISLANDS
-MARYLAND
-MASSACHUSETTS
-MICHIGAN
-MINNESOTA
-MISSISSIPPI
-MISSOURI
-MONTANA
-NEBRASKA
-NEVADA
-NEW HAMPSHIRE
-NEW JERSEY
-NEW MEXICO
-NEW YORK
-NORTH CAROLINA
-NORTH DAKOTA
-NORTHERN MARIANA ISLANDS
-OHIO
-OKLAHOMA
-OREGON
-PALAU
-PENNSYLVANIA
-PUERTO RICO
-RHODE ISLAND
-SOUTH CAROLINA
-SOUTH DAKOTA
-TENNESSEE
-TEXAS
-UTAH
-VERMONT
-VIRGINIA
-VIRGIN ISLANDS
-WASHINGTON
-WEST VIRGINIA
-WISCONSIN
-WYOMING
+WYOMING
\ No newline at end of file

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state_acronym_abbreviation.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state_acronym_abbreviation.txt?rev=1727698&r1=1727697&r2=1727698&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state_acronym_abbreviation.txt (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/us_state_acronym_abbreviation.txt Sat Jan 30 14:08:08 2016
@@ -309,316 +309,4 @@ Wis
 Wisc
 W.Va
 W. Va
-Wyo
-AK
-AL
-AR
-AZ
-CA
-CO
-CT
-DC
-FL
-FM
-GA
-GU
-IA
-ID
-IL
-KS
-KY
-LA
-MA
-ME
-MH
-MI
-MN
-MO
-MP
-MS
-MT
-NC
-ND
-NE
-NH
-NJ
-NM
-NV
-NY
-OH
-OK
-PA
-PR
-PW
-RI
-SC
-SD
-TN
-TX
-UT
-VA
-VI
-VT
-WA
-WI
-WV
-WY
-Ala
-Amer. Samoa
-Ariz
-Ark
-Calif
-Colo
-Conn
-C.Z.
-D.C.
-Del
-Fla
-Ill
-Ind
-Kans
-Mass
-Mich
-Minn
-Mo
-Mont
-N.C.
-N.Dak
-N. Dak
-Nebr
-Nev
-N.H.
-N.J.
-N.Mex
-N. Mex
-N.Y.
-Ohio
-Okla
-Ore
-Oreg
-Pa
-P.R.
-R.I.
-S.C.
-S.Dak
-S. Dak
-Tenn
-Tex
-Utah
-Va.
-V.I.
-Vt
-Wash
-Wis
-Wisc
-W.Va
-W. Va
-Wyo
-AK
-AL
-AR
-AZ
-CA
-CO
-CT
-DC
-FL
-FM
-GA
-GU
-IA
-ID
-IL
-KS
-KY
-LA
-MA
-ME
-MH
-MI
-MN
-MO
-MP
-MS
-MT
-NC
-ND
-NE
-NH
-NJ
-NM
-NV
-NY
-OH
-OK
-PA
-PR
-PW
-RI
-SC
-SD
-TN
-TX
-UT
-VA
-VI
-VT
-WA
-WI
-WV
-WY
-Ala
-Amer. Samoa
-Ariz
-Ark
-Calif
-Colo
-Conn
-C.Z.
-D.C.
-Del
-Fla
-Ill
-Ind
-Kans
-Mass
-Mich
-Minn
-Mo
-Mont
-N.C.
-N.Dak
-N. Dak
-Nebr
-Nev
-N.H.
-N.J.
-N.Mex
-N. Mex
-N.Y.
-Ohio
-Okla
-Ore
-Oreg
-Pa
-P.R.
-R.I.
-S.C.
-S.Dak
-S. Dak
-Tenn
-Tex
-Utah
-Va.
-V.I.
-Vt
-Wash
-Wis
-Wisc
-W.Va
-W. Va
-Wyo
-AK
-AL
-AR
-AZ
-CA
-CO
-CT
-DC
-FL
-FM
-GA
-GU
-IA
-ID
-IL
-KS
-KY
-LA
-MA
-ME
-MH
-MI
-MN
-MO
-MP
-MS
-MT
-NC
-ND
-NE
-NH
-NJ
-NM
-NV
-NY
-OH
-OK
-PA
-PR
-PW
-RI
-SC
-SD
-TN
-TX
-UT
-VA
-VI
-VT
-WA
-WI
-WV
-WY
-Ala
-Amer. Samoa
-Ariz
-Ark
-Calif
-Colo
-Conn
-C.Z.
-D.C.
-Del
-Fla
-Ill
-Ind
-Kans
-Mass
-Mich
-Minn
-Mo
-Mont
-N.C.
-N.Dak
-N. Dak
-Nebr
-Nev
-N.H.
-N.J.
-N.Mex
-N. Mex
-N.Y.
-Ohio
-Okla
-Ore
-Oreg
-Pa
-P.R.
-R.I.
-S.C.
-S.Dak
-S. Dak
-Tenn
-Tex
-Utah
-Va.
-V.I.
-Vt
-Wash
-Wis
-Wisc
-W.Va
-W. Va
-Wyo
+Wyo
\ No newline at end of file

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta?rev=1727698&r1=1727697&r2=1727698&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta Sat Jan 30 14:08:08 2016
@@ -1,10 +1,22 @@
 PACKAGE org.apache.ctakes.deid;
 
 TYPESYSTEM org.apache.ctakes.typesystem.types.TypeSystem;
+SCRIPT org.apache.ctakes.deid.Dictionaries;
+SCRIPT org.apache.ctakes.deid.ZipState;
+SCRIPT org.apache.ctakes.deid.Street;
+SCRIPT org.apache.ctakes.deid.UserName;
 
-CW{-> IdentifiedAnnotation};
-PACKAGE org.apache.ctakes.deid;
+CALL(Dictionaries);
+CALL(ZipState);
+CALL(Street);
+CALL(UserName);
 
-TYPESYSTEM org.apache.ctakes.typesystem.types.TypeSystem;
+// map types of ruta scripts to cTAKES types
+// TODO select the correct types and fill the features
+Zip{-> IdentifiedAnnotation};
+State{-> IdentifiedAnnotation};
+Email{-> IdentifiedAnnotation};
+Url{-> IdentifiedAnnotation};
+Street{-> IdentifiedAnnotation};
+UserName{-> IdentifiedAnnotation};
 
-CW{-> IdentifiedAnnotation};
\ No newline at end of file

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta.orig
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta.orig?rev=1727698&r1=1727697&r2=1727698&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta.orig (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta.orig Sat Jan 30 14:08:08 2016
@@ -1,22 +0,0 @@
-PACKAGE org.apache.ctakes.deid;
-
-TYPESYSTEM org.apache.ctakes.typesystem.types.TypeSystem;
-SCRIPT org.apache.ctakes.deid.Dictionaries;
-SCRIPT org.apache.ctakes.deid.ZipState;
-SCRIPT org.apache.ctakes.deid.Street;
-SCRIPT org.apache.ctakes.deid.UserName;
-
-CALL(Dictionaries);
-CALL(ZipState);
-CALL(Street);
-CALL(UserName);
-
-// map types of ruta scripts to cTAKES types
-// TODO select the correct types and fill the features
-Zip{-> IdentifiedAnnotation};
-State{-> IdentifiedAnnotation};
-Email{-> IdentifiedAnnotation};
-Url{-> IdentifiedAnnotation};
-Street{-> IdentifiedAnnotation};
-UserName{-> IdentifiedAnnotation};
-

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta?rev=1727698&r1=1727697&r2=1727698&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta Sat Jan 30 14:08:08 2016
@@ -38,39 +38,3 @@ DECLARE Url, Email;
 "[_A-Za-z0-9-\\+]+(\\.[_A-Za-z0-9- ]+)*@[A-Za-z0-9-]+(\\.[A-Za-z0-9]+)*(\\.(com|org|edu|gov|mil|co\\.uk))" -> Email;
 "(https?://)?(www.)([a-zA-Z0-9]+).[a-zA-Z0-9]*.[a-z]{3}.?([a-z]+)?(/[a-zA-Z0-9]+)?|(https?|ftp)://[^\\s/$.?#].[^\\s]*|www.[^\\s/$.?#].[^\\s]*" -> Url;
 
-PACKAGE org.apache.ctakes.deid;
-
-WORDLIST trie = 'generated.mtwl';
-DECLARE KeywordInd; 
-DECLARE KeywordInd Profession, StateContext;
-
-TRIE(
-    "profession.txt" = Profession,
-    "us_state.txt" = StateContext,
-    "us_state_acronym_abbreviation.txt" = StateContext,
-    trie, true, 4, false, 0, "-");
-    
-
-DECLARE Url, Email;
-"[_A-Za-z0-9-\\+]+(\\.[_A-Za-z0-9- ]+)*@[A-Za-z0-9-]+(\\.[A-Za-z0-9]+)*(\\.(com|org|edu|gov|mil|co\\.uk))" -> Email;
-"(https?://)?(www.)([a-zA-Z0-9]+).[a-zA-Z0-9]*.[a-z]{3}.?([a-z]+)?(/[a-zA-Z0-9]+)?|(https?|ftp)://[^\\s/$.?#].[^\\s]*|www.[^\\s/$.?#].[^\\s]*" -> Url;
-PACKAGE org.apache.ctakes.deid;
-
-WORDLIST trie = 'generated.mtwl';
-DECLARE KeywordInd; 
-DECLARE KeywordInd Profession, StateContext;
-DECLARE KeywordInd StreetInd, StreetFullInd;
-
-TRIE(
-    "profession.txt" = Profession,
-    "us_state.txt" = StateContext,
-    "us_state_acronym_abbreviation.txt" = StateContext,
-    "street_ind.txt" = StreetInd,
-    "street_full_ind.txt" = StreetFullInd,
-    trie, true, 4, false, 0, "-");
-    
-
-DECLARE Url, Email;
-"[_A-Za-z0-9-\\+]+(\\.[_A-Za-z0-9- ]+)*@[A-Za-z0-9-]+(\\.[A-Za-z0-9]+)*(\\.(com|org|edu|gov|mil|co\\.uk))" -> Email;
-"(https?://)?(www.)([a-zA-Z0-9]+).[a-zA-Z0-9]*.[a-z]{3}.?([a-z]+)?(/[a-zA-Z0-9]+)?|(https?|ftp)://[^\\s/$.?#].[^\\s]*|www.[^\\s/$.?#].[^\\s]*" -> Url;
-

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Street.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Street.ruta?rev=1727698&r1=1727697&r2=1727698&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Street.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Street.ruta Sat Jan 30 14:08:08 2016
@@ -48,4 +48,4 @@ Street PERIOD{-> SHIFT(Street,1,2)};
 
 //getSTREET5
 //(CW|CAP) W{REGEXP("street|road|avenue", true)->MARK(Street,1,2)};
-((CW|CAP) @StreetFullInd){-PARTOF(Street)-> Street};
+((CW|CAP) @StreetFullInd){-PARTOF(Street) -> Street};

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/ZipState.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/ZipState.ruta?rev=1727698&r1=1727697&r2=1727698&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/ZipState.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/ZipState.ruta Sat Jan 30 14:08:08 2016
@@ -69,35 +69,6 @@ Zip{-> SHIFT(Zip,1,3)} "-" NUM{REGEXP(".
 //getSTATE2/getSTATE5: remove if has given context
 NUM{REGEXP(".{1,4}")} @State{->UNMARK(State)};
 State CW{->UNMARK(State)};
-// TODO refactor to worlist?
-State{-> UNMARK(State)} W{REGEXP("hospital|heart|association|medical|care|nursing|avenue|street|road|drive|boulevard", true)};
-
-//getSTATE3 
-"originally" "from" W[0,3]{-REGEXP("from")} @StateContext{-> State};
-// TODO refactor to worlist?
-W{REGEXP("home|son|daughter|mother|father|raised|grew|lived|lives", true)} W[0,3]?
-    W{REGEXP("in")} W[0,3]{-REGEXP("in")} @StateContext{->State};
-
-//getSTATE4
-CW (CW? COMMA) @StateContext{-> State};
-
-//getSTATE6
-RETAINTYPE(BREAK);
-BREAK CW COMMA W{REGEXP("MD|IN|AS|DE|HI|OR") -> State} NUM{REGEXP(".{5}")};
-RETAINTYPE;
-PACKAGE org.apache.ctakes.deid;
-
-TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
-
-DECLARE State, Zip;
-
-//getZIP + getSTATE1
-StateContext{-> State} COMMA? NUM{REGEXP(".{5}") -> Zip};
-Zip{-> SHIFT(Zip,1,3)} "-" NUM{REGEXP(".{4}")};
-
-//getSTATE2/getSTATE5: remove if has given context
-NUM{REGEXP(".{1,4}")} @State{->UNMARK(State)};
-State CW{->UNMARK(State)};
 // TODO refactor to wordlist?
 State{-> UNMARK(State)} W{REGEXP("hospital|heart|association|medical|care|nursing|avenue|street|road|drive|boulevard", true)};
 

Modified: ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/DeidPipelineTest.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/DeidPipelineTest.java?rev=1727698&r1=1727697&r2=1727698&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/DeidPipelineTest.java (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/DeidPipelineTest.java Sat Jan 30 14:08:08 2016
@@ -18,6 +18,9 @@
  */
 package org.apache.ctakes.deid;
 
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.URL;
 import java.util.Collection;
 
 import junit.framework.Assert;