You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by pk...@apache.org on 2016/06/02 16:36:19 UTC
svn commit: r1746604 - in /ctakes/sandbox/ctakes-clinical-deid/src:
main/java/org/apache/ctakes/deid/
main/resources/org/apache/ctakes/deid/types/ main/resources/wordlists/
main/ruta/org/apache/ctakes/deid/ test/java/org/apache/ctakes/deid/
Author: pkluegl
Date: Thu Jun 2 16:36:19 2016
New Revision: 1746604
URL: http://svn.apache.org/viewvc?rev=1746604&view=rev
Log:
CTAKES-384
- refacoring of scripts
- added more gazetteers
- fixed trie call
- added some rules
Added:
ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/country.txt (with props)
ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/nationality.txt (with props)
ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/spoken_language.txt (with props)
ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Country.ruta
ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Fax.ruta
ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/IDNum.ruta
ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/MedicalRecNum.ruta
ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta
ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/State.ruta
ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Zip.ruta
Removed:
ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta.orig
ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/ZipState.ruta
Modified:
ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java
ctakes/sandbox/ctakes-clinical-deid/src/main/resources/org/apache/ctakes/deid/types/TypeSystem.xml
ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/phone_pre_ind.txt
ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta
ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta
ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta
ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta
ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Phone.ruta
ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java
Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java?rev=1746604&r1=1746603&r2=1746604&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java Thu Jun 2 16:36:19 2016
@@ -48,6 +48,8 @@ public class SimpleDeidEntityComparator
public static final String PARAM_CREATE_RUTA_EVAL_ANNOTATIONS = "createRutaEvalAnnotations";
+ private static final boolean PRINT_ANNOTATIONS = true;
+
@ConfigurationParameter(name = PARAM_CREATE_RUTA_EVAL_ANNOTATIONS, mandatory = true, defaultValue = "false")
private Boolean createRutaEvalAnnotations;
@@ -83,7 +85,7 @@ public class SimpleDeidEntityComparator
for (DeidEntity processAnnotation : allProcess) {
if (equals(goldAnnotation, processAnnotation)) {
tp.add(processAnnotation);
- inc(type2tp, processAnnotation);
+ inc(type2tp, processAnnotation, "TP");
found = true;
break;
}
@@ -91,7 +93,7 @@ public class SimpleDeidEntityComparator
if (!found) {
DeidEntity copyFs = (DeidEntity) cc.copyFs(goldAnnotation);
fn.add(copyFs);
- inc(type2fn, copyFs);
+ inc(type2fn, copyFs, "FN");
}
}
@@ -105,7 +107,7 @@ public class SimpleDeidEntityComparator
}
if (!found) {
fp.add(processAnnotation);
- inc(type2fp, processAnnotation);
+ inc(type2fp, processAnnotation, "FP");
}
}
@@ -184,7 +186,7 @@ public class SimpleDeidEntityComparator
System.out.println();
}
- private void inc(Map<String, Integer> map, DeidEntity annotation) {
+ private void inc(Map<String, Integer> map, DeidEntity annotation, String type) {
String name = annotation.getType().getShortName();
Integer count = map.get(name);
if(count == null) {
@@ -192,6 +194,11 @@ public class SimpleDeidEntityComparator
} else {
map.put(name, count + 1);
}
+ if(PRINT_ANNOTATIONS) {
+// if(name.equals("Location")) {
+ System.out.printf(Locale.ENGLISH,"%-5s %-10s %-10s %-50s\n", type, name,annotation.getEntityType(), annotation.getCoveredText());
+// }
+ }
}
}
Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/org/apache/ctakes/deid/types/TypeSystem.xml
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/org/apache/ctakes/deid/types/TypeSystem.xml?rev=1746604&r1=1746603&r2=1746604&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/org/apache/ctakes/deid/types/TypeSystem.xml (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/org/apache/ctakes/deid/types/TypeSystem.xml Thu Jun 2 16:36:19 2016
@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier">
<name>org.apache.ctakes.deid.types.TypeSystem</name>
- <description>This is a Apache cTAKES Type System for clinical deidentificastion.</description>
+ <description>This is a Apache cTAKES Type System for clinical deidentification.</description>
<version>1.0</version>
<vendor>Apache cTAKES</vendor>
<types>
Added: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/country.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/country.txt?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/country.txt (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/country.txt Thu Jun 2 16:36:19 2016
@@ -0,0 +1,191 @@
+Afghanistan
+Africa
+Albania
+Algeria
+Andorra
+Angola
+Arabia
+Argentina
+Armenia
+Australia
+Austria
+Bahama
+Bahrain
+Bangladesh
+Belarus
+Belgium
+Belize
+Benin
+Bermuda
+Bhutan
+Bissau
+Bosnia
+Botswana
+Brazil
+Britain
+Brunei
+Bulgaria
+Burma
+Burundi
+Cambodia
+Cameroon
+Canada
+Chad
+Chile
+China
+Colombia
+Columbia
+Congo
+Croatia
+Cuba
+Cyprus
+Czech
+Denmark
+Djibouti
+Dominican
+Ecuador
+Egypt
+Emirates
+Eritrea
+Estonia
+Ethiopia
+Fiji
+Finland
+France
+Gabon
+Gambia
+Germany
+Georgia
+Ghana
+Greece
+Grenada
+Guatemala
+Guiana
+Guinea
+Guyana
+Haiti
+Holland
+Honduras
+Hungary
+Iceland
+India
+Indonesia
+Iran
+Iraq
+Ireland
+Italy
+Jamaica
+Japan
+Jordan
+Kazakhstan
+Kenya
+Kiribati
+Korea
+Kosovo
+Kuwait
+Kyrgyzstan
+Lanka
+Laos
+Latvia
+Lebanon
+Leone
+Libya
+Liechtenstein
+Lithuania
+Luxembourg
+Malawi
+Malaysia
+Maldives
+Mali
+Malta
+Mauritania
+Mauritia
+Mexico
+Micronesia
+Moldovia
+Mongolia
+Morocco
+Mozambique
+Myanmar
+Namibia
+Nepal
+Netherland
+Netherlands
+Nicaragua
+Niger
+Nigeria
+Norway
+Oman
+Pakistan
+Palau
+Palestine
+Panama
+Papua
+Paraguay
+Peru
+Philippines
+Portugal
+Poland
+Qatar
+Rica
+Romania
+Russia
+Rwanda
+Salvador
+Scotland
+Senegal
+Serbia
+Sierra
+Singapore
+Slovakia
+Slovenia
+Somalia
+Spain
+Sudan
+Suriname
+Swaziland
+Sweden
+Swedish
+Switzerland
+Syria
+Taiwan
+Tajikstan
+Tajikistan
+Tanzania
+Thailand
+Tibet
+Timor
+Tonga
+Tunisia
+Turkey
+Turkmenistan
+Tuvalua
+Uganda
+Ukrain
+Ukraine
+Uruguay
+Uzbekistan
+Venezuela
+Verde
+Vietnam
+Wales
+Yemen
+Zaire
+Zambia
+Zealand
+Zimbabwe
+Israel
+Herzegovina
+England
+America
+Puerto Rico
+Sri Lanka
+Costa Rica
+United Kingdom
+UK
+United States
+Ivory Coast
+Saudi Arabia
+South Korea
+North Korea
+Trinidad and Tobago
Propchange: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/country.txt
------------------------------------------------------------------------------
svn:eol-style = native
Added: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/nationality.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/nationality.txt?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/nationality.txt (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/nationality.txt Thu Jun 2 16:36:19 2016
@@ -0,0 +1,228 @@
+Afghan
+African-American
+Albanian
+Algerian
+Andorran
+Angolan
+Antiguans
+Argentinean
+Argentinian
+Armenian
+Australian
+Austrian
+Azerbaijani
+Bahamian
+Bahraini
+Bangladeshi
+Barbadian
+Barbudans
+Batswana
+Belarusan
+Belarusian
+Belgian
+Belizean
+Beninese
+Bhutanese
+Bolivian
+Bosnian
+Botswanan
+Brazilian
+British
+Bruneian
+Bulgarian
+Burkinabe
+Burkinese
+Burmese
+Burundian
+Cambodian
+Cameroonian
+Canadian
+Chadian
+Chilean
+Chinese
+Colombian
+Columbian
+Comoran
+Congolese
+Croatian
+Cuban
+Cypriot
+Czech
+Danish
+Djibouti
+Djiboutian
+Dominican
+Dutch
+East Timorese
+Ecuadorean
+Ecuadorian
+Egyptian
+Emirati
+Emirian
+Equadorian
+Eritrean
+Estonian
+Ethiopian
+Fijian
+Filipino
+Finnish
+French
+Gabonese
+Gambian
+Georgian
+German
+Ghanaian
+Greek
+Grenadian
+Guatemalan
+Guinea-Bissauan
+Guinean
+Guyanese
+Haitian
+Herzegovinian
+Honduran
+Hungarian
+Icelander
+Icelandic
+I-Kiribati
+Indian
+Indonesian
+Iranian
+Iraqi
+Irish
+Israeli
+Italian
+Ivorian
+Jamaican
+Japanese
+Jordanian
+Kazakh
+Kazakhstani
+Khazakhstani
+Kenyan
+Kittian
+Nevisian
+Kuwaiti
+Kyrgyz
+Laotian
+Latvian
+Lebanese
+Liberian
+Libyan
+Liechtensteiner
+Lithuanian
+Luxembourger
+Macedonian
+Madagascan
+Malagasy
+Malawian
+Malaysian
+Maldivan
+Maldivian
+Malian
+Maltese
+Marshallese
+Mauritanian
+Mauritian
+Mexican
+Micronesian
+Moldovan
+Monacan
+Mongolian
+Montenegrin
+Moroccan
+Mosotho
+Motswana
+Mozambican
+Namibian
+Nauruan
+Nepalese
+New Zealander
+Nicaraguan
+Nigerian
+Nigerien
+Northern Irish
+North Korean
+Norwegian
+Omani
+Pakistani
+Palauan
+Panamanian
+Paraguayan
+Peruvian
+Philippine
+Polish
+Portuguese
+Qatari
+Romanian
+Russian
+Rwandan
+Salvadoran
+Salvadorean
+Samoan
+Scottish
+Senegalese
+Serb
+Serbian
+Seychellois
+Singaporean
+Slovak
+Slovakian
+Slovene
+Slovenian
+Solomon Islander
+Somali
+South African
+South Korean
+Spanish
+Sri Lankan
+Sudanese
+Surinamer
+Surinamese
+Swazi
+Swedish
+Swiss
+Syrian
+Tadjik
+Tadjiki
+Taiwanese
+Tajikistani
+Tajik
+Tajiki
+Tanzanian
+Thai
+Tobagonian
+Togolese
+Tongan
+Trinidadian
+Tunisian
+Turkish
+Turkmen
+Turkoman
+Tuvaluan
+Ugandan
+Ukrainian
+Uruguayan
+Uzbek
+Uzbeki
+Uzbekistani
+Vanuatuan
+Venezuelan
+Vietnamese
+Welsh
+Yemeni
+Yemenite
+Yugoslav
+Zairean
+Zambian
+Zimbabwean
+English
+San Marinese
+Sao Tomean
+Papua New Guinean
+Western Samoan
+Saint Lucian
+Sierra Leonean
+Sierra Leonian
+Equatorial Guinean
+
Propchange: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/nationality.txt
------------------------------------------------------------------------------
svn:eol-style = native
Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/phone_pre_ind.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/phone_pre_ind.txt?rev=1746604&r1=1746603&r2=1746604&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/phone_pre_ind.txt (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/phone_pre_ind.txt Thu Jun 2 16:36:19 2016
@@ -2,4 +2,6 @@ tele
telephone
tel
phone
-contact
\ No newline at end of file
+contact
+beeper
+pager
\ No newline at end of file
Added: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/spoken_language.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/spoken_language.txt?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/spoken_language.txt (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/spoken_language.txt Thu Jun 2 16:36:19 2016
@@ -0,0 +1,174 @@
+Afghan-speaking
+Albanian-speaking
+Algerian-speaking
+Andorran-speaking
+Angolan-speaking
+Argentinian-speaking
+Armenian-speaking
+Australian-speaking
+Austrian-speaking
+Azerbaijani-speaking
+Bahamian-speaking
+Bahraini-speaking
+Bangladeshi-speaking
+Barbadian-speaking
+Belarusian-speaking
+Belarusan-speaking
+Belgian-speaking
+Belizean-speaking
+Beninese-speaking
+Bhutanese-speaking
+Bolivian-speaking
+Bosnian-speaking
+Botswanan-speaking
+Brazilian-speaking
+British-speaking
+Bruneian-speaking
+Bulgarian-speaking
+Burkinese-speaking
+Burmese-speaking
+Burundian-speaking
+Cambodian-speaking
+Cameroonian-speaking
+Canadian-speaking
+Cape Verdean-speaking
+Chadian-speaking
+Chilean-speaking
+Chinese-speaking
+Colombian-speaking
+Congolese-speaking
+Croatian-speaking
+Cuban-speaking
+Cypriot-speaking
+Czech-speaking
+Danish-speaking
+Djiboutian-speaking
+Dominican-speaking
+Dominican-speaking
+Ecuadorean-speaking
+English-speaking
+Egyptian-speaking
+Salvadorean-speaking
+Eritrean-speaking
+Estonian-speaking
+Ethiopian-speaking
+Fijian-speaking
+Finnish-speaking
+Gabonese-speaking
+Gambian-speaking
+Georgian-speaking
+German-speaking
+Ghanaian-speaking
+Greek-speaking
+Grenadian-speaking
+Guatemalan-speaking
+Guinean-speaking
+Guyanese-speaking
+Haitian-speaking
+Dutch-speaking
+Honduran-speaking
+Hungarian-speaking
+Icelandic-speaking
+Indian-speaking
+Indonesian-speaking
+Iranian-speaking
+Iraqi-speaking
+Irish-speaking
+Italian-speaking
+Jamaican-speaking
+Japanese-speaking
+Jordanian-speaking
+Kazakh-speaking
+Kenyan-speaking
+Kuwaiti-speaking
+Laotian-speaking
+Latvian-speaking
+Lebanese-speaking
+Liberian-speaking
+Libyan-speaking
+Lithuanian-speaking
+Macedonian-speaking
+Madagascan-speaking
+Malawian-speaking
+Malaysian-speaking
+Maldivian-speaking
+Malian-speaking
+Maltese-speaking
+Mauritanian-speaking
+Mauritian-speaking
+Mexican-speaking
+Moldovan-speaking
+Monacan-speaking
+Mongolian-speaking
+Montenegrin-speaking
+Moroccan-speaking
+Mozambican-speaking
+Namibian-speaking
+Nepalese-speaking
+Dutch-speaking
+Nicaraguan-speaking
+Nigerien-speaking
+Nigerian-speaking
+Norwegian-speaking
+Omani-speaking
+Pakistani-speaking
+Panamanian-speaking
+Guinean-speaking
+Paraguayan-speaking
+Peruvian-speaking
+Persian-speaking
+Philippine-speaking
+Polish-speaking
+Portuguese-speaking
+Qatari-speaking
+Romanian-speaking
+Russian-speaking
+Rwandan-speaking
+Saudi-speaking
+Scottish-speaking
+Senegalese-speaking
+Serb-speaking
+Serbian-speaking
+Seychellois-speaking
+Sierra Leonian-speaking
+Singaporean-speaking
+Slovak-speaking
+Slovene-speaking
+Slovenian-speaking
+Somali-speaking
+Spanish-speaking
+Sri Lankan-speaking
+Sudanese-speaking
+Surinamese-speaking
+Swazi-speaking
+Swedish-speaking
+Swiss-speaking
+Syrian-speaking
+Taiwanese-speaking
+Tajik-speaking
+Tadjik-speaking
+Tanzanian-speaking
+Thai-speaking
+Togolese-speaking
+Tobagonian-speaking
+Turkish-speaking
+Turkoman-speaking
+Turkmen-speaking
+Tuvaluan-speaking
+Ugandan-speaking
+Ukrainian-speaking
+Emirati-speaking
+British-speaking
+Uruguayan-speaking
+Uzbek-speaking
+Vanuatuan-speaking
+Venezuelan-speaking
+Vietnamese-speaking
+Welsh-speaking
+Western Samoan-speaking
+Yemeni-speaking
+Yugoslav-speaking
+Zairean-speaking
+Zambian-speaking
+Zimbabwean-speaking
+Equadorian-speaking
Propchange: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/spoken_language.txt
------------------------------------------------------------------------------
svn:eol-style = native
Added: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Country.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Country.ruta?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Country.ruta (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Country.ruta Thu Jun 2 16:36:19 2016
@@ -0,0 +1,21 @@
+PACKAGE org.apache.ctakes.deid;
+
+TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
+TYPESYSTEM org.apache.ctakes.deid.types.TypeSystem;
+
+DECLARE Country;
+
+ANY{-PARTOF(Split)} @CountryInd{-> Country};
+
+
+ANY{-REGEXP("(?i)speaks?|some")}
+ @NationalityInd{-> Country}
+ ANY{-REGEXP("(?i)general|hospital|clinic|city|restaurant|area|street|road")} ;
+
+// TODO
+//Rule: getSpokenLanguage
+//(
+// {Lookup.minorType=="nationality"}
+// ):label
+//-->
+//:label
Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta?rev=1746604&r1=1746603&r2=1746604&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta Thu Jun 2 16:36:19 2016
@@ -1,11 +1,13 @@
PACKAGE org.apache.ctakes.deid;
TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
-TYPESYSTEM org.apache.ctakes.deid.types.TypeSystem;
+//TYPESYSTEM org.apache.ctakes.deid.types.TypeSystem;
+IMPORT PACKAGE * FROM org.apache.ctakes.deid.types.TypeSystem AS deid;
RETAINTYPE(WS);
-(Num4{-PARTOF(Date),REGEXP("19..|20..")} Dash Num2 Dash Num2){-> Date};
-(Num12{-PARTOF(Date)} Slash (Num12 Slash)? Num2{REGEXP("[123456789].")}){-> Date};
-Num4{-PARTOF(Date),REGEXP("19..|20..")-> Date};
-MonthInd{-PARTOF(Date)-> Date};
+(Num4{-PARTOF(deid.Date),REGEXP("19..|20..")} Dash Num2 Dash Num2){-> deid.Date};
+(Num12{-PARTOF(deid.Date)} Slash Num12 Slash Num4{REGEXP("19..|20..")}){-> deid.Date};
+(Num12{-PARTOF(deid.Date)} Slash (Num12 Slash)? Num2{REGEXP("[123456789].")}){-> deid.Date};
+Num4{-PARTOF(deid.Date),REGEXP("19..|20..")-> deid.Date};
+MonthInd{-PARTOF(deid.Date)-> deid.Date};
RETAINTYPE;
Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta?rev=1746604&r1=1746603&r2=1746604&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta Thu Jun 2 16:36:19 2016
@@ -1,46 +1,70 @@
PACKAGE org.apache.ctakes.deid;
//TYPESYSTEM org.apache.ctakes.typesystem.types.TypeSystem;
-TYPESYSTEM org.apache.ctakes.deid.types.TypeSystem;
+//TYPESYSTEM org.apache.ctakes.deid.types.TypeSystem;
+IMPORT PACKAGE * FROM org.apache.ctakes.deid.types.TypeSystem AS deid;
// UIMA-4833
-TYPESYSTEM org.apache.ctakes.deid.ZipStateRutaTypeSystem;
TYPESYSTEM org.apache.ctakes.deid.StreetRutaTypeSystem;
TYPESYSTEM org.apache.ctakes.deid.AgeRutaTypeSystem;
TYPESYSTEM org.apache.ctakes.deid.DoctorRutaTypeSystem;
TYPESYSTEM org.apache.ctakes.deid.UserNameRutaTypeSystem;
TYPESYSTEM org.apache.ctakes.deid.PhoneRutaTypeSystem;
TYPESYSTEM org.apache.ctakes.deid.DateRutaTypeSystem;
+TYPESYSTEM org.apache.ctakes.deid.ZipRutaTypeSystem;
+TYPESYSTEM org.apache.ctakes.deid.StateRutaTypeSystem;
+TYPESYSTEM org.apache.ctakes.deid.CountryRutaTypeSystem;
+TYPESYSTEM org.apache.ctakes.deid.FaxRutaTypeSystem;
+TYPESYSTEM org.apache.ctakes.deid.PatientRutaTypeSystem;
SCRIPT org.apache.ctakes.deid.Dictionaries;
SCRIPT org.apache.ctakes.deid.Age;
SCRIPT org.apache.ctakes.deid.Doctor;
-SCRIPT org.apache.ctakes.deid.ZipState;
+SCRIPT org.apache.ctakes.deid.State;
+SCRIPT org.apache.ctakes.deid.Zip;
SCRIPT org.apache.ctakes.deid.Street;
SCRIPT org.apache.ctakes.deid.UserName;
SCRIPT org.apache.ctakes.deid.Phone;
SCRIPT org.apache.ctakes.deid.Date;
+SCRIPT org.apache.ctakes.deid.Country;
+SCRIPT org.apache.ctakes.deid.Fax;
+SCRIPT org.apache.ctakes.deid.IDNum;
+SCRIPT org.apache.ctakes.deid.MedicalRecNum;
+SCRIPT org.apache.ctakes.deid.Patient;
CALL(Dictionaries);
-CALL(ZipState);
-CALL(Street);
-CALL(UserName);
-CALL(Date);
+
CALL(Age);
+CALL(Country);
CALL(Doctor);
+CALL(Fax);
+CALL(IDNum);
+CALL(MedicalRecNum);
+CALL(Patient);
CALL(Phone);
+CALL(State);
+CALL(Street);
+CALL(UserName);
+CALL(Zip);
+CALL(Date);
-Zip{-> Location, Location.entityType = "ZIP"};
-State{-> Location, Location.entityType= "STATE"};
Email{-> Contact, Contact.entityType = "EMAIL"};
-ProfessionInd{-> Profession, Profession.entityType = "PROFESSION"};
Url{-> Contact, Contact.entityType = "URL"};
-Street{-> Location, Location.entityType= "STREET"};
-UserName{-> Name, Name.entityType = "USERNAME"};
+
Age{-> Age.entityType = "AGE"};
+Country{-> Location, Location.entityType = "COUNTRY"};
Doctor{-> Name, Name.entityType = "DOCTOR"};
+Fax{-> Contact, Contact.entityType = "FAX"};
+//IDNum{-> ID, ID.entityType = "IDNUM"};
+//MedicalRecNum{-> ID, ID.entityType = "MEDICALRECNUM"};
+Patient{-> Name, Name.entityType = "PATIENT"};
Phone{-> Contact, Contact.entityType = "PHONE"};
-Date{-> Date.entityType = "DATE"};
+State{-> Location, Location.entityType= "STATE"};
+Street{-> Location, Location.entityType= "STREET"};
+UserName{-> Name, Name.entityType = "USERNAME"};
+ProfessionInd{-> Profession, Profession.entityType = "PROFESSION"};
+Zip{-> Location, Location.entityType = "ZIP"};
+deid.Date{-> deid.Date.entityType = "DATE"};
Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta?rev=1746604&r1=1746603&r2=1746604&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta Thu Jun 2 16:36:19 2016
@@ -1,9 +1,12 @@
PACKAGE org.apache.ctakes.deid;
+TYPESYSTEM org.apache.ctakes.typesystem.types.TypeSystem;
+
WORDLIST trie = 'generated.mtwl';
DECLARE KeywordInd;
DECLARE KeywordInd ProfessionInd, StateContext, DeceasedInd, FamilyInd, MonthInd;
DECLARE KeywordInd StreetInd, StreetFullInd, AgePostInd, AgePreInd, PhonePreInd;
+DECLARE KeywordInd NationalityInd, SpokenLanguageInd, CountryInd;
TRIE(
"profession.txt" = ProfessionInd,
@@ -14,9 +17,12 @@ TRIE(
"age_post_ind.txt" = AgePostInd,
"age_pre_ind.txt" = AgePreInd,
"deceased_ind.txt" = DeceasedInd,
- "family_ind" = FamilyInd,
- "phone_pre_ind" = PhonePreInd,
- "month_ind" = MonthInd,
+ "family_ind.txt" = FamilyInd,
+ "phone_pre_ind.txt" = PhonePreInd,
+ "month_ind.txt" = MonthInd,
+ "nationality.txt" = NationalityInd,
+ "spoken-language.txt" = SpokenLanguageInd,
+ "country.txt" = CountryInd,
trie, true, 4, false, 0, "-");
DECLARE Url, Email;
@@ -26,7 +32,7 @@ DECLARE Url, Email;
DECLARE MDInd;
"M\\.D\\."-> MDInd;
-DECLARE Num1, Num12, Num2, Num3, Num34, Num4, Num5;
+DECLARE Num1, Num12, Num2, Num3, Num34, Num4, Num5, Num6, Num7, Num8;
NUM->{
Document{REGEXP(".")-> Num1};
@@ -35,16 +41,28 @@ NUM->{
Document{REGEXP("...")-> Num3};
Document{REGEXP("....?")-> Num34};
Document{REGEXP("....")-> Num4};
- Document{REGEXP(".....")-> Num5};
+ Document{REGEXP("......")-> Num5};
+ Document{REGEXP(".......")-> Num6};
+ Document{REGEXP(".......")-> Num7};
+ Document{REGEXP("........")-> Num8};
};
-DECLARE LParen, RParen, Dash, Slash;
+DECLARE LParen, RParen, Dash, Slash, Hash;
SPECIAL-> {
Document.ct=="("{-> LParen};
Document.ct==")"{-> RParen};
Document.ct=="-"{-> Dash};
Document.ct=="/"{-> Slash};
+ Document.ct=="#"{-> Hash};
};
DECLARE ApoInd;
-(SPECIAL.ct=="'" SW.ct=="s"){-> ApoInd};
\ No newline at end of file
+(SPECIAL.ct=="'" SW.ct=="s"){-> ApoInd};
+
+DECLARE Split;
+DECLARE Split SplitInternal, SplitExternal;
+
+Sentence{-> MARKLAST(SplitInternal)};
+RETAINTYPE(BREAK);
+ANY{-PARTOF(WS)-> SplitExternal} BREAK;
+RETAINTYPE;
Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta?rev=1746604&r1=1746603&r2=1746604&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta Thu Jun 2 16:36:19 2016
@@ -6,4 +6,13 @@ TYPESYSTEM org.apache.ctakes.deid.Dictio
DECLARE Doctor;
-MDInd W{-PARTOF(UserName)-> Doctor} (SPECIAL.ct=="/" W{-> Doctor})*;
\ No newline at end of file
+RETAINTYPE(BREAK);
+MDInd BREAK[2,4] W{-PARTOF(UserName)-> Doctor} (SPECIAL.ct=="/" W{-> Doctor})* BREAK BREAK;
+RETAINTYPE;
+
+SPECIAL.ct=="^" (CAP COMMA? CAP{ENDSWITH(Split)}){-> Doctor};
+
+//W{REGEXP("Drs?", true)} PERIOD?
+// @CW{-REGEXP("Done|Take|PO", true)}
+// CW CW?
+// ;
\ No newline at end of file
Added: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Fax.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Fax.ruta?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Fax.ruta (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Fax.ruta Thu Jun 2 16:36:19 2016
@@ -0,0 +1,3 @@
+PACKAGE org.apache.ctakes.deid;
+
+DECLARE Fax;
\ No newline at end of file
Added: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/IDNum.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/IDNum.ruta?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/IDNum.ruta (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/IDNum.ruta Thu Jun 2 16:36:19 2016
@@ -0,0 +1,3 @@
+PACKAGE org.apache.ctakes.deid;
+
+DECLARE IDNum;
\ No newline at end of file
Added: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/MedicalRecNum.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/MedicalRecNum.ruta?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/MedicalRecNum.ruta (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/MedicalRecNum.ruta Thu Jun 2 16:36:19 2016
@@ -0,0 +1,3 @@
+PACKAGE org.apache.ctakes.deid;
+
+DECLARE MedicalRecNum;
\ No newline at end of file
Added: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta Thu Jun 2 16:36:19 2016
@@ -0,0 +1,3 @@
+PACKAGE org.apache.ctakes.deid;
+
+DECLARE Patient;
Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Phone.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Phone.ruta?rev=1746604&r1=1746603&r2=1746604&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Phone.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Phone.ruta Thu Jun 2 16:36:19 2016
@@ -8,8 +8,29 @@ DECLARE DashOrPeriod;
PERIOD{-> DashOrPeriod};
Dash{-> DashOrPeriod};
-PhonePreInd W?{OR(PARTOF({PERIOD, COLON}), W.ct=="#")}
+PhonePreInd ANY?{OR(PARTOF({PERIOD, COLON}), W.ct=="#")}
(
(LParen Num3 RParen)? Dash?
Num34 (DashOrPeriod Num34)+
- ){-> Phone};
\ No newline at end of file
+ ){-> Phone};
+
+PhonePreInd W[0,3] (ANY{PARTOF({COLON, Hash})} COLON?)?
+ (Num5 | (Num1 Dash Num4)){-> Phone};
+
+(LParen? @Num3 RParen? Num3 Dash? Num34 Dash Num4 (Dash Num3)?){-> Phone} ANY{-REGEXP("cc|Units?", true)};
+
+LParen CW? @Num8{->Phone} RParen;
+
+ANY{-REGEXP("Accession|[0-9]{2,5}", true)} ANY{REGEXP("#|B|b|X|x|P|pgr?")} PERIOD? @Num5{-> Phone};
+ANY{-REGEXP("Accession|[0-9]{2,5}", true)} ANY{REGEXP("#|B|b|X|x|P|pgr?")} PERIOD? (@Num1 Dash Num4){-> Phone};
+
+// TODO getPHONE7
+//@Num5{-> Phone}; //TODO ENDSWITH(Sentence)
+//(@Num1 Dash Num4){-> Phone};//TODO ENDSWITH(Sentence)
+
+
+// TODO Split is a LineBreak or a Sentence?
+ANY{ENDSWITH(Split), -IS(CAP)} @Num5{ENDSWITH(Split)-> Phone};
+ANY{ENDSWITH(Split), -IS(CAP)} (@Num1 Dash Num4){ENDSWITH(Split)-> Phone};
+
+
Added: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/State.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/State.ruta?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/State.ruta (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/State.ruta Thu Jun 2 16:36:19 2016
@@ -0,0 +1,25 @@
+PACKAGE org.apache.ctakes.deid;
+
+TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
+
+DECLARE State;
+//getSTATE2/getSTATE5: remove if has given context
+NUM{REGEXP(".{1,4}")} @State{->UNMARK(State)};
+State CW{->UNMARK(State)};
+// TODO refactor to wordlist?
+State{-> UNMARK(State)} W{REGEXP("hospital|heart|association|medical|care|nursing|avenue|street|road|drive|boulevard", true)};
+
+//getSTATE3
+"originally" "from" W[0,3]{-REGEXP("from")} @StateContext{-> State};
+// TODO refactor to wordlist?
+W{REGEXP("home|son|daughter|mother|father|raised|grew|lived|lives", true)} W[0,3]?
+ W{REGEXP("in")} W[0,3]{-REGEXP("in")} @StateContext{->State};
+
+//getSTATE4
+CW (CW? COMMA) @StateContext{-> State};
+
+//getSTATE6
+RETAINTYPE(BREAK);
+BREAK CW COMMA W{REGEXP("MD|IN|AS|DE|HI|OR") -> State} NUM{REGEXP(".{5}")};
+RETAINTYPE;
+
Added: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Zip.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Zip.ruta?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Zip.ruta (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Zip.ruta Thu Jun 2 16:36:19 2016
@@ -0,0 +1,9 @@
+PACKAGE org.apache.ctakes.deid;
+
+TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
+TYPESYSTEM org.apache.ctakes.deid.StateRutaTypeSystem;
+DECLARE Zip;
+
+//getZIP + getSTATE1
+StateContext{-> State} COMMA? NUM{REGEXP(".{5}") -> Zip};
+Zip{-> SHIFT(Zip,1,3)} "-" NUM{REGEXP(".{4}")};
\ No newline at end of file
Modified: ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java?rev=1746604&r1=1746603&r2=1746604&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java Thu Jun 2 16:36:19 2016
@@ -33,11 +33,15 @@ public class I2B2Evaluation {
public static void main(String[] args)
throws ResourceInitializationException, UIMAException, IOException {
+ String testData = "C:/data/i2b2/2014/training-PHI-Gold-Set1/";
+ if(args.length >=1) {
+ testData = args[0];
+ }
+
SimplePipeline.runPipeline(
CollectionReaderFactory.createReader(I2B2DeidCollectionReader.class,
I2B2DeidCollectionReader.PARAM_INPUT_DIRECTORY,
-// "C:/data/i2b2/2014/training-PHI-Gold-Set1/",
- "C:/data/i2b2/2014/PHI-test/",
+ testData,
I2B2DeidCollectionReader.PARAM_GOLD_VIEW, "gold"),
AnalysisEngineFactory.createEngine("org.apache.ctakes.deid.DeidRutaAnnotator"),
AnalysisEngineFactory.createEngine(SimpleDeidEntityComparator.class,