You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by pk...@apache.org on 2016/06/02 16:36:19 UTC

svn commit: r1746604 - in /ctakes/sandbox/ctakes-clinical-deid/src: main/java/org/apache/ctakes/deid/ main/resources/org/apache/ctakes/deid/types/ main/resources/wordlists/ main/ruta/org/apache/ctakes/deid/ test/java/org/apache/ctakes/deid/

Author: pkluegl
Date: Thu Jun  2 16:36:19 2016
New Revision: 1746604

URL: http://svn.apache.org/viewvc?rev=1746604&view=rev
Log:
CTAKES-384
- refacoring of scripts
- added more gazetteers
- fixed trie call
- added some rules

Added:
    ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/country.txt   (with props)
    ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/nationality.txt   (with props)
    ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/spoken_language.txt   (with props)
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Country.ruta
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Fax.ruta
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/IDNum.ruta
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/MedicalRecNum.ruta
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/State.ruta
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Zip.ruta
Removed:
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta.orig
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/ZipState.ruta
Modified:
    ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java
    ctakes/sandbox/ctakes-clinical-deid/src/main/resources/org/apache/ctakes/deid/types/TypeSystem.xml
    ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/phone_pre_ind.txt
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Phone.ruta
    ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java?rev=1746604&r1=1746603&r2=1746604&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java Thu Jun  2 16:36:19 2016
@@ -48,6 +48,8 @@ public class SimpleDeidEntityComparator
 
   public static final String PARAM_CREATE_RUTA_EVAL_ANNOTATIONS = "createRutaEvalAnnotations";
 
+  private static final boolean PRINT_ANNOTATIONS = true;
+
   @ConfigurationParameter(name = PARAM_CREATE_RUTA_EVAL_ANNOTATIONS, mandatory = true, defaultValue = "false")
   private Boolean createRutaEvalAnnotations;
 
@@ -83,7 +85,7 @@ public class SimpleDeidEntityComparator
       for (DeidEntity processAnnotation : allProcess) {
         if (equals(goldAnnotation, processAnnotation)) {
           tp.add(processAnnotation);
-          inc(type2tp, processAnnotation);
+          inc(type2tp, processAnnotation, "TP");
           found = true;
           break;
         }
@@ -91,7 +93,7 @@ public class SimpleDeidEntityComparator
       if (!found) {
         DeidEntity copyFs = (DeidEntity) cc.copyFs(goldAnnotation);
         fn.add(copyFs);
-        inc(type2fn, copyFs);
+        inc(type2fn, copyFs, "FN");
       }
     }
 
@@ -105,7 +107,7 @@ public class SimpleDeidEntityComparator
       }
       if (!found) {
         fp.add(processAnnotation);
-        inc(type2fp, processAnnotation);
+        inc(type2fp, processAnnotation, "FP");
       }
     }
 
@@ -184,7 +186,7 @@ public class SimpleDeidEntityComparator
     System.out.println();
   }
 
-  private void inc(Map<String, Integer> map, DeidEntity annotation) {
+  private void inc(Map<String, Integer> map, DeidEntity annotation, String type) {
     String name = annotation.getType().getShortName();
     Integer count = map.get(name);
     if(count == null) {
@@ -192,6 +194,11 @@ public class SimpleDeidEntityComparator
     } else {
       map.put(name, count + 1);
     }
+    if(PRINT_ANNOTATIONS) {
+//      if(name.equals("Location")) {
+        System.out.printf(Locale.ENGLISH,"%-5s %-10s %-10s %-50s\n", type, name,annotation.getEntityType(),  annotation.getCoveredText());
+//      }
+    }
   }
   
 }

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/org/apache/ctakes/deid/types/TypeSystem.xml
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/org/apache/ctakes/deid/types/TypeSystem.xml?rev=1746604&r1=1746603&r2=1746604&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/org/apache/ctakes/deid/types/TypeSystem.xml (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/org/apache/ctakes/deid/types/TypeSystem.xml Thu Jun  2 16:36:19 2016
@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier">
     <name>org.apache.ctakes.deid.types.TypeSystem</name>
-	  <description>This is a Apache cTAKES Type System for clinical deidentificastion.</description>
+	  <description>This is a Apache cTAKES Type System for clinical deidentification.</description>
 	  <version>1.0</version>
 	  <vendor>Apache cTAKES</vendor>
   <types>

Added: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/country.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/country.txt?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/country.txt (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/country.txt Thu Jun  2 16:36:19 2016
@@ -0,0 +1,191 @@
+Afghanistan
+Africa
+Albania
+Algeria
+Andorra
+Angola
+Arabia
+Argentina
+Armenia
+Australia
+Austria
+Bahama
+Bahrain
+Bangladesh
+Belarus
+Belgium
+Belize
+Benin
+Bermuda
+Bhutan
+Bissau
+Bosnia
+Botswana
+Brazil
+Britain
+Brunei
+Bulgaria
+Burma
+Burundi
+Cambodia
+Cameroon
+Canada
+Chad
+Chile
+China
+Colombia
+Columbia
+Congo
+Croatia
+Cuba
+Cyprus
+Czech
+Denmark
+Djibouti
+Dominican
+Ecuador
+Egypt
+Emirates
+Eritrea
+Estonia
+Ethiopia
+Fiji
+Finland
+France
+Gabon
+Gambia
+Germany
+Georgia
+Ghana
+Greece
+Grenada
+Guatemala
+Guiana
+Guinea
+Guyana
+Haiti
+Holland
+Honduras
+Hungary
+Iceland
+India
+Indonesia
+Iran
+Iraq
+Ireland
+Italy
+Jamaica
+Japan
+Jordan
+Kazakhstan
+Kenya
+Kiribati
+Korea
+Kosovo
+Kuwait
+Kyrgyzstan
+Lanka
+Laos
+Latvia
+Lebanon
+Leone
+Libya
+Liechtenstein
+Lithuania
+Luxembourg
+Malawi
+Malaysia
+Maldives
+Mali
+Malta
+Mauritania
+Mauritia
+Mexico
+Micronesia
+Moldovia
+Mongolia
+Morocco
+Mozambique
+Myanmar
+Namibia
+Nepal
+Netherland
+Netherlands
+Nicaragua
+Niger
+Nigeria
+Norway
+Oman
+Pakistan
+Palau
+Palestine
+Panama
+Papua
+Paraguay
+Peru
+Philippines
+Portugal
+Poland
+Qatar
+Rica
+Romania
+Russia
+Rwanda
+Salvador
+Scotland
+Senegal
+Serbia
+Sierra
+Singapore
+Slovakia
+Slovenia
+Somalia
+Spain
+Sudan
+Suriname
+Swaziland
+Sweden
+Swedish
+Switzerland
+Syria
+Taiwan
+Tajikstan
+Tajikistan
+Tanzania
+Thailand
+Tibet
+Timor
+Tonga
+Tunisia
+Turkey
+Turkmenistan
+Tuvalua
+Uganda
+Ukrain
+Ukraine
+Uruguay
+Uzbekistan
+Venezuela
+Verde
+Vietnam
+Wales
+Yemen
+Zaire
+Zambia
+Zealand
+Zimbabwe
+Israel
+Herzegovina
+England
+America
+Puerto Rico
+Sri Lanka
+Costa Rica
+United Kingdom
+UK
+United States
+Ivory Coast
+Saudi Arabia
+South Korea
+North Korea
+Trinidad and Tobago

Propchange: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/country.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/nationality.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/nationality.txt?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/nationality.txt (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/nationality.txt Thu Jun  2 16:36:19 2016
@@ -0,0 +1,228 @@
+Afghan
+African-American
+Albanian
+Algerian
+Andorran
+Angolan
+Antiguans
+Argentinean
+Argentinian
+Armenian
+Australian
+Austrian
+Azerbaijani
+Bahamian
+Bahraini
+Bangladeshi
+Barbadian
+Barbudans
+Batswana
+Belarusan
+Belarusian
+Belgian
+Belizean
+Beninese
+Bhutanese
+Bolivian
+Bosnian
+Botswanan
+Brazilian
+British
+Bruneian
+Bulgarian
+Burkinabe
+Burkinese
+Burmese
+Burundian
+Cambodian
+Cameroonian
+Canadian
+Chadian
+Chilean
+Chinese
+Colombian
+Columbian
+Comoran
+Congolese
+Croatian
+Cuban
+Cypriot
+Czech
+Danish
+Djibouti
+Djiboutian
+Dominican
+Dutch
+East Timorese
+Ecuadorean
+Ecuadorian
+Egyptian
+Emirati
+Emirian
+Equadorian
+Eritrean
+Estonian
+Ethiopian
+Fijian
+Filipino
+Finnish
+French
+Gabonese
+Gambian
+Georgian
+German
+Ghanaian
+Greek
+Grenadian
+Guatemalan
+Guinea-Bissauan
+Guinean
+Guyanese
+Haitian
+Herzegovinian
+Honduran
+Hungarian
+Icelander
+Icelandic
+I-Kiribati
+Indian
+Indonesian
+Iranian
+Iraqi
+Irish
+Israeli
+Italian
+Ivorian
+Jamaican
+Japanese
+Jordanian
+Kazakh
+Kazakhstani
+Khazakhstani
+Kenyan
+Kittian
+Nevisian
+Kuwaiti
+Kyrgyz
+Laotian
+Latvian
+Lebanese
+Liberian
+Libyan
+Liechtensteiner
+Lithuanian
+Luxembourger
+Macedonian
+Madagascan
+Malagasy
+Malawian
+Malaysian
+Maldivan
+Maldivian
+Malian
+Maltese
+Marshallese
+Mauritanian
+Mauritian
+Mexican
+Micronesian
+Moldovan
+Monacan
+Mongolian
+Montenegrin
+Moroccan
+Mosotho
+Motswana
+Mozambican
+Namibian
+Nauruan
+Nepalese
+New Zealander
+Nicaraguan
+Nigerian
+Nigerien
+Northern Irish
+North Korean
+Norwegian
+Omani
+Pakistani
+Palauan
+Panamanian
+Paraguayan
+Peruvian
+Philippine
+Polish
+Portuguese
+Qatari
+Romanian
+Russian
+Rwandan
+Salvadoran
+Salvadorean
+Samoan
+Scottish
+Senegalese
+Serb
+Serbian
+Seychellois
+Singaporean
+Slovak
+Slovakian
+Slovene
+Slovenian
+Solomon Islander
+Somali
+South African
+South Korean
+Spanish
+Sri Lankan
+Sudanese
+Surinamer
+Surinamese
+Swazi
+Swedish
+Swiss
+Syrian
+Tadjik
+Tadjiki
+Taiwanese
+Tajikistani
+Tajik
+Tajiki
+Tanzanian
+Thai
+Tobagonian
+Togolese
+Tongan
+Trinidadian
+Tunisian
+Turkish
+Turkmen
+Turkoman
+Tuvaluan
+Ugandan
+Ukrainian
+Uruguayan
+Uzbek
+Uzbeki
+Uzbekistani
+Vanuatuan
+Venezuelan
+Vietnamese
+Welsh
+Yemeni
+Yemenite
+Yugoslav
+Zairean
+Zambian
+Zimbabwean
+English
+San Marinese
+Sao Tomean
+Papua New Guinean
+Western Samoan
+Saint Lucian
+Sierra Leonean
+Sierra Leonian
+Equatorial Guinean
+

Propchange: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/nationality.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/phone_pre_ind.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/phone_pre_ind.txt?rev=1746604&r1=1746603&r2=1746604&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/phone_pre_ind.txt (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/phone_pre_ind.txt Thu Jun  2 16:36:19 2016
@@ -2,4 +2,6 @@ tele
 telephone
 tel
 phone
-contact
\ No newline at end of file
+contact
+beeper
+pager
\ No newline at end of file

Added: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/spoken_language.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/spoken_language.txt?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/spoken_language.txt (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/spoken_language.txt Thu Jun  2 16:36:19 2016
@@ -0,0 +1,174 @@
+Afghan-speaking
+Albanian-speaking
+Algerian-speaking
+Andorran-speaking
+Angolan-speaking
+Argentinian-speaking
+Armenian-speaking
+Australian-speaking
+Austrian-speaking
+Azerbaijani-speaking
+Bahamian-speaking
+Bahraini-speaking
+Bangladeshi-speaking
+Barbadian-speaking
+Belarusian-speaking
+Belarusan-speaking
+Belgian-speaking
+Belizean-speaking
+Beninese-speaking
+Bhutanese-speaking
+Bolivian-speaking
+Bosnian-speaking
+Botswanan-speaking
+Brazilian-speaking
+British-speaking
+Bruneian-speaking
+Bulgarian-speaking
+Burkinese-speaking
+Burmese-speaking
+Burundian-speaking
+Cambodian-speaking
+Cameroonian-speaking
+Canadian-speaking
+Cape Verdean-speaking
+Chadian-speaking
+Chilean-speaking
+Chinese-speaking
+Colombian-speaking
+Congolese-speaking
+Croatian-speaking
+Cuban-speaking
+Cypriot-speaking
+Czech-speaking
+Danish-speaking
+Djiboutian-speaking
+Dominican-speaking
+Dominican-speaking
+Ecuadorean-speaking
+English-speaking
+Egyptian-speaking
+Salvadorean-speaking
+Eritrean-speaking
+Estonian-speaking
+Ethiopian-speaking
+Fijian-speaking
+Finnish-speaking
+Gabonese-speaking
+Gambian-speaking
+Georgian-speaking
+German-speaking
+Ghanaian-speaking
+Greek-speaking
+Grenadian-speaking
+Guatemalan-speaking
+Guinean-speaking
+Guyanese-speaking
+Haitian-speaking
+Dutch-speaking
+Honduran-speaking
+Hungarian-speaking
+Icelandic-speaking
+Indian-speaking
+Indonesian-speaking
+Iranian-speaking
+Iraqi-speaking
+Irish-speaking
+Italian-speaking
+Jamaican-speaking
+Japanese-speaking
+Jordanian-speaking
+Kazakh-speaking
+Kenyan-speaking
+Kuwaiti-speaking
+Laotian-speaking
+Latvian-speaking
+Lebanese-speaking
+Liberian-speaking
+Libyan-speaking
+Lithuanian-speaking
+Macedonian-speaking
+Madagascan-speaking
+Malawian-speaking
+Malaysian-speaking
+Maldivian-speaking
+Malian-speaking
+Maltese-speaking
+Mauritanian-speaking
+Mauritian-speaking
+Mexican-speaking
+Moldovan-speaking
+Monacan-speaking
+Mongolian-speaking
+Montenegrin-speaking
+Moroccan-speaking
+Mozambican-speaking
+Namibian-speaking
+Nepalese-speaking
+Dutch-speaking
+Nicaraguan-speaking
+Nigerien-speaking
+Nigerian-speaking
+Norwegian-speaking
+Omani-speaking
+Pakistani-speaking
+Panamanian-speaking
+Guinean-speaking
+Paraguayan-speaking
+Peruvian-speaking
+Persian-speaking
+Philippine-speaking
+Polish-speaking
+Portuguese-speaking
+Qatari-speaking
+Romanian-speaking
+Russian-speaking
+Rwandan-speaking
+Saudi-speaking
+Scottish-speaking
+Senegalese-speaking
+Serb-speaking
+Serbian-speaking
+Seychellois-speaking
+Sierra Leonian-speaking
+Singaporean-speaking
+Slovak-speaking
+Slovene-speaking
+Slovenian-speaking
+Somali-speaking
+Spanish-speaking
+Sri Lankan-speaking
+Sudanese-speaking
+Surinamese-speaking
+Swazi-speaking
+Swedish-speaking
+Swiss-speaking
+Syrian-speaking
+Taiwanese-speaking
+Tajik-speaking
+Tadjik-speaking
+Tanzanian-speaking
+Thai-speaking
+Togolese-speaking
+Tobagonian-speaking
+Turkish-speaking
+Turkoman-speaking
+Turkmen-speaking
+Tuvaluan-speaking
+Ugandan-speaking
+Ukrainian-speaking
+Emirati-speaking
+British-speaking
+Uruguayan-speaking
+Uzbek-speaking
+Vanuatuan-speaking
+Venezuelan-speaking
+Vietnamese-speaking
+Welsh-speaking
+Western Samoan-speaking
+Yemeni-speaking
+Yugoslav-speaking
+Zairean-speaking
+Zambian-speaking
+Zimbabwean-speaking
+Equadorian-speaking

Propchange: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/spoken_language.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Country.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Country.ruta?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Country.ruta (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Country.ruta Thu Jun  2 16:36:19 2016
@@ -0,0 +1,21 @@
+PACKAGE org.apache.ctakes.deid;
+
+TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
+TYPESYSTEM org.apache.ctakes.deid.types.TypeSystem;
+
+DECLARE Country;
+
+ANY{-PARTOF(Split)} @CountryInd{-> Country};
+
+
+ANY{-REGEXP("(?i)speaks?|some")} 
+    @NationalityInd{-> Country}
+    ANY{-REGEXP("(?i)general|hospital|clinic|city|restaurant|area|street|road")} ;
+
+// TODO
+//Rule: getSpokenLanguage
+//(
+//    {Lookup.minorType=="nationality"}
+//    ):label
+//-->
+//:label

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta?rev=1746604&r1=1746603&r2=1746604&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta Thu Jun  2 16:36:19 2016
@@ -1,11 +1,13 @@
 PACKAGE org.apache.ctakes.deid;
 
 TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
-TYPESYSTEM org.apache.ctakes.deid.types.TypeSystem;
+//TYPESYSTEM org.apache.ctakes.deid.types.TypeSystem;
+IMPORT PACKAGE * FROM org.apache.ctakes.deid.types.TypeSystem AS deid;
 
 RETAINTYPE(WS);
-(Num4{-PARTOF(Date),REGEXP("19..|20..")} Dash Num2 Dash Num2){-> Date};
-(Num12{-PARTOF(Date)} Slash (Num12 Slash)? Num2{REGEXP("[123456789].")}){-> Date};
-Num4{-PARTOF(Date),REGEXP("19..|20..")-> Date};
-MonthInd{-PARTOF(Date)-> Date};
+(Num4{-PARTOF(deid.Date),REGEXP("19..|20..")} Dash Num2 Dash Num2){-> deid.Date};
+(Num12{-PARTOF(deid.Date)} Slash Num12 Slash Num4{REGEXP("19..|20..")}){-> deid.Date};
+(Num12{-PARTOF(deid.Date)} Slash (Num12 Slash)? Num2{REGEXP("[123456789].")}){-> deid.Date};
+Num4{-PARTOF(deid.Date),REGEXP("19..|20..")-> deid.Date};
+MonthInd{-PARTOF(deid.Date)-> deid.Date};
 RETAINTYPE;

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta?rev=1746604&r1=1746603&r2=1746604&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta Thu Jun  2 16:36:19 2016
@@ -1,46 +1,70 @@
 PACKAGE org.apache.ctakes.deid;
 
 //TYPESYSTEM org.apache.ctakes.typesystem.types.TypeSystem;
-TYPESYSTEM org.apache.ctakes.deid.types.TypeSystem;
+//TYPESYSTEM org.apache.ctakes.deid.types.TypeSystem;
+IMPORT PACKAGE * FROM org.apache.ctakes.deid.types.TypeSystem AS deid;
 
 // UIMA-4833
-TYPESYSTEM org.apache.ctakes.deid.ZipStateRutaTypeSystem;
 TYPESYSTEM org.apache.ctakes.deid.StreetRutaTypeSystem;
 TYPESYSTEM org.apache.ctakes.deid.AgeRutaTypeSystem;
 TYPESYSTEM org.apache.ctakes.deid.DoctorRutaTypeSystem;
 TYPESYSTEM org.apache.ctakes.deid.UserNameRutaTypeSystem;
 TYPESYSTEM org.apache.ctakes.deid.PhoneRutaTypeSystem;
 TYPESYSTEM org.apache.ctakes.deid.DateRutaTypeSystem;
+TYPESYSTEM org.apache.ctakes.deid.ZipRutaTypeSystem;
+TYPESYSTEM org.apache.ctakes.deid.StateRutaTypeSystem;
+TYPESYSTEM org.apache.ctakes.deid.CountryRutaTypeSystem;
+TYPESYSTEM org.apache.ctakes.deid.FaxRutaTypeSystem;
+TYPESYSTEM org.apache.ctakes.deid.PatientRutaTypeSystem;
 
 SCRIPT org.apache.ctakes.deid.Dictionaries;
 SCRIPT org.apache.ctakes.deid.Age;
 SCRIPT org.apache.ctakes.deid.Doctor;
-SCRIPT org.apache.ctakes.deid.ZipState;
+SCRIPT org.apache.ctakes.deid.State;
+SCRIPT org.apache.ctakes.deid.Zip;
 SCRIPT org.apache.ctakes.deid.Street;
 SCRIPT org.apache.ctakes.deid.UserName;
 SCRIPT org.apache.ctakes.deid.Phone;
 SCRIPT org.apache.ctakes.deid.Date;
+SCRIPT org.apache.ctakes.deid.Country;
+SCRIPT org.apache.ctakes.deid.Fax;
+SCRIPT org.apache.ctakes.deid.IDNum;
+SCRIPT org.apache.ctakes.deid.MedicalRecNum;
+SCRIPT org.apache.ctakes.deid.Patient;
 
 CALL(Dictionaries);
-CALL(ZipState);
-CALL(Street);
-CALL(UserName);
-CALL(Date);
+
 CALL(Age);
+CALL(Country);
 CALL(Doctor);
+CALL(Fax);
+CALL(IDNum);
+CALL(MedicalRecNum);
+CALL(Patient);
 CALL(Phone);
+CALL(State);
+CALL(Street);
+CALL(UserName);
+CALL(Zip);
+CALL(Date);
 
-Zip{-> Location, Location.entityType = "ZIP"};
-State{-> Location, Location.entityType= "STATE"};
 Email{-> Contact, Contact.entityType = "EMAIL"};
-ProfessionInd{-> Profession, Profession.entityType = "PROFESSION"};
 Url{-> Contact, Contact.entityType = "URL"};
-Street{-> Location, Location.entityType= "STREET"};
-UserName{-> Name, Name.entityType = "USERNAME"};
+
 Age{-> Age.entityType = "AGE"};
+Country{-> Location, Location.entityType = "COUNTRY"};
 Doctor{-> Name, Name.entityType = "DOCTOR"};
+Fax{-> Contact, Contact.entityType = "FAX"};
+//IDNum{-> ID, ID.entityType = "IDNUM"};
+//MedicalRecNum{-> ID, ID.entityType = "MEDICALRECNUM"};
+Patient{-> Name, Name.entityType = "PATIENT"};
 Phone{-> Contact, Contact.entityType = "PHONE"};
-Date{-> Date.entityType = "DATE"};
+State{-> Location, Location.entityType= "STATE"};
+Street{-> Location, Location.entityType= "STREET"};
+UserName{-> Name, Name.entityType = "USERNAME"};
+ProfessionInd{-> Profession, Profession.entityType = "PROFESSION"};
+Zip{-> Location, Location.entityType = "ZIP"};
+deid.Date{-> deid.Date.entityType = "DATE"};
 
 
 

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta?rev=1746604&r1=1746603&r2=1746604&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta Thu Jun  2 16:36:19 2016
@@ -1,9 +1,12 @@
 PACKAGE org.apache.ctakes.deid;
 
+TYPESYSTEM org.apache.ctakes.typesystem.types.TypeSystem;
+
 WORDLIST trie = 'generated.mtwl';
 DECLARE KeywordInd; 
 DECLARE KeywordInd ProfessionInd, StateContext, DeceasedInd, FamilyInd, MonthInd;
 DECLARE KeywordInd StreetInd, StreetFullInd, AgePostInd, AgePreInd, PhonePreInd;
+DECLARE KeywordInd NationalityInd, SpokenLanguageInd, CountryInd;
 
 TRIE(
     "profession.txt" = ProfessionInd,
@@ -14,9 +17,12 @@ TRIE(
     "age_post_ind.txt" = AgePostInd,
     "age_pre_ind.txt" = AgePreInd,
     "deceased_ind.txt" = DeceasedInd,
-    "family_ind" = FamilyInd,
-    "phone_pre_ind" = PhonePreInd,
-    "month_ind" = MonthInd,
+    "family_ind.txt" = FamilyInd,
+    "phone_pre_ind.txt" = PhonePreInd,
+    "month_ind.txt" = MonthInd,
+    "nationality.txt" = NationalityInd,
+    "spoken-language.txt" = SpokenLanguageInd,
+    "country.txt" = CountryInd,
     trie, true, 4, false, 0, "-");
 
 DECLARE Url, Email;
@@ -26,7 +32,7 @@ DECLARE Url, Email;
 DECLARE MDInd;
 "M\\.D\\."-> MDInd;
 
-DECLARE Num1, Num12, Num2, Num3, Num34, Num4, Num5;
+DECLARE Num1, Num12, Num2, Num3, Num34, Num4, Num5, Num6, Num7, Num8;
 
 NUM->{
     Document{REGEXP(".")-> Num1};
@@ -35,16 +41,28 @@ NUM->{
 	Document{REGEXP("...")-> Num3};
 	Document{REGEXP("....?")-> Num34};
 	Document{REGEXP("....")-> Num4};
-	Document{REGEXP(".....")-> Num5};
+	Document{REGEXP("......")-> Num5};
+	Document{REGEXP(".......")-> Num6};
+	Document{REGEXP(".......")-> Num7};
+	Document{REGEXP("........")-> Num8};
 };
 
-DECLARE LParen, RParen, Dash, Slash;
+DECLARE LParen, RParen, Dash, Slash, Hash;
 SPECIAL-> {
     Document.ct=="("{-> LParen};
     Document.ct==")"{-> RParen};
     Document.ct=="-"{-> Dash};
     Document.ct=="/"{-> Slash};
+    Document.ct=="#"{-> Hash};
 };
 
 DECLARE ApoInd;
-(SPECIAL.ct=="'" SW.ct=="s"){-> ApoInd};
\ No newline at end of file
+(SPECIAL.ct=="'" SW.ct=="s"){-> ApoInd};
+
+DECLARE Split;
+DECLARE Split SplitInternal, SplitExternal;
+
+Sentence{-> MARKLAST(SplitInternal)};
+RETAINTYPE(BREAK);
+ANY{-PARTOF(WS)-> SplitExternal} BREAK;
+RETAINTYPE;

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta?rev=1746604&r1=1746603&r2=1746604&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta Thu Jun  2 16:36:19 2016
@@ -6,4 +6,13 @@ TYPESYSTEM org.apache.ctakes.deid.Dictio
 
 DECLARE Doctor;
 
-MDInd W{-PARTOF(UserName)-> Doctor} (SPECIAL.ct=="/" W{-> Doctor})*;
\ No newline at end of file
+RETAINTYPE(BREAK);
+MDInd BREAK[2,4] W{-PARTOF(UserName)-> Doctor} (SPECIAL.ct=="/" W{-> Doctor})* BREAK BREAK;
+RETAINTYPE;
+
+SPECIAL.ct=="^" (CAP COMMA? CAP{ENDSWITH(Split)}){-> Doctor};
+
+//W{REGEXP("Drs?", true)} PERIOD? 
+//    @CW{-REGEXP("Done|Take|PO", true)}
+//    CW CW?
+//    ;
\ No newline at end of file

Added: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Fax.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Fax.ruta?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Fax.ruta (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Fax.ruta Thu Jun  2 16:36:19 2016
@@ -0,0 +1,3 @@
+PACKAGE org.apache.ctakes.deid;
+
+DECLARE Fax;
\ No newline at end of file

Added: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/IDNum.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/IDNum.ruta?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/IDNum.ruta (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/IDNum.ruta Thu Jun  2 16:36:19 2016
@@ -0,0 +1,3 @@
+PACKAGE org.apache.ctakes.deid;
+
+DECLARE IDNum;
\ No newline at end of file

Added: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/MedicalRecNum.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/MedicalRecNum.ruta?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/MedicalRecNum.ruta (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/MedicalRecNum.ruta Thu Jun  2 16:36:19 2016
@@ -0,0 +1,3 @@
+PACKAGE org.apache.ctakes.deid;
+
+DECLARE MedicalRecNum;
\ No newline at end of file

Added: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta Thu Jun  2 16:36:19 2016
@@ -0,0 +1,3 @@
+PACKAGE org.apache.ctakes.deid;
+
+DECLARE Patient;

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Phone.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Phone.ruta?rev=1746604&r1=1746603&r2=1746604&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Phone.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Phone.ruta Thu Jun  2 16:36:19 2016
@@ -8,8 +8,29 @@ DECLARE DashOrPeriod;
 PERIOD{-> DashOrPeriod};
 Dash{-> DashOrPeriod};
 
-PhonePreInd W?{OR(PARTOF({PERIOD, COLON}), W.ct=="#")} 
+PhonePreInd ANY?{OR(PARTOF({PERIOD, COLON}), W.ct=="#")} 
     (
     (LParen Num3 RParen)? Dash?
     Num34 (DashOrPeriod Num34)+
-    ){-> Phone};
\ No newline at end of file
+    ){-> Phone};
+
+PhonePreInd W[0,3] (ANY{PARTOF({COLON, Hash})} COLON?)?
+    (Num5 | (Num1 Dash Num4)){-> Phone};
+
+(LParen? @Num3 RParen? Num3 Dash? Num34 Dash Num4 (Dash Num3)?){-> Phone} ANY{-REGEXP("cc|Units?", true)};
+
+LParen CW? @Num8{->Phone} RParen;
+
+ANY{-REGEXP("Accession|[0-9]{2,5}", true)} ANY{REGEXP("#|B|b|X|x|P|pgr?")} PERIOD? @Num5{-> Phone};
+ANY{-REGEXP("Accession|[0-9]{2,5}", true)} ANY{REGEXP("#|B|b|X|x|P|pgr?")} PERIOD? (@Num1 Dash Num4){-> Phone};
+
+// TODO getPHONE7
+//@Num5{-> Phone}; //TODO ENDSWITH(Sentence)
+//(@Num1 Dash Num4){-> Phone};//TODO ENDSWITH(Sentence)
+
+
+// TODO Split is a LineBreak or a Sentence?
+ANY{ENDSWITH(Split), -IS(CAP)} @Num5{ENDSWITH(Split)-> Phone};
+ANY{ENDSWITH(Split), -IS(CAP)} (@Num1 Dash Num4){ENDSWITH(Split)-> Phone};
+
+

Added: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/State.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/State.ruta?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/State.ruta (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/State.ruta Thu Jun  2 16:36:19 2016
@@ -0,0 +1,25 @@
+PACKAGE org.apache.ctakes.deid;
+
+TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
+
+DECLARE State;
+//getSTATE2/getSTATE5: remove if has given context
+NUM{REGEXP(".{1,4}")} @State{->UNMARK(State)};
+State CW{->UNMARK(State)};
+// TODO refactor to wordlist?
+State{-> UNMARK(State)} W{REGEXP("hospital|heart|association|medical|care|nursing|avenue|street|road|drive|boulevard", true)};
+
+//getSTATE3 
+"originally" "from" W[0,3]{-REGEXP("from")} @StateContext{-> State};
+// TODO refactor to wordlist?
+W{REGEXP("home|son|daughter|mother|father|raised|grew|lived|lives", true)} W[0,3]?
+    W{REGEXP("in")} W[0,3]{-REGEXP("in")} @StateContext{->State};
+
+//getSTATE4
+CW (CW? COMMA) @StateContext{-> State};
+
+//getSTATE6
+RETAINTYPE(BREAK);
+BREAK CW COMMA W{REGEXP("MD|IN|AS|DE|HI|OR") -> State} NUM{REGEXP(".{5}")};
+RETAINTYPE;
+

Added: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Zip.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Zip.ruta?rev=1746604&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Zip.ruta (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Zip.ruta Thu Jun  2 16:36:19 2016
@@ -0,0 +1,9 @@
+PACKAGE org.apache.ctakes.deid;
+
+TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
+TYPESYSTEM org.apache.ctakes.deid.StateRutaTypeSystem;
+DECLARE Zip;
+
+//getZIP + getSTATE1
+StateContext{-> State} COMMA? NUM{REGEXP(".{5}") -> Zip};
+Zip{-> SHIFT(Zip,1,3)} "-" NUM{REGEXP(".{4}")};
\ No newline at end of file

Modified: ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java?rev=1746604&r1=1746603&r2=1746604&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java Thu Jun  2 16:36:19 2016
@@ -33,11 +33,15 @@ public class I2B2Evaluation {
   public static void main(String[] args)
           throws ResourceInitializationException, UIMAException, IOException {
 
+    String testData = "C:/data/i2b2/2014/training-PHI-Gold-Set1/";
+    if(args.length >=1) {
+      testData = args[0];
+    }
+    
     SimplePipeline.runPipeline(
             CollectionReaderFactory.createReader(I2B2DeidCollectionReader.class,
                     I2B2DeidCollectionReader.PARAM_INPUT_DIRECTORY,
-//                    "C:/data/i2b2/2014/training-PHI-Gold-Set1/",
-                    "C:/data/i2b2/2014/PHI-test/",
+                    testData,
                     I2B2DeidCollectionReader.PARAM_GOLD_VIEW, "gold"),
             AnalysisEngineFactory.createEngine("org.apache.ctakes.deid.DeidRutaAnnotator"),
             AnalysisEngineFactory.createEngine(SimpleDeidEntityComparator.class,