You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by ch...@apache.org on 2016/01/28 16:09:22 UTC

svn commit: r1727380 - in /ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid: Deid.ruta.orig Dictionaries.ruta Street.ruta UserName.ruta ZipState.ruta

Author: chenpei
Date: Thu Jan 28 15:09:22 2016
New Revision: 1727380

URL: http://svn.apache.org/viewvc?rev=1727380&view=rev
Log:
CTAKES-384 Applying patch.Thanks Peter Klugl.

Added:
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta.orig
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Street.ruta
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/UserName.ruta
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/ZipState.ruta

Added: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta.orig
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta.orig?rev=1727380&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta.orig (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta.orig Thu Jan 28 15:09:22 2016
@@ -0,0 +1,22 @@
+PACKAGE org.apache.ctakes.deid;
+
+TYPESYSTEM org.apache.ctakes.typesystem.types.TypeSystem;
+SCRIPT org.apache.ctakes.deid.Dictionaries;
+SCRIPT org.apache.ctakes.deid.ZipState;
+SCRIPT org.apache.ctakes.deid.Street;
+SCRIPT org.apache.ctakes.deid.UserName;
+
+CALL(Dictionaries);
+CALL(ZipState);
+CALL(Street);
+CALL(UserName);
+
+// map types of ruta scripts to cTAKES types
+// TODO select the correct types and fill the features
+Zip{-> IdentifiedAnnotation};
+State{-> IdentifiedAnnotation};
+Email{-> IdentifiedAnnotation};
+Url{-> IdentifiedAnnotation};
+Street{-> IdentifiedAnnotation};
+UserName{-> IdentifiedAnnotation};
+

Added: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta?rev=1727380&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta Thu Jan 28 15:09:22 2016
@@ -0,0 +1,76 @@
+PACKAGE org.apache.ctakes.deid;
+
+WORDLIST trie = 'generated.mtwl';
+DECLARE KeywordInd; 
+DECLARE KeywordInd Profession, StateContext;
+DECLARE KeywordInd StreetInd, StreetFullInd;
+
+TRIE(
+    "profession.txt" = Profession,
+    "us_state.txt" = StateContext,
+    "us_state_acronym_abbreviation.txt" = StateContext,
+    "street_ind.txt" = StreetInd,
+    "street_full_ind.txt" = StreetFullInd,
+    trie, true, 4, false, 0, "-");
+    
+
+DECLARE Url, Email;
+"[_A-Za-z0-9-\\+]+(\\.[_A-Za-z0-9- ]+)*@[A-Za-z0-9-]+(\\.[A-Za-z0-9]+)*(\\.(com|org|edu|gov|mil|co\\.uk))" -> Email;
+"(https?://)?(www.)([a-zA-Z0-9]+).[a-zA-Z0-9]*.[a-z]{3}.?([a-z]+)?(/[a-zA-Z0-9]+)?|(https?|ftp)://[^\\s/$.?#].[^\\s]*|www.[^\\s/$.?#].[^\\s]*" -> Url;
+
+PACKAGE org.apache.ctakes.deid;
+
+WORDLIST trie = 'generated.mtwl';
+DECLARE KeywordInd; 
+DECLARE KeywordInd Profession, StateContext;
+DECLARE KeywordInd StreetInd, StreetFullInd;
+
+TRIE(
+    "profession.txt" = Profession,
+    "us_state.txt" = StateContext,
+    "us_state_acronym_abbreviation.txt" = StateContext,
+    "street_ind.txt" = StreetInd,
+    "street_full_ind.txt" = StreetFullInd,
+    trie, true, 4, false, 0, "-");
+    
+
+DECLARE Url, Email;
+"[_A-Za-z0-9-\\+]+(\\.[_A-Za-z0-9- ]+)*@[A-Za-z0-9-]+(\\.[A-Za-z0-9]+)*(\\.(com|org|edu|gov|mil|co\\.uk))" -> Email;
+"(https?://)?(www.)([a-zA-Z0-9]+).[a-zA-Z0-9]*.[a-z]{3}.?([a-z]+)?(/[a-zA-Z0-9]+)?|(https?|ftp)://[^\\s/$.?#].[^\\s]*|www.[^\\s/$.?#].[^\\s]*" -> Url;
+
+PACKAGE org.apache.ctakes.deid;
+
+WORDLIST trie = 'generated.mtwl';
+DECLARE KeywordInd; 
+DECLARE KeywordInd Profession, StateContext;
+
+TRIE(
+    "profession.txt" = Profession,
+    "us_state.txt" = StateContext,
+    "us_state_acronym_abbreviation.txt" = StateContext,
+    trie, true, 4, false, 0, "-");
+    
+
+DECLARE Url, Email;
+"[_A-Za-z0-9-\\+]+(\\.[_A-Za-z0-9- ]+)*@[A-Za-z0-9-]+(\\.[A-Za-z0-9]+)*(\\.(com|org|edu|gov|mil|co\\.uk))" -> Email;
+"(https?://)?(www.)([a-zA-Z0-9]+).[a-zA-Z0-9]*.[a-z]{3}.?([a-z]+)?(/[a-zA-Z0-9]+)?|(https?|ftp)://[^\\s/$.?#].[^\\s]*|www.[^\\s/$.?#].[^\\s]*" -> Url;
+PACKAGE org.apache.ctakes.deid;
+
+WORDLIST trie = 'generated.mtwl';
+DECLARE KeywordInd; 
+DECLARE KeywordInd Profession, StateContext;
+DECLARE KeywordInd StreetInd, StreetFullInd;
+
+TRIE(
+    "profession.txt" = Profession,
+    "us_state.txt" = StateContext,
+    "us_state_acronym_abbreviation.txt" = StateContext,
+    "street_ind.txt" = StreetInd,
+    "street_full_ind.txt" = StreetFullInd,
+    trie, true, 4, false, 0, "-");
+    
+
+DECLARE Url, Email;
+"[_A-Za-z0-9-\\+]+(\\.[_A-Za-z0-9- ]+)*@[A-Za-z0-9-]+(\\.[A-Za-z0-9]+)*(\\.(com|org|edu|gov|mil|co\\.uk))" -> Email;
+"(https?://)?(www.)([a-zA-Z0-9]+).[a-zA-Z0-9]*.[a-z]{3}.?([a-z]+)?(/[a-zA-Z0-9]+)?|(https?|ftp)://[^\\s/$.?#].[^\\s]*|www.[^\\s/$.?#].[^\\s]*" -> Url;
+

Added: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Street.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Street.ruta?rev=1727380&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Street.ruta (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Street.ruta Thu Jan 28 15:09:22 2016
@@ -0,0 +1,51 @@
+PACKAGE org.apache.ctakes.deid;
+
+TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
+
+DECLARE Street;
+
+//getSTREET1-4
+(NUM{REGEXP(".{1,4}")} (CW|CAP) @StreetInd){-PARTOF(Street)-> Street};
+(NUM{REGEXP(".{1,4}")} CW CW @StreetInd){-PARTOF(Street)-> Street};
+Street PERIOD{-> SHIFT(Street,1,2)};
+
+//remove STREET annotation if the annotation is followed by CW on the same line/sentence.
+//STREET CW{->UNMARK(STREET,1)};
+
+//getSTREET5
+//(CW|CAP) W{REGEXP("street|road|avenue", true)->MARK(Street,1,2)};
+((CW|CAP) @StreetFullInd){-PARTOF(Street)-> Street};
+PACKAGE org.apache.ctakes.deid;
+
+TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
+
+DECLARE Street;
+
+//getSTREET1-4
+(NUM{REGEXP(".{1,4}")} (CW|CAP) @StreetInd){-PARTOF(Street)-> Street};
+(NUM{REGEXP(".{1,4}")} CW CW @StreetInd){-PARTOF(Street)-> Street};
+Street PERIOD{-> SHIFT(Street,1,2)};
+
+//remove STREET annotation if the annotation is followed by CW on the same line/sentence.
+//STREET CW{->UNMARK(STREET,1)};
+
+//getSTREET5
+//(CW|CAP) W{REGEXP("street|road|avenue", true)->MARK(Street,1,2)};
+((CW|CAP) @StreetFullInd){-PARTOF(Street)-> Street};
+PACKAGE org.apache.ctakes.deid;
+
+TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
+
+DECLARE Street;
+
+//getSTREET1-4
+(NUM{REGEXP(".{1,4}")} (CW|CAP) @StreetInd){-PARTOF(Street)-> Street};
+(NUM{REGEXP(".{1,4}")} CW CW @StreetInd){-PARTOF(Street)-> Street};
+Street PERIOD{-> SHIFT(Street,1,2)};
+
+//remove STREET annotation if the annotation is followed by CW on the same line/sentence.
+//STREET CW{->UNMARK(STREET,1)};
+
+//getSTREET5
+//(CW|CAP) W{REGEXP("street|road|avenue", true)->MARK(Street,1,2)};
+((CW|CAP) @StreetFullInd){-PARTOF(Street)-> Street};

Added: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/UserName.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/UserName.ruta?rev=1727380&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/UserName.ruta (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/UserName.ruta Thu Jan 28 15:09:22 2016
@@ -0,0 +1,48 @@
+PACKAGE org.apache.ctakes.deid;
+
+DECLARE UserName;
+//getUSERNAME 1
+RETAINTYPE(WS);
+SPECIAL.ct=="[" 
+	(W{REGEXP(".{2,3}")} @NUM{REGEXP(".{1,3}")}){-> UserName} 
+	SPECIAL.ct=="]" ;
+RETAINTYPE;
+//getUSERNAME2
+
+DECLARE MDInd;
+"M\\.D\\."-> MDInd;
+MDInd W{REGEXP(".{2,3}")} NUM{REGEXP(".{1,3}")->MARK(UserName,2,3)};
+MDInd W{REGEXP("[Oo]n")} NUM{REGEXP(".{1,3}")->UNMARK(UserName,2,3)};
+W{REGEXP("[Oo]n")} @NUM{REGEXP(".{1,3}")->UNMARK(UserName,1,2)};
+PACKAGE org.apache.ctakes.deid;
+
+DECLARE UserName;
+//getUSERNAME 1
+RETAINTYPE(WS);
+SPECIAL.ct=="[" 
+	(W{REGEXP(".{2,3}")} @NUM{REGEXP(".{1,3}")}){-> UserName} 
+	SPECIAL.ct=="]" ;
+RETAINTYPE;
+//getUSERNAME2
+
+DECLARE MDInd;
+"M\\.D\\."-> MDInd;
+MDInd W{REGEXP(".{2,3}")} NUM{REGEXP(".{1,3}")->MARK(UserName,2,3)};
+MDInd W{REGEXP("[Oo]n")} NUM{REGEXP(".{1,3}")->UNMARK(UserName,2,3)};
+W{REGEXP("[Oo]n")} @NUM{REGEXP(".{1,3}")->UNMARK(UserName,1,2)};
+PACKAGE org.apache.ctakes.deid;
+
+DECLARE UserName;
+//getUSERNAME 1
+RETAINTYPE(WS);
+SPECIAL.ct=="[" 
+	(W{REGEXP(".{2,3}")} @NUM{REGEXP(".{1,3}")}){-> UserName} 
+	SPECIAL.ct=="]" ;
+RETAINTYPE;
+//getUSERNAME2
+
+DECLARE MDInd;
+"M\\.D\\."-> MDInd;
+MDInd W{REGEXP(".{2,3}")} NUM{REGEXP(".{1,3}")->MARK(UserName,2,3)};
+MDInd W{REGEXP("[Oo]n")} NUM{REGEXP(".{1,3}")->UNMARK(UserName,2,3)};
+W{REGEXP("[Oo]n")} @NUM{REGEXP(".{1,3}")->UNMARK(UserName,1,2)};

Added: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/ZipState.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/ZipState.ruta?rev=1727380&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/ZipState.ruta (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/ZipState.ruta Thu Jan 28 15:09:22 2016
@@ -0,0 +1,116 @@
+PACKAGE org.apache.ctakes.deid;
+
+TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
+
+DECLARE State, Zip;
+
+//getZIP + getSTATE1
+StateContext{-> State} COMMA? NUM{REGEXP(".{5}") -> Zip};
+Zip{-> SHIFT(Zip,1,3)} "-" NUM{REGEXP(".{4}")};
+
+//getSTATE2/getSTATE5: remove if has given context
+NUM{REGEXP(".{1,4}")} @State{->UNMARK(State)};
+State CW{->UNMARK(State)};
+// TODO refactor to wordlist?
+State{-> UNMARK(State)} W{REGEXP("hospital|heart|association|medical|care|nursing|avenue|street|road|drive|boulevard", true)};
+
+//getSTATE3 
+"originally" "from" W[0,3]{-REGEXP("from")} @StateContext{-> State};
+// TODO refactor to wordlist?
+W{REGEXP("home|son|daughter|mother|father|raised|grew|lived|lives", true)} W[0,3]?
+    W{REGEXP("in")} W[0,3]{-REGEXP("in")} @StateContext{->State};
+
+//getSTATE4
+CW (CW? COMMA) @StateContext{-> State};
+
+//getSTATE6
+RETAINTYPE(BREAK);
+BREAK CW COMMA W{REGEXP("MD|IN|AS|DE|HI|OR") -> State} NUM{REGEXP(".{5}")};
+RETAINTYPE;
+PACKAGE org.apache.ctakes.deid;
+
+TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
+
+DECLARE State, Zip;
+
+//getZIP + getSTATE1
+StateContext{-> State} COMMA? NUM{REGEXP(".{5}") -> Zip};
+Zip{-> SHIFT(Zip,1,3)} "-" NUM{REGEXP(".{4}")};
+
+//getSTATE2/getSTATE5: remove if has given context
+NUM{REGEXP(".{1,4}")} @State{->UNMARK(State)};
+State CW{->UNMARK(State)};
+// TODO refactor to wordlist?
+State{-> UNMARK(State)} W{REGEXP("hospital|heart|association|medical|care|nursing|avenue|street|road|drive|boulevard", true)};
+
+//getSTATE3 
+"originally" "from" W[0,3]{-REGEXP("from")} @StateContext{-> State};
+// TODO refactor to wordlist?
+W{REGEXP("home|son|daughter|mother|father|raised|grew|lived|lives", true)} W[0,3]?
+    W{REGEXP("in")} W[0,3]{-REGEXP("in")} @StateContext{->State};
+
+//getSTATE4
+CW (CW? COMMA) @StateContext{-> State};
+
+//getSTATE6
+RETAINTYPE(BREAK);
+BREAK CW COMMA W{REGEXP("MD|IN|AS|DE|HI|OR") -> State} NUM{REGEXP(".{5}")};
+RETAINTYPE;
+PACKAGE org.apache.ctakes.deid;
+
+TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
+
+DECLARE State, Zip;
+
+//getZIP + getSTATE1
+StateContext{-> State} COMMA? NUM{REGEXP(".{5}") -> Zip};
+Zip{-> SHIFT(Zip,1,3)} "-" NUM{REGEXP(".{4}")};
+
+//getSTATE2/getSTATE5: remove if has given context
+NUM{REGEXP(".{1,4}")} @State{->UNMARK(State)};
+State CW{->UNMARK(State)};
+// TODO refactor to worlist?
+State{-> UNMARK(State)} W{REGEXP("hospital|heart|association|medical|care|nursing|avenue|street|road|drive|boulevard", true)};
+
+//getSTATE3 
+"originally" "from" W[0,3]{-REGEXP("from")} @StateContext{-> State};
+// TODO refactor to worlist?
+W{REGEXP("home|son|daughter|mother|father|raised|grew|lived|lives", true)} W[0,3]?
+    W{REGEXP("in")} W[0,3]{-REGEXP("in")} @StateContext{->State};
+
+//getSTATE4
+CW (CW? COMMA) @StateContext{-> State};
+
+//getSTATE6
+RETAINTYPE(BREAK);
+BREAK CW COMMA W{REGEXP("MD|IN|AS|DE|HI|OR") -> State} NUM{REGEXP(".{5}")};
+RETAINTYPE;
+PACKAGE org.apache.ctakes.deid;
+
+TYPESYSTEM org.apache.ctakes.deid.DictionariesRutaTypeSystem;
+
+DECLARE State, Zip;
+
+//getZIP + getSTATE1
+StateContext{-> State} COMMA? NUM{REGEXP(".{5}") -> Zip};
+Zip{-> SHIFT(Zip,1,3)} "-" NUM{REGEXP(".{4}")};
+
+//getSTATE2/getSTATE5: remove if has given context
+NUM{REGEXP(".{1,4}")} @State{->UNMARK(State)};
+State CW{->UNMARK(State)};
+// TODO refactor to wordlist?
+State{-> UNMARK(State)} W{REGEXP("hospital|heart|association|medical|care|nursing|avenue|street|road|drive|boulevard", true)};
+
+//getSTATE3 
+"originally" "from" W[0,3]{-REGEXP("from")} @StateContext{-> State};
+// TODO refactor to wordlist?
+W{REGEXP("home|son|daughter|mother|father|raised|grew|lived|lives", true)} W[0,3]?
+    W{REGEXP("in")} W[0,3]{-REGEXP("in")} @StateContext{->State};
+
+//getSTATE4
+CW (CW? COMMA) @StateContext{-> State};
+
+//getSTATE6
+RETAINTYPE(BREAK);
+BREAK CW COMMA W{REGEXP("MD|IN|AS|DE|HI|OR") -> State} NUM{REGEXP(".{5}")};
+RETAINTYPE;