You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2014/11/04 17:19:21 UTC
svn commit: r1636633 - in
/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool:
CodeMapCreator.java DictionaryCreator.java DictionaryCreator2.java
reader/UmlsTextsForCuisReader.java util/CreatorProperties.java
util/UmlsTermUtil.java
Author: seanfinan
Date: Tue Nov 4 16:19:20 2014
New Revision: 1636633
URL: http://svn.apache.org/r1636633
Log:
Added DictionaryCreator2.java, which can favor anatomical site term text over equal text for signs/symptoms, disease/disorder, procedure. Medical term text is separate, preserving drug names such as "liver" (live-er?)
Various static public methods added elsewhere to support DictionaryCreator2 functionality.
UmlsTermUtil is now less forgiving on things such as function texts ([liver disease]&/or[liver toxicity]), utilizing the new RemovalFunctionTriggers.txt data list.
UmlsTermUtil also no longer functionally handles template/form terms automatically e.g. "heartbeat (___bpm)", instead handling them according to the data/ list files.
Added:
ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator2.java
- copied, changed from r1625577, ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java
Modified:
ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java
ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java
ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java
ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java
ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java
Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java?rev=1636633&r1=1636632&r2=1636633&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java Tue Nov 4 16:19:20 2014
@@ -32,11 +32,12 @@ public class CodeMapCreator {
static private final Logger LOGGER = Logger.getLogger( "CodeMapCreator" );
- static private final String[] DEBUG_ARGS = {"-umls", "C:\\Spiffy\\Data\\UMLS\\2011AB\\META",
+ static private final String[] DEBUG_ARGS = {"-umls", "C:\\Spiffy\\umls\\data\\external\\2011AB\\META",
"-db",
- "jdbc:hsqldb:file:C:/Spiffy/Output/ctakes_sno_rx_mem_bin_3char/ctakes_sno_rx_mem",
- "-tbl", "kludge"
+ "jdbc:hsqldb:file:C:/Spiffy/rword_dict/data/internal/ctakesnewsnorx/ctakesnewsnorx",
+ "-tbl", "kludge",
// "-ol", "C:/Spiffy/Output/DictionaryToolTest/CodeMap_sno_rx.bsv"
+ "-fd", "./data/tiny"
};
Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java?rev=1636633&r1=1636632&r2=1636633&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java Tue Nov 4 16:19:20 2014
@@ -35,19 +35,19 @@ public class DictionaryCreator {
static private final Logger LOGGER = Logger.getLogger( "DictionaryCreator" );
- static private final int MIN_SNOMED_TERM_LENGTH = 3; // changed from 2 to 3 9/8/2014 spf - clears ~850 2 char terms
+// static private final int MIN_SNOMED_TERM_LENGTH = 3; // changed from 2 to 3 9/8/2014 spf - clears ~850 2 char terms
+ static private final int MIN_SNOMED_TERM_LENGTH = 2; // changed back to 2, let the dictionary lookup module cull
static private final int MIN_RXNORM_TERM_LENGTH = 1;
- static private final String[] DEBUG_ARGS = {"-umls", "C:\\Spiffy\\Data\\UMLS\\2011AB\\META",
+ static private final String[] DEBUG_ARGS = {"-umls", "C:\\Spiffy\\umls\\data\\external\\2011AB\\META",
"-db",
- "jdbc:hsqldb:file:C:/Spiffy/Output/ctakes_sno_rx_mem_bin_3char/ctakes_sno_rx_mem",
+ "jdbc:hsqldb:file:C:/Spiffy/rword_dict/data/internal/ctakesnewsnorx/ctakesnewsnorx",
"-tbl", "CUI_TERMS",
- // "-ol", "C:/Spiffy/Output/DictionaryToolTest/Terms_sno_rx.bsv",
+// "-ol", "C:\\Spiffy\\rword_dict\\output\\temp/Terms_sno_rx.bsv",
// "-fw",
- // "-mtui", "C:/Spiffy/Dev/Spiffy/Spiffy/DictionaryTool/data/default/CtakesDrugTuis.txt"
- // "-tui", "C:/Spiffy/Dev/Spiffy/Spiffy/DictionaryTool/data/default/CtakesSnomedTuis.txt"
+ "-fd", "./data/tiny"
};
Copied: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator2.java (from r1625577, ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java)
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator2.java?p2=ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator2.java&p1=ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java&r1=1625577&r2=1636633&rev=1636633&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator2.java Tue Nov 4 16:19:20 2014
@@ -13,41 +13,44 @@ import org.apache.ctakes.dictionarytool.
import java.util.Arrays;
import java.util.Collection;
+import java.util.HashSet;
import java.util.logging.Logger;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.DATA_BASE;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.DATA_TABLE;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.FORMAT_DATA;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.MED_TUI_LIST;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.SOURCE;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.TERM_LIST;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.TUI_LIST;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.UMLS_ROOT;
+import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.*;
import static org.apache.ctakes.dictionarytool.util.UmlsFileName.CUI_TERM_MAP;
/**
+ * Because of the manner in which different umls source text is structured, and
+ * because of the broad-based automatic cleanup routines,
+ * DictionaryCreator2 works better than the original DictionaryCreator in handling anatomical sites and diseases.
+ *
+ * thanks go to Tim Miller for originally finding the bug and bringing it to my attention.
+ *
+ *
* Author: SPF
* Affiliation: CHIP-NLP
- * Date: 2/27/14
+ * Date: 11/3/14
*/
-public class DictionaryCreator {
+public class DictionaryCreator2 {
- static private final Logger LOGGER = Logger.getLogger( "DictionaryCreator" );
+ static private final Logger LOGGER = Logger.getLogger( "DictionaryCreator2" );
- static private final int MIN_SNOMED_TERM_LENGTH = 3; // changed from 2 to 3 9/8/2014 spf - clears ~850 2 char terms
+// static private final int MIN_SNOMED_TERM_LENGTH = 3; // changed from 2 to 3 9/8/2014 spf - clears ~850 2 char terms
+ static private final int MIN_SNOMED_TERM_LENGTH = 2; // changed back to 2, let the dictionary lookup module cull
static private final int MIN_RXNORM_TERM_LENGTH = 1;
- static private final String[] DEBUG_ARGS = {"-umls", "C:\\Spiffy\\Data\\UMLS\\2011AB\\META",
+ static private final String[] DEBUG_ARGS = {"-umls", "C:\\Spiffy\\umls\\data\\external\\2011AB\\META",
"-db",
- "jdbc:hsqldb:file:C:/Spiffy/Output/ctakes_sno_rx_mem_bin_3char/ctakes_sno_rx_mem",
+ "jdbc:hsqldb:file:C:/Spiffy/rword_dict/data/internal/ctakesnewsnorx/ctakesnewsnorx",
"-tbl", "CUI_TERMS",
- // "-ol", "C:/Spiffy/Output/DictionaryToolTest/Terms_sno_rx.bsv",
+// "-ol", "C:\\Spiffy\\rword_dict\\output\\temp/Terms_sno_rx.bsv",
// "-fw",
- // "-mtui", "C:/Spiffy/Dev/Spiffy/Spiffy/DictionaryTool/data/default/CtakesDrugTuis.txt"
- // "-tui", "C:/Spiffy/Dev/Spiffy/Spiffy/DictionaryTool/data/default/CtakesSnomedTuis.txt"
+ "-fd", "./data/tiny",
+ "-atui", "./data/tiny/CtakesAnatTuis.txt",
+ "-tui", "./data/tiny/CtakesSnomedTuis.txt"
};
@@ -57,14 +60,44 @@ public class DictionaryCreator {
// final CreatorProperties properties = new CreatorProperties( DEBUG_ARGS );
// Set up the term utility
final UmlsTermUtil umlsTermUtil = new UmlsTermUtil( FORMAT_DATA.getValue() );
+ // Write the anatomical site terms
+ final Collection<String> anats = writeAnat( umlsTermUtil, properties.isRareWordIndex() );
// Write the non-medication terms
- writeSnomed( umlsTermUtil, properties.isRareWordIndex() );
+ writeSnomed( umlsTermUtil, anats, properties.isRareWordIndex() );
// Write the medication terms
writeRxNorm( umlsTermUtil, properties.isRareWordIndex() );
}
+ static private Collection<String> writeAnat( final UmlsTermUtil umlsTermUtil, final boolean isRareWordIndex ) {
+ // Read wanted Sources
+ final Collection<String> wantedSources = SourceTypeListReader.readSourceTypes( SOURCE.getValue() );
+ // Read wanted Tuis
+ final Collection<Integer> wantedTuis = TuiListReader.readTuiList( ANAT_TUI_LIST.getValue() );
+ if ( wantedTuis == null || wantedTuis.isEmpty() ) {
+ LOGGER.severe( "No valid TUI codes found in " + ANAT_TUI_LIST.getValue() );
+ System.exit( 1 );
+ }
+ // get the valid Cuis for all wanted Tuis
+ final HashSetMap<Long, Integer> validCuisAndTuis
+ = CuiTuiUtil.getValidCuisAndTuis( UMLS_ROOT.getValue(), wantedSources, wantedTuis );
+ // Get the texts for all cuis
+ // Term Types are not usable for Snomed. ObsoletePreferredname IS (obsolete Synonym) PreferredTerm SYnonym
+ // PreferredTermGreatBritain SYnonymGreatBritain OB (spelling variation?) MTH_* MTH version
+ final HashSetMap<Long, String> cuiTexts
+ = UmlsTextsForCuisReader.readTextsForCuis( UMLS_ROOT.getValue() + '/' + CUI_TERM_MAP._filename,
+ validCuisAndTuis.keySet(), umlsTermUtil, false, true,
+ MIN_SNOMED_TERM_LENGTH, 7 );
+ writeOutput( validCuisAndTuis, cuiTexts, isRareWordIndex );
+ LOGGER.info( "Done Writing Non-Medication Terms" );
+ final Collection<String> allAnatTerms = new HashSet<>( 10000 );
+ for ( Collection<String> texts : cuiTexts.values() ) {
+ allAnatTerms.addAll( texts );
+ }
+ return allAnatTerms;
+ }
- static private void writeSnomed( final UmlsTermUtil umlsTermUtil, final boolean isRareWordIndex ) {
+ static private void writeSnomed( final UmlsTermUtil umlsTermUtil, final Collection<String> anats,
+ final boolean isRareWordIndex ) {
// Read wanted Sources
final Collection<String> wantedSources = SourceTypeListReader.readSourceTypes( SOURCE.getValue() );
// Read wanted Tuis
@@ -81,7 +114,7 @@ public class DictionaryCreator {
// PreferredTermGreatBritain SYnonymGreatBritain OB (spelling variation?) MTH_* MTH version
final HashSetMap<Long, String> cuiTexts
= UmlsTextsForCuisReader.readTextsForCuis( UMLS_ROOT.getValue() + '/' + CUI_TERM_MAP._filename,
- validCuisAndTuis.keySet(), umlsTermUtil, false, true,
+ validCuisAndTuis.keySet(), umlsTermUtil, anats, false, true,
MIN_SNOMED_TERM_LENGTH, 7 );
writeOutput( validCuisAndTuis, cuiTexts, isRareWordIndex );
LOGGER.info( "Done Writing Non-Medication Terms" );
Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java?rev=1636633&r1=1636632&r2=1636633&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java Tue Nov 4 16:19:20 2014
@@ -8,7 +8,9 @@ import org.apache.ctakes.dictionarytool.
import java.io.BufferedReader;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.Collection;
+import java.util.Collections;
import java.util.List;
import static org.apache.ctakes.dictionarytool.util.index.MrconsoIndex.CUI;
@@ -31,7 +33,8 @@ final public class UmlsTextsForCuisReade
static public HashSetMap<Long, String> readTextsForCuis( final String rrfPath,
final Collection<Long> wantedCuis,
final UmlsTermUtil umlsTermUtil ) {
- return readTextsForCuis( rrfPath, wantedCuis, umlsTermUtil, false, true, 1, Integer.MAX_VALUE );
+ return readTextsForCuis( rrfPath, wantedCuis, umlsTermUtil,
+ false, true, 1, Integer.MAX_VALUE );
}
static public HashSetMap<Long, String> readTextsForCuis( final String rrfPath,
@@ -41,6 +44,19 @@ final public class UmlsTextsForCuisReade
final boolean extractAbbreviations,
final int minWordLength,
final int maxWordCount ) {
+ return readTextsForCuis( rrfPath, wantedCuis, umlsTermUtil, new ArrayList<String>(0),
+ preferredOnly, extractAbbreviations, minWordLength, maxWordCount );
+ }
+
+
+ static public HashSetMap<Long, String> readTextsForCuis( final String rrfPath,
+ final Collection<Long> wantedCuis,
+ final UmlsTermUtil umlsTermUtil,
+ final Collection<String> unwantedTexts,
+ final boolean preferredOnly,
+ final boolean extractAbbreviations,
+ final int minWordLength,
+ final int maxWordCount ) {
System.out.println( "Compiling map of Umls Cuis and Texts" );
long lineCount = 0;
long textCount = 0;
@@ -63,6 +79,7 @@ final public class UmlsTextsForCuisReade
tokens = FileUtil.readBsvTokens( reader, rrfPath );
continue;
}
+ formattedTexts.removeAll( unwantedTexts );
textCount += cuisAndText.addAll( cuiCode, formattedTexts );
}
}
Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java?rev=1636633&r1=1636632&r2=1636633&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java Tue Nov 4 16:19:20 2014
@@ -13,6 +13,7 @@ final public class CreatorProperties {
static private final String DEFAULT_DATA_DIR = "./data/default";
static private final String DEFAULT_TUI_FILE = DEFAULT_DATA_DIR + "/CtakesSnomedTuis.txt";
+ static private final String DEFAULT_ANAT_TUI_FILE = DEFAULT_DATA_DIR + "/CtakesAnatTuis.txt";
static private final String DEFAULT_MED_TUI_FILE = DEFAULT_DATA_DIR + "/CtakesDrugTuis.txt";
static private final String DEFAULT_SOURCE_FILE = DEFAULT_DATA_DIR + "/CtakesSources.txt";
@@ -61,7 +62,10 @@ final public class CreatorProperties {
System.out.println(
"If an Input Tui List Path is not specified then the cTakes Snomed Tuis are used: " + DEFAULT_TUI_FILE );
System.out.println(
- "If an Input Drug Tui List Path is not specified then the cTakes Medication Tuis are used: "
+ "If an Input Anatomical Site Tui List Path is not specified then the defaults are used: "
+ + DEFAULT_ANAT_TUI_FILE );
+ System.out.println(
+ "If an Input Drug Tui List Path is not specified then the defaults are used: "
+ DEFAULT_MED_TUI_FILE );
System.out.println( "If a Source Type List Path is not specified then Snomed is used: " + DEFAULT_SOURCE_FILE );
}
@@ -86,6 +90,9 @@ final public class CreatorProperties {
if ( !Option.TUI_LIST.hasValue() ) {
Option.TUI_LIST.parseValue( Option.TUI_LIST.__key, DEFAULT_TUI_FILE );
}
+ if ( !Option.ANAT_TUI_LIST.hasValue() ) {
+ Option.ANAT_TUI_LIST.parseValue( Option.ANAT_TUI_LIST.__key, DEFAULT_ANAT_TUI_FILE );
+ }
if ( !Option.MED_TUI_LIST.hasValue() ) {
Option.MED_TUI_LIST.parseValue( Option.MED_TUI_LIST.__key, DEFAULT_MED_TUI_FILE );
}
@@ -106,6 +113,7 @@ final public class CreatorProperties {
ORANGE_BOOK( "Orangebook Path", "-ob" ),
FORMAT_DATA( "Format Data Directory", "-fd" ),
TUI_LIST( "Input Tui List Path", "-tui" ),
+ ANAT_TUI_LIST( "Anatomical Site Tui List Path", "-atui" ),
MED_TUI_LIST( "Medication Tui List Path", "-mtui" ),
// SEM_LIST( "Input Semantic Group List Path", "-sem" ),
SOURCE( "Source Type List Path", "-src" ),
Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java?rev=1636633&r1=1636632&r2=1636633&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java Tue Nov 4 16:19:20 2014
@@ -18,6 +18,7 @@ final public class UmlsTermUtil {
static private enum DATA_FILE {
REMOVAL_PREFIX_TRIGGERS( "RemovalPrefixTriggers.txt" ),
REMOVAL_SUFFIX_TRIGGERS( "RemovalSuffixTriggers.txt" ),
+ REMOVAL_FUNCTION_TRIGGERS( "RemovalFunctionTriggers.txt" ),
REMOVAL_COLON_TRIGGERS( "RemovalColonTriggers.txt" ),
UNWANTED_PREFIXES( "UnwantedPrefixes.txt" ),
UNWANTED_SUFFIXES( "UnwantedSuffixes.txt" ),
@@ -37,6 +38,7 @@ final public class UmlsTermUtil {
final private Collection<String> _removalPrefixTriggers;
final private Collection<String> _removalSuffixTriggers;
final private Collection<String> _removalColonTriggers;
+ final private Collection<String> _removalFunctionTriggers;
final private Collection<String> _unwantedPrefixes;
final private Collection<String> _unwantedSuffixes;
final private Collection<String> _modifierSuffixes;
@@ -46,6 +48,7 @@ final public class UmlsTermUtil {
this( getDataPath( dataDir, DATA_FILE.REMOVAL_PREFIX_TRIGGERS ),
getDataPath( dataDir, DATA_FILE.REMOVAL_SUFFIX_TRIGGERS ),
getDataPath( dataDir, DATA_FILE.REMOVAL_COLON_TRIGGERS ),
+ getDataPath( dataDir, DATA_FILE.REMOVAL_FUNCTION_TRIGGERS ),
getDataPath( dataDir, DATA_FILE.UNWANTED_PREFIXES ),
getDataPath( dataDir, DATA_FILE.UNWANTED_SUFFIXES ),
getDataPath( dataDir, DATA_FILE.MODIFIER_SUFFIXES ),
@@ -53,12 +56,13 @@ final public class UmlsTermUtil {
}
public UmlsTermUtil( final String removalPrefixTriggersPath, final String removalSuffixTriggersPath,
- final String removalColonTriggersPath,
+ final String removalColonTriggersPath, final String removalFunctionTriggersPath,
final String unwantedPrefixesPath, final String unwantedSuffixesPath,
final String modifierSuffixesPath, final String abbreviationsPath ) {
_removalPrefixTriggers = FileUtil.readOneColumn( removalPrefixTriggersPath, "term removal Prefix Triggers" );
_removalSuffixTriggers = FileUtil.readOneColumn( removalSuffixTriggersPath, "term removal Suffix Triggers" );
_removalColonTriggers = FileUtil.readOneColumn( removalColonTriggersPath, "term removal Colon Triggers" );
+ _removalFunctionTriggers = FileUtil.readOneColumn( removalFunctionTriggersPath, "term removal Function Triggers" );
_unwantedPrefixes = FileUtil.readOneColumn( unwantedPrefixesPath, "unwanted Prefixes" );
_unwantedSuffixes = FileUtil.readOneColumn( unwantedSuffixesPath, "unwanted Suffixes" );
_modifierSuffixes = FileUtil.readOneColumn( modifierSuffixesPath, "modifier Suffixes" );
@@ -112,9 +116,9 @@ final public class UmlsTermUtil {
if ( extractAbbreviations ) {
// add embedded abbreviations
extractedTerms = extractAbbreviations( validText );
- if ( extractedTerms.isEmpty() ) {
- extractedTerms = autoExtractAcronyms( validText );
- }
+// if ( extractedTerms.isEmpty() ) {
+// extractedTerms = autoExtractAcronyms( validText );
+// }
}
if ( extractedTerms.isEmpty() ) {
extractedTerms = extractModifiers( validText );
@@ -123,36 +127,36 @@ final public class UmlsTermUtil {
extractedTerms.add( validText );
return getFormattedTexts( getPluralTerms( getValidTexts( extractedTerms ) ), minWordLength, maxWordCount );
}
- // Check for embedded and / or terms
- if ( extractedTerms.isEmpty() ) {
- extractedTerms = autoExtractColonParaTerms( validText );
- }
- if ( extractedTerms.isEmpty() ) {
- extractedTerms = autoExtractOrParaTerms( validText );
- }
- if ( extractedTerms.isEmpty() ) {
- extractedTerms = autoExtractColonBracketTerms( validText );
- }
- // if ( extractedTerms.isEmpty() ) {
- // extractedTerms = autoExtractAndBracketTerms( validText );
- // }
- if ( extractedTerms.isEmpty() ) {
- extractedTerms = autoExtractOrBracketTerms( validText );
- }
- if ( extractedTerms.isEmpty() ) {
- extractedTerms = autoExtractAndOrOtherTerms( validText );
- }
- if ( !extractedTerms.isEmpty() ) {
- // System.out.println( validText );
- // for ( String et : extractedTerms ) {
- // System.out.println(" " + et);
- // }
- return getFormattedTexts( getPluralTerms( getValidTexts( extractedTerms ) ), minWordLength, maxWordCount );
- } else {
+// // Check for embedded and / or terms
+// if ( extractedTerms.isEmpty() ) {
+// extractedTerms = autoExtractColonParaTerms( validText );
+// }
+// if ( extractedTerms.isEmpty() ) {
+// extractedTerms = autoExtractOrParaTerms( validText );
+// }
+// if ( extractedTerms.isEmpty() ) {
+// extractedTerms = autoExtractColonBracketTerms( validText );
+// }
+// // if ( extractedTerms.isEmpty() ) {
+// // extractedTerms = autoExtractAndBracketTerms( validText );
+// // }
+// if ( extractedTerms.isEmpty() ) {
+// extractedTerms = autoExtractOrBracketTerms( validText );
+// }
+// if ( extractedTerms.isEmpty() ) {
+// extractedTerms = autoExtractAndOrOtherTerms( validText );
+// }
+// if ( !extractedTerms.isEmpty() ) {
+// // System.out.println( validText );
+// // for ( String et : extractedTerms ) {
+// // System.out.println(" " + et);
+// // }
+// return getFormattedTexts( getPluralTerms( getValidTexts( extractedTerms ) ), minWordLength, maxWordCount );
+// } else {
Collection<String> texts = new HashSet<>( 1 );
texts.add( validText );
return getFormattedTexts( getPluralTerms( getValidTexts( texts ) ), minWordLength, maxWordCount );
- }
+// }
}
static private Collection<String> getPluralTerms( final Collection<String> texts ) {
@@ -197,6 +201,11 @@ final public class UmlsTermUtil {
return false;
}
}
+ for ( String removalFunction : _removalFunctionTriggers ) {
+ if ( text.contains( removalFunction ) ) {
+ return false;
+ }
+ }
return true;
}
@@ -210,14 +219,14 @@ final public class UmlsTermUtil {
private String getValidText( final String text ) {
// remove form underlines
- if ( text.contains( "_ _ _" ) ) {
- final int lastParen = text.lastIndexOf( '(' );
- final int lastDash = text.indexOf( "_ _ _" );
- final int deleteIndex = Math.max( 0, Math.min( lastParen, lastDash ) );
- if ( deleteIndex > 0 ) {
- return getValidText( text.substring( 0, deleteIndex - 1 ).trim() );
- }
- }
+// if ( text.contains( "_ _ _" ) ) {
+// final int lastParen = text.lastIndexOf( '(' );
+// final int lastDash = text.indexOf( "_ _ _" );
+// final int deleteIndex = Math.max( 0, Math.min( lastParen, lastDash ) );
+// if ( deleteIndex > 0 ) {
+// return getValidText( text.substring( 0, deleteIndex - 1 ).trim() );
+// }
+// }
// remove unmatched parentheses, brackets, etc.
// if ( text.startsWith( "(" ) && !text.contains( ")" ) ) {
// return getValidText( text.substring( 1 ).trim() );
@@ -264,10 +273,16 @@ final public class UmlsTermUtil {
strippedText = strippedText.substring( 0, strippedText.length() - suffix.length() ).trim();
}
}
+ if ( !isTextValid( strippedText ) ) {
+ return "";
+ }
}
if ( strippedText.contains( "(" ) && strippedText.contains( "[" ) ) {
return "";
}
+// if ( strippedText.length() != text.trim().length() ) {
+// System.out.println( text.trim() + " > " + strippedText );
+// }
return strippedText;
}