You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2016/07/05 19:40:58 UTC
svn commit: r1751544 - in /ctakes/sandbox/dictionary-gui/src:
main/java/org/apache/ctakes/dictionary/creator/gui/ctakes/
main/java/org/apache/ctakes/dictionary/creator/gui/main/
main/java/org/apache/ctakes/dictionary/creator/gui/umls/ main/java/org/apa...
Author: seanfinan
Date: Tue Jul 5 19:40:57 2016
New Revision: 1751544
URL: http://svn.apache.org/viewvc?rev=1751544&view=rev
Log:
DictionaryBuilder only parses mrconso once
MainPanle prepped for language selection, "source" vocabulary ignored
Concept improved to remove subsuming texts, recognize unwanted status (e.g. concept has a dose)
DoseUtil recognizes units in terms
MrconsoParser has exclusions by term type in vocabularies
UmlsTermUtil is cleaned up
RareWordDbWriter mostly project-specific changes
RareWordUtil logging
TextTokenizer minor refactor
Adding DoseUtilTester
Added:
ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtil.java
ctakes/sandbox/dictionary-gui/src/test/java/org/apache/ctakes/dictionary/creator/gui/umls/
ctakes/sandbox/dictionary-gui/src/test/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtilTester.java
Modified:
ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/ctakes/DictionaryBuilder.java
ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/main/MainPanel.java
ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/Concept.java
ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/ConceptMapFactory.java
ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/MrconsoParser.java
ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/UmlsTermUtil.java
ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordDbWriter.java
ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordUtil.java
ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/TextTokenizer.java
Modified: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/ctakes/DictionaryBuilder.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/ctakes/DictionaryBuilder.java?rev=1751544&r1=1751543&r2=1751544&view=diff
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/ctakes/DictionaryBuilder.java (original)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/ctakes/DictionaryBuilder.java Tue Jul 5 19:40:57 2016
@@ -8,10 +8,8 @@ import org.apache.logging.log4j.LogManag
import org.apache.logging.log4j.Logger;
import java.io.File;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Map;
+import java.util.*;
+import java.util.function.Predicate;
import java.util.stream.Collectors;
/**
@@ -23,107 +21,142 @@ final public class DictionaryBuilder {
static private final Logger LOGGER = LogManager.getLogger( "DictionaryBuilder" );
- static private final String DEFAULT_DATA_DIR = "./data/default";
+// static private final String DEFAULT_DATA_DIR = "./data/default";
+static private final String DEFAULT_DATA_DIR = "./data/tiny";
static private final String CTAKES_APP_DB_PATH = "resources/org/apache/ctakes/dictionary/lookup/fast";
static private final String CTAKES_RES_MODULE = "ctakes-dictionary-lookup-fast-res";
static private final String CTAKES_RES_DB_PATH = CTAKES_RES_MODULE+"/src/main/" + CTAKES_APP_DB_PATH;
- static private final int MIN_TERM_LENGTH = 2;
- static private final int MIN_DRUG_TERM_LENGTH = 1;
+ static private final int MIN_CHAR_LENGTH = 2;
+ static private final int MAX_CHAR_LENGTH = 50;
+ static private final int MAX_WORD_COUNT = 12;
+ static private final int MAX_SYM_COUNT = 7;
+ static private final int MIN_DRUG_CHAR_LENGTH = 1;
private DictionaryBuilder() {}
+
static public boolean buildDictionary( final String umlsDirPath,
final String ctakesDirPath,
final String dictionaryName,
- final Collection<String> wantedSources,
+ final Collection<String> wantedLanguages,
final Collection<String> wantedTargets,
final Collection<Tui> wantedTuis ) {
// Set up the term utility
final UmlsTermUtil umlsTermUtil = new UmlsTermUtil( DEFAULT_DATA_DIR );
- final Map<Long,Concept> anatomies = parseAnatomy( umlsTermUtil, umlsDirPath, wantedSources, wantedTargets, wantedTuis );
- final Map<Long,Concept> nonSpecials = parseNonSpecial( umlsTermUtil, umlsDirPath, wantedSources, wantedTargets, wantedTuis, anatomies
- .values() );
- final Map<Long,Concept> medications = parseMedication( umlsTermUtil, umlsDirPath, wantedTargets, wantedTuis );
- writeDatabase( ctakesDirPath, dictionaryName, anatomies, nonSpecials, medications );
+ final Map<Long,Concept> conceptMap = parseAll( umlsTermUtil, umlsDirPath, wantedLanguages, wantedTargets, wantedTuis );
-// Process process = ProcessBuilder( "java -Xmx2G -cp lib/hsqldb_1_8_0_10.jar org.hsqldb.util.SqlTool --rcfile " )
+ // special case for nitric oxide "no"
+ final Concept nitricOxide = conceptMap.get( 28128l );
+ if ( nitricOxide != null ) {
+ nitricOxide.removeTexts( Collections.singletonList( "no" ) );
+ }
+ // special case for nitric oxide synthase "nos"
+ final Concept nitricOxides = conceptMap.get( 132555l );
+ if ( nitricOxides != null ) {
+ nitricOxides.removeTexts( Arrays.asList( "nos", "synthase" ) );
+ }
+
+ writeDatabase( ctakesDirPath, dictionaryName, conceptMap );
return true;
}
- static private Map<Long,Concept> parseAnatomy( final UmlsTermUtil umlsTermUtil,
- final String umlsDirPath,
- final Collection<String> wantedSources,
- final Collection<String> wantedTargets,
- final Collection<Tui> wantedTuis ) {
- LOGGER.info( "Parsing Anatomical Site Concepts" );
- final Collection<Tui> wantedAnatTuis = new ArrayList<>( wantedTuis );
- wantedAnatTuis.retainAll( Arrays.asList( TuiTableModel.CTAKES_ANAT ) );
- final Map<Long,Concept> concepts = ConceptMapFactory.createConceptMap( umlsDirPath, wantedSources,
- wantedAnatTuis, "Anatomical Site" );
+
+
+
+ static private Map<Long,Concept> parseAll( final UmlsTermUtil umlsTermUtil,
+ final String umlsDirPath,
+ final Collection<String> wantedLanguages,
+ final Collection<String> wantedTargets,
+ final Collection<Tui> wantedTuis ) {
+ LOGGER.info( "Parsing Concepts" );
+ // Create a map of Cuis to empty Concepts for all wanted Tuis and source vocabularies
+ final Map<Long,Concept> conceptMap
+ = ConceptMapFactory.createInitialConceptMap( umlsDirPath, wantedTargets, wantedTuis );
// Fill in information for all valid concepts
- MrconsoParser.parseConcepts( umlsDirPath, concepts, wantedTargets, umlsTermUtil, MIN_TERM_LENGTH, 7 );
- LOGGER.info( "Done Parsing Anatomical Site Concepts" );
- return concepts;
+ MrconsoParser.parseAllConcepts( umlsDirPath, conceptMap, wantedTargets, umlsTermUtil,
+ wantedLanguages, true, MIN_CHAR_LENGTH, MAX_CHAR_LENGTH, MAX_WORD_COUNT, MAX_SYM_COUNT );
+ removeUnwantedConcepts( conceptMap );
+ // remove concepts that have only drug tuis but are not in rxnorm
+ removeNonRxNormDrugs( conceptMap, wantedTuis );
+ // remove concepts that are in rxnorm but have non-drug tuis
+ removeRxNormNonDrugs( conceptMap, wantedTuis );
+ // Cull non-ANAT texts by ANAT texts as determined by ANAT tuis
+ removeAnatTexts( conceptMap.values(), wantedTuis );
+ conceptMap.values().forEach( Concept::minimizeTexts );
+ LOGGER.info( "Done Parsing Concepts" );
+ return conceptMap;
+ }
+
+ /**
+ * Remove any concepts that are unwanted - don't have any text from a desired vocabulary
+ * @param conceptMap -
+ */
+ static private void removeUnwantedConcepts( final Map<Long, Concept> conceptMap ) {
+ final Collection<Long> empties = conceptMap.entrySet().stream()
+ .filter( e -> e.getValue().isUnwanted() )
+ .map( Map.Entry::getKey )
+ .collect( Collectors.toSet() );
+ conceptMap.keySet().removeAll( empties );
}
- static private Map<Long,Concept> parseNonSpecial( final UmlsTermUtil umlsTermUtil,
- final String umlsDirPath,
- final Collection<String> wantedSources,
- final Collection<String> wantedTargets,
- final Collection<Tui> wantedTuis,
- final Collection<Concept> anatomyConcepts ) {
- LOGGER.info( "Parsing Non-Anatomical Site, Non-Medication Concepts" );
- final Collection<Tui> wantedNormTuis = new ArrayList<>( wantedTuis );
- wantedNormTuis.removeAll( Arrays.asList( TuiTableModel.CTAKES_ANAT ) );
-// wantedNormTuis.removeAll( Arrays.asList( TuiTableModel.CTAKES_DRUG ) );
- final Map<Long,Concept> concepts = ConceptMapFactory.createConceptMap( umlsDirPath, wantedSources,
- wantedNormTuis, "Non-Anatomical Site, Non-Medication" );
- // We don't want anatomical site texts to be anything but, so make them unavailable for other concepts
- final Collection<String> anatomyTexts = anatomyConcepts.stream()
+ static private Collection<String> getAnatTexts( final Collection<Concept> concepts, final Collection<Tui> wantedTuis ) {
+ final Collection<Tui> wantedAnatTuis = new ArrayList<>( wantedTuis );
+ wantedAnatTuis.retainAll( Arrays.asList( TuiTableModel.CTAKES_ANAT ) );
+ return concepts.stream()
+ .filter( c -> c.hasTui( wantedAnatTuis ) )
.map( Concept::getTexts )
.flatMap( Collection::stream )
.collect( Collectors.toSet() );
- // Fill in information for all valid concepts
- MrconsoParser.parseConcepts( umlsDirPath, concepts, wantedTargets, umlsTermUtil,
- anatomyTexts, true, MIN_TERM_LENGTH, 7 );
- LOGGER.info( "Done Parsing Non-Anatomical Site, Non-Medication Concepts" );
- return concepts;
}
- static private Map<Long,Concept> parseMedication( final UmlsTermUtil umlsTermUtil,
- final String umlsDirPath,
- final Collection<String> wantedTargets,
+ static private void removeAnatTexts( final Collection<Concept> concepts,
+ final Collection<Tui> wantedTuis,
+ final Collection<String> anatTexts ) {
+ final Collection<Tui> nonAnatTuis = new ArrayList<>( wantedTuis );
+ nonAnatTuis.removeAll( Arrays.asList( TuiTableModel.CTAKES_ANAT ) );
+ concepts.stream()
+ .filter( c -> c.hasTui( nonAnatTuis ) )
+ .forEach( c -> c.removeTexts( anatTexts ) );
+ }
+
+ static private void removeAnatTexts( final Collection<Concept> concepts,
final Collection<Tui> wantedTuis ) {
- LOGGER.info( "Parsing Medication Concepts" );
- final Collection<Tui> wantedDrugTuis = new ArrayList<>( wantedTuis );
- wantedDrugTuis.retainAll( Arrays.asList( TuiTableModel.CTAKES_DRUG ) );
- final Map<Long,Concept> concepts = ConceptMapFactory.createRxConceptMap( umlsDirPath, wantedDrugTuis );
- // Fill in information for all valid concepts
- MrconsoParser.parseConcepts( umlsDirPath, concepts, wantedTargets, umlsTermUtil, MIN_DRUG_TERM_LENGTH, 11 );
- // special case for nitric oxide "no"
- final Concept nitricOxide = concepts.get( 28128l );
- if ( nitricOxide != null ) {
- nitricOxide.removeText( "no" );
- }
- LOGGER.info( "Done Parsing Medication Concepts" );
- return concepts;
+ final Collection<String> anatTexts = getAnatTexts( concepts, wantedTuis );
+ removeAnatTexts( concepts, wantedTuis, anatTexts );
}
- static private boolean writeDatabase( final String ctakesDirPath,
- final String dictionaryName,
- final Map<Long,Concept> anatomies,
- final Map<Long,Concept> nonSpecials,
- final Map<Long,Concept> medications ) {
- mergeConcepts( nonSpecials, anatomies );
- mergeConcepts( nonSpecials, medications );
- return writeDatabase( ctakesDirPath, dictionaryName, nonSpecials );
+ static private void removeNonRxNormDrugs( final Map<Long,Concept> conceptMap, Collection<Tui> wantedTuis ) {
+ final Collection<Tui> drugTuis = new ArrayList<>( wantedTuis );
+ drugTuis.retainAll( Arrays.asList( TuiTableModel.CTAKES_DRUG ) );
+ final Predicate<Map.Entry<Long,Concept>> isNonRxNormDrug
+ = e -> drugTuis.containsAll( e.getValue().getTuis() )
+ && !e.getValue().getVocabularies().contains( "RXNORM" );
+ final Collection<Long> removalCuis = conceptMap.entrySet().stream()
+ .filter( isNonRxNormDrug )
+ .map( Map.Entry::getKey )
+ .collect( Collectors.toSet() );
+ conceptMap.keySet().removeAll( removalCuis );
+ }
+
+ static private void removeRxNormNonDrugs( final Map<Long,Concept> conceptMap, Collection<Tui> wantedTuis ) {
+ final Collection<Tui> nonDrugTuis = new ArrayList<>( wantedTuis );
+ nonDrugTuis.removeAll( Arrays.asList( TuiTableModel.CTAKES_DRUG ) );
+ final Predicate<Map.Entry<Long,Concept>> isRxNormNonDrug
+ = e -> e.getValue().getVocabularies().contains( "RXNORM" )
+ && nonDrugTuis.containsAll( e.getValue().getTuis() );
+ final Collection<Long> removalCuis = conceptMap.entrySet().stream()
+ .filter( isRxNormNonDrug )
+ .map( Map.Entry::getKey )
+ .collect( Collectors.toSet() );
+ conceptMap.keySet().removeAll( removalCuis );
}
+
static private boolean writeDatabase( final String ctakesDirPath,
final String dictionaryName,
- final Map<Long,Concept> concepts ) {
+ final Map<Long,Concept> conceptMap ) {
final File ctakesRoot = new File( ctakesDirPath );
String databaseDirPath = ctakesDirPath + "/" + CTAKES_APP_DB_PATH;
if ( Arrays.asList( ctakesRoot.list() ).contains( CTAKES_RES_MODULE ) ) {
@@ -136,20 +169,7 @@ final public class DictionaryBuilder {
return false;
}
final String url = HsqlUtil.URL_PREFIX + databaseDirPath.replace( '\\', '/' ) + "/" + dictionaryName + "/" + dictionaryName;
- return RareWordDbWriter.writeConcepts( concepts, url, "sa", "" );
- }
-
- static private void mergeConcepts( final Map<Long,Concept> mainConepts,
- final Map<Long,Concept> mergingConcepts ) {
- for ( Map.Entry<Long,Concept> mergable : mergingConcepts.entrySet() ) {
- final Concept mainConcept = mainConepts.get( mergable.getKey() );
- if ( mainConcept == null ) {
- mainConepts.put( mergable.getKey(), mergable.getValue() );
- continue;
- }
- mainConcept.mergeWith( mergable.getValue() );
- }
- mergingConcepts.clear();
+ return RareWordDbWriter.writeConcepts( conceptMap, url, "sa", "" );
}
Modified: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/main/MainPanel.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/main/MainPanel.java?rev=1751544&r1=1751543&r2=1751544&view=diff
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/main/MainPanel.java (original)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/main/MainPanel.java Tue Jul 5 19:40:57 2016
@@ -21,6 +21,7 @@ import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
+import java.util.Collections;
import java.util.HashSet;
/**
@@ -48,8 +49,6 @@ final public class MainPanel extends JPa
final JComponent centerPanel = new JPanel( new GridLayout( 1, 2 ) );
centerPanel.add( createSourceTable( _sourceModel ) );
centerPanel.add( createTuiTable( _tuiModel ) );
-// add( createTuiTable( _tuiModel ), BorderLayout.WEST );
-// add( createSourceTable( _sourceModel ), BorderLayout.EAST );
add( centerPanel, BorderLayout.CENTER );
add( createGoPanel(), BorderLayout.SOUTH );
}
@@ -145,30 +144,37 @@ final public class MainPanel extends JPa
private void buildDictionary( final String dictionaryName ) {
SwingUtilities.invokeLater(
new DictionaryBuildRunner( _umlsDirPath, _ctakesPath, dictionaryName,
- _sourceModel.getWantedSources(), _sourceModel.getWantedTargets(), _tuiModel.getWantedTuis() ) );
+ _sourceModel.getWantedTargets(), _tuiModel.getWantedTuis() ) );
}
+ private void error( final String title, final String message ) {
+ LOGGER.error( message );
+ JOptionPane.showMessageDialog( MainPanel.this, message, title, JOptionPane.ERROR_MESSAGE );
+ }
+
+
+
private class DictionaryBuildRunner implements Runnable {
private final String __umlsDirPath;
private final String __ctakesDirPath;
private final String __dictionaryName;
- private final Collection<String> __wantedSources;
private final Collection<String> __wantedTargets;
private final Collection<Tui> __wantedTuis;
private DictionaryBuildRunner( final String umlsDirPath, final String ctakesDirPath, final String dictionaryName,
- final Collection<String> wantedSources, final Collection<String> wantedTargets,
+ final Collection<String> wantedTargets,
final Collection<Tui> wantedTuis ) {
__umlsDirPath = umlsDirPath;
__ctakesDirPath = ctakesDirPath;
__dictionaryName = dictionaryName;
- __wantedSources = new ArrayList<>( wantedSources );
__wantedTargets = new ArrayList<>( wantedTargets );
__wantedTuis = new ArrayList<>( wantedTuis );
}
+
public void run() {
SwingUtilities.getRoot( MainPanel.this ).setCursor( Cursor.getPredefinedCursor( Cursor.WAIT_CURSOR ) );
if ( DictionaryBuilder.buildDictionary( __umlsDirPath, __ctakesDirPath, __dictionaryName,
- __wantedSources, __wantedTargets, __wantedTuis ) ) {
+ Collections.singletonList( "ENG" ),
+ __wantedTargets, __wantedTuis ) ) {
final String message = "Dictionary " + __dictionaryName + " successfully built in " + __ctakesDirPath;
LOGGER.info( message );
JOptionPane.showMessageDialog( MainPanel.this, message, "Dictionary Built", JOptionPane.INFORMATION_MESSAGE );
@@ -179,10 +185,7 @@ final public class MainPanel extends JPa
}
}
- private void error( final String title, final String message ) {
- LOGGER.error( message );
- JOptionPane.showMessageDialog( MainPanel.this, message, title, JOptionPane.ERROR_MESSAGE );
- }
+
private class UmlsDirListener implements ActionListener {
public void actionPerformed( final ActionEvent event ) {
Modified: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/Concept.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/Concept.java?rev=1751544&r1=1751543&r2=1751544&view=diff
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/Concept.java (original)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/Concept.java Tue Jul 5 19:40:57 2016
@@ -3,10 +3,7 @@ package org.apache.ctakes.dictionary.cre
import org.apache.ctakes.dictionary.creator.util.collection.CollectionMap;
import org.apache.ctakes.dictionary.creator.util.collection.HashSetMap;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.EnumSet;
-import java.util.HashSet;
+import java.util.*;
/**
* Author: SPF
@@ -18,11 +15,14 @@ final public class Concept {
static public String PREFERRED_TERM_UNKNOWN = "Unknown Preferred Term";
private String _preferredText = null;
+ private boolean _hasDose = false;
+
final private Collection<String> _texts;
final private CollectionMap<String, String, ? extends Collection<String>> _codes;
final private Collection<Tui> _tuis;
+
public Concept() {
_codes = new HashSetMap<>( 0 );
_texts = new HashSet<>( 1 );
@@ -33,14 +33,41 @@ final public class Concept {
_texts.addAll( texts );
}
- public void removeText( final String text ) {
- _texts.remove( text );
+ public void removeTexts( final Collection<String> texts ) {
+ _texts.removeAll( texts );
}
public Collection<String> getTexts() {
return _texts;
}
+ public void minimizeTexts() {
+ if ( _texts.size() < 2 ) {
+ return;
+ }
+ final List<String> textList = new ArrayList<>( _texts );
+ final Collection<String> extensionTexts = new HashSet<>();
+ for ( int i=0; i<textList.size()-1; i++ ) {
+ final String iText = textList.get( i );
+ for ( int j=i+1; j<textList.size(); j++ ) {
+ final String jText = textList.get( j );
+ if ( textContained( jText, iText ) ) {
+ extensionTexts.add( jText );
+ } else if ( textContained( iText, jText ) ) {
+ extensionTexts.add( iText );
+ }
+ }
+ }
+ _texts.removeAll( extensionTexts );
+ }
+
+ static private boolean textContained( final String containerText, final String containedText ) {
+ final int index = containerText.indexOf( containedText );
+ return index >= 0
+ && ( index == 0 || containerText.charAt( index-1 ) == ' ' )
+ && ( index+containedText.length() == containerText.length() || containerText.charAt( index + containedText.length() ) == ' ' );
+ }
+
public void setPreferredText( final String text ) {
_preferredText = text;
}
@@ -52,17 +79,16 @@ final public class Concept {
return PREFERRED_TERM_UNKNOWN;
}
- public void addCode( final String vocabulary, final String code ) {
- _codes.placeValue( vocabulary, code );
- Vocabulary.getInstance().addVocabulary( vocabulary, code );
+ public void addCode( final String source, final String code ) {
+ _codes.placeValue( source, code );
}
public Collection<String> getVocabularies() {
return _codes.keySet();
}
- public Collection<String> getCodes( final String vocabulary ) {
- final Collection<String> codes = _codes.getCollection( vocabulary );
+ public Collection<String> getCodes( final String source ) {
+ final Collection<String> codes = _codes.getCollection( source );
if ( codes == null ) {
return Collections.emptyList();
}
@@ -77,17 +103,24 @@ final public class Concept {
return _tuis;
}
- public void mergeWith( final Concept concept ) {
- addTexts( concept.getTexts() );
- concept.getTuis().stream().forEach( this::addTui );
- if ( _preferredText == null || _preferredText.isEmpty() ) {
- setPreferredText( concept.getPreferredText() );
- }
- for ( String vocabulary : concept.getVocabularies() ) {
- for ( String code : concept.getCodes( vocabulary ) ) {
- addCode( vocabulary, code );
- }
- }
+ public boolean hasTui( final Collection<Tui> tuis ) {
+ return _tuis.stream().anyMatch( tuis::contains );
+ }
+
+ public boolean isEmpty() {
+ return _texts.isEmpty() || _codes.isEmpty();
+ }
+
+ public void setHasDose() {
+ _hasDose = true;
+ }
+
+ public boolean hasDose() {
+ return _hasDose;
+ }
+
+ public boolean isUnwanted() {
+ return hasDose() || isEmpty();
}
}
Modified: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/ConceptMapFactory.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/ConceptMapFactory.java?rev=1751544&r1=1751543&r2=1751544&view=diff
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/ConceptMapFactory.java (original)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/ConceptMapFactory.java Tue Jul 5 19:40:57 2016
@@ -17,45 +17,23 @@ public class ConceptMapFactory {
static private final Logger LOGGER = LogManager.getLogger( "ConceptMapFactory" );
-
- static public Map<Long,Concept> createConceptMap( final String umlsDirPath,
- final Collection<String> wantedSources,
- final Collection<Tui> wantedTuis,
- final String tuiTypes ) {
- if ( wantedTuis.isEmpty() ) {
- LOGGER.warn( "No valid " + tuiTypes + " Tuis" );
+ static public Map<Long,Concept> createInitialConceptMap( final String umlsDirPath,
+ final Collection<String> wantedSources,
+ final Collection<Tui> wantedTuis ) {
+ if ( wantedSources.isEmpty() ) {
+ LOGGER.warn( "No source vocabularies specified" );
return Collections.emptyMap();
}
-
- // get the valid Cuis for all wanted Tuis
- final Map<Long, Concept> concepts = MrstyParser.createConceptsForTuis( umlsDirPath, wantedTuis );
- if ( concepts.isEmpty() ) {
- LOGGER.warn( "No valid " + tuiTypes + " Tuis" );
- return Collections.emptyMap();
- }
- // filter out the Cuis that do not belong to the given sources
- final Collection<Long> validVocabularyCuis = MrconsoParser.getValidVocabularyCuis( umlsDirPath, wantedSources );
- concepts.keySet().retainAll( validVocabularyCuis );
- LOGGER.info( "Total Valid Cuis " + concepts.size() + "\t from wanted Tuis and Vocabularies" );
- return concepts;
- }
-
- static public Map<Long,Concept> createRxConceptMap( final String umlsDirPath,
- final Collection<Tui> wantedTuis ) {
if ( wantedTuis.isEmpty() ) {
- LOGGER.warn( "No valid Medication Tuis" );
+ LOGGER.warn( "No TUIs specified" );
return Collections.emptyMap();
}
// get the valid Cuis for all wanted Tuis
final Map<Long, Concept> concepts = MrstyParser.createConceptsForTuis( umlsDirPath, wantedTuis );
- if ( concepts.isEmpty() ) {
- LOGGER.warn( "No valid Medication Tuis" );
- return Collections.emptyMap();
- }
// filter out the Cuis that do not belong to the given sources
- final Collection<Long> validVocabularyCuis = MrconsoParser.getValidRxNormCuis( umlsDirPath );
+ final Collection<Long> validVocabularyCuis = MrconsoParser.getValidVocabularyCuis( umlsDirPath, wantedSources );
concepts.keySet().retainAll( validVocabularyCuis );
- LOGGER.info( "Total Valid Medication Cuis " + concepts.size() + "\t from wanted Tuis and Vocabularies" );
+ LOGGER.info( "Total Valid Cuis " + concepts.size() + "\t from wanted Tuis and Vocabularies" );
return concepts;
}
Added: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtil.java?rev=1751544&view=auto
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtil.java (added)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtil.java Tue Jul 5 19:40:57 2016
@@ -0,0 +1,71 @@
+package org.apache.ctakes.dictionary.creator.gui.umls;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.logging.Logger;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 7/14/14
+ */
+final public class DoseUtil {
+
+ private DoseUtil() {
+ }
+
+ static private final Logger LOGGER = Logger.getLogger( "DoseUtil" );
+
+
+ // some of these are not strictly units, e.g. "ud" : "ut dictum" or "as directed"
+ // but can be properly trimmed as they appear in the same place as would a unit
+
+ static private final String[] UNIT_ARRAY = { "gr", "gm", "gram", "grams", "g",
+ "mg", "milligram", "milligrams", "kg",
+ "microgram", "micrograms", "mcg", "ug",
+ "millicurie", "mic", "oz",
+ "lf", "ml", "liter", "milliliter", "l",
+ "milliequivalent", "meq",
+ "hour", "hours", "hr", //"day", "days", "daily", //"24hr", "8hr", "12hr",
+ "week", "weeks", "weekly", "biweekly",
+ "usp", "titradose",
+ "unit", "units", "unt", "iu", "u", "mmu",
+ "mm", "cm",
+ "gauge", "intl","au", "bau", "mci", "ud",
+ "ww", "vv", "wv",
+ "%", "percent", "%ww", "%vv", "%wv",
+ "actuation", "actuat", "vial", "vil", "packet", "pkt" };
+ static private final Collection<String> UNITS = Arrays.asList( UNIT_ARRAY );
+
+
+ static public boolean hasUnit( final String text ) {
+ final String[] splits = text.split( "\\s+" );
+ if ( splits.length <= 1 ) {
+ return false;
+ }
+ for ( String split : splits ) {
+ for ( String unit : UNITS ) {
+ if ( !split.endsWith( unit ) ) {
+ continue;
+ }
+ final int diff = split.length() - unit.length();
+ if ( diff == 0 ) {
+ return true;
+ }
+ boolean isAmount = true;
+ for ( int i=0; i<diff; i++ ) {
+ if ( !Character.isDigit( split.charAt( i ) ) ) {
+ isAmount = false;
+ break;
+ }
+ }
+ if ( isAmount ) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+
+}
Modified: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/MrconsoParser.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/MrconsoParser.java?rev=1751544&r1=1751543&r2=1751544&view=diff
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/MrconsoParser.java (original)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/MrconsoParser.java Tue Jul 5 19:40:57 2016
@@ -2,6 +2,7 @@ package org.apache.ctakes.dictionary.cre
import org.apache.ctakes.dictionary.creator.util.FileUtil;
+import org.apache.ctakes.dictionary.creator.util.TextTokenizer;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@@ -22,54 +23,105 @@ final public class MrconsoParser {
static private final String MR_CONSO_SUB_PATH = "/META/MRCONSO.RRF";
+ // TODO - put all exclusions in a data file, display for user, allow changes and save, etc.
+
+ // https://www.nlm.nih.gov/research/umls/sourcereleasedocs
// https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/SNOMEDCT_US/stats.html
// https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/RXNORM/stats.html
- static private final String[] EXCLUSION_TYPES = { "FN", "CCS", "CA2", "CA3", "PSN", "TMSY",
+ static private final String[] DEFAULT_EXCLUSIONS = { "FN", "CCS", "CA2", "CA3", "PSN", "TMSY",
"SBD", "SBDC", "SBDF", "SBDG",
- "SCD", "SCDC", "SCDF", "SCDG", "BPCK", "GPCK" };
- static private final String EXCLUSION_RXNORM = "SY";
+ "SCD", "SCDC", "SCDF", "SCDG", "BPCK", "GPCK", "XM" };
+
+ static private final String[] SNOMED_OBSOLETES = { "OF", "MTH_OF", "OAP", "MTH_OAP", "OAF", "MTH_OAF",
+ "IS", "MTH_IS", "OAS", "MTH_OAS",
+ "OP", "MTH_OP" };
+ // Snomed OF = Obsolete Fully Specified Name MTH_OF
+ // Snomed OAP = Obsolete Active Preferred Term MTH_OAP
+ // Snomed OAF = Obsolete Active Full Name MTH_OAF
+ // Snomed IS = Obsolete Synonym MTH_IS
+ // Snomed OAS = Obsolete Active Synonym MTH_OAS
+ // Snomed OP = Obsolete Preferred Name MTH_OP
+ // Snomed PT = Preferred Term , but we don't need that for valid cuis ... or do we want only those with preferred terms?
+ // Snomed PTGB = British Preferred Term
+
+ // GO has same snomed obsoletes +
+ // GO EOT = Obsolete Entry Term
+ // HPO has same snomed obsoletes
+
+ // MTHSPL - DP is Drug Product as is MTH_RXN_DP MTHSPL SU is active substance
+ // VANDF AB is abbreviation for drug VANDF CD is Clinical Drug. Both are dosed.
+ // NDFRT AB? Looks like ingredient. NDFRT PT can be dosed
+
+ static private final String[] GO_OBSOLETES = { "EOT" };
+
+ static private final String[] LOINC_OBSOLETES = { "LO", "OLC", "MTH_LO", "OOSN" };
+
+ static private final String[] MEDRA_OBSOLETES = { "OL", "MTH_OL" };
+
+ static private final String[] MESH_EXCLUSIONS = { "N1", "EN", "PEN" };
+
+ static private final String[] RXNORM_EXCLUSIONS = { "SY" }; // What is IN ? Ingredient?
+
+ static private final String[] NCI_EXCLUSIONS = { "CSN" };
+
+ // Related to, but not synonymous
+ static private final String[] UMDNS_EXCLUSIONS = { "RT" };
private MrconsoParser() {
}
- static public Map<Long, Concept> parseConcepts( final String umlsDirPath,
- final Map<Long, Concept> concepts,
- final Collection<String> wantedTargets,
- final UmlsTermUtil umlsTermUtil ) {
- return parseConcepts( umlsDirPath, concepts, wantedTargets, umlsTermUtil, 1, Integer.MAX_VALUE );
+ static public String[] getDefaultExclusions() {
+ return DEFAULT_EXCLUSIONS;
}
- static public Map<Long, Concept> parseConcepts( final String umlsDirPath,
- final Map<Long, Concept> concepts,
- final Collection<String> wantedTargets,
- final UmlsTermUtil umlsTermUtil,
- final int minCharLength,
- final int maxWordCount ) {
- return parseConcepts( umlsDirPath, concepts, wantedTargets, umlsTermUtil, true, minCharLength, maxWordCount );
+ static public String[] getSnomedExclusions() {
+ final String[] defaults = getDefaultExclusions();
+ final String[] exclusionTypes = Arrays.copyOf( defaults,
+ defaults.length + SNOMED_OBSOLETES.length );
+ System.arraycopy( SNOMED_OBSOLETES, 0, exclusionTypes, defaults.length, SNOMED_OBSOLETES.length );
+ return exclusionTypes;
}
- static public Map<Long, Concept> parseConcepts( final String umlsDirPath,
- final Map<Long, Concept> concepts,
- final Collection<String> wantedTargets,
- final UmlsTermUtil umlsTermUtil,
- final boolean extractAbbreviations,
- final int minWordLength,
- final int maxWordCount ) {
- return parseConcepts( umlsDirPath, concepts, wantedTargets, umlsTermUtil, Collections.emptyList(),
- extractAbbreviations, minWordLength, maxWordCount );
+ static public String[] getNonRxnormExclusions() {
+ final String[] snomeds = getSnomedExclusions();
+ final String[] exclusionTypes = Arrays.copyOf( snomeds,
+ snomeds.length
+ + GO_OBSOLETES.length
+ + LOINC_OBSOLETES.length
+ + MEDRA_OBSOLETES.length
+ + MESH_EXCLUSIONS.length
+ + NCI_EXCLUSIONS.length
+ + UMDNS_EXCLUSIONS.length );
+ int start = snomeds.length;
+ System.arraycopy( GO_OBSOLETES, 0, exclusionTypes, start, GO_OBSOLETES.length );
+ start += GO_OBSOLETES.length;
+ System.arraycopy( LOINC_OBSOLETES, 0, exclusionTypes, start, LOINC_OBSOLETES.length );
+ start += LOINC_OBSOLETES.length;
+ System.arraycopy( MEDRA_OBSOLETES, 0, exclusionTypes, start, MEDRA_OBSOLETES.length );
+ start += MEDRA_OBSOLETES.length;
+ System.arraycopy( MESH_EXCLUSIONS, 0, exclusionTypes, start, MESH_EXCLUSIONS.length );
+ start += MESH_EXCLUSIONS.length;
+ System.arraycopy( NCI_EXCLUSIONS, 0, exclusionTypes, start, NCI_EXCLUSIONS.length );
+ start += NCI_EXCLUSIONS.length;
+ System.arraycopy( UMDNS_EXCLUSIONS, 0, exclusionTypes, start, UMDNS_EXCLUSIONS.length );
+ return exclusionTypes;
}
- static public Map<Long, Concept> parseConcepts( final String umlsDirPath,
+
+ static public Map<Long, Concept> parseAllConcepts( final String umlsDirPath,
final Map<Long, Concept> concepts,
final Collection<String> wantedTargets,
final UmlsTermUtil umlsTermUtil,
- final Collection<String> unwantedTexts,
+ final Collection<String> languages,
final boolean extractAbbreviations,
- final int minWordLength,
- final int maxWordCount ) {
+ final int minCharLength,
+ final int maxCharLength,
+ final int maxWordCount,
+ final int maxSymCount ) {
final String mrconsoPath = umlsDirPath + MR_CONSO_SUB_PATH;
- LOGGER.info( "Compiling map of Umls Cuis and Texts from " + mrconsoPath );
+ final Collection<String> invalidTypeSet = new HashSet<>( Arrays.asList( getNonRxnormExclusions() ) );
+ LOGGER.info( "Compiling map of Concepts from " + mrconsoPath );
long lineCount = 0;
try ( final BufferedReader reader = FileUtil.createReader( mrconsoPath ) ) {
List<String> tokens = FileUtil.readBsvTokens( reader, mrconsoPath );
@@ -78,28 +130,52 @@ final public class MrconsoParser {
if ( lineCount % 100000 == 0 ) {
LOGGER.info( "File Line " + lineCount );
}
- if ( tokens.size() > TEXT._index && getToken( tokens, LANGUAGE ).equals( "ENG" ) ) {
- final Long cuiCode = CuiCodeUtil.getInstance().getCuiCode( getToken( tokens, CUI ) );
- final Concept concept = concepts.get( cuiCode );
- if ( concept == null ) {
- tokens = FileUtil.readBsvTokens( reader, mrconsoPath );
- continue;
- }
- final String source = getToken( tokens, SOURCE );
- if ( wantedTargets.contains( source ) ) {
- concept.addCode( source, getToken( tokens, SOURCE_CODE ) );
- }
- final String text = getToken( tokens, TEXT );
- if ( getToken( tokens, STATUS ).equals( "P" ) && getToken( tokens, FORM ).equals( "PF" ) ) {
- concept.setPreferredText( text );
- }
- Collection<String> formattedTexts = umlsTermUtil.getFormattedTexts( text, extractAbbreviations,
- minWordLength, maxWordCount );
- if ( formattedTexts == null || formattedTexts.isEmpty() ) {
- tokens = FileUtil.readBsvTokens( reader, mrconsoPath );
- continue;
+ if ( !isRowOk( tokens, languages, invalidTypeSet ) ) {
+ tokens = FileUtil.readBsvTokens( reader, mrconsoPath );
+ continue;
+ }
+ final Long cuiCode = CuiCodeUtil.getInstance().getCuiCode( getToken( tokens, CUI ) );
+ final Concept concept = concepts.get( cuiCode );
+ if ( concept == null ) {
+ // cui for current row is unwanted
+ tokens = FileUtil.readBsvTokens( reader, mrconsoPath );
+ continue;
+ }
+ final String text = getToken( tokens, TEXT );
+ if ( isPreferredTerm( tokens ) ) {
+ concept.setPreferredText( text );
+ }
+ final String source = getToken( tokens, SOURCE );
+ if ( wantedTargets.contains( source ) ) {
+ final String code = getToken( tokens, SOURCE_CODE );
+ if ( !code.equals( "NOCODE" ) ) {
+ Vocabulary.getInstance().addVocabulary( source, code );
+ concept.addCode( source, code );
}
- formattedTexts.removeAll( unwantedTexts );
+ }
+ final String tokenizedText = TextTokenizer.getTokenizedText( text );
+ if ( tokenizedText == null || tokenizedText.isEmpty()
+ || !umlsTermUtil.isTextValid( tokenizedText ) ) {
+ // no tokenizable text or tokenized text is invalid for some reason
+ tokens = FileUtil.readBsvTokens( reader, mrconsoPath );
+ continue;
+ }
+ if ( DoseUtil.hasUnit( tokenizedText ) ) {
+ concept.setHasDose();
+ tokens = FileUtil.readBsvTokens( reader, mrconsoPath );
+ continue;
+ }
+ final String strippedText = umlsTermUtil.getStrippedText( tokenizedText );
+ if ( strippedText == null || strippedText.isEmpty()
+ || UmlsTermUtil.isTextTooShort( strippedText, minCharLength )
+ || UmlsTermUtil.isTextTooLong( strippedText, maxCharLength, maxWordCount, maxSymCount ) ) {
+ // after stripping unwanted prefixes and suffixes there is no valid text
+ tokens = FileUtil.readBsvTokens( reader, mrconsoPath );
+ continue;
+ }
+ final Collection<String> formattedTexts
+ = umlsTermUtil.getFormattedTexts( strippedText, extractAbbreviations, minCharLength, maxCharLength, maxWordCount, maxSymCount );
+ if ( formattedTexts != null && !formattedTexts.isEmpty() ) {
concept.addTexts( formattedTexts );
}
tokens = FileUtil.readBsvTokens( reader, mrconsoPath );
@@ -112,6 +188,27 @@ final public class MrconsoParser {
}
+ static private boolean isRowOk( final List<String> tokens,
+ final Collection<String> languages,
+ final Collection<String> invalidTypeSet ) {
+ if ( tokens.size() <= TEXT._index || !languages.contains( getToken( tokens, LANGUAGE ) ) ) {
+ return false;
+ }
+ final String type = getToken( tokens, TERM_TYPE );
+ if ( invalidTypeSet.contains( type ) ) {
+ return false;
+ }
+ // "Synonyms" are actually undesirable in the rxnorm vocabulary
+ final String source = getToken( tokens, SOURCE );
+ return !( source.equals( "RXNORM" ) && type.equals( "SY" ) );
+ }
+
+
+ static private boolean isPreferredTerm( final List<String> tokens ) {
+ return getToken( tokens, STATUS ).equals( "P" ) && getToken( tokens, FORM ).equals( "PF" );
+ }
+
+
/**
* Can cull the given collection of cuis
*
@@ -121,20 +218,18 @@ final public class MrconsoParser {
*/
static public Collection<Long> getValidVocabularyCuis( final String umlsDirPath,
final Collection<String> sourceVocabularies ) {
- return getValidVocabularyCuis( umlsDirPath, sourceVocabularies, EXCLUSION_TYPES );
+ return getValidVocabularyCuis( umlsDirPath, sourceVocabularies, getDefaultExclusions() );
}
- /**
- * Can cull the given collection of cuis
- *
- * @param umlsDirPath path to the UMLS_ROOT Meta/MRCONSO.RRF file
- * @return Subset of cuis that exist in in the given sources
- */
- static public Collection<Long> getValidRxNormCuis( final String umlsDirPath ) {
- final String[] exclusionTypes = Arrays.copyOf( EXCLUSION_TYPES, EXCLUSION_TYPES.length + 1 );
- exclusionTypes[ EXCLUSION_TYPES.length ] = EXCLUSION_RXNORM;
- return getValidVocabularyCuis( umlsDirPath, Collections.singletonList( "RXNORM" ), exclusionTypes );
- }
+// /**
+// * Can cull the given collection of cuis
+// *
+// * @param umlsDirPath path to the UMLS_ROOT Meta/MRCONSO.RRF file
+// * @return Subset of cuis that exist in in the given sources
+// */
+// static public Collection<Long> getValidRxNormCuis( final String umlsDirPath ) {
+// return getValidVocabularyCuis( umlsDirPath, Collections.singletonList( "RXNORM" ), getRxnormExclusions() );
+// }
/**
* Can cull the given collection of cuis
@@ -173,21 +268,6 @@ final public class MrconsoParser {
return validCuis;
}
-// /**
-// * Given a collection of cuis, returns all of the cuis that don't exist for the given source types
-// *
-// * @param rrfPath path to the UMLS_ROOT Meta/MRCONSO.RRF file
-// * @param sourceTypes desired source type names as appear in rrf: RXNORM, SNOMEDCT, MSH, etc.
-// * @param cuiCodes current list of cui codes
-// * @return Subset of cuis that don't exist in in the given sources
-// */
-// static public Collection<Long> getSourceTypeInvalidCuis( final String rrfPath,
-// final Collection<String> sourceTypes,
-// final Collection<Long> cuiCodes ) {
-// final Collection<Long> validCuis = getSourceTypeValidCuis( rrfPath, sourceTypes, cuiCodes );
-// final Predicate<Long> validCui = validCuis::contains;
-// return cuiCodes.stream().filter( validCui.negate() ).collect( Collectors.toSet() );
-// }
static private String getToken( final List<String> tokens, final MrconsoIndex mrconsoIndex ) {
return tokens.get( mrconsoIndex._index );
Modified: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/UmlsTermUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/UmlsTermUtil.java?rev=1751544&r1=1751543&r2=1751544&view=diff
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/UmlsTermUtil.java (original)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/UmlsTermUtil.java Tue Jul 5 19:40:57 2016
@@ -1,11 +1,12 @@
package org.apache.ctakes.dictionary.creator.gui.umls;
import org.apache.ctakes.dictionary.creator.util.FileUtil;
-import org.apache.ctakes.dictionary.creator.util.TextTokenizer;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
+import java.util.function.Consumer;
+import java.util.stream.Collectors;
/**
@@ -18,7 +19,7 @@ import java.util.HashSet;
final public class UmlsTermUtil {
- static private enum DATA_FILE {
+ private enum DATA_FILE {
REMOVAL_PREFIX_TRIGGERS( "RemovalPrefixTriggers.txt" ),
REMOVAL_SUFFIX_TRIGGERS( "RemovalSuffixTriggers.txt" ),
REMOVAL_FUNCTION_TRIGGERS( "RemovalFunctionTriggers.txt" ),
@@ -29,7 +30,7 @@ final public class UmlsTermUtil {
RIGHT_ABBREVIATIONS( "RightAbbreviations.txt" );
final private String __name;
- private DATA_FILE( final String name ) {
+ DATA_FILE( final String name ) {
__name = name;
}
}
@@ -72,111 +73,7 @@ final public class UmlsTermUtil {
_abbreviations = FileUtil.readOneColumn( abbreviationsPath, "Abbreviations to expand" );
}
- public Collection<String> getFormattedTexts( final String text ) {
- return getFormattedTexts( text, true, 1, Integer.MAX_VALUE );
- }
-
- public Collection<String> getFormattedTexts( final Collection<String> extractedTerms,
- final int minWordLength, final int maxWordCount ) {
- final Collection<String> removalTexts = new HashSet<>();
- for ( String term : extractedTerms ) {
- if ( term.length() < minWordLength ) {
- removalTexts.add( term );
- continue;
- }
- final String[] splits = term.split( "\\s+" );
- if ( splits.length > maxWordCount ) {
- int count = 0;
- for ( String split : splits ) {
- if ( split.length() > 2 ) {
- count++;
- if ( count > maxWordCount ) {
- removalTexts.add( term );
- break;
- }
- }
- }
- }
- }
- extractedTerms.removeAll( removalTexts );
- return extractedTerms;
- }
-
- public Collection<String> getFormattedTexts( final String text, final boolean extractAbbreviations,
- final int minWordLength, final int maxWordCount ) {
- final String tokenizedText = TextTokenizer.getTokenizedText( text );
-// final String tokenizedText = TextTokenizerCtakesPTB.getTokenizedText( text ); PTB is not worth the trouble
- if ( tokenizedText == null || tokenizedText.isEmpty() ) {
- return Collections.emptyList();
- }
- if ( !isTextValid( tokenizedText ) ) {
- return Collections.emptyList();
- }
- final String validText = getValidText( tokenizedText );
- if ( validText == null || validText.isEmpty() ) {
- return Collections.emptyList();
- }
- Collection<String> extractedTerms = Collections.emptySet();
- if ( extractAbbreviations ) {
- // add embedded abbreviations
- extractedTerms = extractAbbreviations( validText );
-// if ( extractedTerms.isEmpty() ) {
-// extractedTerms = autoExtractAcronyms( validText );
-// }
- }
- if ( extractedTerms.isEmpty() ) {
- extractedTerms = extractModifiers( validText );
- }
- if ( !extractedTerms.isEmpty() ) {
- extractedTerms.add( validText );
- return getFormattedTexts( getPluralTerms( getValidTexts( extractedTerms ) ), minWordLength, maxWordCount );
- }
-// // Check for embedded and / or terms
-// if ( extractedTerms.isEmpty() ) {
-// extractedTerms = autoExtractColonParaTerms( validText );
-// }
-// if ( extractedTerms.isEmpty() ) {
-// extractedTerms = autoExtractOrParaTerms( validText );
-// }
-// if ( extractedTerms.isEmpty() ) {
-// extractedTerms = autoExtractColonBracketTerms( validText );
-// }
-// // if ( extractedTerms.isEmpty() ) {
-// // extractedTerms = autoExtractAndBracketTerms( validText );
-// // }
-// if ( extractedTerms.isEmpty() ) {
-// extractedTerms = autoExtractOrBracketTerms( validText );
-// }
-// if ( extractedTerms.isEmpty() ) {
-// extractedTerms = autoExtractAndOrOtherTerms( validText );
-// }
-// if ( !extractedTerms.isEmpty() ) {
-// // System.out.println( validText );
-// // for ( String et : extractedTerms ) {
-// // System.out.println(" " + et);
-// // }
-// return getFormattedTexts( getPluralTerms( getValidTexts( extractedTerms ) ), minWordLength, maxWordCount );
-// } else {
- Collection<String> texts = new HashSet<>( 1 );
- texts.add( validText );
- return getFormattedTexts( getPluralTerms( getValidTexts( texts ) ), minWordLength, maxWordCount );
-// }
- }
-
- static private Collection<String> getPluralTerms( final Collection<String> texts ) {
- final Collection<String> plurals = new HashSet<>();
- for ( String text : texts ) {
- if ( text.endsWith( "( s )" ) ) {
- final String singular = text.substring( 0, text.length() - 5 ).trim();
- plurals.add( singular );
- plurals.add( singular + "s" );
- }
- }
- texts.addAll( plurals );
- return texts;
- }
-
- private boolean isTextValid( final String text ) {
+ public boolean isTextValid( final String text ) {
// Check for illegal characters
for ( int i = 0; i < text.length(); i++ ) {
if ( text.charAt( i ) < ' ' || text.charAt( i ) > '~' ) {
@@ -190,77 +87,141 @@ final public class UmlsTermUtil {
if ( text.length() == 3 && text.charAt( 0 ) == '(' ) {
return false;
}
- for ( String removalPrefix : _removalPrefixTriggers ) {
- if ( text.startsWith( removalPrefix ) ) {
- return false;
- }
+ if ( _removalPrefixTriggers.stream().anyMatch( text::startsWith ) ) {
+ return false;
}
- for ( String removalSuffix : _removalSuffixTriggers ) {
- if ( text.endsWith( removalSuffix ) ) {
- return false;
- }
+ if ( _removalSuffixTriggers.stream().anyMatch( text::endsWith ) ) {
+ return false;
}
- for ( String removalColon : _removalColonTriggers ) {
- if ( text.contains( removalColon ) ) {
- return false;
- }
+ if ( _removalColonTriggers.stream().anyMatch( text::contains ) ) {
+ return false;
}
- for ( String removalFunction : _removalFunctionTriggers ) {
- if ( text.contains( removalFunction ) ) {
- return false;
- }
+ if ( _removalFunctionTriggers.stream().anyMatch( text::contains ) ) {
+ return false;
}
return true;
}
- private Collection<String> getValidTexts( final Collection<String> texts ) {
- final Collection<String> validTexts = new HashSet<>( texts.size() );
- for ( String text : texts ) {
- validTexts.add( getValidText( text ) );
+ static public boolean isTextTooShort( final String text, final int minCharLength ) {
+ return text.length() < minCharLength;
+ }
+
+ static private int cccc = 0;
+
+ static public boolean isTextTooLong( final String text, final int maxCharLength,
+ final int maxWordCount, final int maxSymCount ) {
+ final String[] splits = text.split( "\\s+" );
+ int wordCount = 0;
+ int symCount = 0;
+ for ( String split : splits ) {
+ if ( split.length() > maxCharLength ) {
+ return true;
+ }
+ if ( split.length() > 2 ) {
+ wordCount++;
+ } else {
+ symCount++;
+ }
}
- return validTexts;
+ return wordCount > maxWordCount || symCount > maxSymCount;
}
- private String getValidText( final String text ) {
+
+ public Collection<String> getFormattedTexts( final String strippedText, final boolean extractAbbreviations,
+ final int minCharLength, final int maxCharLength,
+ final int maxWordCount, final int maxSymCount ) {
+ Collection<String> extractedTerms = Collections.emptySet();
+ if ( extractAbbreviations ) {
+ // add embedded abbreviations
+ extractedTerms = extractAbbreviations( strippedText );
+ }
+ if ( extractedTerms.isEmpty() ) {
+ extractedTerms = extractModifiers( strippedText );
+ }
+ if ( !extractedTerms.isEmpty() ) {
+ extractedTerms.add( strippedText );
+ return getFormattedTexts( getPluralTerms( getStrippedTexts( extractedTerms ) ), minCharLength, maxCharLength, maxWordCount, maxSymCount );
+ }
+ Collection<String> texts = new HashSet<>( 1 );
+ texts.add( strippedText );
+ return getFormattedTexts( getPluralTerms( getStrippedTexts( texts ) ), minCharLength, maxCharLength, maxWordCount, maxSymCount );
+ }
+
+
+ static private Collection<String> getFormattedTexts( final Collection<String> extractedTerms,
+ final int minCharLength, final int maxCharLength,
+ final int maxWordCount, final int maxSymCount ) {
+ return extractedTerms.stream()
+ .filter( t -> !isTextTooShort( t, minCharLength ) )
+ .filter( t -> !isTextTooLong( t, maxCharLength, maxWordCount, maxSymCount ) )
+ .collect( Collectors.toList() );
+ }
+
+ static private Collection<String> getPluralTerms( final Collection<String> texts ) {
+ final Collection<String> plurals = texts.stream()
+ .filter( t -> t.endsWith( "( s )" ) )
+ .collect( Collectors.toList() );
+ if ( plurals.isEmpty() ) {
+ return texts;
+ }
+ texts.removeAll( plurals );
+ final Consumer<String> addPlural = t -> {
+ texts.add( t );
+ texts.add( t + "s" );
+ };
+ plurals.stream()
+ .map( t -> t.substring( 0, t.length() - 5 ) )
+ .forEach( addPlural );
+ return texts;
+ }
+
+ private Collection<String> getStrippedTexts( final Collection<String> texts ) {
+ return texts.stream()
+ .map( this::getStrippedText )
+ .filter( t -> !t.isEmpty() )
+ .collect( Collectors.toSet() );
+ }
+
+ public String getStrippedText( final String text ) {
// remove form underlines
// if ( text.contains( "_ _ _" ) ) {
// final int lastParen = text.lastIndexOf( '(' );
// final int lastDash = text.indexOf( "_ _ _" );
// final int deleteIndex = Math.max( 0, Math.min( lastParen, lastDash ) );
// if ( deleteIndex > 0 ) {
-// return getValidText( text.substring( 0, deleteIndex - 1 ).trim() );
+// return getStrippedText( text.substring( 0, deleteIndex - 1 ).trim() );
// }
// }
// remove unmatched parentheses, brackets, etc.
// if ( text.startsWith( "(" ) && !text.contains( ")" ) ) {
- // return getValidText( text.substring( 1 ).trim() );
+ // return getStrippedText( text.substring( 1 ).trim() );
// }
// if ( text.startsWith( "[" ) && !text.contains( "]" ) ) {
- // return getValidText( text.substring( 1 ).trim() );
+ // return getStrippedText( text.substring( 1 ).trim() );
// }
// if ( text.startsWith( "(" ) && text.endsWith( ") or" ) ) {
- // return getValidText( text.substring( 1, text.length() - 4 ).trim() );
+ // return getStrippedText( text.substring( 1, text.length() - 4 ).trim() );
// }
// if ( text.startsWith( "or (" ) ) {
- // return getValidText( text.substring( 2 ).trim() );
+ // return getStrippedText( text.substring( 2 ).trim() );
// }
// if ( text.startsWith( "\"" ) && text.endsWith( "\"" ) ) {
- // return getValidText( text.substring( 1 ).trim() );
+ // return getStrippedText( text.substring( 1 ).trim() );
// }
// if ( text.startsWith( "(" ) && text.endsWith( ")" ) ) {
- // return getValidText( text.substring( 1, text.length() - 2 ).trim() );
+ // return getStrippedText( text.substring( 1, text.length() - 2 ).trim() );
// }
// if ( text.startsWith( "[" ) && text.endsWith( "]" ) ) {
- // return getValidText( text.substring( 1, text.length() - 2 ).trim() );
+ // return getStrippedText( text.substring( 1, text.length() - 2 ).trim() );
// }
// if ( text.startsWith( "&" ) ) {
- // return getValidText( text.substring( 1 ).trim() );
+ // return getStrippedText( text.substring( 1 ).trim() );
// }
// if ( text.endsWith( "]" ) && !text.contains( "[" ) ) {
- // return getValidText( text.substring( 0, text.length() - 2 ).trim() );
+ // return getStrippedText( text.substring( 0, text.length() - 2 ).trim() );
// }
// if ( text.endsWith( ")" ) && !text.contains( "(" ) ) {
- // return getValidText( text.substring( 0, text.length() - 2 ).trim() );
+ // return getStrippedText( text.substring( 0, text.length() - 2 ).trim() );
// }
String strippedText = text.trim();
// Text in umls can have multiple suffixes and/or prefixes. Stripping just once doesn't do the trick
@@ -284,9 +245,6 @@ final public class UmlsTermUtil {
if ( strippedText.contains( "(" ) && strippedText.contains( "[" ) ) {
return "";
}
-// if ( strippedText.length() != text.trim().length() ) {
-// System.out.println( text.trim() + " > " + strippedText );
-// }
return strippedText;
}
@@ -325,225 +283,5 @@ final public class UmlsTermUtil {
return Collections.emptyList();
}
- private Collection<String> autoExtractAcronyms( final String tokenizedText ) {
- final int dashIndex = tokenizedText.indexOf( '-' );
- if ( dashIndex > 1 ) {
- // have text ABC - DEF, check for acronym
- final String acronym = tokenizedText.substring( 0, dashIndex - 1 ).trim();
- if ( acronym.isEmpty() || acronym.length() > 8 || acronym.equals( "dose" ) ) {
- return Collections.emptyList();
- }
- final String[] splits = acronym.split( "\\s+" );
- if ( (splits.length == 1 && acronym.length() > 6) || splits.length > 2 ) {
- return Collections.emptyList();
- }
- final String definition = tokenizedText.substring( dashIndex + 1 ).trim();
- if ( definition.isEmpty() ) {
- return Collections.emptyList();
- }
- if ( (acronym.charAt( 0 ) != definition.charAt( 0 ) && !definition.contains( "' s" )) ) {
- return Collections.emptyList();
- }
- final String[] definitionSplits = definition.split( "\\s+" );
- if ( acronym.length() != definitionSplits.length
- || definitionSplits[definitionSplits.length - 1].charAt( 0 ) != acronym.charAt(
- acronym.length() - 1 ) ) {
- return Collections.emptyList();
- }
- final Collection<String> extractedAbbreviations = new HashSet<>( 2 );
- extractedAbbreviations.add( acronym );
- extractedAbbreviations.add( definition );
- return extractedAbbreviations;
- }
- return Collections.emptyList();
- }
-
- private Collection<String> autoExtractColonBracketTerms( final String tokenizedText ) {
- final int colonIndex = tokenizedText.indexOf( ':' );
- if ( colonIndex < 0 ) {
- return Collections.emptyList();
- }
- final int orIndex = tokenizedText.indexOf( "] or [" );
- final int andOrIndex = tokenizedText.indexOf( "] & / or [" );
- if ( Math.max( orIndex, andOrIndex ) < colonIndex ) {
- return Collections.emptyList();
- }
- String splitter = "\\] or \\[";
- if ( andOrIndex > 0 ) {
- splitter = "\\] & / or \\[";
- }
- final Collection<String> extractedTerms = new HashSet<>( 2 );
- final String thing = tokenizedText.substring( 0, colonIndex - 1 ).trim();
- final String types = tokenizedText.substring( colonIndex + 1 ).trim();
- final String[] splits = types.split( splitter );
- for ( String split : splits ) {
- split = trimBracketText( split );
- if ( split.equals( "nos" ) || split.equals( "nec" ) || split.equals( "unspecified" )
- || split.equals( "other" ) || split.isEmpty() ) {
- extractedTerms.addAll( getFormattedTexts( thing ) );
- } else {
- extractedTerms.addAll( getFormattedTexts( split + " " + thing ) );
- extractedTerms.addAll( getFormattedTexts( thing + " " + split ) );
- }
- }
- return extractedTerms;
- }
-
- private Collection<String> autoExtractAndBracketTerms( final String tokenizedText ) {
- final int andIndex = tokenizedText.indexOf( "( &" );
- if ( andIndex < 0 || tokenizedText.indexOf( "] or [" ) < andIndex ) {
- return Collections.emptyList();
- }
- final Collection<String> extractedTerms = new HashSet<>( 3 );
- final String thing = tokenizedText.substring( 0, andIndex - 1 ).trim();
- extractedTerms.add( thing );
- final String types = tokenizedText.substring( andIndex + 3 ).trim();
- final String[] splits = types.split( "\\] or \\[" );
- for ( String split : splits ) {
- split = trimBracketText( split );
- extractedTerms.addAll( getFormattedTexts( split + " " + thing ) );
- extractedTerms.addAll( getFormattedTexts( thing + " " + split ) );
- }
- return extractedTerms;
- }
-
- private Collection<String> autoExtractOrBracketTerms( final String tokenizedText ) {
- if ( !tokenizedText.contains( "] or [" ) && !tokenizedText.contains( "] & / or [" ) ) {
- return Collections.emptyList();
- }
- final int lastOf = tokenizedText.lastIndexOf( " of " );
- if ( lastOf > tokenizedText.lastIndexOf( ']' ) ) {
- final String ofTerm = tokenizedText.substring( lastOf ).trim();
- final Collection<String> ofExtractions = autoExtractOrBracketTerms( tokenizedText.substring( 0,
- lastOf ).trim() );
- final Collection<String> ofTexts = new HashSet<>( ofExtractions.size() );
- for ( String ofText : ofExtractions ) {
- ofTexts.add( ofText + " " + ofTerm );
- }
- return ofTexts;
- }
- final Collection<String> extractedTerms = new HashSet<>( 2 );
- String splitter = "\\] or \\[";
- if ( tokenizedText.contains( "] & / or [" ) ) {
- splitter = "\\] & / or \\[";
- }
- final String[] splits = tokenizedText.split( splitter );
- for ( String split : splits ) {
- split = trimBracketText( split );
- if ( !split.equals( "operation" ) && !split.equals( "therapy" ) && !split.equals( "provision of" ) ) {
- extractedTerms.addAll( getFormattedTexts( split ) );
- }
- }
- return extractedTerms;
- }
-
- private Collection<String> autoExtractOrParaTerms( final String tokenizedText ) {
- if ( !tokenizedText.contains( ") or (" ) && !tokenizedText.contains( ") & / or (" ) ) {
- return Collections.emptyList();
- }
- final int lastOf = tokenizedText.lastIndexOf( " of " );
- if ( lastOf > tokenizedText.lastIndexOf( ')' ) ) {
- final String ofTerm = tokenizedText.substring( lastOf ).trim();
- final Collection<String> ofExtractions = autoExtractOrBracketTerms( tokenizedText.substring( 0,
- lastOf ).trim() );
- final Collection<String> ofTexts = new HashSet<>( ofExtractions.size() );
- for ( String ofText : ofExtractions ) {
- ofTexts.add( ofText + " " + ofTerm );
- }
- return ofTexts;
- }
- final Collection<String> extractedTerms = new HashSet<>( 2 );
- String splitter = "\\) or \\(";
- if ( tokenizedText.contains( ") & / or (" ) ) {
- splitter = "\\) & / or \\(";
- }
- final String[] splits = tokenizedText.split( splitter );
- for ( String split : splits ) {
- split = trimParaText( split );
- if ( !split.equals( "operation" ) && !split.equals( "therapy" ) && !split.equals( "provision of" ) ) {
- extractedTerms.addAll( getFormattedTexts( split ) );
- }
- }
- return extractedTerms;
- }
-
- private Collection<String> autoExtractColonParaTerms( final String tokenizedText ) {
- final int colonIndex = tokenizedText.indexOf( ':' );
- if ( colonIndex < 0 || colonIndex > tokenizedText.indexOf( '(' ) ) {
- return Collections.emptyList();
- }
- final int orIndex = tokenizedText.indexOf( ") or (" );
- final int andOrIndex = tokenizedText.indexOf( ") & / or (" );
- if ( Math.max( orIndex, andOrIndex ) < colonIndex ) {
- return Collections.emptyList();
- }
- String splitter = "\\) or \\(";
- if ( andOrIndex > 0 ) {
- splitter = "\\) & / or \\(";
- }
- final Collection<String> extractedTerms = new HashSet<>( 2 );
- final String thing = tokenizedText.substring( 0, colonIndex - 1 ).trim();
- final String types = tokenizedText.substring( colonIndex + 1 ).trim();
- final String[] splits = types.split( splitter );
- for ( String split : splits ) {
- split = trimParaText( split );
- if ( split.equals( "nos" ) || split.equals( "nec" ) || split.equals( "unspecified" )
- || split.equals( "other" ) || split.isEmpty() ) {
- extractedTerms.addAll( getFormattedTexts( thing ) );
- } else {
- extractedTerms.addAll( getFormattedTexts( split + " " + thing ) );
- extractedTerms.addAll( getFormattedTexts( thing + " " + split ) );
- }
- }
- return extractedTerms;
- }
-
- private Collection<String> autoExtractAndOrOtherTerms( final String tokenizedText ) {
- final int otherIndex = tokenizedText.indexOf( " & / or other " );
- if ( otherIndex < 0 ) {
- return Collections.emptyList();
- }
- final Collection<String> otherTexts = new HashSet<>( 2 );
- otherTexts.add( tokenizedText.substring( 0, otherIndex ).trim() );
- otherTexts.add( tokenizedText.substring( otherIndex + 14 ).trim() );
- return otherTexts;
- }
-
- static private String trimParaText( String paraText ) {
- if ( paraText.startsWith( "(" ) ) {
- paraText = paraText.substring( 1 );
- }
- if ( paraText.endsWith( " nos " ) || paraText.endsWith( " nec " ) ) {
- return paraText.substring( 0, paraText.length() - 4 ).trim();
- } else if ( paraText.endsWith( ", unspecified " ) ) {
- return paraText.substring( 0, paraText.length() - 14 ).trim();
- } else if ( paraText.endsWith( " nos )" ) || paraText.endsWith( " nec )" ) ) {
- return paraText.substring( 0, paraText.length() - 5 ).trim();
- } else if ( paraText.endsWith( ", unspecified )" ) ) {
- return paraText.substring( 0, paraText.length() - 15 ).trim();
- } else if ( paraText.endsWith( ")" ) ) {
- return paraText.substring( 0, paraText.length() - 1 ).trim();
- }
- return paraText.trim();
- }
-
- static private String trimBracketText( String bracketText ) {
- if ( bracketText.startsWith( "[" ) ) {
- bracketText = bracketText.substring( 1 );
- }
- if ( bracketText.endsWith( " nos " ) || bracketText.endsWith( " nec " ) ) {
- return bracketText.substring( 0, bracketText.length() - 4 ).trim();
- } else if ( bracketText.endsWith( ", unspecified " ) ) {
- return bracketText.substring( 0, bracketText.length() - 14 ).trim();
- } else if ( bracketText.endsWith( " nos ]" ) || bracketText.endsWith( " nec ]" ) ) {
- return bracketText.substring( 0, bracketText.length() - 5 ).trim();
- } else if ( bracketText.endsWith( ", unspecified ]" ) ) {
- return bracketText.substring( 0, bracketText.length() - 15 ).trim();
- } else if ( bracketText.endsWith( "]" ) ) {
- return bracketText.substring( 0, bracketText.length() - 1 ).trim();
- }
- return bracketText.trim();
- }
-
}
Modified: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordDbWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordDbWriter.java?rev=1751544&r1=1751543&r2=1751544&view=diff
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordDbWriter.java (original)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordDbWriter.java Tue Jul 5 19:40:57 2016
@@ -28,6 +28,7 @@ final public class RareWordDbWriter {
static private final Logger LOGGER = LogManager.getLogger( "RareWordDbWriter" );
+
private RareWordDbWriter() {
}
@@ -49,6 +50,25 @@ final public class RareWordDbWriter {
final String url, final String user, final String pass ) {
// Get Count of appearance in dictionary per term token
final Map<String, Integer> tokenCounts = RareWordUtil.getTokenCounts( concepts.values() );
+ // For pmsdn tesseract user-words
+// try ( Writer rareWordWriter = new BufferedWriter( new FileWriter( "C:\\Spiffy\\prj_pmsdn\\data\\internal\\dictionaryTemp\\rarewords.txt" ) ) ) {
+// for ( Map.Entry<String,Integer> entry : tokenCounts.entrySet() ) {
+// if ( entry.getValue() > 24 && entry.getKey().length() > 4 && entry.getKey().length() < 15 ) {
+// boolean allAlpha = true;
+// for ( char c : entry.getKey().toCharArray() ) {
+// if ( !Character.isLetter( c ) ) {
+// allAlpha = false;
+// break;
+// }
+// }
+// if ( allAlpha ) {
+// rareWordWriter.write( entry.getKey() + "\n" );
+// }
+// }
+// }
+// } catch ( IOException ioE ) {
+// LOGGER.error( ioE.getMessage() );
+// }
// Create insert sql statements
final String mainTableSql = JdbcUtil.createRowInsertSql( "CUI_TERMS", CuiTermsField.values() );
final String tuiTableSql = JdbcUtil.createCodeInsertSql( "tui" );
@@ -70,14 +90,15 @@ final public class RareWordDbWriter {
for ( Map.Entry<Long, Concept> conceptEntry : concepts.entrySet() ) {
final long cui = conceptEntry.getKey();
final Concept concept = conceptEntry.getValue();
- final Collection<String> texts = concept.getTexts();
- if ( texts.isEmpty() ) {
- continue;
- }
// write main term table
+ boolean conceptOk = false;
for ( String text : conceptEntry.getValue().getTexts() ) {
final RareWordUtil.IndexedRareWord indexedRareWord = RareWordUtil.getIndexedRareWord( text,
tokenCounts );
+ if ( RareWordUtil.NULL_RARE_WORD.equals( indexedRareWord ) ) {
+ continue;
+ }
+ conceptOk = true;
mainTableStatement.setLong( CuiTermsField.CUI.__index, cui );
mainTableStatement.setInt( CuiTermsField.RINDEX.__index, indexedRareWord.__index );
mainTableStatement.setInt( CuiTermsField.TCOUNT.__index, indexedRareWord.__tokenCount );
@@ -86,6 +107,9 @@ final public class RareWordDbWriter {
mainTableStatement.executeUpdate();
mainTableCount = incrementCount( "Main", mainTableCount );
}
+ if ( !conceptOk ) {
+ continue;
+ }
// write tui table
for ( Tui tui : concept.getTuis() ) {
tuiStatement.setLong( CuiTermsField.CUI.__index, cui );
@@ -144,9 +168,10 @@ final public class RareWordDbWriter {
LOGGER.info( "Main Table Rows " + mainTableCount );
LOGGER.info( "Tui Table Rows " + tuiTableCount );
LOGGER.info( "Preferred Term Table Rows " + preftermTableCount );
- final Function<String,String> getCountInfo
- = vocabulary -> vocabulary + " Table Rows " + codeTableCounts.get( vocabulary );
- Vocabulary.getInstance().getAllVocabularies().stream().map( getCountInfo ).forEach( LOGGER::info );
+ final Function<String,String> vocabCount = v -> v + " Table Rows " + codeTableCounts.get( v );
+ Vocabulary.getInstance().getAllVocabularies().stream()
+ .map( vocabCount )
+ .forEach( LOGGER::info );
return true;
}
Modified: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordUtil.java?rev=1751544&r1=1751543&r2=1751544&view=diff
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordUtil.java (original)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordUtil.java Tue Jul 5 19:40:57 2016
@@ -150,6 +150,8 @@ final public class RareWordUtil {
}
}
+ static public final IndexedRareWord NULL_RARE_WORD = new IndexedRareWord( null, -1, -1 );
+
static public IndexedRareWord getIndexedRareWord( final String text,
final Map<String, Integer> tokenCounts ) {
final String[] tokens = text.split( "\\s+" );
@@ -162,6 +164,9 @@ final public class RareWordUtil {
bestCount = count;
}
}
+ if ( bestCount == Integer.MAX_VALUE ) {
+ return NULL_RARE_WORD;
+ }
return new IndexedRareWord( tokens[bestIndex], bestIndex, tokens.length );
}
}
Modified: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/TextTokenizer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/TextTokenizer.java?rev=1751544&r1=1751543&r2=1751544&view=diff
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/TextTokenizer.java (original)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/TextTokenizer.java Tue Jul 5 19:40:57 2016
@@ -1,6 +1,7 @@
package org.apache.ctakes.dictionary.creator.util;
import java.util.*;
+import java.util.stream.Collectors;
/**
* Author: SPF
@@ -183,16 +184,10 @@ final public class TextTokenizer {
// get rid of last comma or semicolon or period
splits[splits.length - 1] = lastSplit.substring( 0, lastSplit.length() - 1 );
}
- final StringBuilder sb = new StringBuilder();
- for ( String split : splits ) {
- final List<String> tokens = getTokens( split, separateDigits );
- for ( String token : tokens ) {
- sb.append( token ).append( " " );
- }
- }
- // trim whitespace
- sb.setLength( Math.max( 0, sb.length() - 1 ) );
- return sb.toString();
+ return Arrays.stream( splits )
+ .map( s -> getTokens( s, separateDigits ) )
+ .flatMap( Collection::stream )
+ .collect( Collectors.joining( " " ) );
}
Added: ctakes/sandbox/dictionary-gui/src/test/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtilTester.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/test/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtilTester.java?rev=1751544&view=auto
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/test/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtilTester.java (added)
+++ ctakes/sandbox/dictionary-gui/src/test/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtilTester.java Tue Jul 5 19:40:57 2016
@@ -0,0 +1,30 @@
+package org.apache.ctakes.dictionary.creator.gui.umls;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.logging.Logger;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 7/5/2016
+ */
+public class DoseUtilTester {
+
+ static private final Logger LOGGER = Logger.getLogger( "DoseUtilTester" );
+
+ @Test
+ public void testHasUnit() {
+ Assert.assertTrue( "No ml detected!",
+ DoseUtil.hasUnit( "alcohol . 31 ml in 1 ml topical cloth [ alcohol wipes ]" ) );
+ Assert.assertTrue( "No mpa detected!",
+ DoseUtil.hasUnit( "polyquaternium - 32 ( 30000 mpa . s at 2 % )" ) );
+ Assert.assertTrue( "No mg detected!",
+ DoseUtil.hasUnit( "myasthenia gravis ( mg )" ) );
+ Assert.assertTrue( "No % detected!",
+ DoseUtil.hasUnit( "imiquimod 2 . 5 % top cream" ) );
+
+ }
+
+}