You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2016/07/05 19:40:58 UTC
svn commit: r1751544 - in /ctakes/sandbox/dictionary-gui/src: main/java/org/apache/ctakes/dictionary/creator/gui/ctakes/ main/java/org/apache/ctakes/dictionary/creator/gui/main/ main/java/org/apache/ctakes/dictionary/creator/gui/umls/ main/java/org/apa...

Author: seanfinan
Date: Tue Jul  5 19:40:57 2016
New Revision: 1751544

URL: http://svn.apache.org/viewvc?rev=1751544&view=rev
Log:
DictionaryBuilder only parses mrconso once
MainPanle prepped for language selection, "source" vocabulary ignored
Concept improved to remove subsuming texts, recognize unwanted status (e.g. concept has a dose)
DoseUtil recognizes units in terms
MrconsoParser has exclusions by term type in vocabularies
UmlsTermUtil is cleaned up
RareWordDbWriter mostly project-specific changes
RareWordUtil logging
TextTokenizer minor refactor
Adding DoseUtilTester

Added:
    ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtil.java
    ctakes/sandbox/dictionary-gui/src/test/java/org/apache/ctakes/dictionary/creator/gui/umls/
    ctakes/sandbox/dictionary-gui/src/test/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtilTester.java
Modified:
    ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/ctakes/DictionaryBuilder.java
    ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/main/MainPanel.java
    ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/Concept.java
    ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/ConceptMapFactory.java
    ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/MrconsoParser.java
    ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/UmlsTermUtil.java
    ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordDbWriter.java
    ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordUtil.java
    ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/TextTokenizer.java

Modified: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/ctakes/DictionaryBuilder.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/ctakes/DictionaryBuilder.java?rev=1751544&r1=1751543&r2=1751544&view=diff
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/ctakes/DictionaryBuilder.java (original)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/ctakes/DictionaryBuilder.java Tue Jul  5 19:40:57 2016
@@ -8,10 +8,8 @@ import org.apache.logging.log4j.LogManag
 import org.apache.logging.log4j.Logger;
 
 import java.io.File;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Map;
+import java.util.*;
+import java.util.function.Predicate;
 import java.util.stream.Collectors;
 
 /**
@@ -23,107 +21,142 @@ final public class DictionaryBuilder {
 
    static private final Logger LOGGER = LogManager.getLogger( "DictionaryBuilder" );
 
-   static private final String DEFAULT_DATA_DIR = "./data/default";
+//   static private final String DEFAULT_DATA_DIR = "./data/default";
+static private final String DEFAULT_DATA_DIR = "./data/tiny";
    static private final String CTAKES_APP_DB_PATH = "resources/org/apache/ctakes/dictionary/lookup/fast";
    static private final String CTAKES_RES_MODULE = "ctakes-dictionary-lookup-fast-res";
    static private final String CTAKES_RES_DB_PATH = CTAKES_RES_MODULE+"/src/main/" + CTAKES_APP_DB_PATH;
-   static private final int MIN_TERM_LENGTH = 2;
-   static private final int MIN_DRUG_TERM_LENGTH = 1;
+   static private final int MIN_CHAR_LENGTH = 2;
+   static private final int MAX_CHAR_LENGTH = 50;
+   static private final int MAX_WORD_COUNT = 12;
+   static private final int MAX_SYM_COUNT = 7;
+   static private final int MIN_DRUG_CHAR_LENGTH = 1;
 
 
    private DictionaryBuilder() {}
 
 
+
    static public boolean buildDictionary( final String umlsDirPath,
                                           final String ctakesDirPath,
                                           final String dictionaryName,
-                                          final Collection<String> wantedSources,
+                                           final Collection<String> wantedLanguages,
                                           final Collection<String> wantedTargets,
                                           final Collection<Tui> wantedTuis ) {
       // Set up the term utility
       final UmlsTermUtil umlsTermUtil = new UmlsTermUtil( DEFAULT_DATA_DIR );
-      final Map<Long,Concept> anatomies = parseAnatomy( umlsTermUtil, umlsDirPath, wantedSources, wantedTargets, wantedTuis );
-      final Map<Long,Concept> nonSpecials = parseNonSpecial( umlsTermUtil, umlsDirPath, wantedSources, wantedTargets, wantedTuis, anatomies
-            .values() );
-      final Map<Long,Concept> medications = parseMedication( umlsTermUtil, umlsDirPath, wantedTargets, wantedTuis );
-      writeDatabase( ctakesDirPath, dictionaryName, anatomies, nonSpecials, medications );
+      final Map<Long,Concept> conceptMap = parseAll( umlsTermUtil, umlsDirPath, wantedLanguages, wantedTargets, wantedTuis );
 
-//      Process process = ProcessBuilder( "java -Xmx2G -cp lib/hsqldb_1_8_0_10.jar org.hsqldb.util.SqlTool --rcfile " )
+      // special case for nitric oxide "no"
+      final Concept nitricOxide = conceptMap.get( 28128l );
+      if ( nitricOxide != null ) {
+         nitricOxide.removeTexts( Collections.singletonList( "no" ) );
+      }
+      // special case for nitric oxide synthase "nos"
+      final Concept nitricOxides = conceptMap.get( 132555l );
+      if ( nitricOxides != null ) {
+         nitricOxides.removeTexts( Arrays.asList( "nos", "synthase" ) );
+      }
+
+      writeDatabase( ctakesDirPath, dictionaryName, conceptMap );
       return true;
    }
 
-   static private Map<Long,Concept> parseAnatomy( final UmlsTermUtil umlsTermUtil,
-                                                final String umlsDirPath,
-                                                final Collection<String> wantedSources,
-                                                final Collection<String> wantedTargets,
-                                                final Collection<Tui> wantedTuis ) {
-      LOGGER.info( "Parsing Anatomical Site Concepts" );
-      final Collection<Tui> wantedAnatTuis = new ArrayList<>( wantedTuis );
-      wantedAnatTuis.retainAll( Arrays.asList( TuiTableModel.CTAKES_ANAT ) );
-      final Map<Long,Concept> concepts = ConceptMapFactory.createConceptMap( umlsDirPath, wantedSources,
-            wantedAnatTuis, "Anatomical Site" );
+
+
+
+   static private Map<Long,Concept> parseAll( final UmlsTermUtil umlsTermUtil,
+                                              final String umlsDirPath,
+                                              final Collection<String> wantedLanguages,
+                                              final Collection<String> wantedTargets,
+                                              final Collection<Tui> wantedTuis ) {
+      LOGGER.info( "Parsing Concepts" );
+      // Create a map of Cuis to empty Concepts for all wanted Tuis and source vocabularies
+      final Map<Long,Concept> conceptMap
+            = ConceptMapFactory.createInitialConceptMap( umlsDirPath, wantedTargets, wantedTuis );
       // Fill in information for all valid concepts
-      MrconsoParser.parseConcepts( umlsDirPath, concepts, wantedTargets, umlsTermUtil, MIN_TERM_LENGTH, 7 );
-      LOGGER.info( "Done Parsing Anatomical Site Concepts" );
-      return concepts;
+      MrconsoParser.parseAllConcepts( umlsDirPath, conceptMap, wantedTargets, umlsTermUtil,
+            wantedLanguages, true, MIN_CHAR_LENGTH, MAX_CHAR_LENGTH, MAX_WORD_COUNT, MAX_SYM_COUNT );
+      removeUnwantedConcepts( conceptMap );
+      // remove concepts that have only drug tuis but are not in rxnorm
+      removeNonRxNormDrugs( conceptMap, wantedTuis );
+      // remove concepts that are in rxnorm but have non-drug tuis
+      removeRxNormNonDrugs( conceptMap, wantedTuis );
+      // Cull non-ANAT texts by ANAT texts as determined by ANAT tuis
+      removeAnatTexts( conceptMap.values(), wantedTuis );
+      conceptMap.values().forEach( Concept::minimizeTexts );
+      LOGGER.info( "Done Parsing Concepts" );
+      return conceptMap;
+   }
+
+   /**
+    * Remove any concepts that are unwanted - don't have any text from a desired vocabulary
+    * @param conceptMap -
+    */
+   static private void removeUnwantedConcepts( final Map<Long, Concept> conceptMap ) {
+      final Collection<Long> empties = conceptMap.entrySet().stream()
+            .filter( e -> e.getValue().isUnwanted() )
+            .map( Map.Entry::getKey )
+            .collect( Collectors.toSet() );
+      conceptMap.keySet().removeAll( empties );
    }
 
-   static private Map<Long,Concept> parseNonSpecial( final UmlsTermUtil umlsTermUtil,
-                                       final String umlsDirPath,
-                                       final Collection<String> wantedSources,
-                                        final Collection<String> wantedTargets,
-                                        final Collection<Tui> wantedTuis,
-                                        final Collection<Concept> anatomyConcepts ) {
-      LOGGER.info( "Parsing Non-Anatomical Site, Non-Medication Concepts" );
-      final Collection<Tui> wantedNormTuis = new ArrayList<>( wantedTuis );
-      wantedNormTuis.removeAll( Arrays.asList( TuiTableModel.CTAKES_ANAT ) );
-//      wantedNormTuis.removeAll( Arrays.asList( TuiTableModel.CTAKES_DRUG ) );
-      final Map<Long,Concept> concepts = ConceptMapFactory.createConceptMap( umlsDirPath, wantedSources,
-            wantedNormTuis, "Non-Anatomical Site, Non-Medication" );
-      // We don't want anatomical site texts to be anything but, so make them unavailable for other concepts
-      final Collection<String> anatomyTexts = anatomyConcepts.stream()
+   static private Collection<String> getAnatTexts( final Collection<Concept> concepts, final Collection<Tui> wantedTuis ) {
+      final Collection<Tui> wantedAnatTuis = new ArrayList<>( wantedTuis );
+      wantedAnatTuis.retainAll( Arrays.asList( TuiTableModel.CTAKES_ANAT ) );
+      return concepts.stream()
+            .filter( c -> c.hasTui( wantedAnatTuis ) )
             .map( Concept::getTexts )
             .flatMap( Collection::stream )
             .collect( Collectors.toSet() );
-      // Fill in information for all valid concepts
-      MrconsoParser.parseConcepts( umlsDirPath, concepts, wantedTargets, umlsTermUtil,
-            anatomyTexts, true, MIN_TERM_LENGTH, 7 );
-      LOGGER.info( "Done Parsing Non-Anatomical Site, Non-Medication Concepts" );
-      return concepts;
    }
 
-   static private Map<Long,Concept> parseMedication( final UmlsTermUtil umlsTermUtil,
-                                        final String umlsDirPath,
-                                        final Collection<String> wantedTargets,
+   static private void removeAnatTexts( final Collection<Concept> concepts,
+                                        final Collection<Tui> wantedTuis,
+                                        final Collection<String> anatTexts ) {
+      final Collection<Tui> nonAnatTuis = new ArrayList<>( wantedTuis );
+      nonAnatTuis.removeAll( Arrays.asList( TuiTableModel.CTAKES_ANAT ) );
+      concepts.stream()
+            .filter( c -> c.hasTui( nonAnatTuis ) )
+            .forEach( c -> c.removeTexts( anatTexts ) );
+   }
+
+   static private void removeAnatTexts( final Collection<Concept> concepts,
                                         final Collection<Tui> wantedTuis ) {
-      LOGGER.info( "Parsing Medication Concepts" );
-      final Collection<Tui> wantedDrugTuis = new ArrayList<>( wantedTuis );
-      wantedDrugTuis.retainAll( Arrays.asList( TuiTableModel.CTAKES_DRUG ) );
-      final Map<Long,Concept> concepts = ConceptMapFactory.createRxConceptMap( umlsDirPath, wantedDrugTuis );
-      // Fill in information for all valid concepts
-      MrconsoParser.parseConcepts( umlsDirPath, concepts, wantedTargets, umlsTermUtil, MIN_DRUG_TERM_LENGTH, 11 );
-      // special case for nitric oxide "no"
-      final Concept nitricOxide = concepts.get( 28128l );
-      if ( nitricOxide != null ) {
-         nitricOxide.removeText( "no" );
-      }
-      LOGGER.info( "Done Parsing Medication Concepts" );
-      return concepts;
+      final Collection<String> anatTexts = getAnatTexts( concepts, wantedTuis );
+      removeAnatTexts( concepts, wantedTuis, anatTexts );
    }
 
-   static private boolean writeDatabase( final String ctakesDirPath,
-                                         final String dictionaryName,
-                                         final Map<Long,Concept> anatomies,
-                                         final Map<Long,Concept> nonSpecials,
-                                         final Map<Long,Concept> medications ) {
-      mergeConcepts( nonSpecials, anatomies );
-      mergeConcepts( nonSpecials, medications );
-      return writeDatabase( ctakesDirPath, dictionaryName, nonSpecials );
+   static private void removeNonRxNormDrugs( final Map<Long,Concept> conceptMap, Collection<Tui> wantedTuis ) {
+      final Collection<Tui> drugTuis = new ArrayList<>( wantedTuis );
+      drugTuis.retainAll( Arrays.asList( TuiTableModel.CTAKES_DRUG ) );
+      final Predicate<Map.Entry<Long,Concept>> isNonRxNormDrug
+            = e -> drugTuis.containsAll( e.getValue().getTuis() )
+                   && !e.getValue().getVocabularies().contains( "RXNORM" );
+      final Collection<Long> removalCuis = conceptMap.entrySet().stream()
+            .filter( isNonRxNormDrug )
+            .map( Map.Entry::getKey )
+            .collect( Collectors.toSet() );
+      conceptMap.keySet().removeAll( removalCuis );
+   }
+
+   static private void removeRxNormNonDrugs( final Map<Long,Concept> conceptMap, Collection<Tui> wantedTuis ) {
+      final Collection<Tui> nonDrugTuis = new ArrayList<>( wantedTuis );
+      nonDrugTuis.removeAll( Arrays.asList( TuiTableModel.CTAKES_DRUG ) );
+      final Predicate<Map.Entry<Long,Concept>> isRxNormNonDrug
+            = e -> e.getValue().getVocabularies().contains( "RXNORM" )
+                   && nonDrugTuis.containsAll( e.getValue().getTuis() );
+      final Collection<Long> removalCuis = conceptMap.entrySet().stream()
+            .filter( isRxNormNonDrug )
+            .map( Map.Entry::getKey )
+            .collect( Collectors.toSet() );
+      conceptMap.keySet().removeAll( removalCuis );
    }
 
+
    static private boolean writeDatabase( final String ctakesDirPath,
                                          final String dictionaryName,
-                                         final Map<Long,Concept> concepts ) {
+                                         final Map<Long,Concept> conceptMap ) {
       final File ctakesRoot = new File( ctakesDirPath );
       String databaseDirPath = ctakesDirPath + "/" + CTAKES_APP_DB_PATH;
       if ( Arrays.asList( ctakesRoot.list() ).contains( CTAKES_RES_MODULE ) ) {
@@ -136,20 +169,7 @@ final public class DictionaryBuilder {
          return false;
       }
       final String url = HsqlUtil.URL_PREFIX + databaseDirPath.replace( '\\', '/' ) + "/" + dictionaryName + "/" + dictionaryName;
-      return RareWordDbWriter.writeConcepts( concepts, url, "sa", "" );
-   }
-
-   static private void mergeConcepts( final Map<Long,Concept> mainConepts,
-                                      final Map<Long,Concept> mergingConcepts ) {
-      for ( Map.Entry<Long,Concept> mergable : mergingConcepts.entrySet() ) {
-         final Concept mainConcept = mainConepts.get( mergable.getKey() );
-         if ( mainConcept == null ) {
-            mainConepts.put( mergable.getKey(), mergable.getValue() );
-            continue;
-         }
-         mainConcept.mergeWith( mergable.getValue() );
-      }
-      mergingConcepts.clear();
+      return RareWordDbWriter.writeConcepts( conceptMap, url, "sa", "" );
    }
 
 

Modified: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/main/MainPanel.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/main/MainPanel.java?rev=1751544&r1=1751543&r2=1751544&view=diff
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/main/MainPanel.java (original)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/main/MainPanel.java Tue Jul  5 19:40:57 2016
@@ -21,6 +21,7 @@ import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.HashSet;
 
 /**
@@ -48,8 +49,6 @@ final public class MainPanel extends JPa
       final JComponent centerPanel = new JPanel( new GridLayout( 1, 2 ) );
       centerPanel.add( createSourceTable( _sourceModel ) );
       centerPanel.add( createTuiTable( _tuiModel ) );
-//      add( createTuiTable( _tuiModel ), BorderLayout.WEST );
-//      add( createSourceTable( _sourceModel ), BorderLayout.EAST );
       add( centerPanel, BorderLayout.CENTER );
       add( createGoPanel(), BorderLayout.SOUTH );
    }
@@ -145,30 +144,37 @@ final public class MainPanel extends JPa
    private void buildDictionary( final String dictionaryName ) {
       SwingUtilities.invokeLater(
             new DictionaryBuildRunner( _umlsDirPath, _ctakesPath, dictionaryName,
-                  _sourceModel.getWantedSources(), _sourceModel.getWantedTargets(), _tuiModel.getWantedTuis() ) );
+                  _sourceModel.getWantedTargets(), _tuiModel.getWantedTuis() ) );
    }
 
+   private void error( final String title, final String message ) {
+      LOGGER.error( message );
+      JOptionPane.showMessageDialog( MainPanel.this, message, title, JOptionPane.ERROR_MESSAGE );
+   }
+
+
+
    private class DictionaryBuildRunner implements Runnable {
       private final String __umlsDirPath;
       private final String __ctakesDirPath;
       private final String __dictionaryName;
-      private final Collection<String> __wantedSources;
       private final Collection<String> __wantedTargets;
       private final Collection<Tui> __wantedTuis;
       private DictionaryBuildRunner( final String umlsDirPath, final String ctakesDirPath, final String dictionaryName,
-                                     final Collection<String> wantedSources, final Collection<String> wantedTargets,
+                                     final Collection<String> wantedTargets,
                                      final Collection<Tui> wantedTuis ) {
          __umlsDirPath = umlsDirPath;
          __ctakesDirPath = ctakesDirPath;
          __dictionaryName = dictionaryName;
-         __wantedSources = new ArrayList<>( wantedSources );
          __wantedTargets = new ArrayList<>( wantedTargets );
          __wantedTuis = new ArrayList<>( wantedTuis );
       }
+
       public void run() {
          SwingUtilities.getRoot( MainPanel.this ).setCursor( Cursor.getPredefinedCursor( Cursor.WAIT_CURSOR ) );
          if ( DictionaryBuilder.buildDictionary( __umlsDirPath, __ctakesDirPath, __dictionaryName,
-               __wantedSources, __wantedTargets, __wantedTuis ) ) {
+               Collections.singletonList( "ENG" ),
+               __wantedTargets, __wantedTuis ) ) {
             final String message = "Dictionary " + __dictionaryName + " successfully built in " + __ctakesDirPath;
             LOGGER.info( message );
             JOptionPane.showMessageDialog( MainPanel.this, message, "Dictionary Built", JOptionPane.INFORMATION_MESSAGE );
@@ -179,10 +185,7 @@ final public class MainPanel extends JPa
       }
    }
 
-   private void error( final String title, final String message ) {
-      LOGGER.error( message );
-      JOptionPane.showMessageDialog( MainPanel.this, message, title, JOptionPane.ERROR_MESSAGE );
-   }
+
 
    private class UmlsDirListener implements ActionListener {
       public void actionPerformed( final ActionEvent event ) {

Modified: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/Concept.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/Concept.java?rev=1751544&r1=1751543&r2=1751544&view=diff
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/Concept.java (original)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/Concept.java Tue Jul  5 19:40:57 2016
@@ -3,10 +3,7 @@ package org.apache.ctakes.dictionary.cre
 import org.apache.ctakes.dictionary.creator.util.collection.CollectionMap;
 import org.apache.ctakes.dictionary.creator.util.collection.HashSetMap;
 
-import java.util.Collection;
-import java.util.Collections;
-import java.util.EnumSet;
-import java.util.HashSet;
+import java.util.*;
 
 /**
  * Author: SPF
@@ -18,11 +15,14 @@ final public class Concept {
    static public String PREFERRED_TERM_UNKNOWN = "Unknown Preferred Term";
 
    private String _preferredText = null;
+   private boolean _hasDose = false;
+
    final private Collection<String> _texts;
    final private CollectionMap<String, String, ? extends Collection<String>> _codes;
    final private Collection<Tui> _tuis;
 
 
+
    public Concept() {
       _codes = new HashSetMap<>( 0 );
       _texts = new HashSet<>( 1 );
@@ -33,14 +33,41 @@ final public class Concept {
       _texts.addAll( texts );
    }
 
-   public void removeText( final String text ) {
-      _texts.remove( text );
+   public void removeTexts( final Collection<String> texts ) {
+      _texts.removeAll( texts );
    }
 
    public Collection<String> getTexts() {
       return _texts;
    }
 
+   public void minimizeTexts() {
+      if ( _texts.size() < 2 ) {
+         return;
+      }
+      final List<String> textList = new ArrayList<>( _texts );
+      final Collection<String> extensionTexts = new HashSet<>();
+      for ( int i=0; i<textList.size()-1; i++ ) {
+         final String iText = textList.get( i );
+         for ( int j=i+1; j<textList.size(); j++ ) {
+            final String jText = textList.get( j );
+            if ( textContained( jText, iText ) ) {
+               extensionTexts.add( jText );
+            } else if ( textContained( iText, jText ) ) {
+               extensionTexts.add( iText );
+            }
+         }
+      }
+      _texts.removeAll( extensionTexts );
+   }
+
+   static private boolean textContained( final String containerText, final String containedText ) {
+      final int index = containerText.indexOf( containedText );
+      return index >= 0
+             && ( index == 0 || containerText.charAt( index-1 ) == ' ' )
+           && ( index+containedText.length() == containerText.length() || containerText.charAt( index + containedText.length() ) == ' ' );
+   }
+
    public void setPreferredText( final String text ) {
       _preferredText = text;
    }
@@ -52,17 +79,16 @@ final public class Concept {
       return PREFERRED_TERM_UNKNOWN;
    }
 
-   public void addCode( final String vocabulary, final String code ) {
-      _codes.placeValue( vocabulary, code );
-      Vocabulary.getInstance().addVocabulary( vocabulary, code );
+   public void addCode( final String source, final String code ) {
+      _codes.placeValue( source, code );
    }
 
    public Collection<String> getVocabularies() {
       return _codes.keySet();
    }
 
-   public Collection<String> getCodes( final String vocabulary ) {
-      final Collection<String> codes = _codes.getCollection( vocabulary );
+   public Collection<String> getCodes( final String source ) {
+      final Collection<String> codes = _codes.getCollection( source );
       if ( codes == null ) {
          return Collections.emptyList();
       }
@@ -77,17 +103,24 @@ final public class Concept {
       return _tuis;
    }
 
-   public void mergeWith( final Concept concept ) {
-      addTexts( concept.getTexts() );
-      concept.getTuis().stream().forEach( this::addTui );
-      if ( _preferredText == null || _preferredText.isEmpty() ) {
-         setPreferredText( concept.getPreferredText() );
-      }
-      for ( String vocabulary : concept.getVocabularies() ) {
-         for ( String code : concept.getCodes( vocabulary ) ) {
-            addCode( vocabulary, code );
-         }
-      }
+   public boolean hasTui( final Collection<Tui> tuis ) {
+      return _tuis.stream().anyMatch( tuis::contains );
+   }
+
+   public boolean isEmpty() {
+      return _texts.isEmpty() || _codes.isEmpty();
+   }
+
+   public void setHasDose() {
+      _hasDose = true;
+   }
+
+   public boolean hasDose() {
+      return _hasDose;
+   }
+
+   public boolean isUnwanted() {
+      return hasDose() || isEmpty();
    }
 
 }

Modified: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/ConceptMapFactory.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/ConceptMapFactory.java?rev=1751544&r1=1751543&r2=1751544&view=diff
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/ConceptMapFactory.java (original)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/ConceptMapFactory.java Tue Jul  5 19:40:57 2016
@@ -17,45 +17,23 @@ public class ConceptMapFactory {
 
    static private final Logger LOGGER = LogManager.getLogger( "ConceptMapFactory" );
 
-
-   static public Map<Long,Concept> createConceptMap( final String umlsDirPath,
-                                                      final Collection<String> wantedSources,
-                                                      final Collection<Tui> wantedTuis,
-                                                      final String tuiTypes ) {
-      if ( wantedTuis.isEmpty() ) {
-         LOGGER.warn( "No valid " + tuiTypes + " Tuis" );
+   static public Map<Long,Concept> createInitialConceptMap( final String umlsDirPath,
+                                                     final Collection<String> wantedSources,
+                                                     final Collection<Tui> wantedTuis ) {
+      if ( wantedSources.isEmpty() ) {
+         LOGGER.warn( "No source vocabularies specified" );
          return Collections.emptyMap();
       }
-
-      // get the valid Cuis for all wanted Tuis
-      final Map<Long, Concept> concepts = MrstyParser.createConceptsForTuis( umlsDirPath, wantedTuis );
-      if ( concepts.isEmpty() ) {
-         LOGGER.warn( "No valid " + tuiTypes + " Tuis" );
-         return Collections.emptyMap();
-      }
-      // filter out the Cuis that do not belong to the given sources
-      final Collection<Long> validVocabularyCuis = MrconsoParser.getValidVocabularyCuis( umlsDirPath, wantedSources );
-      concepts.keySet().retainAll( validVocabularyCuis );
-      LOGGER.info( "Total Valid Cuis " + concepts.size() + "\t from wanted Tuis and Vocabularies" );
-      return concepts;
-   }
-
-   static public Map<Long,Concept> createRxConceptMap( final String umlsDirPath,
-                                                      final Collection<Tui> wantedTuis ) {
       if ( wantedTuis.isEmpty() ) {
-         LOGGER.warn( "No valid Medication Tuis" );
+         LOGGER.warn( "No TUIs specified" );
          return Collections.emptyMap();
       }
       // get the valid Cuis for all wanted Tuis
       final Map<Long, Concept> concepts = MrstyParser.createConceptsForTuis( umlsDirPath, wantedTuis );
-      if ( concepts.isEmpty() ) {
-         LOGGER.warn( "No valid Medication Tuis" );
-         return Collections.emptyMap();
-      }
       // filter out the Cuis that do not belong to the given sources
-      final Collection<Long> validVocabularyCuis = MrconsoParser.getValidRxNormCuis( umlsDirPath );
+      final Collection<Long> validVocabularyCuis = MrconsoParser.getValidVocabularyCuis( umlsDirPath, wantedSources );
       concepts.keySet().retainAll( validVocabularyCuis );
-      LOGGER.info( "Total Valid Medication Cuis " + concepts.size() + "\t from wanted Tuis and Vocabularies" );
+      LOGGER.info( "Total Valid Cuis " + concepts.size() + "\t from wanted Tuis and Vocabularies" );
       return concepts;
    }
 

Added: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtil.java?rev=1751544&view=auto
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtil.java (added)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtil.java Tue Jul  5 19:40:57 2016
@@ -0,0 +1,71 @@
+package org.apache.ctakes.dictionary.creator.gui.umls;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.logging.Logger;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 7/14/14
+ */
+final public class DoseUtil {
+
+   private DoseUtil() {
+   }
+
+   static private final Logger LOGGER = Logger.getLogger( "DoseUtil" );
+
+
+   // some of these are not strictly units, e.g. "ud" : "ut dictum" or "as directed"
+   // but can be properly trimmed as they appear in the same place as would a unit
+
+   static private final String[] UNIT_ARRAY = { "gr", "gm", "gram", "grams", "g",
+                                           "mg", "milligram", "milligrams", "kg",
+                                           "microgram", "micrograms", "mcg", "ug",
+                                           "millicurie", "mic", "oz",
+                                            "lf", "ml", "liter", "milliliter", "l",
+                                           "milliequivalent", "meq",
+                                           "hour", "hours", "hr", //"day", "days", "daily", //"24hr", "8hr", "12hr",
+                                                "week", "weeks", "weekly", "biweekly",
+                                           "usp", "titradose",
+                                           "unit", "units", "unt", "iu", "u", "mmu",
+                                           "mm", "cm",
+                                           "gauge", "intl","au", "bau", "mci", "ud",
+                                           "ww", "vv", "wv",
+                                           "%", "percent", "%ww", "%vv", "%wv",
+                                           "actuation", "actuat", "vial", "vil", "packet", "pkt" };
+   static private final Collection<String> UNITS = Arrays.asList( UNIT_ARRAY );
+
+
+   static public boolean hasUnit( final String text ) {
+      final String[] splits = text.split( "\\s+" );
+      if ( splits.length <= 1 ) {
+         return false;
+      }
+      for ( String split : splits ) {
+         for ( String unit : UNITS ) {
+            if ( !split.endsWith( unit ) ) {
+               continue;
+            }
+            final int diff = split.length() - unit.length();
+            if ( diff == 0 ) {
+               return true;
+            }
+            boolean isAmount = true;
+            for ( int i=0; i<diff; i++ ) {
+               if ( !Character.isDigit( split.charAt( i ) ) ) {
+                  isAmount = false;
+                  break;
+               }
+            }
+            if ( isAmount ) {
+               return true;
+            }
+         }
+      }
+      return false;
+   }
+
+
+}

Modified: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/MrconsoParser.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/MrconsoParser.java?rev=1751544&r1=1751543&r2=1751544&view=diff
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/MrconsoParser.java (original)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/MrconsoParser.java Tue Jul  5 19:40:57 2016
@@ -2,6 +2,7 @@ package org.apache.ctakes.dictionary.cre
 
 
 import org.apache.ctakes.dictionary.creator.util.FileUtil;
+import org.apache.ctakes.dictionary.creator.util.TextTokenizer;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 
@@ -22,54 +23,105 @@ final public class MrconsoParser {
 
    static private final String MR_CONSO_SUB_PATH = "/META/MRCONSO.RRF";
 
+   // TODO - put all exclusions in a data file, display for user, allow changes and save, etc.
+
+   //  https://www.nlm.nih.gov/research/umls/sourcereleasedocs
    //   https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/SNOMEDCT_US/stats.html
    //   https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/RXNORM/stats.html
-   static private final String[] EXCLUSION_TYPES = { "FN", "CCS", "CA2", "CA3", "PSN", "TMSY",
+   static private final String[] DEFAULT_EXCLUSIONS = { "FN", "CCS", "CA2", "CA3", "PSN", "TMSY",
                                                      "SBD", "SBDC", "SBDF", "SBDG",
-                                                     "SCD", "SCDC", "SCDF", "SCDG", "BPCK", "GPCK" };
-   static private final String EXCLUSION_RXNORM = "SY";
+                                                     "SCD", "SCDC", "SCDF", "SCDG", "BPCK", "GPCK", "XM" };
+
+   static private final String[] SNOMED_OBSOLETES = { "OF", "MTH_OF", "OAP", "MTH_OAP", "OAF", "MTH_OAF",
+                                                     "IS", "MTH_IS", "OAS", "MTH_OAS",
+                                                     "OP", "MTH_OP" };
+   // Snomed OF  = Obsolete Fully Specified Name      MTH_OF
+   // Snomed OAP = Obsolete Active Preferred Term     MTH_OAP
+   // Snomed OAF = Obsolete Active Full Name          MTH_OAF
+   // Snomed IS  = Obsolete Synonym                   MTH_IS
+   // Snomed OAS = Obsolete Active Synonym            MTH_OAS
+   // Snomed OP  = Obsolete Preferred Name            MTH_OP
+   // Snomed PT  = Preferred Term , but we don't need that for valid cuis ...  or do we want only those with preferred terms?
+   // Snomed PTGB = British Preferred Term
+
+   // GO has same snomed obsoletes +
+   // GO EOT = Obsolete Entry Term
+   // HPO has same snomed obsoletes
+
+   // MTHSPL - DP is Drug Product  as is MTH_RXN_DP      MTHSPL SU is active substance
+   // VANDF AB  is abbreviation for drug  VANDF CD is Clinical Drug.  Both are dosed.
+   //  NDFRT AB?  Looks like ingredient.  NDFRT PT can be dosed
+
+   static private final String[] GO_OBSOLETES = { "EOT" };
+
+   static private final String[] LOINC_OBSOLETES = { "LO", "OLC", "MTH_LO", "OOSN" };
+
+   static private final String[] MEDRA_OBSOLETES = { "OL", "MTH_OL" };
+
+   static private final String[] MESH_EXCLUSIONS = { "N1", "EN", "PEN" };
+
+   static private final String[] RXNORM_EXCLUSIONS = { "SY" };   // What is IN ?  Ingredient?
+
+   static private final String[] NCI_EXCLUSIONS = { "CSN" };
+
+   // Related to, but not synonymous
+   static private final String[] UMDNS_EXCLUSIONS = { "RT" };
 
    private MrconsoParser() {
    }
 
-   static public Map<Long, Concept> parseConcepts( final String umlsDirPath,
-                                                   final Map<Long, Concept> concepts,
-                                                   final Collection<String> wantedTargets,
-                                                   final UmlsTermUtil umlsTermUtil ) {
-      return parseConcepts( umlsDirPath, concepts, wantedTargets, umlsTermUtil, 1, Integer.MAX_VALUE );
+   static public String[] getDefaultExclusions() {
+      return DEFAULT_EXCLUSIONS;
    }
 
-   static public Map<Long, Concept> parseConcepts( final String umlsDirPath,
-                                                   final Map<Long, Concept> concepts,
-                                                   final Collection<String> wantedTargets,
-                                                   final UmlsTermUtil umlsTermUtil,
-                                                   final int minCharLength,
-                                                   final int maxWordCount ) {
-      return parseConcepts( umlsDirPath, concepts, wantedTargets, umlsTermUtil, true, minCharLength, maxWordCount );
+   static public String[] getSnomedExclusions() {
+      final String[] defaults = getDefaultExclusions();
+      final String[] exclusionTypes = Arrays.copyOf( defaults,
+            defaults.length + SNOMED_OBSOLETES.length );
+      System.arraycopy( SNOMED_OBSOLETES, 0, exclusionTypes, defaults.length, SNOMED_OBSOLETES.length );
+      return exclusionTypes;
    }
 
-   static public Map<Long, Concept> parseConcepts( final String umlsDirPath,
-                                                   final Map<Long, Concept> concepts,
-                                                   final Collection<String> wantedTargets,
-                                                   final UmlsTermUtil umlsTermUtil,
-                                                   final boolean extractAbbreviations,
-                                                   final int minWordLength,
-                                                   final int maxWordCount ) {
-      return parseConcepts( umlsDirPath, concepts, wantedTargets, umlsTermUtil, Collections.emptyList(),
-            extractAbbreviations, minWordLength, maxWordCount );
+   static public String[] getNonRxnormExclusions() {
+      final String[] snomeds = getSnomedExclusions();
+      final String[] exclusionTypes = Arrays.copyOf( snomeds,
+            snomeds.length
+            + GO_OBSOLETES.length
+            + LOINC_OBSOLETES.length
+            + MEDRA_OBSOLETES.length
+            + MESH_EXCLUSIONS.length
+            + NCI_EXCLUSIONS.length
+            + UMDNS_EXCLUSIONS.length );
+      int start = snomeds.length;
+      System.arraycopy( GO_OBSOLETES, 0, exclusionTypes, start, GO_OBSOLETES.length );
+      start += GO_OBSOLETES.length;
+      System.arraycopy( LOINC_OBSOLETES, 0, exclusionTypes, start, LOINC_OBSOLETES.length );
+      start += LOINC_OBSOLETES.length;
+      System.arraycopy( MEDRA_OBSOLETES, 0, exclusionTypes, start, MEDRA_OBSOLETES.length );
+      start += MEDRA_OBSOLETES.length;
+      System.arraycopy( MESH_EXCLUSIONS, 0, exclusionTypes, start, MESH_EXCLUSIONS.length );
+      start += MESH_EXCLUSIONS.length;
+      System.arraycopy( NCI_EXCLUSIONS, 0, exclusionTypes, start, NCI_EXCLUSIONS.length );
+      start += NCI_EXCLUSIONS.length;
+      System.arraycopy( UMDNS_EXCLUSIONS, 0, exclusionTypes, start, UMDNS_EXCLUSIONS.length );
+      return exclusionTypes;
    }
 
 
-   static public Map<Long, Concept> parseConcepts( final String umlsDirPath,
+
+   static public Map<Long, Concept> parseAllConcepts( final String umlsDirPath,
                                                    final Map<Long, Concept> concepts,
                                                    final Collection<String> wantedTargets,
                                                    final UmlsTermUtil umlsTermUtil,
-                                                   final Collection<String> unwantedTexts,
+                                                   final Collection<String> languages,
                                                    final boolean extractAbbreviations,
-                                                   final int minWordLength,
-                                                   final int maxWordCount ) {
+                                                   final int minCharLength,
+                                                      final int maxCharLength,
+                                                   final int maxWordCount,
+                                                      final int maxSymCount ) {
       final String mrconsoPath = umlsDirPath + MR_CONSO_SUB_PATH;
-      LOGGER.info( "Compiling map of Umls Cuis and Texts from " + mrconsoPath );
+      final Collection<String> invalidTypeSet = new HashSet<>( Arrays.asList( getNonRxnormExclusions() ) );
+      LOGGER.info( "Compiling map of Concepts from " + mrconsoPath );
       long lineCount = 0;
       try ( final BufferedReader reader = FileUtil.createReader( mrconsoPath ) ) {
          List<String> tokens = FileUtil.readBsvTokens( reader, mrconsoPath );
@@ -78,28 +130,52 @@ final public class MrconsoParser {
             if ( lineCount % 100000 == 0 ) {
                LOGGER.info( "File Line " + lineCount );
             }
-            if ( tokens.size() > TEXT._index && getToken( tokens, LANGUAGE ).equals( "ENG" ) ) {
-               final Long cuiCode = CuiCodeUtil.getInstance().getCuiCode( getToken( tokens, CUI ) );
-               final Concept concept = concepts.get( cuiCode );
-               if ( concept == null ) {
-                  tokens = FileUtil.readBsvTokens( reader, mrconsoPath );
-                  continue;
-               }
-               final String source = getToken( tokens, SOURCE );
-               if ( wantedTargets.contains( source ) ) {
-                  concept.addCode( source, getToken( tokens, SOURCE_CODE ) );
-               }
-               final String text = getToken( tokens, TEXT );
-               if ( getToken( tokens, STATUS ).equals( "P" ) && getToken( tokens, FORM ).equals( "PF" ) ) {
-                  concept.setPreferredText( text );
-               }
-               Collection<String> formattedTexts = umlsTermUtil.getFormattedTexts( text, extractAbbreviations,
-                                                                                   minWordLength, maxWordCount );
-               if ( formattedTexts == null || formattedTexts.isEmpty() ) {
-                  tokens = FileUtil.readBsvTokens( reader, mrconsoPath );
-                  continue;
+            if ( !isRowOk( tokens, languages, invalidTypeSet ) ) {
+               tokens = FileUtil.readBsvTokens( reader, mrconsoPath );
+               continue;
+            }
+            final Long cuiCode = CuiCodeUtil.getInstance().getCuiCode( getToken( tokens, CUI ) );
+            final Concept concept = concepts.get( cuiCode );
+            if ( concept == null ) {
+               // cui for current row is unwanted
+               tokens = FileUtil.readBsvTokens( reader, mrconsoPath );
+               continue;
+            }
+            final String text = getToken( tokens, TEXT );
+            if ( isPreferredTerm( tokens ) ) {
+               concept.setPreferredText( text );
+            }
+            final String source = getToken( tokens, SOURCE );
+            if ( wantedTargets.contains( source ) ) {
+               final String code = getToken( tokens, SOURCE_CODE );
+               if ( !code.equals( "NOCODE" ) ) {
+                  Vocabulary.getInstance().addVocabulary( source, code );
+                  concept.addCode( source, code );
                }
-               formattedTexts.removeAll( unwantedTexts );
+            }
+            final String tokenizedText = TextTokenizer.getTokenizedText( text );
+            if ( tokenizedText == null || tokenizedText.isEmpty()
+                 || !umlsTermUtil.isTextValid( tokenizedText ) ) {
+               // no tokenizable text or tokenized text is invalid for some reason
+               tokens = FileUtil.readBsvTokens( reader, mrconsoPath );
+               continue;
+            }
+            if ( DoseUtil.hasUnit( tokenizedText ) ) {
+               concept.setHasDose();
+               tokens = FileUtil.readBsvTokens( reader, mrconsoPath );
+               continue;
+            }
+            final String strippedText = umlsTermUtil.getStrippedText( tokenizedText );
+            if ( strippedText == null || strippedText.isEmpty()
+                 || UmlsTermUtil.isTextTooShort( strippedText, minCharLength )
+                 || UmlsTermUtil.isTextTooLong( strippedText, maxCharLength, maxWordCount, maxSymCount ) ) {
+               // after stripping unwanted prefixes and suffixes there is no valid text
+               tokens = FileUtil.readBsvTokens( reader, mrconsoPath );
+               continue;
+            }
+            final Collection<String> formattedTexts
+                  = umlsTermUtil.getFormattedTexts( strippedText, extractAbbreviations, minCharLength, maxCharLength, maxWordCount, maxSymCount );
+            if ( formattedTexts != null && !formattedTexts.isEmpty() ) {
                concept.addTexts( formattedTexts );
             }
             tokens = FileUtil.readBsvTokens( reader, mrconsoPath );
@@ -112,6 +188,27 @@ final public class MrconsoParser {
    }
 
 
+   static private boolean isRowOk( final List<String> tokens,
+                                   final Collection<String> languages,
+                                   final Collection<String> invalidTypeSet ) {
+      if ( tokens.size() <= TEXT._index || !languages.contains( getToken( tokens, LANGUAGE ) ) ) {
+         return false;
+      }
+      final String type = getToken( tokens, TERM_TYPE );
+      if ( invalidTypeSet.contains( type ) ) {
+         return false;
+      }
+      // "Synonyms" are actually undesirable in the rxnorm vocabulary
+      final String source = getToken( tokens, SOURCE );
+      return !( source.equals( "RXNORM" ) && type.equals( "SY" ) );
+   }
+
+
+   static private boolean isPreferredTerm( final List<String> tokens ) {
+      return getToken( tokens, STATUS ).equals( "P" ) && getToken( tokens, FORM ).equals( "PF" );
+   }
+
+
    /**
     * Can cull the given collection of cuis
     *
@@ -121,20 +218,18 @@ final public class MrconsoParser {
     */
    static public Collection<Long> getValidVocabularyCuis( final String umlsDirPath,
                                                           final Collection<String> sourceVocabularies ) {
-      return getValidVocabularyCuis( umlsDirPath, sourceVocabularies, EXCLUSION_TYPES );
+      return getValidVocabularyCuis( umlsDirPath, sourceVocabularies, getDefaultExclusions() );
    }
 
-   /**
-    * Can cull the given collection of cuis
-    *
-    * @param umlsDirPath     path to the UMLS_ROOT Meta/MRCONSO.RRF file
-    * @return Subset of cuis that exist in in the given sources
-    */
-   static public Collection<Long> getValidRxNormCuis( final String umlsDirPath ) {
-      final String[] exclusionTypes = Arrays.copyOf( EXCLUSION_TYPES, EXCLUSION_TYPES.length + 1 );
-      exclusionTypes[ EXCLUSION_TYPES.length ] = EXCLUSION_RXNORM;
-      return getValidVocabularyCuis( umlsDirPath, Collections.singletonList( "RXNORM" ), exclusionTypes );
-   }
+//   /**
+//    * Can cull the given collection of cuis
+//    *
+//    * @param umlsDirPath     path to the UMLS_ROOT Meta/MRCONSO.RRF file
+//    * @return Subset of cuis that exist in in the given sources
+//    */
+//   static public Collection<Long> getValidRxNormCuis( final String umlsDirPath ) {
+//      return getValidVocabularyCuis( umlsDirPath, Collections.singletonList( "RXNORM" ), getRxnormExclusions() );
+//   }
 
    /**
     * Can cull the given collection of cuis
@@ -173,21 +268,6 @@ final public class MrconsoParser {
       return validCuis;
    }
 
-//   /**
-//    * Given a collection of cuis, returns all of the cuis that don't exist for the given source types
-//    *
-//    * @param rrfPath     path to the UMLS_ROOT Meta/MRCONSO.RRF file
-//    * @param sourceTypes desired source type names as appear in rrf: RXNORM, SNOMEDCT, MSH, etc.
-//    * @param cuiCodes    current list of cui codes
-//    * @return Subset of cuis that don't exist in in the given sources
-//    */
-//   static public Collection<Long> getSourceTypeInvalidCuis( final String rrfPath,
-//                                                            final Collection<String> sourceTypes,
-//                                                            final Collection<Long> cuiCodes ) {
-//      final Collection<Long> validCuis = getSourceTypeValidCuis( rrfPath, sourceTypes, cuiCodes );
-//      final Predicate<Long> validCui = validCuis::contains;
-//      return cuiCodes.stream().filter( validCui.negate() ).collect( Collectors.toSet() );
-//   }
 
    static private String getToken( final List<String> tokens, final MrconsoIndex mrconsoIndex ) {
       return tokens.get( mrconsoIndex._index );

Modified: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/UmlsTermUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/UmlsTermUtil.java?rev=1751544&r1=1751543&r2=1751544&view=diff
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/UmlsTermUtil.java (original)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/gui/umls/UmlsTermUtil.java Tue Jul  5 19:40:57 2016
@@ -1,11 +1,12 @@
 package org.apache.ctakes.dictionary.creator.gui.umls;
 
 import org.apache.ctakes.dictionary.creator.util.FileUtil;
-import org.apache.ctakes.dictionary.creator.util.TextTokenizer;
 
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashSet;
+import java.util.function.Consumer;
+import java.util.stream.Collectors;
 
 
 /**
@@ -18,7 +19,7 @@ import java.util.HashSet;
 final public class UmlsTermUtil {
 
 
-   static private enum DATA_FILE {
+   private enum DATA_FILE {
       REMOVAL_PREFIX_TRIGGERS( "RemovalPrefixTriggers.txt" ),
       REMOVAL_SUFFIX_TRIGGERS( "RemovalSuffixTriggers.txt" ),
       REMOVAL_FUNCTION_TRIGGERS( "RemovalFunctionTriggers.txt" ),
@@ -29,7 +30,7 @@ final public class UmlsTermUtil {
       RIGHT_ABBREVIATIONS( "RightAbbreviations.txt" );
       final private String __name;
 
-      private DATA_FILE( final String name ) {
+      DATA_FILE( final String name ) {
          __name = name;
       }
    }
@@ -72,111 +73,7 @@ final public class UmlsTermUtil {
       _abbreviations = FileUtil.readOneColumn( abbreviationsPath, "Abbreviations to expand" );
    }
 
-   public Collection<String> getFormattedTexts( final String text ) {
-      return getFormattedTexts( text, true, 1, Integer.MAX_VALUE );
-   }
-
-   public Collection<String> getFormattedTexts( final Collection<String> extractedTerms,
-                                                final int minWordLength, final int maxWordCount ) {
-      final Collection<String> removalTexts = new HashSet<>();
-      for ( String term : extractedTerms ) {
-         if ( term.length() < minWordLength ) {
-            removalTexts.add( term );
-            continue;
-         }
-         final String[] splits = term.split( "\\s+" );
-         if ( splits.length > maxWordCount ) {
-            int count = 0;
-            for ( String split : splits ) {
-               if ( split.length() > 2 ) {
-                  count++;
-                  if ( count > maxWordCount ) {
-                     removalTexts.add( term );
-                     break;
-                  }
-               }
-            }
-         }
-      }
-      extractedTerms.removeAll( removalTexts );
-      return extractedTerms;
-   }
-
-   public Collection<String> getFormattedTexts( final String text, final boolean extractAbbreviations,
-                                                final int minWordLength, final int maxWordCount ) {
-      final String tokenizedText = TextTokenizer.getTokenizedText( text );
-//      final String tokenizedText = TextTokenizerCtakesPTB.getTokenizedText( text );  PTB is not worth the trouble
-      if ( tokenizedText == null || tokenizedText.isEmpty() ) {
-         return Collections.emptyList();
-      }
-      if ( !isTextValid( tokenizedText ) ) {
-         return Collections.emptyList();
-      }
-      final String validText = getValidText( tokenizedText );
-      if ( validText == null || validText.isEmpty() ) {
-         return Collections.emptyList();
-      }
-      Collection<String> extractedTerms = Collections.emptySet();
-      if ( extractAbbreviations ) {
-         // add embedded abbreviations
-         extractedTerms = extractAbbreviations( validText );
-//         if ( extractedTerms.isEmpty() ) {
-//            extractedTerms = autoExtractAcronyms( validText );
-//         }
-      }
-      if ( extractedTerms.isEmpty() ) {
-         extractedTerms = extractModifiers( validText );
-      }
-      if ( !extractedTerms.isEmpty() ) {
-         extractedTerms.add( validText );
-         return getFormattedTexts( getPluralTerms( getValidTexts( extractedTerms ) ), minWordLength, maxWordCount );
-      }
-//      // Check for embedded and / or terms
-//      if ( extractedTerms.isEmpty() ) {
-//         extractedTerms = autoExtractColonParaTerms( validText );
-//      }
-//      if ( extractedTerms.isEmpty() ) {
-//         extractedTerms = autoExtractOrParaTerms( validText );
-//      }
-//      if ( extractedTerms.isEmpty() ) {
-//         extractedTerms = autoExtractColonBracketTerms( validText );
-//      }
-//      //      if ( extractedTerms.isEmpty() ) {
-//      //         extractedTerms = autoExtractAndBracketTerms( validText );
-//      //      }
-//      if ( extractedTerms.isEmpty() ) {
-//         extractedTerms = autoExtractOrBracketTerms( validText );
-//      }
-//      if ( extractedTerms.isEmpty() ) {
-//         extractedTerms = autoExtractAndOrOtherTerms( validText );
-//      }
-//      if ( !extractedTerms.isEmpty() ) {
-//         //         System.out.println( validText );
-//         //         for ( String et : extractedTerms ) {
-//         //            System.out.println("  " + et);
-//         //         }
-//         return getFormattedTexts( getPluralTerms( getValidTexts( extractedTerms ) ), minWordLength, maxWordCount );
-//      } else {
-         Collection<String> texts = new HashSet<>( 1 );
-         texts.add( validText );
-         return getFormattedTexts( getPluralTerms( getValidTexts( texts ) ), minWordLength, maxWordCount );
-//      }
-   }
-
-   static private Collection<String> getPluralTerms( final Collection<String> texts ) {
-      final Collection<String> plurals = new HashSet<>();
-      for ( String text : texts ) {
-         if ( text.endsWith( "( s )" ) ) {
-            final String singular = text.substring( 0, text.length() - 5 ).trim();
-            plurals.add( singular );
-            plurals.add( singular + "s" );
-         }
-      }
-      texts.addAll( plurals );
-      return texts;
-   }
-
-   private boolean isTextValid( final String text ) {
+   public boolean isTextValid( final String text ) {
       // Check for illegal characters
       for ( int i = 0; i < text.length(); i++ ) {
          if ( text.charAt( i ) < ' ' || text.charAt( i ) > '~' ) {
@@ -190,77 +87,141 @@ final public class UmlsTermUtil {
       if ( text.length() == 3 && text.charAt( 0 ) == '(' ) {
          return false;
       }
-      for ( String removalPrefix : _removalPrefixTriggers ) {
-         if ( text.startsWith( removalPrefix ) ) {
-            return false;
-         }
+      if ( _removalPrefixTriggers.stream().anyMatch( text::startsWith ) ) {
+         return false;
       }
-      for ( String removalSuffix : _removalSuffixTriggers ) {
-         if ( text.endsWith( removalSuffix ) ) {
-            return false;
-         }
+      if ( _removalSuffixTriggers.stream().anyMatch( text::endsWith ) ) {
+         return false;
       }
-      for ( String removalColon : _removalColonTriggers ) {
-         if ( text.contains( removalColon ) ) {
-            return false;
-         }
+      if ( _removalColonTriggers.stream().anyMatch( text::contains ) ) {
+         return false;
       }
-      for ( String removalFunction : _removalFunctionTriggers ) {
-         if ( text.contains( removalFunction ) ) {
-            return false;
-         }
+      if ( _removalFunctionTriggers.stream().anyMatch( text::contains ) ) {
+         return false;
       }
       return true;
    }
 
-   private Collection<String> getValidTexts( final Collection<String> texts ) {
-      final Collection<String> validTexts = new HashSet<>( texts.size() );
-      for ( String text : texts ) {
-         validTexts.add( getValidText( text ) );
+   static public boolean isTextTooShort( final String text, final int minCharLength ) {
+      return text.length() < minCharLength;
+   }
+
+   static private int cccc = 0;
+
+   static public boolean isTextTooLong( final String text, final int maxCharLength,
+                                 final int maxWordCount, final int maxSymCount ) {
+      final String[] splits = text.split( "\\s+" );
+      int wordCount = 0;
+      int symCount = 0;
+      for ( String split : splits ) {
+         if ( split.length() > maxCharLength ) {
+            return true;
+         }
+         if ( split.length() > 2 ) {
+            wordCount++;
+         } else {
+            symCount++;
+         }
       }
-      return validTexts;
+      return wordCount > maxWordCount || symCount > maxSymCount;
    }
 
-   private String getValidText( final String text ) {
+
+   public Collection<String> getFormattedTexts( final String strippedText, final boolean extractAbbreviations,
+                                                final int minCharLength, final int maxCharLength,
+                                                final int maxWordCount, final int maxSymCount ) {
+      Collection<String> extractedTerms = Collections.emptySet();
+      if ( extractAbbreviations ) {
+         // add embedded abbreviations
+         extractedTerms = extractAbbreviations( strippedText );
+      }
+      if ( extractedTerms.isEmpty() ) {
+         extractedTerms = extractModifiers( strippedText );
+      }
+      if ( !extractedTerms.isEmpty() ) {
+         extractedTerms.add( strippedText );
+         return getFormattedTexts( getPluralTerms( getStrippedTexts( extractedTerms ) ), minCharLength, maxCharLength, maxWordCount, maxSymCount );
+      }
+      Collection<String> texts = new HashSet<>( 1 );
+      texts.add( strippedText );
+      return getFormattedTexts( getPluralTerms( getStrippedTexts( texts ) ), minCharLength, maxCharLength, maxWordCount, maxSymCount );
+   }
+
+
+   static private Collection<String> getFormattedTexts( final Collection<String> extractedTerms,
+                                                final int minCharLength, final int maxCharLength,
+                                                final int maxWordCount, final int maxSymCount ) {
+      return extractedTerms.stream()
+            .filter( t -> !isTextTooShort( t, minCharLength ) )
+            .filter( t -> !isTextTooLong( t, maxCharLength, maxWordCount, maxSymCount ) )
+            .collect( Collectors.toList() );
+   }
+
+   static private Collection<String> getPluralTerms( final Collection<String> texts ) {
+      final Collection<String> plurals = texts.stream()
+            .filter( t -> t.endsWith( "( s )" ) )
+            .collect( Collectors.toList() );
+      if ( plurals.isEmpty() ) {
+         return texts;
+      }
+      texts.removeAll( plurals );
+      final Consumer<String> addPlural = t -> {
+         texts.add( t );
+         texts.add( t + "s" );
+      };
+      plurals.stream()
+            .map( t -> t.substring( 0, t.length() - 5 ) )
+            .forEach( addPlural );
+      return texts;
+   }
+
+   private Collection<String> getStrippedTexts( final Collection<String> texts ) {
+      return texts.stream()
+            .map( this::getStrippedText )
+            .filter( t -> !t.isEmpty() )
+            .collect( Collectors.toSet() );
+   }
+
+   public String getStrippedText( final String text ) {
       // remove form underlines
 //      if ( text.contains( "_ _ _" ) ) {
 //         final int lastParen = text.lastIndexOf( '(' );
 //         final int lastDash = text.indexOf( "_ _ _" );
 //         final int deleteIndex = Math.max( 0, Math.min( lastParen, lastDash ) );
 //         if ( deleteIndex > 0 ) {
-//            return getValidText( text.substring( 0, deleteIndex - 1 ).trim() );
+//            return getStrippedText( text.substring( 0, deleteIndex - 1 ).trim() );
 //         }
 //      }
       // remove unmatched parentheses, brackets, etc.
       //      if ( text.startsWith( "(" ) && !text.contains( ")" ) ) {
-      //         return getValidText( text.substring( 1 ).trim() );
+      //         return getStrippedText( text.substring( 1 ).trim() );
       //      }
       //      if ( text.startsWith( "[" ) && !text.contains( "]" ) ) {
-      //         return getValidText( text.substring( 1 ).trim() );
+      //         return getStrippedText( text.substring( 1 ).trim() );
       //      }
       //      if ( text.startsWith( "(" ) && text.endsWith( ") or" ) ) {
-      //         return getValidText( text.substring( 1, text.length() - 4 ).trim() );
+      //         return getStrippedText( text.substring( 1, text.length() - 4 ).trim() );
       //      }
       //      if ( text.startsWith( "or (" ) ) {
-      //         return getValidText( text.substring( 2 ).trim() );
+      //         return getStrippedText( text.substring( 2 ).trim() );
       //      }
       //      if ( text.startsWith( "\"" ) && text.endsWith( "\"" ) ) {
-      //         return getValidText( text.substring( 1 ).trim() );
+      //         return getStrippedText( text.substring( 1 ).trim() );
       //      }
       //      if ( text.startsWith( "(" ) && text.endsWith( ")" ) ) {
-      //         return getValidText( text.substring( 1, text.length() - 2 ).trim() );
+      //         return getStrippedText( text.substring( 1, text.length() - 2 ).trim() );
       //      }
       //      if ( text.startsWith( "[" ) && text.endsWith( "]" ) ) {
-      //         return getValidText( text.substring( 1, text.length() - 2 ).trim() );
+      //         return getStrippedText( text.substring( 1, text.length() - 2 ).trim() );
       //      }
       //      if ( text.startsWith( "&" ) ) {
-      //         return getValidText( text.substring( 1 ).trim() );
+      //         return getStrippedText( text.substring( 1 ).trim() );
       //      }
       //      if ( text.endsWith( "]" ) && !text.contains( "[" ) ) {
-      //         return getValidText( text.substring( 0, text.length() - 2 ).trim() );
+      //         return getStrippedText( text.substring( 0, text.length() - 2 ).trim() );
       //      }
       //      if ( text.endsWith( ")" ) && !text.contains( "(" ) ) {
-      //         return getValidText( text.substring( 0, text.length() - 2 ).trim() );
+      //         return getStrippedText( text.substring( 0, text.length() - 2 ).trim() );
       //      }
       String strippedText = text.trim();
       // Text in umls can have multiple suffixes and/or prefixes.  Stripping just once doesn't do the trick
@@ -284,9 +245,6 @@ final public class UmlsTermUtil {
       if ( strippedText.contains( "(" ) && strippedText.contains( "[" ) ) {
          return "";
       }
-//      if ( strippedText.length() != text.trim().length() ) {
-//         System.out.println( text.trim() + " > " + strippedText );
-//      }
       return strippedText;
    }
 
@@ -325,225 +283,5 @@ final public class UmlsTermUtil {
       return Collections.emptyList();
    }
 
-   private Collection<String> autoExtractAcronyms( final String tokenizedText ) {
-      final int dashIndex = tokenizedText.indexOf( '-' );
-      if ( dashIndex > 1 ) {
-         // have text ABC - DEF, check for acronym
-         final String acronym = tokenizedText.substring( 0, dashIndex - 1 ).trim();
-         if ( acronym.isEmpty() || acronym.length() > 8 || acronym.equals( "dose" ) ) {
-            return Collections.emptyList();
-         }
-         final String[] splits = acronym.split( "\\s+" );
-         if ( (splits.length == 1 && acronym.length() > 6) || splits.length > 2 ) {
-            return Collections.emptyList();
-         }
-         final String definition = tokenizedText.substring( dashIndex + 1 ).trim();
-         if ( definition.isEmpty() ) {
-            return Collections.emptyList();
-         }
-         if ( (acronym.charAt( 0 ) != definition.charAt( 0 ) && !definition.contains( "' s" )) ) {
-            return Collections.emptyList();
-         }
-         final String[] definitionSplits = definition.split( "\\s+" );
-         if ( acronym.length() != definitionSplits.length
-               || definitionSplits[definitionSplits.length - 1].charAt( 0 ) != acronym.charAt(
-               acronym.length() - 1 ) ) {
-            return Collections.emptyList();
-         }
-         final Collection<String> extractedAbbreviations = new HashSet<>( 2 );
-         extractedAbbreviations.add( acronym );
-         extractedAbbreviations.add( definition );
-         return extractedAbbreviations;
-      }
-      return Collections.emptyList();
-   }
-
-   private Collection<String> autoExtractColonBracketTerms( final String tokenizedText ) {
-      final int colonIndex = tokenizedText.indexOf( ':' );
-      if ( colonIndex < 0 ) {
-         return Collections.emptyList();
-      }
-      final int orIndex = tokenizedText.indexOf( "] or [" );
-      final int andOrIndex = tokenizedText.indexOf( "] & / or [" );
-      if ( Math.max( orIndex, andOrIndex ) < colonIndex ) {
-         return Collections.emptyList();
-      }
-      String splitter = "\\] or \\[";
-      if ( andOrIndex > 0 ) {
-         splitter = "\\] & / or \\[";
-      }
-      final Collection<String> extractedTerms = new HashSet<>( 2 );
-      final String thing = tokenizedText.substring( 0, colonIndex - 1 ).trim();
-      final String types = tokenizedText.substring( colonIndex + 1 ).trim();
-      final String[] splits = types.split( splitter );
-      for ( String split : splits ) {
-         split = trimBracketText( split );
-         if ( split.equals( "nos" ) || split.equals( "nec" ) || split.equals( "unspecified" )
-               || split.equals( "other" ) || split.isEmpty() ) {
-            extractedTerms.addAll( getFormattedTexts( thing ) );
-         } else {
-            extractedTerms.addAll( getFormattedTexts( split + " " + thing ) );
-            extractedTerms.addAll( getFormattedTexts( thing + " " + split ) );
-         }
-      }
-      return extractedTerms;
-   }
-
-   private Collection<String> autoExtractAndBracketTerms( final String tokenizedText ) {
-      final int andIndex = tokenizedText.indexOf( "( &" );
-      if ( andIndex < 0 || tokenizedText.indexOf( "] or [" ) < andIndex ) {
-         return Collections.emptyList();
-      }
-      final Collection<String> extractedTerms = new HashSet<>( 3 );
-      final String thing = tokenizedText.substring( 0, andIndex - 1 ).trim();
-      extractedTerms.add( thing );
-      final String types = tokenizedText.substring( andIndex + 3 ).trim();
-      final String[] splits = types.split( "\\] or \\[" );
-      for ( String split : splits ) {
-         split = trimBracketText( split );
-         extractedTerms.addAll( getFormattedTexts( split + " " + thing ) );
-         extractedTerms.addAll( getFormattedTexts( thing + " " + split ) );
-      }
-      return extractedTerms;
-   }
-
-   private Collection<String> autoExtractOrBracketTerms( final String tokenizedText ) {
-      if ( !tokenizedText.contains( "] or [" ) && !tokenizedText.contains( "] & / or [" ) ) {
-         return Collections.emptyList();
-      }
-      final int lastOf = tokenizedText.lastIndexOf( " of " );
-      if ( lastOf > tokenizedText.lastIndexOf( ']' ) ) {
-         final String ofTerm = tokenizedText.substring( lastOf ).trim();
-         final Collection<String> ofExtractions = autoExtractOrBracketTerms( tokenizedText.substring( 0,
-                                                                                                      lastOf ).trim() );
-         final Collection<String> ofTexts = new HashSet<>( ofExtractions.size() );
-         for ( String ofText : ofExtractions ) {
-            ofTexts.add( ofText + " " + ofTerm );
-         }
-         return ofTexts;
-      }
-      final Collection<String> extractedTerms = new HashSet<>( 2 );
-      String splitter = "\\] or \\[";
-      if ( tokenizedText.contains( "] & / or [" ) ) {
-         splitter = "\\] & / or \\[";
-      }
-      final String[] splits = tokenizedText.split( splitter );
-      for ( String split : splits ) {
-         split = trimBracketText( split );
-         if ( !split.equals( "operation" ) && !split.equals( "therapy" ) && !split.equals( "provision of" ) ) {
-            extractedTerms.addAll( getFormattedTexts( split ) );
-         }
-      }
-      return extractedTerms;
-   }
-
-   private Collection<String> autoExtractOrParaTerms( final String tokenizedText ) {
-      if ( !tokenizedText.contains( ") or (" ) && !tokenizedText.contains( ") & / or (" ) ) {
-         return Collections.emptyList();
-      }
-      final int lastOf = tokenizedText.lastIndexOf( " of " );
-      if ( lastOf > tokenizedText.lastIndexOf( ')' ) ) {
-         final String ofTerm = tokenizedText.substring( lastOf ).trim();
-         final Collection<String> ofExtractions = autoExtractOrBracketTerms( tokenizedText.substring( 0,
-                                                                                                      lastOf ).trim() );
-         final Collection<String> ofTexts = new HashSet<>( ofExtractions.size() );
-         for ( String ofText : ofExtractions ) {
-            ofTexts.add( ofText + " " + ofTerm );
-         }
-         return ofTexts;
-      }
-      final Collection<String> extractedTerms = new HashSet<>( 2 );
-      String splitter = "\\) or \\(";
-      if ( tokenizedText.contains( ") & / or (" ) ) {
-         splitter = "\\) & / or \\(";
-      }
-      final String[] splits = tokenizedText.split( splitter );
-      for ( String split : splits ) {
-         split = trimParaText( split );
-         if ( !split.equals( "operation" ) && !split.equals( "therapy" ) && !split.equals( "provision of" ) ) {
-            extractedTerms.addAll( getFormattedTexts( split ) );
-         }
-      }
-      return extractedTerms;
-   }
-
-   private Collection<String> autoExtractColonParaTerms( final String tokenizedText ) {
-      final int colonIndex = tokenizedText.indexOf( ':' );
-      if ( colonIndex < 0 || colonIndex > tokenizedText.indexOf( '(' ) ) {
-         return Collections.emptyList();
-      }
-      final int orIndex = tokenizedText.indexOf( ") or (" );
-      final int andOrIndex = tokenizedText.indexOf( ") & / or (" );
-      if ( Math.max( orIndex, andOrIndex ) < colonIndex ) {
-         return Collections.emptyList();
-      }
-      String splitter = "\\) or \\(";
-      if ( andOrIndex > 0 ) {
-         splitter = "\\) & / or \\(";
-      }
-      final Collection<String> extractedTerms = new HashSet<>( 2 );
-      final String thing = tokenizedText.substring( 0, colonIndex - 1 ).trim();
-      final String types = tokenizedText.substring( colonIndex + 1 ).trim();
-      final String[] splits = types.split( splitter );
-      for ( String split : splits ) {
-         split = trimParaText( split );
-         if ( split.equals( "nos" ) || split.equals( "nec" ) || split.equals( "unspecified" )
-               || split.equals( "other" ) || split.isEmpty() ) {
-            extractedTerms.addAll( getFormattedTexts( thing ) );
-         } else {
-            extractedTerms.addAll( getFormattedTexts( split + " " + thing ) );
-            extractedTerms.addAll( getFormattedTexts( thing + " " + split ) );
-         }
-      }
-      return extractedTerms;
-   }
-
-   private Collection<String> autoExtractAndOrOtherTerms( final String tokenizedText ) {
-      final int otherIndex = tokenizedText.indexOf( " & / or other " );
-      if ( otherIndex < 0 ) {
-         return Collections.emptyList();
-      }
-      final Collection<String> otherTexts = new HashSet<>( 2 );
-      otherTexts.add( tokenizedText.substring( 0, otherIndex ).trim() );
-      otherTexts.add( tokenizedText.substring( otherIndex + 14 ).trim() );
-      return otherTexts;
-   }
-
-   static private String trimParaText( String paraText ) {
-      if ( paraText.startsWith( "(" ) ) {
-         paraText = paraText.substring( 1 );
-      }
-      if ( paraText.endsWith( " nos " ) || paraText.endsWith( " nec " ) ) {
-         return paraText.substring( 0, paraText.length() - 4 ).trim();
-      } else if ( paraText.endsWith( ", unspecified " ) ) {
-         return paraText.substring( 0, paraText.length() - 14 ).trim();
-      } else if ( paraText.endsWith( " nos )" ) || paraText.endsWith( " nec )" ) ) {
-         return paraText.substring( 0, paraText.length() - 5 ).trim();
-      } else if ( paraText.endsWith( ", unspecified )" ) ) {
-         return paraText.substring( 0, paraText.length() - 15 ).trim();
-      } else if ( paraText.endsWith( ")" ) ) {
-         return paraText.substring( 0, paraText.length() - 1 ).trim();
-      }
-      return paraText.trim();
-   }
-
-   static private String trimBracketText( String bracketText ) {
-      if ( bracketText.startsWith( "[" ) ) {
-         bracketText = bracketText.substring( 1 );
-      }
-      if ( bracketText.endsWith( " nos " ) || bracketText.endsWith( " nec " ) ) {
-         return bracketText.substring( 0, bracketText.length() - 4 ).trim();
-      } else if ( bracketText.endsWith( ", unspecified " ) ) {
-         return bracketText.substring( 0, bracketText.length() - 14 ).trim();
-      } else if ( bracketText.endsWith( " nos ]" ) || bracketText.endsWith( " nec ]" ) ) {
-         return bracketText.substring( 0, bracketText.length() - 5 ).trim();
-      } else if ( bracketText.endsWith( ", unspecified ]" ) ) {
-         return bracketText.substring( 0, bracketText.length() - 15 ).trim();
-      } else if ( bracketText.endsWith( "]" ) ) {
-         return bracketText.substring( 0, bracketText.length() - 1 ).trim();
-      }
-      return bracketText.trim();
-   }
-
 
 }

Modified: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordDbWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordDbWriter.java?rev=1751544&r1=1751543&r2=1751544&view=diff
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordDbWriter.java (original)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordDbWriter.java Tue Jul  5 19:40:57 2016
@@ -28,6 +28,7 @@ final public class RareWordDbWriter {
 
    static private final Logger LOGGER = LogManager.getLogger( "RareWordDbWriter" );
 
+
    private RareWordDbWriter() {
    }
 
@@ -49,6 +50,25 @@ final public class RareWordDbWriter {
                                      final String url, final String user, final String pass ) {
       // Get Count of appearance in dictionary per term token
       final Map<String, Integer> tokenCounts = RareWordUtil.getTokenCounts( concepts.values() );
+      // For pmsdn tesseract user-words
+//      try ( Writer rareWordWriter = new BufferedWriter( new FileWriter( "C:\\Spiffy\\prj_pmsdn\\data\\internal\\dictionaryTemp\\rarewords.txt" ) ) ) {
+//         for ( Map.Entry<String,Integer> entry : tokenCounts.entrySet() ) {
+//            if ( entry.getValue() > 24 && entry.getKey().length() > 4 && entry.getKey().length() < 15 ) {
+//               boolean allAlpha = true;
+//               for ( char c : entry.getKey().toCharArray() ) {
+//                  if ( !Character.isLetter( c ) ) {
+//                     allAlpha = false;
+//                     break;
+//                  }
+//               }
+//               if ( allAlpha ) {
+//                  rareWordWriter.write( entry.getKey() + "\n" );
+//               }
+//            }
+//         }
+//      } catch ( IOException ioE ) {
+//         LOGGER.error( ioE.getMessage() );
+//      }
       // Create insert sql statements
       final String mainTableSql = JdbcUtil.createRowInsertSql( "CUI_TERMS", CuiTermsField.values() );
       final String tuiTableSql = JdbcUtil.createCodeInsertSql( "tui" );
@@ -70,14 +90,15 @@ final public class RareWordDbWriter {
          for ( Map.Entry<Long, Concept> conceptEntry : concepts.entrySet() ) {
             final long cui = conceptEntry.getKey();
             final Concept concept = conceptEntry.getValue();
-            final Collection<String> texts = concept.getTexts();
-            if ( texts.isEmpty() ) {
-               continue;
-            }
             // write main term table
+            boolean conceptOk = false;
             for ( String text : conceptEntry.getValue().getTexts() ) {
                final RareWordUtil.IndexedRareWord indexedRareWord = RareWordUtil.getIndexedRareWord( text,
                                                                                                      tokenCounts );
+               if ( RareWordUtil.NULL_RARE_WORD.equals( indexedRareWord ) ) {
+                  continue;
+               }
+               conceptOk = true;
                mainTableStatement.setLong( CuiTermsField.CUI.__index, cui );
                mainTableStatement.setInt( CuiTermsField.RINDEX.__index, indexedRareWord.__index );
                mainTableStatement.setInt( CuiTermsField.TCOUNT.__index, indexedRareWord.__tokenCount );
@@ -86,6 +107,9 @@ final public class RareWordDbWriter {
                mainTableStatement.executeUpdate();
                mainTableCount = incrementCount( "Main", mainTableCount );
             }
+            if ( !conceptOk ) {
+               continue;
+            }
             // write tui table
             for ( Tui tui : concept.getTuis() ) {
                tuiStatement.setLong( CuiTermsField.CUI.__index, cui );
@@ -144,9 +168,10 @@ final public class RareWordDbWriter {
       LOGGER.info( "Main Table Rows " + mainTableCount );
       LOGGER.info( "Tui Table Rows " + tuiTableCount );
       LOGGER.info( "Preferred Term Table Rows " + preftermTableCount );
-      final Function<String,String> getCountInfo
-            = vocabulary -> vocabulary + " Table Rows " + codeTableCounts.get( vocabulary );
-      Vocabulary.getInstance().getAllVocabularies().stream().map( getCountInfo ).forEach( LOGGER::info );
+      final Function<String,String> vocabCount = v -> v + " Table Rows " + codeTableCounts.get( v );
+      Vocabulary.getInstance().getAllVocabularies().stream()
+            .map( vocabCount )
+            .forEach( LOGGER::info );
       return true;
    }
 

Modified: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordUtil.java?rev=1751544&r1=1751543&r2=1751544&view=diff
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordUtil.java (original)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/RareWordUtil.java Tue Jul  5 19:40:57 2016
@@ -150,6 +150,8 @@ final public class RareWordUtil {
       }
    }
 
+   static public final IndexedRareWord NULL_RARE_WORD = new IndexedRareWord( null, -1, -1 );
+
    static public IndexedRareWord getIndexedRareWord( final String text,
                                                      final Map<String, Integer> tokenCounts ) {
       final String[] tokens = text.split( "\\s+" );
@@ -162,6 +164,9 @@ final public class RareWordUtil {
             bestCount = count;
          }
       }
+      if ( bestCount == Integer.MAX_VALUE ) {
+         return NULL_RARE_WORD;
+      }
       return new IndexedRareWord( tokens[bestIndex], bestIndex, tokens.length );
    }
 }

Modified: ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/TextTokenizer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/TextTokenizer.java?rev=1751544&r1=1751543&r2=1751544&view=diff
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/TextTokenizer.java (original)
+++ ctakes/sandbox/dictionary-gui/src/main/java/org/apache/ctakes/dictionary/creator/util/TextTokenizer.java Tue Jul  5 19:40:57 2016
@@ -1,6 +1,7 @@
 package org.apache.ctakes.dictionary.creator.util;
 
 import java.util.*;
+import java.util.stream.Collectors;
 
 /**
  * Author: SPF
@@ -183,16 +184,10 @@ final public class TextTokenizer {
          // get rid of last comma or semicolon or period
          splits[splits.length - 1] = lastSplit.substring( 0, lastSplit.length() - 1 );
       }
-      final StringBuilder sb = new StringBuilder();
-      for ( String split : splits ) {
-         final List<String> tokens = getTokens( split, separateDigits );
-         for ( String token : tokens ) {
-            sb.append( token ).append( " " );
-         }
-      }
-      // trim whitespace
-      sb.setLength( Math.max( 0, sb.length() - 1 ) );
-      return sb.toString();
+      return Arrays.stream( splits )
+            .map( s -> getTokens( s, separateDigits ) )
+            .flatMap( Collection::stream )
+            .collect( Collectors.joining( " " ) );
    }
 
 

Added: ctakes/sandbox/dictionary-gui/src/test/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtilTester.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionary-gui/src/test/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtilTester.java?rev=1751544&view=auto
==============================================================================
--- ctakes/sandbox/dictionary-gui/src/test/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtilTester.java (added)
+++ ctakes/sandbox/dictionary-gui/src/test/java/org/apache/ctakes/dictionary/creator/gui/umls/DoseUtilTester.java Tue Jul  5 19:40:57 2016
@@ -0,0 +1,30 @@
+package org.apache.ctakes.dictionary.creator.gui.umls;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.logging.Logger;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 7/5/2016
+ */
+public class DoseUtilTester {
+
+   static private final Logger LOGGER = Logger.getLogger( "DoseUtilTester" );
+
+   @Test
+   public void testHasUnit() {
+      Assert.assertTrue( "No ml detected!",
+            DoseUtil.hasUnit( "alcohol . 31 ml in 1 ml topical cloth [ alcohol wipes ]" ) );
+      Assert.assertTrue( "No mpa detected!",
+            DoseUtil.hasUnit( "polyquaternium - 32 ( 30000 mpa . s at 2 % )" ) );
+      Assert.assertTrue( "No mg detected!",
+            DoseUtil.hasUnit( "myasthenia gravis ( mg )" ) );
+      Assert.assertTrue( "No % detected!",
+            DoseUtil.hasUnit( "imiquimod 2 . 5 % top cream" ) );
+
+   }
+
+}