You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2020/09/25 01:04:47 UTC
svn commit: r1881995 [1/4] - in /ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased: ./ table/ term/ umls/ umls/abbreviation/ umls/file/

Author: seanfinan
Date: Fri Sep 25 01:04:47 2020
New Revision: 1881995

URL: http://svn.apache.org/viewvc?rev=1881995&view=rev
Log:
New Case Sensitive Dictionary Lookup Dictionary Creator Gui

Added:
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedDictionaryCreator.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedMainPanel.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedPiperWriter.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/HsqlWriter.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Ranks.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Synonym.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/SemanticTuiModel.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/TextTypeModel.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/ConsoLine.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CuiTerm.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CustomTermLine.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/TermLine.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/UmlsParser.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/Atn.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/IsPref.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/Lat.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/Rel.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/Rela.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/Srl.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/Stt.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/Stype.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/Ts.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/file/
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/file/MrConso.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/file/MrRel.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/file/MrSat.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/file/MrSty.java
    ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/file/Tty.java

Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedDictionaryCreator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedDictionaryCreator.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedDictionaryCreator.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedDictionaryCreator.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,77 @@
+package org.apache.ctakes.gui.dictionary.cased;
+
+
+import org.apache.ctakes.gui.component.DisablerPane;
+import org.apache.log4j.Logger;
+
+import javax.swing.*;
+import java.awt.*;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/26/2020
+ */
+public class CasedDictionaryCreator {
+
+   static private final Logger LOGGER = Logger.getLogger( "CasedDictionaryCreator" );
+
+
+   static private JFrame createFrame() {
+      // Case Sensitive Phrase F..
+      final JFrame frame = new JFrame( "cTAKES Cased Dictionary Creator" );
+      frame.setDefaultCloseOperation( WindowConstants.EXIT_ON_CLOSE );
+      // Use 1024 x 768 as the minimum required resolution (XGA)
+      // iPhone 3 : 480 x 320 (3:2, HVGA)
+      // iPhone 4 : 960 x 640  (3:2, unique to Apple)
+      // iPhone 5 : 1136 x 640 (under 16:9, unique to Apple)
+      // iPad 3&4 : 2048 x 1536 (4:3, QXGA)
+      // iPad Mini: 1024 x 768 (4:3, XGA)
+      final Dimension size = new Dimension( 1024, 768 );
+      frame.setSize( size );
+      frame.setMinimumSize( size );
+      final JMenuBar menuBar = new JMenuBar();
+      final JMenu fileMenu = new JMenu( "File" );
+      menuBar.add( fileMenu );
+
+      frame.setJMenuBar( menuBar );
+      System.setProperty( "apple.laf.useScreenMenuBar", "true" );
+      return frame;
+   }
+
+   static private JComponent createMainPanel() {
+      return new CasedMainPanel();
+   }
+
+   public static void main( final String... args ) {
+      try {
+         UIManager.setLookAndFeel( UIManager.getSystemLookAndFeelClassName() );
+         UIManager.getDefaults().put( "SplitPane.border", BorderFactory.createEmptyBorder() );
+      } catch ( ClassNotFoundException | InstantiationException
+            | IllegalAccessException | UnsupportedLookAndFeelException multE ) {
+         LOGGER.error( multE.getLocalizedMessage() );
+      }
+      final JFrame frame = createFrame();
+      final JComponent mainPanel = createMainPanel();
+      frame.add( mainPanel );
+      frame.pack();
+      frame.setVisible( true );
+      DisablerPane.getInstance().initialize( frame );
+      LOGGER.info( "1. Select your Apache cTAKES root directory." );
+      LOGGER.info( "   It can be a pre-built binary installation or a developer sandbox." );
+      LOGGER.info( "2. Select your Unified Medical Language System (UMLS) root directory." );
+      LOGGER.info( "   Once selected, your UMLS database will be parsed for available content." );
+      LOGGER.info( "3. Select your desired Vocabulary sources and targets in the left table." );
+      LOGGER.info( "   Recommended Vocabulary sources are pre-selected." );
+      LOGGER.info( "4. Select your desired Languages in the center table." );
+      LOGGER.info( "   English (ENG) is pre-selected if available." );
+      LOGGER.info( "5. Select your desired Semantic Types in the right table." );
+      LOGGER.info( "   Recommended Semantic types are pre-selected." );
+      LOGGER.info( "6. Type a name for your dictionary." );
+      LOGGER.info( "7. Click \'Build Dictionary\'" );
+      LOGGER.info( "-  You can resize this log panel by clicking the top and dragging up or down." );
+   }
+
+
+}

Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedMainPanel.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedMainPanel.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedMainPanel.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedMainPanel.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,396 @@
+package org.apache.ctakes.gui.dictionary.cased;
+
+import org.apache.ctakes.core.util.annotation.SemanticTui;
+import org.apache.ctakes.gui.component.DisablerPane;
+import org.apache.ctakes.gui.component.FileChooserPanel;
+import org.apache.ctakes.gui.component.LoggerPanel;
+import org.apache.ctakes.gui.component.PositionedSplitPane;
+import org.apache.ctakes.gui.dictionary.cased.table.SemanticTuiModel;
+import org.apache.ctakes.gui.dictionary.cased.table.TextTypeModel;
+import org.apache.ctakes.gui.dictionary.cased.term.CuiTerm;
+import org.apache.ctakes.gui.dictionary.cased.umls.UmlsParser;
+import org.apache.ctakes.gui.dictionary.cased.umls.file.Tty;
+import org.apache.ctakes.gui.dictionary.umls.LanguageTableModel;
+import org.apache.ctakes.gui.dictionary.umls.MrconsoIndex;
+import org.apache.ctakes.gui.dictionary.umls.MrsabIndex;
+import org.apache.ctakes.gui.dictionary.umls.SourceTableModel;
+import org.apache.ctakes.gui.dictionary.util.FileUtil;
+import org.apache.log4j.Logger;
+
+import javax.swing.*;
+import javax.swing.border.EmptyBorder;
+import javax.swing.table.TableModel;
+import javax.swing.text.JTextComponent;
+import java.awt.*;
+import java.awt.event.ActionEvent;
+import java.awt.event.ActionListener;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 12/10/2015
+ */
+final class CasedMainPanel extends JPanel {
+
+   static private final Logger LOGGER = Logger.getLogger( "CasedMainPanel" );
+
+   private String _umlsDirPath = System.getProperty( "user.dir" );
+   private String _ctakesPath = System.getProperty( "user.dir" );
+   private final SemanticTuiModel _tuiModel = new SemanticTuiModel();
+   private final SourceTableModel _sourceModel = new SourceTableModel();
+   private final TextTypeModel _textTypeModel = new TextTypeModel();
+   private final LanguageTableModel _languageModel = new LanguageTableModel();
+
+   CasedMainPanel() {
+      super( new BorderLayout() );
+      final JComponent sourceDirPanel = new JPanel( new GridLayout( 2, 1 ) );
+      sourceDirPanel.add( new FileChooserPanel( "cTAKES Installation:", _ctakesPath, true, new CtakesDirListener() ) );
+      sourceDirPanel.add( new FileChooserPanel( "UMLS Installation:", _umlsDirPath, true, new UmlsDirListener() ) );
+      add( sourceDirPanel, BorderLayout.NORTH );
+
+      add( createCenterPanel( _sourceModel, _tuiModel, _textTypeModel, _languageModel ), BorderLayout.CENTER );
+   }
+
+   private JComponent createCenterPanel( final TableModel sourceModel,
+                                         final TableModel tuiModel,
+                                         final TableModel textTypeModel,
+                                         final TableModel languageModel ) {
+      final JTabbedPane tabbedPane = new JTabbedPane();
+      tabbedPane.addTab( "Vocabularies", createTable( sourceModel ) );
+      tabbedPane.addTab( "Semantic Types", createTable( tuiModel ) );
+      tabbedPane.addTab( "Text Types", create50_100Table( textTypeModel ) );
+      tabbedPane.addTab( "Languages", createLangTable( languageModel ) );
+
+      final JPanel umlsPanel = new JPanel( new BorderLayout() );
+      umlsPanel.add( tabbedPane, BorderLayout.CENTER );
+      umlsPanel.add( createGoPanel(), BorderLayout.SOUTH );
+
+      final JSplitPane logSplit = new PositionedSplitPane( JSplitPane.VERTICAL_SPLIT );
+      logSplit.setTopComponent( umlsPanel );
+      logSplit.setBottomComponent( LoggerPanel.createLoggerPanel() );
+      logSplit.setDividerLocation( 0.6d );
+
+      return logSplit;
+   }
+
+   static private JComponent createTable( final TableModel model ) {
+      final JTable table = new JTable( model );
+      table.setCellSelectionEnabled( false );
+      table.setShowVerticalLines( false );
+      table.setAutoCreateRowSorter( true );
+      table.setAutoResizeMode( JTable.AUTO_RESIZE_LAST_COLUMN );
+      table.getColumnModel().getColumn( 0 ).setMaxWidth( 70 );
+      table.getColumnModel().getColumn( 1 ).setMaxWidth( 100 );
+      return new JScrollPane( table );
+   }
+
+   static private JComponent create50_100Table( final TableModel model ) {
+      final JTable table = new JTable( model );
+      table.setCellSelectionEnabled( false );
+      table.setShowVerticalLines( false );
+      table.setAutoCreateRowSorter( true );
+      table.setAutoResizeMode( JTable.AUTO_RESIZE_LAST_COLUMN );
+      table.getColumnModel().getColumn( 0 ).setMaxWidth( 70 );
+      table.getColumnModel().getColumn( 1 ).setMaxWidth( 100 );
+      return new JScrollPane( table );
+   }
+
+   static private JComponent createLangTable( final TableModel model ) {
+      final JTable table = new JTable( model );
+      table.setCellSelectionEnabled( false );
+      table.setShowVerticalLines( false );
+      table.setAutoCreateRowSorter( true );
+      table.setAutoResizeMode( JTable.AUTO_RESIZE_LAST_COLUMN );
+      table.getColumnModel().getColumn( 0 ).setMaxWidth( 50 );
+      return new JScrollPane( table );
+   }
+
+   private JComponent createGoPanel() {
+      final JPanel panel = new JPanel( new BorderLayout( 10, 10 ) );
+      panel.setBorder( new EmptyBorder( 2, 10, 2, 10 ) );
+      final JLabel label = new JLabel( "Dictionary Name:" );
+      label.setPreferredSize( new Dimension( 100, 0 ) );
+      label.setHorizontalAlignment( SwingConstants.TRAILING );
+      final JTextField textField = new JTextField( "custom" );
+      final JButton buildButton = new JButton( new BuildDictionaryAction( textField ) );
+      panel.add( label, BorderLayout.WEST );
+      panel.add( textField, BorderLayout.CENTER );
+      panel.add( buildButton, BorderLayout.EAST );
+      return panel;
+   }
+
+
+   private String setUmlsDirPath( final String umlsDirPath ) {
+      File mrConso = new File( umlsDirPath, "MRCONSO.RRF" );
+      if ( mrConso.isFile() ) {
+         _umlsDirPath = mrConso.getParentFile().getParent();
+      } else {
+         final String plusMetaPath = new File( umlsDirPath, "META" ).getPath();
+         mrConso = new File( plusMetaPath, "MRCONSO.RRF" );
+         if ( mrConso.isFile() ) {
+            _umlsDirPath = umlsDirPath;
+         } else {
+            error( "Invalid UMLS Installation", umlsDirPath + " is not a valid path to a UMLS installation" );
+         }
+      }
+      return _umlsDirPath;
+   }
+
+   private void loadSources() {
+      final ExecutorService executor = Executors.newSingleThreadExecutor();
+      executor.execute( new SourceLoadRunner( _umlsDirPath ) );
+   }
+
+   private class SourceLoadRunner implements Runnable {
+      private final String __umlsDirPath;
+
+      private SourceLoadRunner( final String umlsDirPath ) {
+         __umlsDirPath = umlsDirPath;
+      }
+
+      @Override
+      public void run() {
+         final JFrame frame = (JFrame)SwingUtilities.getRoot( CasedMainPanel.this );
+         frame.setCursor( Cursor.getPredefinedCursor( Cursor.WAIT_CURSOR ) );
+         DisablerPane.getInstance().setVisible( true );
+         final File mrConso = new File( __umlsDirPath + "/META", "MRCONSO.RRF" );
+         final String mrConsoPath = mrConso.getPath();
+         LOGGER.info( "Parsing vocabulary types from " + mrConsoPath );
+         final Collection<String> sources = new HashSet<>();
+         final Collection<String> languages = new HashSet<>();
+         try ( final BufferedReader reader = FileUtil.createReader( mrConsoPath ) ) {
+            java.util.List<String> tokens = FileUtil.readBsvTokens( reader, mrConsoPath );
+            while ( tokens != null ) {
+               if ( tokens.size() > MrconsoIndex.SOURCE._index ) {
+                  sources.add( tokens.get( MrconsoIndex.SOURCE._index ) );
+                  languages.add( tokens.get( MrconsoIndex.LANGUAGE._index ) );
+               }
+               tokens = FileUtil.readBsvTokens( reader, mrConsoPath );
+            }
+            LOGGER.info( "Parsed " + sources.size() + " vocabulary types" );
+            _sourceModel.setSources( sources );
+            LOGGER.info( "Parsed " + languages.size() + " languages" );
+            _languageModel.setLangauges( languages );
+         } catch ( IOException ioE ) {
+            error( "Vocabulary Parse Error", ioE.getMessage() );
+         }
+         final File mrSab = new File( __umlsDirPath + "/META", "MRSAB.RRF" );
+         final String mrSabPath = mrSab.getPath();
+         final Map<String, String> sourceNames = new HashMap<>();
+         final Map<String, String> sourceVersions = new HashMap<>();
+         final Map<String, String> sourceCuiCounts = new HashMap<>();
+         LOGGER.info( "Parsing vocabulary names from " + mrSabPath );
+         try ( final BufferedReader reader = FileUtil.createReader( mrSabPath ) ) {
+            int lineCount = 0;
+            java.util.List<String> tokens = FileUtil.readBsvTokens( reader, mrSabPath );
+            while ( tokens != null ) {
+               lineCount++;
+               if ( tokens.size() > MrsabIndex.CFR._index ) {
+                  final String sab = tokens.get( MrsabIndex.RSAB._index );
+                  if ( sources.contains( sab ) ) {
+                     sourceNames.put( sab, tokens.get( MrsabIndex.SON._index ) );
+                     final String oldCounts = sourceCuiCounts.getOrDefault( sab, "" );
+                     final String newCounts = tokens.get( MrsabIndex.CFR._index );
+                     if ( newCounts.length() > oldCounts.length() ) {
+                        sourceVersions.put( sab, tokens.get( MrsabIndex.SVER._index ) );
+                        sourceCuiCounts.put( sab, newCounts );
+                     }
+                  }
+               }
+               if ( lineCount % 100000 == 0 ) {
+                  LOGGER.info( "File Line " + lineCount + "\t Vocabularies " + sources.size() );
+               }
+               tokens = FileUtil.readBsvTokens( reader, mrConsoPath );
+            }
+            LOGGER.info( "Parsed " + sources.size() + " vocabulary names" );
+            _sourceModel.setSourceInfo( sourceNames, sourceVersions, sourceCuiCounts );
+         } catch ( IOException ioE ) {
+            error( "Vocabulary Parse Error", ioE.getMessage() );
+         }
+
+         DisablerPane.getInstance().setVisible( false );
+         frame.setCursor( Cursor.getDefaultCursor() );
+      }
+   }
+
+   private void buildDictionary( final String dictionaryName ) {
+      final ExecutorService executor = Executors.newSingleThreadExecutor();
+      executor.execute( new CasedDictionaryBuilder( _umlsDirPath,
+            _ctakesPath, dictionaryName,
+            _sourceModel.getWantedSources(),
+            _sourceModel.getWantedTargets(),
+            _tuiModel.getWantedTuis(),
+            _textTypeModel.getWantedTypes(),
+            _languageModel.getWantedLanguages() ) );
+   }
+
+   private void error( final String title, final String message ) {
+      LOGGER.error( message );
+      JOptionPane.showMessageDialog( CasedMainPanel.this, message, title, JOptionPane.ERROR_MESSAGE );
+   }
+
+   private class CasedDictionaryBuilder implements Runnable {
+      private final String _consoPath;
+      private final String _styPath;
+      private final String _rankPath;
+      private final String _hsqlPath;
+      private final String _dictionaryName;
+      private final Collection<String> _wantedVocabularies;
+      private final Collection<String> _writtenSchema;
+      private final Collection<SemanticTui> _wantedTuis;
+      private final Collection<Tty> _wantedTextTypes;
+      private final Collection<String> _wantedLanguages;
+
+      public CasedDictionaryBuilder( final String umlsPath,
+                                     final String ctakesPath,
+                                     final String dictionaryName,
+                                     final Collection<String> wantedVocabularies,
+                                     final Collection<String> writtenSchema,
+                                     final Collection<SemanticTui> wantedTuis,
+                                     final Collection<Tty> wantedTermTypes,
+                                     final Collection<String> wantedLanguages ) {
+         this( umlsPath + "/META/MRCONSO.RRF",
+               umlsPath + "/META/MRSTY.RRF",
+               umlsPath + "/META/MRRANK.RRF",
+               ctakesPath + "/resources/org/apache/ctakes/dictionary/lookup/cased",
+               dictionaryName,
+               wantedVocabularies,
+               writtenSchema,
+               wantedTuis,
+               wantedTermTypes,
+               wantedLanguages );
+      }
+
+      public CasedDictionaryBuilder( final String consoPath,
+                                     final String styPath,
+                                     final String rankPath,
+                                     final String hsqlPath,
+                                     final String dictionaryName,
+                                     final Collection<String> wantedVocabularies,
+                                     final Collection<String> writtenSchema,
+                                     final Collection<SemanticTui> wantedTuis,
+                                     final Collection<Tty> wantedTermTypes,
+                                     final Collection<String> wantedLanguages ) {
+         _consoPath = consoPath;
+         _styPath = styPath;
+         _rankPath = rankPath;
+         _hsqlPath = hsqlPath;
+         _dictionaryName = dictionaryName;
+         _wantedVocabularies = wantedVocabularies;
+         _writtenSchema = writtenSchema;
+         _wantedTuis = wantedTuis;
+         _wantedTextTypes = wantedTermTypes;
+         _wantedLanguages = wantedLanguages;
+      }
+
+      public void run() {
+         SwingUtilities.getRoot( CasedMainPanel.this ).setCursor( Cursor.getPredefinedCursor( Cursor.WAIT_CURSOR ) );
+         DisablerPane.getInstance().setVisible( true );
+         final Collection<CuiTerm> cuiTerms
+               = UmlsParser.createCuiTerms( _consoPath,
+               _styPath,
+               _rankPath,
+               _wantedTuis,
+               _wantedVocabularies,
+               _wantedTextTypes,
+               _wantedLanguages,
+               _writtenSchema );
+         if ( cuiTerms.isEmpty() ) {
+            final String message = "No Terms fit your parameters for the dictionary";
+            LOGGER.error( message );
+            JOptionPane
+                  .showMessageDialog( CasedMainPanel.this, message, "Cannot Build Dictionary", JOptionPane.ERROR_MESSAGE );
+         } else {
+            if ( HsqlWriter.writeHsql( _hsqlPath, _dictionaryName, _writtenSchema, cuiTerms ) ) {
+               final String message = "Dictionary " + _dictionaryName + " successfully built in " + _hsqlPath;
+               LOGGER.info( message );
+               JOptionPane
+                     .showMessageDialog( CasedMainPanel.this, message, "Dictionary Built", JOptionPane.INFORMATION_MESSAGE );
+            } else {
+               error( "Build Failure", "Dictionary " + _dictionaryName + " could not be built in " + _hsqlPath );
+            }
+            if ( CasedPiperWriter.writePiper( _hsqlPath, _dictionaryName, _writtenSchema ) ) {
+               final String message = "Dictionary Piper " + _dictionaryName + " successfully built in " + _hsqlPath;
+               LOGGER.info( message );
+               JOptionPane
+                     .showMessageDialog( CasedMainPanel.this, message, "Dictionary Piper Built", JOptionPane.INFORMATION_MESSAGE );
+            } else {
+               error( "Build Failure", "Dictionary Piper " + _dictionaryName + " could not be built in " + _hsqlPath );
+            }
+         }
+         DisablerPane.getInstance().setVisible( false );
+         SwingUtilities.getRoot( CasedMainPanel.this ).setCursor( Cursor.getDefaultCursor() );
+      }
+   }
+
+
+   private class UmlsDirListener implements ActionListener {
+      @Override
+      public void actionPerformed( final ActionEvent event ) {
+         final String oldPath = _umlsDirPath;
+         final String newPath = setUmlsDirPath( event.getActionCommand() );
+         if ( !oldPath.equals( newPath ) ) {
+            loadSources();
+         }
+      }
+   }
+
+
+   private class CtakesDirListener implements ActionListener {
+      @Override
+      public void actionPerformed( final ActionEvent event ) {
+         _ctakesPath = event.getActionCommand();
+      }
+   }
+
+
+   /**
+    * Builds the dictionary
+    */
+   private class BuildDictionaryAction extends AbstractAction {
+      private final JTextComponent __textComponent;
+
+      private BuildDictionaryAction( final JTextComponent textComponent ) {
+         super( "Build Dictionary" );
+         __textComponent = textComponent;
+      }
+
+      @Override
+      public void actionPerformed( final ActionEvent event ) {
+         if ( _sourceModel.getRowCount() == 0 ) {
+            error( "UMLS not yet loaded", "Please specify a UMLS installation." );
+            return;
+         }
+         if ( _sourceModel.getWantedSources().isEmpty() ) {
+            error( "Vocabularies not selected", "Please specify one or more source vocabularies." );
+            return;
+         }
+         if ( _textTypeModel.getWantedTypes().isEmpty() ) {
+            error( "Text Types not selected", "Please specify one or more source text types." );
+            return;
+         }
+         if ( _languageModel.getWantedLanguages().isEmpty() ) {
+            error( "Language not selected", "Please specify one or more languages." );
+            return;
+         }
+         final String dictionaryName = __textComponent.getText();
+         if ( dictionaryName != null && !dictionaryName.isEmpty() ) {
+            buildDictionary( dictionaryName.toLowerCase() );
+         } else {
+            error( "Invalid Dictionary Name", "Please Specify a Dictionary Name" );
+         }
+      }
+   }
+
+
+}

Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedPiperWriter.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedPiperWriter.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedPiperWriter.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedPiperWriter.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,153 @@
+package org.apache.ctakes.gui.dictionary.cased;
+
+
+import org.apache.ctakes.gui.dictionary.umls.VocabularyStore;
+import org.apache.ctakes.gui.dictionary.util.HsqlUtil;
+import org.apache.log4j.Logger;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/28/2020
+ */
+final public class CasedPiperWriter {
+
+   static private final Logger LOGGER = Logger.getLogger( "CasedPiperWriter" );
+
+
+   private CasedPiperWriter() {
+   }
+
+
+   static public boolean writePiper( final String hsqlPath,
+                                     final String dictionaryName,
+                                     final Collection<String> writtenSchema ) {
+      final String url = HsqlUtil.URL_PREFIX + hsqlPath.replace( '\\', '/' ) + "/" + dictionaryName + "/" +
+                         dictionaryName;
+      final List<String> schemaList = new ArrayList<>( writtenSchema );
+      Collections.sort( schemaList );
+      schemaList.add( "TUI" );
+      schemaList.add( "PREFERRED_TEXT" );
+      final String schemas = String.join( ",", schemaList );
+      final File piperFile = new File( hsqlPath, dictionaryName + ".piper" );
+      try ( final Writer writer = new BufferedWriter( new FileWriter( piperFile ) ) ) {
+         writer.write( "// This piper file contains instructions to set up your custom dictionary and encoders for Case-sensitive Dictionary Lookup.\n" );
+         writer.write( "// To use your new dictionary, load this piper in your main piper:\n" );
+         writer.write( "// load " + hsqlPath + "\n" );
+         writer.write( "\n" );
+         writer.write( "//             ===  Setup common to all Dictionaries  ===\n" );
+         writer.write( "//               =  Trigger Part of Speech  =\n" );
+         writer.write( "//    Use Verbs as lookup tokens.  Default = yes.\n" );
+         writer.write( "// set lookupVerbs=yes\n" );
+         writer.write( "//    Use Nouns as lookup tokens.  Default = yes.\n" );
+         writer.write( "// set lookupNouns=yes\n" );
+         writer.write( "//    Use Adjectives as lookup tokens.  Default = yes.\n" );
+         writer.write( "// set lookupAdjectives=yes\n" );
+         writer.write( "//    Use Adverbs as lookup tokens.  Default = yes.\n" );
+         writer.write( "// set lookupAdverbs=yes\n" );
+         writer.write( "//    Comma delimited array of other parts of speech to use for lookup.  Default is empty.\n" );
+         writer.write( "// set otherLookups=\n" );
+         writer.write( "//               =  Trigger Word Length  =\n" );
+         writer.write( "//    Minimum character span to use for lookup.  Default is 3.\n" );
+         writer.write( "// set minimumSpan=3\n" );
+         writer.write( "//               =  Text Loose Matching  =\n" );
+         writer.write( "//    Allow words to be skipped in lookup.  Default is no.\n" );
+         writer.write( "// set allowWordSkips=no\n" );
+         writer.write( "//    Number of words that can be skipped consecutively in lookup.  Default is 2.\n" );
+         writer.write( "// set consecutiveSkips=2\n" );
+         writer.write( "//    Number of words that can be skipped in total in lookup.  Default is 4.\n" );
+         writer.write( "// set totalSkips=4\n" );
+         writer.write( "//               =  Subsumption  =\n" );
+         writer.write( "//    Subsume small terms by larger enclosing terms in the same semantic group.  Default is yes.\n" );
+         writer.write( "//      This is not the default behavior of the default dictionary lookup, but that of the PrecisionTermConsumer.\n" );
+         writer.write( "// set subsume=yes\n" );
+         writer.write( "//    Subsume contained terms of the same and certain other semantic groups.  Default is yes.\n" );
+         writer.write( "//      This is not the default behavior of the default dictionary lookup, but that of the SemanticCleanupTermConsumer.\n" );
+         writer.write( "// set subsumeSemantics=yes\n" );
+         writer.write( "//    Comma delimited array of semantic types to group reassignment key:value pairs.  Default is empty.\n" );
+         writer.write( "//      Within the comma delimited array types and groups are separated by a colon.\n" );
+         writer.write( "//      Semantic Type can be indicated by name or TUI.  Semantic Group must be indicated by name.\n" );
+         writer.write( "//      Example:     set reassignSemantics=Cell:Finding,T065:Event\n" );
+         writer.write( "// set reassignSemantics=\n" );
+         writer.write( "\n" );
+         writer.write( "//             ===  Dictionaries Setup  ===\n" );
+         writer.write( "//               =  Dictionary Names  =\n" );
+         writer.write( "//    Comma delimited array of dictionary names.\n" );
+         writer.write( "set dictionaries=" + dictionaryName + "\n" );
+         writer.write( "\n" );
+         writer.write( "//             ===  Individual Dictionary Setup  ===\n" );
+         writer.write( "//    Individual Dictionary setup parameters are named {dictionaryName}_{parameterName}.\n" );
+         writer.write( "//               =  Dictionary Type  =\n" );
+         writer.write( "//    Declare the source type the Dictionary.  {dictionaryName}_type\n" );
+         writer.write( "set " + dictionaryName + "_type=JDBC\n" );
+         writer.write( "\n" );
+         writer.write( "//               =  JDBC Database  =\n" );
+         writer.write( "//    JDBC Driver for the Dictionary.  {dictionaryName}_driver\n" );
+         writer.write( "set " + dictionaryName + "_driver=org.hsqldb.jdbcDriver\n" );
+         writer.write( "//    Url for the Database.  {dictionaryName}_url\n" );
+         writer.write( "set " + dictionaryName + "_url=" + url + "\n" );
+         writer.write( "//    User for the Database.  {dictionaryName}_user.\n" );
+         writer.write( "// set " + dictionaryName + "_user=sa\n" );
+         writer.write( "//    Password for the Database.  {dictionaryName}_pass\n" );
+         writer.write( "// set " + dictionaryName + "_pass=\n" );
+         writer.write( "//               =  JDBC Term Tables  =\n" );
+         writer.write( "//    Upper case Term Table in the Database.  {dictionaryName}_upper\n" );
+         writer.write( "// set " + dictionaryName + "_upper=UPPER\n" );
+         writer.write( "//    Mixed case Term Table in the Database.  {dictionaryName}_mixed\n" );
+         writer.write( "// set " + dictionaryName + "_mixed=MIXED\n" );
+         writer.write( "//    Lower case Term Table in the Database.  {dictionaryName}_lower\n" );
+         writer.write( "// set " + dictionaryName + "_lower=LOWER\n" );
+         writer.write( "\n" );
+         writer.write( "//             ===  Encoders Setup  ===\n" );
+         writer.write( "//    Comma delimited array of encoder names.  Note that these names also indicate a Code Schema name.\n" );
+         writer.write( "set encoders=" + schemas + "\n" );
+         writer.write( "\n" );
+         writer.write( "//             ===  Individual Encoder Setup  ===\n" );
+         writer.write( "//    Individual Encoder setup parameters are named {encoderName}_{parameterName}.\n" );
+         writer.write( "//               =  Encoder Type  =\n" );
+         writer.write( "//    Declare the source type the Encoder.  {encoderName}_type\n" );
+         for ( String schema : schemaList ) {
+            writer.write( "set " + schema + "_type=JDBC\n" );
+         }
+         writer.write( "\n" );
+         writer.write( "//               =  JDBC Database  =\n" );
+         writer.write( "//    JDBC Driver for the Encoder.  {encoderName}_driver\n" );
+         writer.write( "//    The default JDBC driver is org.hsqldb.jdbcDriver\n\n" );
+         writer.write( "//    Url for the Database.  {encoderName}_url\n" );
+         for ( String schema : schemaList ) {
+            writer.write( "set " + schema + "_url=jdbc:hsqldb:file:resources/org/apache/ctakes/dictionary/lookup/cased/"
+                          + dictionaryName + "/" + dictionaryName + "\n" );
+         }
+         writer.write( "//    Most of the following settings are left empty to exemplify brevity.\n\n" );
+         writer.write( "//    User for the Database.  {encoderName}_user   Default user is sa\n\n" );
+         writer.write( "//    Password for the Database.  {encoderName}_pass   Default password is empty.\n\n" );
+         writer.write( "//               =  JDBC Encoder Tables  =\n" );
+         writer.write( "//    Encoding Table in the Database.  {encoderName}_table   Default table is the schema name.\n\n" );
+         writer.write( "//    Encoding Table Class Type.  {encoderName}_class\n" );
+         for ( String schema : schemaList ) {
+            if ( schema.equals( "TUI" ) ) {
+               writer.write( "set TUI_class=tui\n" );
+               continue;
+            } else if ( schema.equals( "PREFERRED_TEXT" ) ) {
+               writer.write( "set PREFERRED_TEXT_class=pref_text\n" );
+               continue;
+            }
+            writer.write( "set " + schema + "_class="
+                          + VocabularyStore.getInstance().getCtakesClass( schema ) + "\n" );
+         }
+         writer.write( "\n" );
+      } catch ( IOException ioE ) {
+         LOGGER.error( ioE.getMessage() );
+         return false;
+      }
+      return true;
+   }
+
+
+}

Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/HsqlWriter.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/HsqlWriter.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/HsqlWriter.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/HsqlWriter.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,315 @@
+package org.apache.ctakes.gui.dictionary.cased;
+
+
+import org.apache.ctakes.gui.dictionary.cased.term.CuiTerm;
+import org.apache.ctakes.gui.dictionary.umls.VocabularyStore;
+import org.apache.ctakes.gui.dictionary.util.HsqlUtil;
+import org.apache.ctakes.gui.dictionary.util.JdbcUtil;
+import org.apache.ctakes.gui.dictionary.util.RareWordUtil;
+import org.apache.log4j.Logger;
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.function.Function;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/26/2020
+ */
+final public class HsqlWriter {
+
+   static private final Logger LOGGER = Logger.getLogger( "HsqlWriter" );
+
+   static public boolean writeHsql( final String hsqlPath,
+                                    final String dictionaryName,
+                                    final Collection<String> writtenSchema,
+                                    final Collection<CuiTerm> cuiTerms ) {
+      final String url = HsqlUtil.URL_PREFIX + hsqlPath.replace( '\\', '/' ) + "/" + dictionaryName + "/" +
+                         dictionaryName;
+      final Connection connection = JdbcUtil.createDatabaseConnection( url, "SA", "" );
+      if ( !createDatabase( connection, writtenSchema ) ) {
+         return false;
+      }
+      // Get Count of appearance in dictionary per term token
+      final Map<String, Long> upperTokenCounts = getUpperTokenCounts( cuiTerms );
+      final Map<String, Long> mixedTokenCounts = getMixedTokenCounts( cuiTerms );
+      final Map<String, Long> lowerTokenCounts = getLowerTokenCounts( cuiTerms );
+      // Create insert sql statements
+      final String upperSql = JdbcUtil.createRowInsertSql( "UPPER", Synonym.values() );
+      final String mixedSql = JdbcUtil.createRowInsertSql( "MIXED", Synonym.values() );
+      final String lowerSql = JdbcUtil.createRowInsertSql( "LOWER", Synonym.values() );
+      final String tuiSql = JdbcUtil.createCodeInsertSql( "TUI" );
+      final String prefTextSql = JdbcUtil.createCodeInsertSql( "PREFERRED_TEXT" );
+      final Map<String, String> insertCodeSqls = createCodeInsertSqls( writtenSchema );
+
+      try {
+
+         final PreparedStatement upperStatement = connection.prepareStatement( upperSql );
+         final PreparedStatement mixedStatement = connection.prepareStatement( mixedSql );
+         final PreparedStatement lowerStatement = connection.prepareStatement( lowerSql );
+         final PreparedStatement tuiStatement = connection.prepareStatement( tuiSql );
+         final PreparedStatement prefTextStatement = connection.prepareStatement( prefTextSql );
+         final Map<String, PreparedStatement> codeStatements = createCodeStatements( connection, insertCodeSqls );
+
+         for ( CuiTerm cuiTerm : cuiTerms ) {
+            final long cui = cuiTerm.getCuiCode();
+            // write main term table
+            for ( String text : cuiTerm.getUpperOnly() ) {
+               final RareWordUtil.IndexedRareWord indexedRareWord
+                     = RareWordUtil.getIndexedRareWord( text, upperTokenCounts );
+               if ( RareWordUtil.NULL_RARE_WORD.equals( indexedRareWord ) ) {
+                  continue;
+               }
+               upperStatement.setLong( Synonym.CUI.getColumn(), cui );
+               upperStatement.setString( Synonym.PREFIX.getColumn(), getPrefix( text, indexedRareWord.__word ) );
+               upperStatement.setString( Synonym.INDEX_WORD.getColumn(), indexedRareWord.__word );
+               upperStatement.setString( Synonym.SUFFIX.getColumn(), getSuffix( text, indexedRareWord.__word ) );
+               upperStatement.setInt( Synonym.RANK.getColumn(), cuiTerm.getRank( text ) );
+               upperStatement.setInt( Synonym.INSTANCES.getColumn(), cuiTerm.getInstances( text ) );
+               upperStatement.executeUpdate();
+            }
+            for ( String text : cuiTerm.getMixedOnly() ) {
+               final RareWordUtil.IndexedRareWord indexedRareWord
+                     = RareWordUtil.getIndexedRareWord( text, mixedTokenCounts );
+               if ( RareWordUtil.NULL_RARE_WORD.equals( indexedRareWord ) ) {
+                  continue;
+               }
+               mixedStatement.setLong( Synonym.CUI.getColumn(), cui );
+               mixedStatement.setString( Synonym.PREFIX.getColumn(), getPrefix( text, indexedRareWord.__word ) );
+               mixedStatement.setString( Synonym.INDEX_WORD.getColumn(), indexedRareWord.__word );
+               mixedStatement.setString( Synonym.SUFFIX.getColumn(), getSuffix( text, indexedRareWord.__word ) );
+               mixedStatement.setInt( Synonym.RANK.getColumn(), cuiTerm.getRank( text ) );
+               mixedStatement.setInt( Synonym.INSTANCES.getColumn(), cuiTerm.getInstances( text ) );
+               mixedStatement.executeUpdate();
+            }
+            for ( String text : cuiTerm.getLowerOnly() ) {
+               final RareWordUtil.IndexedRareWord indexedRareWord
+                     = RareWordUtil.getIndexedRareWord( text, lowerTokenCounts );
+               if ( RareWordUtil.NULL_RARE_WORD.equals( indexedRareWord ) ) {
+                  continue;
+               }
+               lowerStatement.setLong( Synonym.CUI.getColumn(), cui );
+               lowerStatement.setString( Synonym.PREFIX.getColumn(), getPrefix( text, indexedRareWord.__word ) );
+               lowerStatement.setString( Synonym.INDEX_WORD.getColumn(), indexedRareWord.__word );
+               lowerStatement.setString( Synonym.SUFFIX.getColumn(), getSuffix( text, indexedRareWord.__word ) );
+               lowerStatement.setInt( Synonym.RANK.getColumn(), cuiTerm.getRank( text ) );
+               lowerStatement.setInt( Synonym.INSTANCES.getColumn(), cuiTerm.getInstances( text ) );
+               lowerStatement.executeUpdate();
+            }
+            // write tui table
+            for ( int tui : cuiTerm.getTuis() ) {
+               tuiStatement.setLong( 1, cui );
+               tuiStatement.setInt( 2, tui );
+               tuiStatement.executeUpdate();
+            }
+            // write preferred term table
+            String preferredText = cuiTerm.getPreferredText();
+            if ( !preferredText.isEmpty() ) {
+               prefTextStatement.setLong( 1, cui );
+               if ( preferredText.length() > 255 ) {
+                  preferredText = preferredText.substring( 0, 255 );
+               }
+               prefTextStatement.setString( 2, preferredText );
+               prefTextStatement.executeUpdate();
+            }
+            // write extra vocabulary code tables
+            final Map<String, Collection<String>> schemaCodeMap = cuiTerm.getSchemaCodes();
+            for ( Map.Entry<String, Collection<String>> schemaCodes : schemaCodeMap.entrySet() ) {
+               final String schema = fixVocabName( schemaCodes.getKey() );
+               final PreparedStatement statement = codeStatements.get( schema );
+               statement.setLong( 1, cui );
+               for ( String code : schemaCodes.getValue() ) {
+                  setCodeAppropriately( statement, code, VocabularyStore.getInstance()
+                                                                        .getVocabularyClass( schema ) );
+                  statement.executeUpdate();
+               }
+            }
+         }
+         connection.commit();
+         upperStatement.close();
+         mixedStatement.close();
+         lowerStatement.close();
+         tuiStatement.close();
+         prefTextStatement.close();
+         for ( PreparedStatement codeStatement : codeStatements.values() ) {
+            codeStatement.close();
+         }
+
+         connection.commit();
+         final Statement shutdownStatement = connection.createStatement();
+         shutdownStatement.execute( "SHUTDOWN" );
+         shutdownStatement.close();
+         connection.commit();
+         connection.close();
+      } catch ( SQLException sqlE ) {
+         LOGGER.error( sqlE.getMessage() );
+         return false;
+      }
+      return true;
+   }
+
+
+   static private String fixVocabName( final String vocabulary ) {
+      return vocabulary.toUpperCase().replace( '.', '_' ).replace( '-', '_' );
+   }
+
+   static private Map<String, String> createCodeInsertSqls( final Collection<String> writtenSchema ) {
+      return writtenSchema.stream().map( HsqlWriter::fixVocabName )
+                          .collect( Collectors.toMap( Function.identity(), HsqlWriter::createCodeInsertSql ) );
+   }
+
+   static public String createCodeInsertSql( final String vocabulary ) {
+      return JdbcUtil.createRowInsertSql( vocabulary, "CUI", vocabulary );
+   }
+
+   static private Map<String, PreparedStatement> createCodeStatements( final Connection connection,
+                                                                       final Map<String, String> insertCodeSqls )
+         throws SQLException {
+      final Map<String, PreparedStatement> codeStatements = new HashMap<>( insertCodeSqls.size() );
+      for ( Map.Entry<String, String> codeSql : insertCodeSqls.entrySet() ) {
+         codeStatements.put( codeSql.getKey(), connection.prepareStatement( codeSql.getValue() ) );
+      }
+      return codeStatements;
+   }
+
+   static private void setCodeAppropriately( final PreparedStatement statement, final String code,
+                                             final Class<?> type ) throws SQLException {
+      if ( String.class.equals( type ) ) {
+         statement.setString( 2, code );
+      } else if ( Double.class.equals( type ) ) {
+         statement.setDouble( 2, Double.valueOf( code ) );
+      } else if ( Long.class.equals( type ) ) {
+         statement.setLong( 2, Long.valueOf( code ) );
+      } else if ( Integer.class.equals( type ) ) {
+         statement.setInt( 2, Integer.valueOf( code ) );
+      } else {
+         LOGGER.error( "Could not set code for " + type.getName() );
+         statement.setString( 2, code );
+      }
+   }
+
+
+   static private boolean createDatabase( final Connection connection, final Collection<String> writtenSchema ) {
+      try {
+         // main tables
+         createSynonymTable( connection, "UPPER" );
+         createSynonymTable( connection, "MIXED" );
+         createSynonymTable( connection, "LOWER" );
+         // tui table
+         createTable( connection, "TUI", "CUI BIGINT", "TUI INTEGER" );
+         createIndex( connection, "TUI", "CUI" );
+         // preferred text table
+         createTable( connection, "PREFERRED_TEXT", "CUI BIGINT", "PREFERRED_TEXT VARCHAR(255)" );
+         createIndex( connection, "PREFERRED_TEXT", "CUI" );
+
+         // schema codes tables
+         for ( String vocabulary : writtenSchema ) {
+            final String jdbcClass = VocabularyStore.getInstance().getJdbcClass( vocabulary );
+            final String tableName = fixVocabName( vocabulary );
+            createTable( connection, tableName, "CUI BIGINT", tableName + " " + jdbcClass );
+            createIndex( connection, tableName, "CUI" );
+         }
+
+         executeStatement( connection, "SET WRITE_DELAY 10" );
+      } catch ( SQLException sqlE ) {
+         LOGGER.error( sqlE.getMessage() );
+         return false;
+      }
+      return true;
+   }
+
+   static private void createSynonymTable( final Connection connection, final String tableName ) throws SQLException {
+      createTable( connection, tableName,
+            "CUI BIGINT",
+            "PREFIX VARCHAR(78)",
+            "INDEX_WORD VARCHAR(48)",
+            "SUFFIX VARCHAR(78)",
+            "RANK INTEGER",
+            "INSTANCES INTEGER" );
+      createIndex( connection, tableName, "INDEX_WORD" );
+   }
+
+   static private void createTable( final Connection connection, final String tableName, final String... fieldNames )
+         throws SQLException {
+      final String fields = String.join( ",", fieldNames );
+      final String creator = "CREATE MEMORY TABLE " + tableName + "(" + fields + ")";
+      executeStatement( connection, creator );
+   }
+
+   static private void createIndex( final Connection connection, final String tableName,
+                                    final String indexField ) throws SQLException {
+      final String indexer = "CREATE INDEX IDX_" + tableName + " ON " + tableName + "(" + indexField + ")";
+      executeStatement( connection, indexer );
+   }
+
+   static private void executeStatement( final Connection connection, final String command ) throws SQLException {
+      final Statement statement = connection.createStatement();
+      statement.execute( command );
+      statement.close();
+   }
+
+
+   static private final Pattern SPACE_PATTERN = Pattern.compile( "\\s+" );
+
+   static private Map<String, Long> getUpperTokenCounts( final Collection<CuiTerm> cuiTerms ) {
+      return cuiTerms.stream()
+                     .map( CuiTerm::getUpperOnly )
+                     .flatMap( Collection::stream )
+                     .map( SPACE_PATTERN::split )
+                     .flatMap( Arrays::stream )
+                     .filter( RareWordUtil::isRarableToken )
+                     .collect( Collectors.groupingBy( Function.identity(), Collectors.counting() ) );
+   }
+
+   static private Map<String, Long> getMixedTokenCounts( final Collection<CuiTerm> cuiTerms ) {
+      return cuiTerms.stream()
+                     .map( CuiTerm::getMixedOnly )
+                     .flatMap( Collection::stream )
+                     .map( SPACE_PATTERN::split )
+                     .flatMap( Arrays::stream )
+                     .filter( RareWordUtil::isRarableToken )
+                     .collect( Collectors.groupingBy( Function.identity(), Collectors.counting() ) );
+   }
+
+   static private Map<String, Long> getLowerTokenCounts( final Collection<CuiTerm> cuiTerms ) {
+      return cuiTerms.stream()
+                     .map( CuiTerm::getLowerOnly )
+                     .flatMap( Collection::stream )
+                     .map( SPACE_PATTERN::split )
+                     .flatMap( Arrays::stream )
+                     .filter( RareWordUtil::isRarableToken )
+                     .collect( Collectors.groupingBy( Function.identity(), Collectors.counting() ) );
+   }
+
+
+   static private String getPrefix( final String text, final String indexedRareWord ) {
+      if ( text.equals( indexedRareWord ) ) {
+         return "";
+      }
+      if ( text.startsWith( indexedRareWord ) ) {
+         return "";
+      }
+      return text.substring( 0, text.indexOf( indexedRareWord ) ).trim();
+   }
+
+   static private String getSuffix( final String text, final String indexedRareWord ) {
+      if ( text.equals( indexedRareWord ) ) {
+         return "";
+      }
+      if ( text.endsWith( indexedRareWord ) ) {
+         return "";
+      }
+      return text.substring( text.indexOf( indexedRareWord ) + indexedRareWord.length() ).trim();
+   }
+
+
+}

Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Ranks.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Ranks.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Ranks.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Ranks.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,52 @@
+package org.apache.ctakes.gui.dictionary.cased;
+
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/26/2020
+ */
+public enum Ranks {
+   INSTANCE;
+
+   static public Ranks getInstance() {
+      return INSTANCE;
+   }
+
+   private final Map<String, Integer> _ranks = new HashMap<>();
+   private List<String> _rankList;
+
+
+   public void setUmlsRank( final String vocabulary, final String tty, final int rank ) {
+      _ranks.put( getRankCode( vocabulary, tty ), rank );
+   }
+
+//   public int getRank( final String vocabulary, final String tty ) {
+//      return _ranks.getOrDefault( getCode( vocabulary, tty ), -1 );
+//   }
+
+   public int getRank( final String vocabulary, final String tty ) {
+      return getCodeRank( getRankCode( vocabulary, tty ) );
+   }
+
+   public int getCodeRank( final String rankCode ) {
+      if ( _rankList == null ) {
+         _rankList = _ranks.entrySet()
+                           .stream()
+                           .sorted( Comparator.comparingInt( Map.Entry::getValue ) )
+                           .map( Map.Entry::getKey )
+                           .collect( Collectors.toList() );
+      }
+      return _rankList.size() - _rankList.indexOf( rankCode );
+   }
+
+   static public String getRankCode( final String vocabulary, final String tty ) {
+      return vocabulary + "_" + tty;
+   }
+
+}

Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Synonym.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Synonym.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Synonym.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Synonym.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,32 @@
+package org.apache.ctakes.gui.dictionary.cased;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/14/2020
+ */
+public enum Synonym {
+   CUI( 1, Long.class ),
+   PREFIX( 2, String.class ),
+   INDEX_WORD( 3, String.class ),
+   SUFFIX( 4, String.class ),
+   RANK( 5, Integer.class ),
+   INSTANCES( 6, Integer.class );
+
+   final private int _column;
+   final private Class<?> _class;
+
+   Synonym( final int column, final Class<?> clazz ) {
+      _column = column;
+      _class = clazz;
+   }
+
+   public int getColumn() {
+      return _column;
+   }
+
+   public Class<?> getClassType() {
+      return _class;
+   }
+
+}

Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/SemanticTuiModel.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/SemanticTuiModel.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/SemanticTuiModel.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/SemanticTuiModel.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,143 @@
+package org.apache.ctakes.gui.dictionary.cased.table;
+
+import org.apache.ctakes.core.util.annotation.SemanticGroup;
+import org.apache.ctakes.core.util.annotation.SemanticTui;
+
+import javax.swing.event.EventListenerList;
+import javax.swing.event.TableModelListener;
+import javax.swing.table.TableModel;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.EnumSet;
+
+import static org.apache.ctakes.core.util.annotation.SemanticTui.*;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/27/2020
+ */
+public class SemanticTuiModel implements TableModel {
+
+   static private final String[] COLUMN_NAMES = { "Use TUI", "TUI", "Semantic Type", "Semantic Group" };
+   static private final Class<?>[] COLUMN_CLASSES = { Boolean.class, String.class, String.class, String.class };
+
+   static private final Collection<SemanticTui> UNWANTED_TUIS
+         = EnumSet.of( T116, T087, T123, T118, T026, T043, T025, T103, T120, T104, T077, T049, T088, T065, T196,
+         T050, T018, T126, T168, T045, T028, T125, T078, T129, T055, T197, T170, T130, T119, T063,
+         T066, T041, T073, T044, T085, T114, T124, T086, T115, T109, T040, T042, T046, T039,
+         T192, T062, T075, T054, T056, T169, T185, T058, T033, UNKNOWN );
+
+   private final EventListenerList _listenerList = new EventListenerList();
+   private final Collection<SemanticTui> _wantedTuis = EnumSet.noneOf( SemanticTui.class );
+
+   public SemanticTuiModel() {
+      final Collection<SemanticGroup> wantedGroups
+            = EnumSet.of( SemanticGroup.ANATOMY,
+            SemanticGroup.DISORDER,
+            SemanticGroup.FINDING,
+            SemanticGroup.DEVICE,
+            SemanticGroup.PROCEDURE,
+            SemanticGroup.DRUG );
+      Arrays.stream( SemanticTui.values() )
+            .filter( t -> !UNWANTED_TUIS.contains( t ) )
+            .filter( t -> wantedGroups.contains( t.getGroup() ) )
+            .forEach( _wantedTuis::add );
+   }
+
+   public Collection<SemanticTui> getWantedTuis() {
+      return _wantedTuis;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public int getRowCount() {
+      return SemanticTui.values().length;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public int getColumnCount() {
+      return 4;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public String getColumnName( final int columnIndex ) {
+      return COLUMN_NAMES[ columnIndex ];
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Class<?> getColumnClass( final int columnIndex ) {
+      return COLUMN_CLASSES[ columnIndex ];
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public boolean isCellEditable( final int rowIndex, final int columnIndex ) {
+      return columnIndex == 0;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Object getValueAt( final int rowIndex, final int columnIndex ) {
+      final SemanticTui tui = SemanticTui.values()[ rowIndex ];
+      switch ( columnIndex ) {
+         case 0:
+            return _wantedTuis.contains( tui );
+         case 1:
+            return tui.name();
+         case 2:
+            return tui.getSemanticType();
+         case 3:
+            return tui.getGroupName();
+      }
+      return "ERROR";
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void setValueAt( final Object aValue, final int rowIndex, final int columnIndex ) {
+      if ( aValue instanceof Boolean && columnIndex == 0 ) {
+         final SemanticTui tui = SemanticTui.values()[ rowIndex ];
+         if ( (Boolean)aValue ) {
+            _wantedTuis.add( tui );
+         } else {
+            _wantedTuis.remove( tui );
+         }
+      }
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void addTableModelListener( final TableModelListener listener ) {
+      _listenerList.add( TableModelListener.class, listener );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void removeTableModelListener( final TableModelListener listener ) {
+      _listenerList.remove( TableModelListener.class, listener );
+   }
+
+
+}

Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/TextTypeModel.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/TextTypeModel.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/TextTypeModel.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/TextTypeModel.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,124 @@
+package org.apache.ctakes.gui.dictionary.cased.table;
+
+
+import org.apache.ctakes.gui.dictionary.cased.umls.file.Tty;
+
+import javax.swing.event.EventListenerList;
+import javax.swing.event.TableModelListener;
+import javax.swing.table.TableModel;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.EnumSet;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/27/2020
+ */
+public class TextTypeModel implements TableModel {
+
+
+   static private final String[] COLUMN_NAMES = { "Use Type", "TTY", "Text Type" };
+   static private final Class<?>[] COLUMN_CLASSES = { Boolean.class, String.class, String.class };
+
+   private final EventListenerList _listenerList = new EventListenerList();
+   private final Collection<Tty> _wantedTypes = EnumSet.noneOf( Tty.class );
+
+   public TextTypeModel() {
+      Arrays.stream( Tty.values() ).filter( Tty::collect ).forEach( _wantedTypes::add );
+   }
+
+
+   public Collection<Tty> getWantedTypes() {
+      return _wantedTypes;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public int getRowCount() {
+      return Tty.values().length;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public int getColumnCount() {
+      return 3;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public String getColumnName( final int columnIndex ) {
+      return COLUMN_NAMES[ columnIndex ];
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Class<?> getColumnClass( final int columnIndex ) {
+      return COLUMN_CLASSES[ columnIndex ];
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public boolean isCellEditable( final int rowIndex, final int columnIndex ) {
+      return columnIndex == 0;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Object getValueAt( final int rowIndex, final int columnIndex ) {
+      final Tty type = Tty.values()[ rowIndex ];
+      switch ( columnIndex ) {
+         case 0:
+            return _wantedTypes.contains( type );
+         case 1:
+            return type.name();
+         case 2:
+            return type.getDescription();
+      }
+      return "ERROR";
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void setValueAt( final Object aValue, final int rowIndex, final int columnIndex ) {
+      if ( aValue instanceof Boolean && columnIndex == 0 ) {
+         final Tty type = Tty.values()[ rowIndex ];
+         if ( (Boolean)aValue ) {
+            _wantedTypes.add( type );
+         } else {
+            _wantedTypes.remove( type );
+         }
+      }
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void addTableModelListener( final TableModelListener listener ) {
+      _listenerList.add( TableModelListener.class, listener );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void removeTableModelListener( final TableModelListener listener ) {
+      _listenerList.remove( TableModelListener.class, listener );
+   }
+
+}

Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/ConsoLine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/ConsoLine.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/ConsoLine.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/ConsoLine.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,159 @@
+package org.apache.ctakes.gui.dictionary.cased.term;
+
+import org.apache.ctakes.core.util.StringUtil;
+import org.apache.ctakes.gui.dictionary.cased.umls.abbreviation.Lat;
+import org.apache.ctakes.gui.dictionary.cased.umls.file.MrConso;
+import org.apache.ctakes.gui.dictionary.cased.umls.file.Tty;
+
+import static org.apache.ctakes.gui.dictionary.cased.umls.file.MrConso.*;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/21/2019
+ */
+final public class ConsoLine implements TermLine {
+   final private String[] _columns;
+
+   public ConsoLine( final String line ) {
+      _columns = StringUtil.fastSplit( line, '|' );
+   }
+
+   private boolean isTextOk() {
+      return isWantedLat() && hasNoSpecialChars();
+   }
+
+   public boolean collect() {
+      return isTextOk()
+//             && hasWantedSynonyms()
+             && Tty.collect( column( TTY ) );
+   }
+
+   public boolean isUnwantedDrug() {
+      final String text = getTokenizedText();
+      if ( text.contains( " in " ) && text.endsWith( " dosage form" ) ) {
+         return true;
+      }
+      if ( text.endsWith( " oral tablet" ) || text.endsWith( " oral capsule" )
+           || text.endsWith( "ml vial" ) || text.endsWith( "ml injection" ) ) {
+         return true;
+      }
+//      if ( UmlsSource.getSource( column( SAB ) ) != UmlsSource.RXNORM ) {
+//         return false;
+//      }
+      return text.contains( " " ) && !Tty.keep( column( TTY ) );
+   }
+
+   public boolean isObsolete() {
+      final Tty tty = Tty.getType( column( TTY ) );
+      return tty == Tty.OAP || tty == Tty.OF;
+   }
+
+   //   public String getCui() {
+//      return CuiUtil.getCui( column( CUI ) );
+//   }
+   public String getCui() {
+      return column( CUI );
+   }
+
+   public String getText() {
+      return column( STR );
+   }
+
+   public int getPrefScore() {
+      final String text = getText();
+      if ( text.length() < 3 ) {
+         return 1;
+      }
+      if ( text.chars().filter( Character::isAlphabetic ).count() < 3 ) {
+         return 1;
+      }
+
+      int score = 1;
+      score = upScore( TS, "P", score, 2 );
+
+//      score = upScore( SAB, UmlsSource.NCI.getName(), score, 3 );
+//      score = upScore( SAB, UmlsSource.FMA.getName(), score, 2 );
+//      score = upScore( SAB, UmlsSource.SNOMEDCT_US.getName(), score, 2 );
+//      score = upScore( SAB, UmlsSource.MTH.getName(), score, 2 );
+//      score = upScore( SAB, UmlsSource.NCI_MTH.getName(), score, 3 );
+
+      score = upScore( STT, "PF", score, 2 );
+      score = upScore( STT, "VC", score, 2 );
+      score = upScore( STT, "VO", score, 2 );
+//      score = upScore( ISPREF, "Y", score );  // It usually looks reversed.
+//      score = upScore( ISPREF, "N", score, 2 );
+
+      score = upScore( TTY, "PT", score, 3 );
+      score = upScore( TTY, "PN", score, 3 );
+      score = upScore( TTY, "RXN_PT", score, 2 );
+      score = upScore( TTY, "DN", score, 2 );
+
+      if ( text.startsWith( "Entire " )
+           || text.startsWith( "Structure of " )
+           || text.endsWith( " structure" )
+           || text.endsWith( ")" )
+           || text.endsWith( "]" )
+           || text.endsWith( " NOS" )
+//           || text.contains( "-" )
+           || text.contains( " or " )
+           || !Character.isLetter( text.charAt( 0 ) ) ) {
+         return score / 3;
+      }
+      if ( text.equals( text.toUpperCase() ) ) {
+         // Should also work for numbers.
+         return score / 2;
+      }
+      // Prefer fewer-word terms, but only slightly
+      final long spaces = text.chars().filter( Character::isWhitespace ).count();
+      return (int)Math.max( 1, score - spaces );
+   }
+
+   public String getSource() {
+      return column( SAB );
+   }
+
+   public String getCode() {
+      return column( CODE );
+   }
+
+   private String column( final MrConso conso ) {
+      return _columns[ conso.ordinal() ];
+   }
+
+   private int upScore( final MrConso column, final String wanted, final int score, final int weight ) {
+      if ( column( column ).equals( wanted ) ) {
+         return weight * score;
+      }
+      return score;
+   }
+
+   public boolean isWantedLat() {
+      return column( LAT ).equals( Lat.ENG.name() );
+   }
+
+//   private boolean hasWantedSynonyms() {
+//      return WantedSource.hasWantedSynonyms( column( SAB ) );
+//   }
+
+
+   private boolean hasNoSpecialChars() {
+      final String text = getText();
+      // strips off all non-ASCII characters
+      String txt = text.replaceAll( "[^\\x00-\\x7F]", "" );
+      // erases all the ASCII control characters
+      txt = txt.replaceAll( "[\\p{Cntrl}&&[^\r\n\t]]", "" );
+      // removes non-printable characters from Unicode
+      txt = txt.replaceAll( "\\p{C}", "" );
+      return text.equals( txt );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public String toString() {
+      return String.join( " | ", _columns );
+   }
+
+}

Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CuiTerm.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CuiTerm.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CuiTerm.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CuiTerm.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,342 @@
+package org.apache.ctakes.gui.dictionary.cased.term;
+
+
+import jdk.nashorn.internal.ir.annotations.Immutable;
+import org.apache.ctakes.core.util.StringUtil;
+import org.apache.ctakes.core.util.annotation.SemanticGroup;
+import org.apache.ctakes.core.util.annotation.SemanticTui;
+import org.apache.ctakes.gui.dictionary.cased.Ranks;
+import org.apache.ctakes.gui.dictionary.umls.VocabularyStore;
+import org.apache.ctakes.gui.dictionary.util.TextTokenizer;
+
+import java.util.*;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/14/2020
+ */
+@Immutable
+final public class CuiTerm {
+
+   static private final int MIN_SYNONYM_LENGTH = 3;
+   static private final int MAX_SYNONYM_LENGTH = 79;
+   static private final int MAX_SYNONYM_TOKENS = 5;
+
+   private final long _cuiCode;
+
+   private final Collection<SemanticTui> _semanticTuis = EnumSet.noneOf( SemanticTui.class );
+
+   private final Map<String, Collection<String>> _tokenizedVocabTuis = new HashMap<>();
+   private final Collection<ScoredText> _textScores = new HashSet<>();
+   private final Map<String, Collection<String>> _schemaCodes = new HashMap<>();
+
+
+   public CuiTerm( final long cuiCode ) {
+      _cuiCode = cuiCode;
+   }
+
+//   public void addTui( final String tui ) {
+//      addTui( SemanticTui.getTuiFromCode( tui ) );
+//   }
+
+   public void addTui( final SemanticTui semanticTui ) {
+      _semanticTuis.add( semanticTui );
+   }
+
+   public void addSchemaCode( final String sab, final String code ) {
+      if ( _schemaCodes.computeIfAbsent( sab, c -> new HashSet<>() ).add( code ) ) {
+         VocabularyStore.getInstance().addVocabulary( sab, code );
+      }
+   }
+
+   public Map<String, Collection<String>> getSchemaCodes() {
+      return _schemaCodes;
+   }
+
+
+   public void addSynonym( final String text,
+                           final String sab,
+                           final Collection<SemanticTui> tuis,
+                           final String ts,
+                           final String stt,
+                           final String tty ) {
+      _textScores.add( new ScoredText( text, ts, stt, tty ) );
+      final String tokenized = TextTokenizer.getTokenizedText( text );
+      final String stripped = stripForm( tokenized );
+      if ( !isDictionaryable( stripped ) ) {
+         return;
+      }
+      final String rankCode = Ranks.getRankCode( sab, tty );
+      _tokenizedVocabTuis.computeIfAbsent( maybeUncap( stripped, tuis ), s -> new HashSet<>() ).add( rankCode );
+   }
+
+
+   static private String stripForm( final String tokenized ) {
+      return tokenized.contains( "_ _ _" ) ? "" : tokenized;
+   }
+
+   static private String replaceEnd( final String text, final String end ) {
+      return text.toLowerCase().endsWith( end ) ? text.substring( 0, text.length() - end.length() ).trim() : text;
+   }
+
+   static private String replaceBegin( final String text, final String begin ) {
+      return text.toLowerCase().startsWith( begin ) ? text.substring( begin.length() ).trim() : text;
+   }
+
+   static private boolean isTextValid( final String tokenized ) {
+      final boolean absolutelyNot = tokenized.length() < MIN_SYNONYM_LENGTH
+                                    || tokenized.length() > MAX_SYNONYM_LENGTH
+                                    || StringUtil.fastSplit( tokenized, ' ' ).length > MAX_SYNONYM_TOKENS
+                                    // Check for auto-created note form
+//                                    || StringUtil.fastSplit( tokenized, '@' ).length > 2
+                                    || tokenized.chars().noneMatch( Character::isAlphabetic )
+                                    || (tokenized.length() == MIN_SYNONYM_LENGTH && tokenized.charAt( 0 ) == '(');
+      return !absolutelyNot;
+   }
+
+
+   static private boolean isDictionaryable( final String tokenized ) {
+      final boolean absolutelyNot = tokenized.length() < MIN_SYNONYM_LENGTH
+                                    || tokenized.length() > MAX_SYNONYM_LENGTH
+                                    || (StringUtil.fastSplit( tokenized, ' ' ).length > MAX_SYNONYM_TOKENS);
+      if ( absolutelyNot ) {
+         return false;
+      }
+      final boolean hasGarbage = tokenized.startsWith( "[" )
+                                 || tokenized.contains( "#" )
+                                 || tokenized.contains( "@" )
+                                 || tokenized.contains( "&" )
+                                 || tokenized.contains( ";" )
+                                 || tokenized.contains( "\"" )
+                                 || tokenized.endsWith( ")" )
+                                 || tokenized.endsWith( "]" );
+      return !hasGarbage;
+   }
+
+   /**
+    * @return umls cui for the term
+    */
+   public long getCuiCode() {
+      return _cuiCode;
+   }
+
+   public Collection<Integer> getTuis() {
+      return _semanticTuis.stream()
+                          .map( SemanticTui::getCode )
+                          .collect( Collectors.toSet() );
+   }
+
+   private Collection<String> getTokenizedSynonyms() {
+      return _tokenizedVocabTuis.keySet();
+   }
+
+   static private final Predicate<String> onlyCapped
+         = t -> t.substring( 1 ).equals( t.substring( 1 ).toLowerCase() );
+
+
+   static private final Collection<String> UNITS = new HashSet<>( Arrays.asList(
+         "MG", "MG/MG", "ML", "mL", "MG/ML", "mg/mL", "ML/ML", "GM", "MCG", "MCG/ML", "mcg/mL", "BAU/ML",
+         "MEQ", "MEQ/ML", "UNT", "UNT/MG", "UNT/ML", "unt/mL", "UNT/GM", "MG/ACTUAT", "MG/HR" ) );
+
+   static private String uncapUnits( final String text ) {
+      return UNITS.contains( text ) ? text.toLowerCase() : text;
+   }
+
+   static private String uncapNumUnits( final String text ) {
+      int lastNum = -1;
+      for ( char c : text.toCharArray() ) {
+         if ( !Character.isDigit( c ) ) {
+            break;
+         }
+         lastNum++;
+      }
+      if ( lastNum < 0 || lastNum > text.length() - 2 ) {
+         return text;
+      }
+      final String remainder = text.substring( lastNum + 1 );
+      return UNITS.contains( remainder ) ? text.toLowerCase() : text;
+   }
+
+   static private final Collection<String> OTHERS = new HashSet<>( Arrays.asList( "NOS", "USP", "(USP)" ) );
+
+   static private String uncapOther( final String text ) {
+      return OTHERS.contains( text ) ? text.toLowerCase() : text;
+   }
+
+   static private String uncapitalize( final String text ) {
+      final String first = text.substring( 0, 1 ).toLowerCase();
+      if ( text.length() == 1 ) {
+         return first;
+      }
+      return first + text.substring( 1 );
+   }
+
+   static private final Collection<SemanticGroup> keepSingleCapTuis
+         = EnumSet.of( SemanticGroup.DEVICE, SemanticGroup.TITLE, SemanticGroup.DRUG );
+
+   static private String maybeUncap( final String tokenized, final Collection<SemanticTui> tuis ) {
+      final String[] words = StringUtil.fastSplit( tokenized, ' ' );
+      final String uncapped = Arrays.stream( words )
+                                    .map( CuiTerm::uncapOther )
+                                    .map( CuiTerm::uncapUnits )
+                                    .map( CuiTerm::uncapNumUnits )
+                                    .collect( Collectors.joining( " " ) );
+      if ( uncapped.equals( tokenized.toLowerCase() ) ) {
+         return tokenized.toLowerCase();
+      }
+      final String[] words2 = StringUtil.fastSplit( uncapped, ' ' );
+      final boolean removeSingleCap = tuis.stream()
+                                          .map( SemanticTui::getGroup )
+                                          .noneMatch( SemanticGroup.DRUG::equals );
+//                                          .noneMatch( keepSingleCapTuis::contains );
+      if ( words2.length > 1 || removeSingleCap ) {
+         final String uncapped2 = Arrays.stream( words2 )
+                                        .map( CuiTerm::uncapitalize )
+                                        .collect( Collectors.joining( " " ) );
+         if ( uncapped2.equals( tokenized.toLowerCase() ) ) {
+            return tokenized.toLowerCase();
+         }
+      }
+      return tokenized;
+   }
+
+
+   public Collection<String> getUpperOnly() {
+      final Collection<String> lowerOnly = getLowerOnly();
+      final Collection<String> lowerMixed = getMixedOnly().stream()
+                                                          .map( String::toLowerCase )
+                                                          .collect( Collectors.toSet() );
+      return getTokenizedSynonyms()
+            .stream()
+            .filter( t -> t.chars().noneMatch( Character::isLowerCase ) )
+            .filter( t -> !lowerOnly.contains( t.toLowerCase() ) )
+            .filter( t -> !lowerMixed.contains( t.toLowerCase() ) )
+            .collect( Collectors.toSet() );
+   }
+
+   public Collection<String> getMixedOnly() {
+      final Collection<String> lowerOnly = getLowerOnly();
+      return getTokenizedSynonyms()
+            .stream()
+            .filter( t -> t.chars().anyMatch( Character::isUpperCase ) )
+            .filter( t -> t.chars().anyMatch( Character::isLowerCase ) )
+            .filter( t -> !lowerOnly.contains( t.toLowerCase() ) )
+            .collect( Collectors.toSet() );
+   }
+
+   public Collection<String> getLowerOnly() {
+      return getTokenizedSynonyms()
+            .stream()
+            .filter( t -> t.chars().noneMatch( Character::isUpperCase ) )
+            .collect( Collectors.toSet() );
+   }
+
+
+   public String getPreferredText() {
+      return _textScores.stream()
+                        .max( prefScorer )
+                        .map( ScoredText::getText )
+                        .orElse( "" );
+   }
+
+   public int getInstances( final String text ) {
+      return _tokenizedVocabTuis.getOrDefault( text, Collections.emptyList() ).size();
+   }
+
+   public int getRank( final String text ) {
+      return _tokenizedVocabTuis.getOrDefault( text, Collections.emptyList() )
+                                .stream()
+                                .mapToInt( Ranks.getInstance()::getCodeRank )
+                                .min()
+                                .orElse( 0 );
+   }
+
+
+   static private final class ScoredText {
+      private final String _text;
+      private final int _tsScore;
+      private final int _sttScore;
+      private final int _ttyScore;
+      private final int _lengthScore;
+      private final int _wordCountScore;
+      private final int _uppercaseScore;
+      static private final Collection<String> GOOD_STT = Arrays.asList( "PF", "VC", "VO" );
+      static private final Collection<String> GREAT_TTY = Arrays.asList( "PT", "PN" );
+      static private final Collection<String> GOOD_TTY = Arrays.asList( "RXN_PT", "DN" );
+
+      private ScoredText( final String text,
+                          final String ts,
+                          final String stt,
+                          final String tty ) {
+         _text = text;
+         _tsScore = ts.equals( "P" ) ? 2 : 1;
+         _sttScore = GOOD_STT.contains( stt ) ? 2 : 1;
+//      score = upScore( ISPREF, "Y", score );  // It usually looks reversed.
+//      score = upScore( ISPREF, "N", score, 2 );
+         _ttyScore = GREAT_TTY.contains( tty ) ? 3 : (GOOD_TTY.contains( tty ) ? 2 : 1);
+         _lengthScore = text.length();
+         // Prefer fewer-word terms - this should be last in a comparison
+         _wordCountScore = 10 - StringUtil.fastSplit( text, ' ' ).length;
+         _uppercaseScore = Character.isUpperCase( text.charAt( 0 ) ) ? 1 : 0;
+
+      }
+
+      public String getText() {
+         return _text;
+      }
+
+      public int getTsScore() {
+         return _tsScore;
+      }
+
+      public int getSttScore() {
+         return _sttScore;
+      }
+
+      public int getTtyScore() {
+         return _ttyScore;
+      }
+
+      public int getLengthScore() {
+         return _lengthScore;
+      }
+
+      public int getWordCountScore() {
+         return _wordCountScore;
+      }
+
+      public int getUppercaseScore() {
+         return _uppercaseScore;
+      }
+   }
+
+
+   static private final Comparator<ScoredText> prefScorer
+         = Comparator.comparingInt( ScoredText::getUppercaseScore )
+                     .thenComparing( ScoredText::getTtyScore )
+                     .thenComparingInt( ScoredText::getSttScore )
+                     .thenComparingInt( ScoredText::getTsScore )
+                     .thenComparingInt( ScoredText::getWordCountScore );
+
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public boolean equals( final Object value ) {
+      return value instanceof CuiTerm && ((CuiTerm)value).getCuiCode() == getCuiCode();
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public int hashCode() {
+      return ((Long)_cuiCode).hashCode();
+   }
+
+}

Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CustomTermLine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CustomTermLine.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CustomTermLine.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CustomTermLine.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,62 @@
+package org.apache.ctakes.gui.dictionary.cased.term;
+
+
+import org.apache.ctakes.core.util.StringUtil;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 1/8/2020
+ */
+final public class CustomTermLine implements TermLine {
+
+
+   final private String[] _columns;
+
+   public CustomTermLine( final String line ) {
+      _columns = StringUtil.fastSplit( line, '|' );
+   }
+
+   public String getCui() {
+      return _columns[ 0 ];
+   }
+
+   public String getText() {
+      return _columns[ 1 ];
+   }
+
+   public int getPrefScore() {
+      return _columns.length > 2 ? getPrefScore( _columns[ 2 ] ) : TermLine.getHalfScore();
+   }
+
+   public String getSource() {
+      return "CUSTOM";
+   }
+
+   public String getCode() {
+      return getCui();
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public String toString() {
+      return String.join( " | ", _columns );
+   }
+
+
+   private int getPrefScore( final String text ) {
+      if ( text.isEmpty() ) {
+         return TermLine.getHalfScore();
+      }
+      try {
+         final int parseInt = Integer.parseInt( text );
+         return Math.min( Math.max( 0, parseInt ), TermLine.getMaxScore() );
+      } catch ( NumberFormatException nfE ) {
+         return TermLine.getHalfScore();
+      }
+   }
+
+
+}

Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/TermLine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/TermLine.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/TermLine.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/TermLine.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,51 @@
+package org.apache.ctakes.gui.dictionary.cased.term;
+
+import org.apache.ctakes.gui.dictionary.util.TextTokenizer;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 1/8/2020
+ */
+public interface TermLine {
+
+   String getCui();
+
+   String getText();
+
+   int getPrefScore();
+
+   String getSource();
+
+   String getCode();
+
+   default String getTokenizedText() {
+      return getTokenizedText( getText() );
+   }
+
+   static int getMaxScore() {
+      // Right now the max score is TS=P SAB=NCI STT=PF TTY=PN  + 2 for custom term.  = 50
+      return 2 * 3 * 2 * 4 + 2;
+   }
+
+   static int getHalfScore() {
+      return getMaxScore() / 2;
+   }
+
+   static String getTokenizedText( final String text ) {
+      String tokenized = TextTokenizer.getTokenizedText( text );
+      if ( tokenized.endsWith( " nos" ) ) {
+         tokenized = tokenized.substring( 0, tokenized.length() - 4 );
+         if ( tokenized.endsWith( " ," ) ) {
+            tokenized = tokenized.substring( 0, tokenized.length() - 2 );
+         }
+      }
+      if ( tokenized.startsWith( "[ x ] " ) ) {
+         tokenized = tokenized.substring( 6 );
+      } else if ( tokenized.startsWith( "[ d ] " ) ) {
+         tokenized = tokenized.substring( 6 );
+      }
+      return tokenized;
+   }
+
+}