You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2020/09/25 01:04:47 UTC
svn commit: r1881995 [1/4] - in
/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased:
./ table/ term/ umls/ umls/abbreviation/ umls/file/
Author: seanfinan
Date: Fri Sep 25 01:04:47 2020
New Revision: 1881995
URL: http://svn.apache.org/viewvc?rev=1881995&view=rev
Log:
New Case Sensitive Dictionary Lookup Dictionary Creator Gui
Added:
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedDictionaryCreator.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedMainPanel.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedPiperWriter.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/HsqlWriter.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Ranks.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Synonym.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/SemanticTuiModel.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/TextTypeModel.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/ConsoLine.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CuiTerm.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CustomTermLine.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/TermLine.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/UmlsParser.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/Atn.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/IsPref.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/Lat.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/Rel.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/Rela.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/Srl.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/Stt.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/Stype.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/abbreviation/Ts.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/file/
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/file/MrConso.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/file/MrRel.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/file/MrSat.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/file/MrSty.java
ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/umls/file/Tty.java
Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedDictionaryCreator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedDictionaryCreator.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedDictionaryCreator.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedDictionaryCreator.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,77 @@
+package org.apache.ctakes.gui.dictionary.cased;
+
+
+import org.apache.ctakes.gui.component.DisablerPane;
+import org.apache.log4j.Logger;
+
+import javax.swing.*;
+import java.awt.*;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/26/2020
+ */
+public class CasedDictionaryCreator {
+
+ static private final Logger LOGGER = Logger.getLogger( "CasedDictionaryCreator" );
+
+
+ static private JFrame createFrame() {
+ // Case Sensitive Phrase F..
+ final JFrame frame = new JFrame( "cTAKES Cased Dictionary Creator" );
+ frame.setDefaultCloseOperation( WindowConstants.EXIT_ON_CLOSE );
+ // Use 1024 x 768 as the minimum required resolution (XGA)
+ // iPhone 3 : 480 x 320 (3:2, HVGA)
+ // iPhone 4 : 960 x 640 (3:2, unique to Apple)
+ // iPhone 5 : 1136 x 640 (under 16:9, unique to Apple)
+ // iPad 3&4 : 2048 x 1536 (4:3, QXGA)
+ // iPad Mini: 1024 x 768 (4:3, XGA)
+ final Dimension size = new Dimension( 1024, 768 );
+ frame.setSize( size );
+ frame.setMinimumSize( size );
+ final JMenuBar menuBar = new JMenuBar();
+ final JMenu fileMenu = new JMenu( "File" );
+ menuBar.add( fileMenu );
+
+ frame.setJMenuBar( menuBar );
+ System.setProperty( "apple.laf.useScreenMenuBar", "true" );
+ return frame;
+ }
+
+ static private JComponent createMainPanel() {
+ return new CasedMainPanel();
+ }
+
+ public static void main( final String... args ) {
+ try {
+ UIManager.setLookAndFeel( UIManager.getSystemLookAndFeelClassName() );
+ UIManager.getDefaults().put( "SplitPane.border", BorderFactory.createEmptyBorder() );
+ } catch ( ClassNotFoundException | InstantiationException
+ | IllegalAccessException | UnsupportedLookAndFeelException multE ) {
+ LOGGER.error( multE.getLocalizedMessage() );
+ }
+ final JFrame frame = createFrame();
+ final JComponent mainPanel = createMainPanel();
+ frame.add( mainPanel );
+ frame.pack();
+ frame.setVisible( true );
+ DisablerPane.getInstance().initialize( frame );
+ LOGGER.info( "1. Select your Apache cTAKES root directory." );
+ LOGGER.info( " It can be a pre-built binary installation or a developer sandbox." );
+ LOGGER.info( "2. Select your Unified Medical Language System (UMLS) root directory." );
+ LOGGER.info( " Once selected, your UMLS database will be parsed for available content." );
+ LOGGER.info( "3. Select your desired Vocabulary sources and targets in the left table." );
+ LOGGER.info( " Recommended Vocabulary sources are pre-selected." );
+ LOGGER.info( "4. Select your desired Languages in the center table." );
+ LOGGER.info( " English (ENG) is pre-selected if available." );
+ LOGGER.info( "5. Select your desired Semantic Types in the right table." );
+ LOGGER.info( " Recommended Semantic types are pre-selected." );
+ LOGGER.info( "6. Type a name for your dictionary." );
+ LOGGER.info( "7. Click \'Build Dictionary\'" );
+ LOGGER.info( "- You can resize this log panel by clicking the top and dragging up or down." );
+ }
+
+
+}
Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedMainPanel.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedMainPanel.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedMainPanel.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedMainPanel.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,396 @@
+package org.apache.ctakes.gui.dictionary.cased;
+
+import org.apache.ctakes.core.util.annotation.SemanticTui;
+import org.apache.ctakes.gui.component.DisablerPane;
+import org.apache.ctakes.gui.component.FileChooserPanel;
+import org.apache.ctakes.gui.component.LoggerPanel;
+import org.apache.ctakes.gui.component.PositionedSplitPane;
+import org.apache.ctakes.gui.dictionary.cased.table.SemanticTuiModel;
+import org.apache.ctakes.gui.dictionary.cased.table.TextTypeModel;
+import org.apache.ctakes.gui.dictionary.cased.term.CuiTerm;
+import org.apache.ctakes.gui.dictionary.cased.umls.UmlsParser;
+import org.apache.ctakes.gui.dictionary.cased.umls.file.Tty;
+import org.apache.ctakes.gui.dictionary.umls.LanguageTableModel;
+import org.apache.ctakes.gui.dictionary.umls.MrconsoIndex;
+import org.apache.ctakes.gui.dictionary.umls.MrsabIndex;
+import org.apache.ctakes.gui.dictionary.umls.SourceTableModel;
+import org.apache.ctakes.gui.dictionary.util.FileUtil;
+import org.apache.log4j.Logger;
+
+import javax.swing.*;
+import javax.swing.border.EmptyBorder;
+import javax.swing.table.TableModel;
+import javax.swing.text.JTextComponent;
+import java.awt.*;
+import java.awt.event.ActionEvent;
+import java.awt.event.ActionListener;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 12/10/2015
+ */
+final class CasedMainPanel extends JPanel {
+
+ static private final Logger LOGGER = Logger.getLogger( "CasedMainPanel" );
+
+ private String _umlsDirPath = System.getProperty( "user.dir" );
+ private String _ctakesPath = System.getProperty( "user.dir" );
+ private final SemanticTuiModel _tuiModel = new SemanticTuiModel();
+ private final SourceTableModel _sourceModel = new SourceTableModel();
+ private final TextTypeModel _textTypeModel = new TextTypeModel();
+ private final LanguageTableModel _languageModel = new LanguageTableModel();
+
+ CasedMainPanel() {
+ super( new BorderLayout() );
+ final JComponent sourceDirPanel = new JPanel( new GridLayout( 2, 1 ) );
+ sourceDirPanel.add( new FileChooserPanel( "cTAKES Installation:", _ctakesPath, true, new CtakesDirListener() ) );
+ sourceDirPanel.add( new FileChooserPanel( "UMLS Installation:", _umlsDirPath, true, new UmlsDirListener() ) );
+ add( sourceDirPanel, BorderLayout.NORTH );
+
+ add( createCenterPanel( _sourceModel, _tuiModel, _textTypeModel, _languageModel ), BorderLayout.CENTER );
+ }
+
+ private JComponent createCenterPanel( final TableModel sourceModel,
+ final TableModel tuiModel,
+ final TableModel textTypeModel,
+ final TableModel languageModel ) {
+ final JTabbedPane tabbedPane = new JTabbedPane();
+ tabbedPane.addTab( "Vocabularies", createTable( sourceModel ) );
+ tabbedPane.addTab( "Semantic Types", createTable( tuiModel ) );
+ tabbedPane.addTab( "Text Types", create50_100Table( textTypeModel ) );
+ tabbedPane.addTab( "Languages", createLangTable( languageModel ) );
+
+ final JPanel umlsPanel = new JPanel( new BorderLayout() );
+ umlsPanel.add( tabbedPane, BorderLayout.CENTER );
+ umlsPanel.add( createGoPanel(), BorderLayout.SOUTH );
+
+ final JSplitPane logSplit = new PositionedSplitPane( JSplitPane.VERTICAL_SPLIT );
+ logSplit.setTopComponent( umlsPanel );
+ logSplit.setBottomComponent( LoggerPanel.createLoggerPanel() );
+ logSplit.setDividerLocation( 0.6d );
+
+ return logSplit;
+ }
+
+ static private JComponent createTable( final TableModel model ) {
+ final JTable table = new JTable( model );
+ table.setCellSelectionEnabled( false );
+ table.setShowVerticalLines( false );
+ table.setAutoCreateRowSorter( true );
+ table.setAutoResizeMode( JTable.AUTO_RESIZE_LAST_COLUMN );
+ table.getColumnModel().getColumn( 0 ).setMaxWidth( 70 );
+ table.getColumnModel().getColumn( 1 ).setMaxWidth( 100 );
+ return new JScrollPane( table );
+ }
+
+ static private JComponent create50_100Table( final TableModel model ) {
+ final JTable table = new JTable( model );
+ table.setCellSelectionEnabled( false );
+ table.setShowVerticalLines( false );
+ table.setAutoCreateRowSorter( true );
+ table.setAutoResizeMode( JTable.AUTO_RESIZE_LAST_COLUMN );
+ table.getColumnModel().getColumn( 0 ).setMaxWidth( 70 );
+ table.getColumnModel().getColumn( 1 ).setMaxWidth( 100 );
+ return new JScrollPane( table );
+ }
+
+ static private JComponent createLangTable( final TableModel model ) {
+ final JTable table = new JTable( model );
+ table.setCellSelectionEnabled( false );
+ table.setShowVerticalLines( false );
+ table.setAutoCreateRowSorter( true );
+ table.setAutoResizeMode( JTable.AUTO_RESIZE_LAST_COLUMN );
+ table.getColumnModel().getColumn( 0 ).setMaxWidth( 50 );
+ return new JScrollPane( table );
+ }
+
+ private JComponent createGoPanel() {
+ final JPanel panel = new JPanel( new BorderLayout( 10, 10 ) );
+ panel.setBorder( new EmptyBorder( 2, 10, 2, 10 ) );
+ final JLabel label = new JLabel( "Dictionary Name:" );
+ label.setPreferredSize( new Dimension( 100, 0 ) );
+ label.setHorizontalAlignment( SwingConstants.TRAILING );
+ final JTextField textField = new JTextField( "custom" );
+ final JButton buildButton = new JButton( new BuildDictionaryAction( textField ) );
+ panel.add( label, BorderLayout.WEST );
+ panel.add( textField, BorderLayout.CENTER );
+ panel.add( buildButton, BorderLayout.EAST );
+ return panel;
+ }
+
+
+ private String setUmlsDirPath( final String umlsDirPath ) {
+ File mrConso = new File( umlsDirPath, "MRCONSO.RRF" );
+ if ( mrConso.isFile() ) {
+ _umlsDirPath = mrConso.getParentFile().getParent();
+ } else {
+ final String plusMetaPath = new File( umlsDirPath, "META" ).getPath();
+ mrConso = new File( plusMetaPath, "MRCONSO.RRF" );
+ if ( mrConso.isFile() ) {
+ _umlsDirPath = umlsDirPath;
+ } else {
+ error( "Invalid UMLS Installation", umlsDirPath + " is not a valid path to a UMLS installation" );
+ }
+ }
+ return _umlsDirPath;
+ }
+
+ private void loadSources() {
+ final ExecutorService executor = Executors.newSingleThreadExecutor();
+ executor.execute( new SourceLoadRunner( _umlsDirPath ) );
+ }
+
+ private class SourceLoadRunner implements Runnable {
+ private final String __umlsDirPath;
+
+ private SourceLoadRunner( final String umlsDirPath ) {
+ __umlsDirPath = umlsDirPath;
+ }
+
+ @Override
+ public void run() {
+ final JFrame frame = (JFrame)SwingUtilities.getRoot( CasedMainPanel.this );
+ frame.setCursor( Cursor.getPredefinedCursor( Cursor.WAIT_CURSOR ) );
+ DisablerPane.getInstance().setVisible( true );
+ final File mrConso = new File( __umlsDirPath + "/META", "MRCONSO.RRF" );
+ final String mrConsoPath = mrConso.getPath();
+ LOGGER.info( "Parsing vocabulary types from " + mrConsoPath );
+ final Collection<String> sources = new HashSet<>();
+ final Collection<String> languages = new HashSet<>();
+ try ( final BufferedReader reader = FileUtil.createReader( mrConsoPath ) ) {
+ java.util.List<String> tokens = FileUtil.readBsvTokens( reader, mrConsoPath );
+ while ( tokens != null ) {
+ if ( tokens.size() > MrconsoIndex.SOURCE._index ) {
+ sources.add( tokens.get( MrconsoIndex.SOURCE._index ) );
+ languages.add( tokens.get( MrconsoIndex.LANGUAGE._index ) );
+ }
+ tokens = FileUtil.readBsvTokens( reader, mrConsoPath );
+ }
+ LOGGER.info( "Parsed " + sources.size() + " vocabulary types" );
+ _sourceModel.setSources( sources );
+ LOGGER.info( "Parsed " + languages.size() + " languages" );
+ _languageModel.setLangauges( languages );
+ } catch ( IOException ioE ) {
+ error( "Vocabulary Parse Error", ioE.getMessage() );
+ }
+ final File mrSab = new File( __umlsDirPath + "/META", "MRSAB.RRF" );
+ final String mrSabPath = mrSab.getPath();
+ final Map<String, String> sourceNames = new HashMap<>();
+ final Map<String, String> sourceVersions = new HashMap<>();
+ final Map<String, String> sourceCuiCounts = new HashMap<>();
+ LOGGER.info( "Parsing vocabulary names from " + mrSabPath );
+ try ( final BufferedReader reader = FileUtil.createReader( mrSabPath ) ) {
+ int lineCount = 0;
+ java.util.List<String> tokens = FileUtil.readBsvTokens( reader, mrSabPath );
+ while ( tokens != null ) {
+ lineCount++;
+ if ( tokens.size() > MrsabIndex.CFR._index ) {
+ final String sab = tokens.get( MrsabIndex.RSAB._index );
+ if ( sources.contains( sab ) ) {
+ sourceNames.put( sab, tokens.get( MrsabIndex.SON._index ) );
+ final String oldCounts = sourceCuiCounts.getOrDefault( sab, "" );
+ final String newCounts = tokens.get( MrsabIndex.CFR._index );
+ if ( newCounts.length() > oldCounts.length() ) {
+ sourceVersions.put( sab, tokens.get( MrsabIndex.SVER._index ) );
+ sourceCuiCounts.put( sab, newCounts );
+ }
+ }
+ }
+ if ( lineCount % 100000 == 0 ) {
+ LOGGER.info( "File Line " + lineCount + "\t Vocabularies " + sources.size() );
+ }
+ tokens = FileUtil.readBsvTokens( reader, mrConsoPath );
+ }
+ LOGGER.info( "Parsed " + sources.size() + " vocabulary names" );
+ _sourceModel.setSourceInfo( sourceNames, sourceVersions, sourceCuiCounts );
+ } catch ( IOException ioE ) {
+ error( "Vocabulary Parse Error", ioE.getMessage() );
+ }
+
+ DisablerPane.getInstance().setVisible( false );
+ frame.setCursor( Cursor.getDefaultCursor() );
+ }
+ }
+
+ private void buildDictionary( final String dictionaryName ) {
+ final ExecutorService executor = Executors.newSingleThreadExecutor();
+ executor.execute( new CasedDictionaryBuilder( _umlsDirPath,
+ _ctakesPath, dictionaryName,
+ _sourceModel.getWantedSources(),
+ _sourceModel.getWantedTargets(),
+ _tuiModel.getWantedTuis(),
+ _textTypeModel.getWantedTypes(),
+ _languageModel.getWantedLanguages() ) );
+ }
+
+ private void error( final String title, final String message ) {
+ LOGGER.error( message );
+ JOptionPane.showMessageDialog( CasedMainPanel.this, message, title, JOptionPane.ERROR_MESSAGE );
+ }
+
+ private class CasedDictionaryBuilder implements Runnable {
+ private final String _consoPath;
+ private final String _styPath;
+ private final String _rankPath;
+ private final String _hsqlPath;
+ private final String _dictionaryName;
+ private final Collection<String> _wantedVocabularies;
+ private final Collection<String> _writtenSchema;
+ private final Collection<SemanticTui> _wantedTuis;
+ private final Collection<Tty> _wantedTextTypes;
+ private final Collection<String> _wantedLanguages;
+
+ public CasedDictionaryBuilder( final String umlsPath,
+ final String ctakesPath,
+ final String dictionaryName,
+ final Collection<String> wantedVocabularies,
+ final Collection<String> writtenSchema,
+ final Collection<SemanticTui> wantedTuis,
+ final Collection<Tty> wantedTermTypes,
+ final Collection<String> wantedLanguages ) {
+ this( umlsPath + "/META/MRCONSO.RRF",
+ umlsPath + "/META/MRSTY.RRF",
+ umlsPath + "/META/MRRANK.RRF",
+ ctakesPath + "/resources/org/apache/ctakes/dictionary/lookup/cased",
+ dictionaryName,
+ wantedVocabularies,
+ writtenSchema,
+ wantedTuis,
+ wantedTermTypes,
+ wantedLanguages );
+ }
+
+ public CasedDictionaryBuilder( final String consoPath,
+ final String styPath,
+ final String rankPath,
+ final String hsqlPath,
+ final String dictionaryName,
+ final Collection<String> wantedVocabularies,
+ final Collection<String> writtenSchema,
+ final Collection<SemanticTui> wantedTuis,
+ final Collection<Tty> wantedTermTypes,
+ final Collection<String> wantedLanguages ) {
+ _consoPath = consoPath;
+ _styPath = styPath;
+ _rankPath = rankPath;
+ _hsqlPath = hsqlPath;
+ _dictionaryName = dictionaryName;
+ _wantedVocabularies = wantedVocabularies;
+ _writtenSchema = writtenSchema;
+ _wantedTuis = wantedTuis;
+ _wantedTextTypes = wantedTermTypes;
+ _wantedLanguages = wantedLanguages;
+ }
+
+ public void run() {
+ SwingUtilities.getRoot( CasedMainPanel.this ).setCursor( Cursor.getPredefinedCursor( Cursor.WAIT_CURSOR ) );
+ DisablerPane.getInstance().setVisible( true );
+ final Collection<CuiTerm> cuiTerms
+ = UmlsParser.createCuiTerms( _consoPath,
+ _styPath,
+ _rankPath,
+ _wantedTuis,
+ _wantedVocabularies,
+ _wantedTextTypes,
+ _wantedLanguages,
+ _writtenSchema );
+ if ( cuiTerms.isEmpty() ) {
+ final String message = "No Terms fit your parameters for the dictionary";
+ LOGGER.error( message );
+ JOptionPane
+ .showMessageDialog( CasedMainPanel.this, message, "Cannot Build Dictionary", JOptionPane.ERROR_MESSAGE );
+ } else {
+ if ( HsqlWriter.writeHsql( _hsqlPath, _dictionaryName, _writtenSchema, cuiTerms ) ) {
+ final String message = "Dictionary " + _dictionaryName + " successfully built in " + _hsqlPath;
+ LOGGER.info( message );
+ JOptionPane
+ .showMessageDialog( CasedMainPanel.this, message, "Dictionary Built", JOptionPane.INFORMATION_MESSAGE );
+ } else {
+ error( "Build Failure", "Dictionary " + _dictionaryName + " could not be built in " + _hsqlPath );
+ }
+ if ( CasedPiperWriter.writePiper( _hsqlPath, _dictionaryName, _writtenSchema ) ) {
+ final String message = "Dictionary Piper " + _dictionaryName + " successfully built in " + _hsqlPath;
+ LOGGER.info( message );
+ JOptionPane
+ .showMessageDialog( CasedMainPanel.this, message, "Dictionary Piper Built", JOptionPane.INFORMATION_MESSAGE );
+ } else {
+ error( "Build Failure", "Dictionary Piper " + _dictionaryName + " could not be built in " + _hsqlPath );
+ }
+ }
+ DisablerPane.getInstance().setVisible( false );
+ SwingUtilities.getRoot( CasedMainPanel.this ).setCursor( Cursor.getDefaultCursor() );
+ }
+ }
+
+
+ private class UmlsDirListener implements ActionListener {
+ @Override
+ public void actionPerformed( final ActionEvent event ) {
+ final String oldPath = _umlsDirPath;
+ final String newPath = setUmlsDirPath( event.getActionCommand() );
+ if ( !oldPath.equals( newPath ) ) {
+ loadSources();
+ }
+ }
+ }
+
+
+ private class CtakesDirListener implements ActionListener {
+ @Override
+ public void actionPerformed( final ActionEvent event ) {
+ _ctakesPath = event.getActionCommand();
+ }
+ }
+
+
+ /**
+ * Builds the dictionary
+ */
+ private class BuildDictionaryAction extends AbstractAction {
+ private final JTextComponent __textComponent;
+
+ private BuildDictionaryAction( final JTextComponent textComponent ) {
+ super( "Build Dictionary" );
+ __textComponent = textComponent;
+ }
+
+ @Override
+ public void actionPerformed( final ActionEvent event ) {
+ if ( _sourceModel.getRowCount() == 0 ) {
+ error( "UMLS not yet loaded", "Please specify a UMLS installation." );
+ return;
+ }
+ if ( _sourceModel.getWantedSources().isEmpty() ) {
+ error( "Vocabularies not selected", "Please specify one or more source vocabularies." );
+ return;
+ }
+ if ( _textTypeModel.getWantedTypes().isEmpty() ) {
+ error( "Text Types not selected", "Please specify one or more source text types." );
+ return;
+ }
+ if ( _languageModel.getWantedLanguages().isEmpty() ) {
+ error( "Language not selected", "Please specify one or more languages." );
+ return;
+ }
+ final String dictionaryName = __textComponent.getText();
+ if ( dictionaryName != null && !dictionaryName.isEmpty() ) {
+ buildDictionary( dictionaryName.toLowerCase() );
+ } else {
+ error( "Invalid Dictionary Name", "Please Specify a Dictionary Name" );
+ }
+ }
+ }
+
+
+}
Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedPiperWriter.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedPiperWriter.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedPiperWriter.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/CasedPiperWriter.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,153 @@
+package org.apache.ctakes.gui.dictionary.cased;
+
+
+import org.apache.ctakes.gui.dictionary.umls.VocabularyStore;
+import org.apache.ctakes.gui.dictionary.util.HsqlUtil;
+import org.apache.log4j.Logger;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/28/2020
+ */
+final public class CasedPiperWriter {
+
+ static private final Logger LOGGER = Logger.getLogger( "CasedPiperWriter" );
+
+
+ private CasedPiperWriter() {
+ }
+
+
+ static public boolean writePiper( final String hsqlPath,
+ final String dictionaryName,
+ final Collection<String> writtenSchema ) {
+ final String url = HsqlUtil.URL_PREFIX + hsqlPath.replace( '\\', '/' ) + "/" + dictionaryName + "/" +
+ dictionaryName;
+ final List<String> schemaList = new ArrayList<>( writtenSchema );
+ Collections.sort( schemaList );
+ schemaList.add( "TUI" );
+ schemaList.add( "PREFERRED_TEXT" );
+ final String schemas = String.join( ",", schemaList );
+ final File piperFile = new File( hsqlPath, dictionaryName + ".piper" );
+ try ( final Writer writer = new BufferedWriter( new FileWriter( piperFile ) ) ) {
+ writer.write( "// This piper file contains instructions to set up your custom dictionary and encoders for Case-sensitive Dictionary Lookup.\n" );
+ writer.write( "// To use your new dictionary, load this piper in your main piper:\n" );
+ writer.write( "// load " + hsqlPath + "\n" );
+ writer.write( "\n" );
+ writer.write( "// === Setup common to all Dictionaries ===\n" );
+ writer.write( "// = Trigger Part of Speech =\n" );
+ writer.write( "// Use Verbs as lookup tokens. Default = yes.\n" );
+ writer.write( "// set lookupVerbs=yes\n" );
+ writer.write( "// Use Nouns as lookup tokens. Default = yes.\n" );
+ writer.write( "// set lookupNouns=yes\n" );
+ writer.write( "// Use Adjectives as lookup tokens. Default = yes.\n" );
+ writer.write( "// set lookupAdjectives=yes\n" );
+ writer.write( "// Use Adverbs as lookup tokens. Default = yes.\n" );
+ writer.write( "// set lookupAdverbs=yes\n" );
+ writer.write( "// Comma delimited array of other parts of speech to use for lookup. Default is empty.\n" );
+ writer.write( "// set otherLookups=\n" );
+ writer.write( "// = Trigger Word Length =\n" );
+ writer.write( "// Minimum character span to use for lookup. Default is 3.\n" );
+ writer.write( "// set minimumSpan=3\n" );
+ writer.write( "// = Text Loose Matching =\n" );
+ writer.write( "// Allow words to be skipped in lookup. Default is no.\n" );
+ writer.write( "// set allowWordSkips=no\n" );
+ writer.write( "// Number of words that can be skipped consecutively in lookup. Default is 2.\n" );
+ writer.write( "// set consecutiveSkips=2\n" );
+ writer.write( "// Number of words that can be skipped in total in lookup. Default is 4.\n" );
+ writer.write( "// set totalSkips=4\n" );
+ writer.write( "// = Subsumption =\n" );
+ writer.write( "// Subsume small terms by larger enclosing terms in the same semantic group. Default is yes.\n" );
+ writer.write( "// This is not the default behavior of the default dictionary lookup, but that of the PrecisionTermConsumer.\n" );
+ writer.write( "// set subsume=yes\n" );
+ writer.write( "// Subsume contained terms of the same and certain other semantic groups. Default is yes.\n" );
+ writer.write( "// This is not the default behavior of the default dictionary lookup, but that of the SemanticCleanupTermConsumer.\n" );
+ writer.write( "// set subsumeSemantics=yes\n" );
+ writer.write( "// Comma delimited array of semantic types to group reassignment key:value pairs. Default is empty.\n" );
+ writer.write( "// Within the comma delimited array types and groups are separated by a colon.\n" );
+ writer.write( "// Semantic Type can be indicated by name or TUI. Semantic Group must be indicated by name.\n" );
+ writer.write( "// Example: set reassignSemantics=Cell:Finding,T065:Event\n" );
+ writer.write( "// set reassignSemantics=\n" );
+ writer.write( "\n" );
+ writer.write( "// === Dictionaries Setup ===\n" );
+ writer.write( "// = Dictionary Names =\n" );
+ writer.write( "// Comma delimited array of dictionary names.\n" );
+ writer.write( "set dictionaries=" + dictionaryName + "\n" );
+ writer.write( "\n" );
+ writer.write( "// === Individual Dictionary Setup ===\n" );
+ writer.write( "// Individual Dictionary setup parameters are named {dictionaryName}_{parameterName}.\n" );
+ writer.write( "// = Dictionary Type =\n" );
+ writer.write( "// Declare the source type the Dictionary. {dictionaryName}_type\n" );
+ writer.write( "set " + dictionaryName + "_type=JDBC\n" );
+ writer.write( "\n" );
+ writer.write( "// = JDBC Database =\n" );
+ writer.write( "// JDBC Driver for the Dictionary. {dictionaryName}_driver\n" );
+ writer.write( "set " + dictionaryName + "_driver=org.hsqldb.jdbcDriver\n" );
+ writer.write( "// Url for the Database. {dictionaryName}_url\n" );
+ writer.write( "set " + dictionaryName + "_url=" + url + "\n" );
+ writer.write( "// User for the Database. {dictionaryName}_user.\n" );
+ writer.write( "// set " + dictionaryName + "_user=sa\n" );
+ writer.write( "// Password for the Database. {dictionaryName}_pass\n" );
+ writer.write( "// set " + dictionaryName + "_pass=\n" );
+ writer.write( "// = JDBC Term Tables =\n" );
+ writer.write( "// Upper case Term Table in the Database. {dictionaryName}_upper\n" );
+ writer.write( "// set " + dictionaryName + "_upper=UPPER\n" );
+ writer.write( "// Mixed case Term Table in the Database. {dictionaryName}_mixed\n" );
+ writer.write( "// set " + dictionaryName + "_mixed=MIXED\n" );
+ writer.write( "// Lower case Term Table in the Database. {dictionaryName}_lower\n" );
+ writer.write( "// set " + dictionaryName + "_lower=LOWER\n" );
+ writer.write( "\n" );
+ writer.write( "// === Encoders Setup ===\n" );
+ writer.write( "// Comma delimited array of encoder names. Note that these names also indicate a Code Schema name.\n" );
+ writer.write( "set encoders=" + schemas + "\n" );
+ writer.write( "\n" );
+ writer.write( "// === Individual Encoder Setup ===\n" );
+ writer.write( "// Individual Encoder setup parameters are named {encoderName}_{parameterName}.\n" );
+ writer.write( "// = Encoder Type =\n" );
+ writer.write( "// Declare the source type the Encoder. {encoderName}_type\n" );
+ for ( String schema : schemaList ) {
+ writer.write( "set " + schema + "_type=JDBC\n" );
+ }
+ writer.write( "\n" );
+ writer.write( "// = JDBC Database =\n" );
+ writer.write( "// JDBC Driver for the Encoder. {encoderName}_driver\n" );
+ writer.write( "// The default JDBC driver is org.hsqldb.jdbcDriver\n\n" );
+ writer.write( "// Url for the Database. {encoderName}_url\n" );
+ for ( String schema : schemaList ) {
+ writer.write( "set " + schema + "_url=jdbc:hsqldb:file:resources/org/apache/ctakes/dictionary/lookup/cased/"
+ + dictionaryName + "/" + dictionaryName + "\n" );
+ }
+ writer.write( "// Most of the following settings are left empty to exemplify brevity.\n\n" );
+ writer.write( "// User for the Database. {encoderName}_user Default user is sa\n\n" );
+ writer.write( "// Password for the Database. {encoderName}_pass Default password is empty.\n\n" );
+ writer.write( "// = JDBC Encoder Tables =\n" );
+ writer.write( "// Encoding Table in the Database. {encoderName}_table Default table is the schema name.\n\n" );
+ writer.write( "// Encoding Table Class Type. {encoderName}_class\n" );
+ for ( String schema : schemaList ) {
+ if ( schema.equals( "TUI" ) ) {
+ writer.write( "set TUI_class=tui\n" );
+ continue;
+ } else if ( schema.equals( "PREFERRED_TEXT" ) ) {
+ writer.write( "set PREFERRED_TEXT_class=pref_text\n" );
+ continue;
+ }
+ writer.write( "set " + schema + "_class="
+ + VocabularyStore.getInstance().getCtakesClass( schema ) + "\n" );
+ }
+ writer.write( "\n" );
+ } catch ( IOException ioE ) {
+ LOGGER.error( ioE.getMessage() );
+ return false;
+ }
+ return true;
+ }
+
+
+}
Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/HsqlWriter.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/HsqlWriter.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/HsqlWriter.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/HsqlWriter.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,315 @@
+package org.apache.ctakes.gui.dictionary.cased;
+
+
+import org.apache.ctakes.gui.dictionary.cased.term.CuiTerm;
+import org.apache.ctakes.gui.dictionary.umls.VocabularyStore;
+import org.apache.ctakes.gui.dictionary.util.HsqlUtil;
+import org.apache.ctakes.gui.dictionary.util.JdbcUtil;
+import org.apache.ctakes.gui.dictionary.util.RareWordUtil;
+import org.apache.log4j.Logger;
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.function.Function;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/26/2020
+ */
+final public class HsqlWriter {
+
+ static private final Logger LOGGER = Logger.getLogger( "HsqlWriter" );
+
+ static public boolean writeHsql( final String hsqlPath,
+ final String dictionaryName,
+ final Collection<String> writtenSchema,
+ final Collection<CuiTerm> cuiTerms ) {
+ final String url = HsqlUtil.URL_PREFIX + hsqlPath.replace( '\\', '/' ) + "/" + dictionaryName + "/" +
+ dictionaryName;
+ final Connection connection = JdbcUtil.createDatabaseConnection( url, "SA", "" );
+ if ( !createDatabase( connection, writtenSchema ) ) {
+ return false;
+ }
+ // Get Count of appearance in dictionary per term token
+ final Map<String, Long> upperTokenCounts = getUpperTokenCounts( cuiTerms );
+ final Map<String, Long> mixedTokenCounts = getMixedTokenCounts( cuiTerms );
+ final Map<String, Long> lowerTokenCounts = getLowerTokenCounts( cuiTerms );
+ // Create insert sql statements
+ final String upperSql = JdbcUtil.createRowInsertSql( "UPPER", Synonym.values() );
+ final String mixedSql = JdbcUtil.createRowInsertSql( "MIXED", Synonym.values() );
+ final String lowerSql = JdbcUtil.createRowInsertSql( "LOWER", Synonym.values() );
+ final String tuiSql = JdbcUtil.createCodeInsertSql( "TUI" );
+ final String prefTextSql = JdbcUtil.createCodeInsertSql( "PREFERRED_TEXT" );
+ final Map<String, String> insertCodeSqls = createCodeInsertSqls( writtenSchema );
+
+ try {
+
+ final PreparedStatement upperStatement = connection.prepareStatement( upperSql );
+ final PreparedStatement mixedStatement = connection.prepareStatement( mixedSql );
+ final PreparedStatement lowerStatement = connection.prepareStatement( lowerSql );
+ final PreparedStatement tuiStatement = connection.prepareStatement( tuiSql );
+ final PreparedStatement prefTextStatement = connection.prepareStatement( prefTextSql );
+ final Map<String, PreparedStatement> codeStatements = createCodeStatements( connection, insertCodeSqls );
+
+ for ( CuiTerm cuiTerm : cuiTerms ) {
+ final long cui = cuiTerm.getCuiCode();
+ // write main term table
+ for ( String text : cuiTerm.getUpperOnly() ) {
+ final RareWordUtil.IndexedRareWord indexedRareWord
+ = RareWordUtil.getIndexedRareWord( text, upperTokenCounts );
+ if ( RareWordUtil.NULL_RARE_WORD.equals( indexedRareWord ) ) {
+ continue;
+ }
+ upperStatement.setLong( Synonym.CUI.getColumn(), cui );
+ upperStatement.setString( Synonym.PREFIX.getColumn(), getPrefix( text, indexedRareWord.__word ) );
+ upperStatement.setString( Synonym.INDEX_WORD.getColumn(), indexedRareWord.__word );
+ upperStatement.setString( Synonym.SUFFIX.getColumn(), getSuffix( text, indexedRareWord.__word ) );
+ upperStatement.setInt( Synonym.RANK.getColumn(), cuiTerm.getRank( text ) );
+ upperStatement.setInt( Synonym.INSTANCES.getColumn(), cuiTerm.getInstances( text ) );
+ upperStatement.executeUpdate();
+ }
+ for ( String text : cuiTerm.getMixedOnly() ) {
+ final RareWordUtil.IndexedRareWord indexedRareWord
+ = RareWordUtil.getIndexedRareWord( text, mixedTokenCounts );
+ if ( RareWordUtil.NULL_RARE_WORD.equals( indexedRareWord ) ) {
+ continue;
+ }
+ mixedStatement.setLong( Synonym.CUI.getColumn(), cui );
+ mixedStatement.setString( Synonym.PREFIX.getColumn(), getPrefix( text, indexedRareWord.__word ) );
+ mixedStatement.setString( Synonym.INDEX_WORD.getColumn(), indexedRareWord.__word );
+ mixedStatement.setString( Synonym.SUFFIX.getColumn(), getSuffix( text, indexedRareWord.__word ) );
+ mixedStatement.setInt( Synonym.RANK.getColumn(), cuiTerm.getRank( text ) );
+ mixedStatement.setInt( Synonym.INSTANCES.getColumn(), cuiTerm.getInstances( text ) );
+ mixedStatement.executeUpdate();
+ }
+ for ( String text : cuiTerm.getLowerOnly() ) {
+ final RareWordUtil.IndexedRareWord indexedRareWord
+ = RareWordUtil.getIndexedRareWord( text, lowerTokenCounts );
+ if ( RareWordUtil.NULL_RARE_WORD.equals( indexedRareWord ) ) {
+ continue;
+ }
+ lowerStatement.setLong( Synonym.CUI.getColumn(), cui );
+ lowerStatement.setString( Synonym.PREFIX.getColumn(), getPrefix( text, indexedRareWord.__word ) );
+ lowerStatement.setString( Synonym.INDEX_WORD.getColumn(), indexedRareWord.__word );
+ lowerStatement.setString( Synonym.SUFFIX.getColumn(), getSuffix( text, indexedRareWord.__word ) );
+ lowerStatement.setInt( Synonym.RANK.getColumn(), cuiTerm.getRank( text ) );
+ lowerStatement.setInt( Synonym.INSTANCES.getColumn(), cuiTerm.getInstances( text ) );
+ lowerStatement.executeUpdate();
+ }
+ // write tui table
+ for ( int tui : cuiTerm.getTuis() ) {
+ tuiStatement.setLong( 1, cui );
+ tuiStatement.setInt( 2, tui );
+ tuiStatement.executeUpdate();
+ }
+ // write preferred term table
+ String preferredText = cuiTerm.getPreferredText();
+ if ( !preferredText.isEmpty() ) {
+ prefTextStatement.setLong( 1, cui );
+ if ( preferredText.length() > 255 ) {
+ preferredText = preferredText.substring( 0, 255 );
+ }
+ prefTextStatement.setString( 2, preferredText );
+ prefTextStatement.executeUpdate();
+ }
+ // write extra vocabulary code tables
+ final Map<String, Collection<String>> schemaCodeMap = cuiTerm.getSchemaCodes();
+ for ( Map.Entry<String, Collection<String>> schemaCodes : schemaCodeMap.entrySet() ) {
+ final String schema = fixVocabName( schemaCodes.getKey() );
+ final PreparedStatement statement = codeStatements.get( schema );
+ statement.setLong( 1, cui );
+ for ( String code : schemaCodes.getValue() ) {
+ setCodeAppropriately( statement, code, VocabularyStore.getInstance()
+ .getVocabularyClass( schema ) );
+ statement.executeUpdate();
+ }
+ }
+ }
+ connection.commit();
+ upperStatement.close();
+ mixedStatement.close();
+ lowerStatement.close();
+ tuiStatement.close();
+ prefTextStatement.close();
+ for ( PreparedStatement codeStatement : codeStatements.values() ) {
+ codeStatement.close();
+ }
+
+ connection.commit();
+ final Statement shutdownStatement = connection.createStatement();
+ shutdownStatement.execute( "SHUTDOWN" );
+ shutdownStatement.close();
+ connection.commit();
+ connection.close();
+ } catch ( SQLException sqlE ) {
+ LOGGER.error( sqlE.getMessage() );
+ return false;
+ }
+ return true;
+ }
+
+
+ static private String fixVocabName( final String vocabulary ) {
+ return vocabulary.toUpperCase().replace( '.', '_' ).replace( '-', '_' );
+ }
+
+ static private Map<String, String> createCodeInsertSqls( final Collection<String> writtenSchema ) {
+ return writtenSchema.stream().map( HsqlWriter::fixVocabName )
+ .collect( Collectors.toMap( Function.identity(), HsqlWriter::createCodeInsertSql ) );
+ }
+
+ static public String createCodeInsertSql( final String vocabulary ) {
+ return JdbcUtil.createRowInsertSql( vocabulary, "CUI", vocabulary );
+ }
+
+ static private Map<String, PreparedStatement> createCodeStatements( final Connection connection,
+ final Map<String, String> insertCodeSqls )
+ throws SQLException {
+ final Map<String, PreparedStatement> codeStatements = new HashMap<>( insertCodeSqls.size() );
+ for ( Map.Entry<String, String> codeSql : insertCodeSqls.entrySet() ) {
+ codeStatements.put( codeSql.getKey(), connection.prepareStatement( codeSql.getValue() ) );
+ }
+ return codeStatements;
+ }
+
+ static private void setCodeAppropriately( final PreparedStatement statement, final String code,
+ final Class<?> type ) throws SQLException {
+ if ( String.class.equals( type ) ) {
+ statement.setString( 2, code );
+ } else if ( Double.class.equals( type ) ) {
+ statement.setDouble( 2, Double.valueOf( code ) );
+ } else if ( Long.class.equals( type ) ) {
+ statement.setLong( 2, Long.valueOf( code ) );
+ } else if ( Integer.class.equals( type ) ) {
+ statement.setInt( 2, Integer.valueOf( code ) );
+ } else {
+ LOGGER.error( "Could not set code for " + type.getName() );
+ statement.setString( 2, code );
+ }
+ }
+
+
+ static private boolean createDatabase( final Connection connection, final Collection<String> writtenSchema ) {
+ try {
+ // main tables
+ createSynonymTable( connection, "UPPER" );
+ createSynonymTable( connection, "MIXED" );
+ createSynonymTable( connection, "LOWER" );
+ // tui table
+ createTable( connection, "TUI", "CUI BIGINT", "TUI INTEGER" );
+ createIndex( connection, "TUI", "CUI" );
+ // preferred text table
+ createTable( connection, "PREFERRED_TEXT", "CUI BIGINT", "PREFERRED_TEXT VARCHAR(255)" );
+ createIndex( connection, "PREFERRED_TEXT", "CUI" );
+
+ // schema codes tables
+ for ( String vocabulary : writtenSchema ) {
+ final String jdbcClass = VocabularyStore.getInstance().getJdbcClass( vocabulary );
+ final String tableName = fixVocabName( vocabulary );
+ createTable( connection, tableName, "CUI BIGINT", tableName + " " + jdbcClass );
+ createIndex( connection, tableName, "CUI" );
+ }
+
+ executeStatement( connection, "SET WRITE_DELAY 10" );
+ } catch ( SQLException sqlE ) {
+ LOGGER.error( sqlE.getMessage() );
+ return false;
+ }
+ return true;
+ }
+
+ static private void createSynonymTable( final Connection connection, final String tableName ) throws SQLException {
+ createTable( connection, tableName,
+ "CUI BIGINT",
+ "PREFIX VARCHAR(78)",
+ "INDEX_WORD VARCHAR(48)",
+ "SUFFIX VARCHAR(78)",
+ "RANK INTEGER",
+ "INSTANCES INTEGER" );
+ createIndex( connection, tableName, "INDEX_WORD" );
+ }
+
+ static private void createTable( final Connection connection, final String tableName, final String... fieldNames )
+ throws SQLException {
+ final String fields = String.join( ",", fieldNames );
+ final String creator = "CREATE MEMORY TABLE " + tableName + "(" + fields + ")";
+ executeStatement( connection, creator );
+ }
+
+ static private void createIndex( final Connection connection, final String tableName,
+ final String indexField ) throws SQLException {
+ final String indexer = "CREATE INDEX IDX_" + tableName + " ON " + tableName + "(" + indexField + ")";
+ executeStatement( connection, indexer );
+ }
+
+ static private void executeStatement( final Connection connection, final String command ) throws SQLException {
+ final Statement statement = connection.createStatement();
+ statement.execute( command );
+ statement.close();
+ }
+
+
+ static private final Pattern SPACE_PATTERN = Pattern.compile( "\\s+" );
+
+ static private Map<String, Long> getUpperTokenCounts( final Collection<CuiTerm> cuiTerms ) {
+ return cuiTerms.stream()
+ .map( CuiTerm::getUpperOnly )
+ .flatMap( Collection::stream )
+ .map( SPACE_PATTERN::split )
+ .flatMap( Arrays::stream )
+ .filter( RareWordUtil::isRarableToken )
+ .collect( Collectors.groupingBy( Function.identity(), Collectors.counting() ) );
+ }
+
+ static private Map<String, Long> getMixedTokenCounts( final Collection<CuiTerm> cuiTerms ) {
+ return cuiTerms.stream()
+ .map( CuiTerm::getMixedOnly )
+ .flatMap( Collection::stream )
+ .map( SPACE_PATTERN::split )
+ .flatMap( Arrays::stream )
+ .filter( RareWordUtil::isRarableToken )
+ .collect( Collectors.groupingBy( Function.identity(), Collectors.counting() ) );
+ }
+
+ static private Map<String, Long> getLowerTokenCounts( final Collection<CuiTerm> cuiTerms ) {
+ return cuiTerms.stream()
+ .map( CuiTerm::getLowerOnly )
+ .flatMap( Collection::stream )
+ .map( SPACE_PATTERN::split )
+ .flatMap( Arrays::stream )
+ .filter( RareWordUtil::isRarableToken )
+ .collect( Collectors.groupingBy( Function.identity(), Collectors.counting() ) );
+ }
+
+
+ static private String getPrefix( final String text, final String indexedRareWord ) {
+ if ( text.equals( indexedRareWord ) ) {
+ return "";
+ }
+ if ( text.startsWith( indexedRareWord ) ) {
+ return "";
+ }
+ return text.substring( 0, text.indexOf( indexedRareWord ) ).trim();
+ }
+
+ static private String getSuffix( final String text, final String indexedRareWord ) {
+ if ( text.equals( indexedRareWord ) ) {
+ return "";
+ }
+ if ( text.endsWith( indexedRareWord ) ) {
+ return "";
+ }
+ return text.substring( text.indexOf( indexedRareWord ) + indexedRareWord.length() ).trim();
+ }
+
+
+}
Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Ranks.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Ranks.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Ranks.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Ranks.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,52 @@
+package org.apache.ctakes.gui.dictionary.cased;
+
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/26/2020
+ */
+public enum Ranks {
+ INSTANCE;
+
+ static public Ranks getInstance() {
+ return INSTANCE;
+ }
+
+ private final Map<String, Integer> _ranks = new HashMap<>();
+ private List<String> _rankList;
+
+
+ public void setUmlsRank( final String vocabulary, final String tty, final int rank ) {
+ _ranks.put( getRankCode( vocabulary, tty ), rank );
+ }
+
+// public int getRank( final String vocabulary, final String tty ) {
+// return _ranks.getOrDefault( getCode( vocabulary, tty ), -1 );
+// }
+
+ public int getRank( final String vocabulary, final String tty ) {
+ return getCodeRank( getRankCode( vocabulary, tty ) );
+ }
+
+ public int getCodeRank( final String rankCode ) {
+ if ( _rankList == null ) {
+ _rankList = _ranks.entrySet()
+ .stream()
+ .sorted( Comparator.comparingInt( Map.Entry::getValue ) )
+ .map( Map.Entry::getKey )
+ .collect( Collectors.toList() );
+ }
+ return _rankList.size() - _rankList.indexOf( rankCode );
+ }
+
+ static public String getRankCode( final String vocabulary, final String tty ) {
+ return vocabulary + "_" + tty;
+ }
+
+}
Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Synonym.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Synonym.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Synonym.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/Synonym.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,32 @@
+package org.apache.ctakes.gui.dictionary.cased;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/14/2020
+ */
+public enum Synonym {
+ CUI( 1, Long.class ),
+ PREFIX( 2, String.class ),
+ INDEX_WORD( 3, String.class ),
+ SUFFIX( 4, String.class ),
+ RANK( 5, Integer.class ),
+ INSTANCES( 6, Integer.class );
+
+ final private int _column;
+ final private Class<?> _class;
+
+ Synonym( final int column, final Class<?> clazz ) {
+ _column = column;
+ _class = clazz;
+ }
+
+ public int getColumn() {
+ return _column;
+ }
+
+ public Class<?> getClassType() {
+ return _class;
+ }
+
+}
Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/SemanticTuiModel.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/SemanticTuiModel.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/SemanticTuiModel.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/SemanticTuiModel.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,143 @@
+package org.apache.ctakes.gui.dictionary.cased.table;
+
+import org.apache.ctakes.core.util.annotation.SemanticGroup;
+import org.apache.ctakes.core.util.annotation.SemanticTui;
+
+import javax.swing.event.EventListenerList;
+import javax.swing.event.TableModelListener;
+import javax.swing.table.TableModel;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.EnumSet;
+
+import static org.apache.ctakes.core.util.annotation.SemanticTui.*;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/27/2020
+ */
+public class SemanticTuiModel implements TableModel {
+
+ static private final String[] COLUMN_NAMES = { "Use TUI", "TUI", "Semantic Type", "Semantic Group" };
+ static private final Class<?>[] COLUMN_CLASSES = { Boolean.class, String.class, String.class, String.class };
+
+ static private final Collection<SemanticTui> UNWANTED_TUIS
+ = EnumSet.of( T116, T087, T123, T118, T026, T043, T025, T103, T120, T104, T077, T049, T088, T065, T196,
+ T050, T018, T126, T168, T045, T028, T125, T078, T129, T055, T197, T170, T130, T119, T063,
+ T066, T041, T073, T044, T085, T114, T124, T086, T115, T109, T040, T042, T046, T039,
+ T192, T062, T075, T054, T056, T169, T185, T058, T033, UNKNOWN );
+
+ private final EventListenerList _listenerList = new EventListenerList();
+ private final Collection<SemanticTui> _wantedTuis = EnumSet.noneOf( SemanticTui.class );
+
+ public SemanticTuiModel() {
+ final Collection<SemanticGroup> wantedGroups
+ = EnumSet.of( SemanticGroup.ANATOMY,
+ SemanticGroup.DISORDER,
+ SemanticGroup.FINDING,
+ SemanticGroup.DEVICE,
+ SemanticGroup.PROCEDURE,
+ SemanticGroup.DRUG );
+ Arrays.stream( SemanticTui.values() )
+ .filter( t -> !UNWANTED_TUIS.contains( t ) )
+ .filter( t -> wantedGroups.contains( t.getGroup() ) )
+ .forEach( _wantedTuis::add );
+ }
+
+ public Collection<SemanticTui> getWantedTuis() {
+ return _wantedTuis;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public int getRowCount() {
+ return SemanticTui.values().length;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public int getColumnCount() {
+ return 4;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public String getColumnName( final int columnIndex ) {
+ return COLUMN_NAMES[ columnIndex ];
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public Class<?> getColumnClass( final int columnIndex ) {
+ return COLUMN_CLASSES[ columnIndex ];
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public boolean isCellEditable( final int rowIndex, final int columnIndex ) {
+ return columnIndex == 0;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public Object getValueAt( final int rowIndex, final int columnIndex ) {
+ final SemanticTui tui = SemanticTui.values()[ rowIndex ];
+ switch ( columnIndex ) {
+ case 0:
+ return _wantedTuis.contains( tui );
+ case 1:
+ return tui.name();
+ case 2:
+ return tui.getSemanticType();
+ case 3:
+ return tui.getGroupName();
+ }
+ return "ERROR";
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void setValueAt( final Object aValue, final int rowIndex, final int columnIndex ) {
+ if ( aValue instanceof Boolean && columnIndex == 0 ) {
+ final SemanticTui tui = SemanticTui.values()[ rowIndex ];
+ if ( (Boolean)aValue ) {
+ _wantedTuis.add( tui );
+ } else {
+ _wantedTuis.remove( tui );
+ }
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void addTableModelListener( final TableModelListener listener ) {
+ _listenerList.add( TableModelListener.class, listener );
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void removeTableModelListener( final TableModelListener listener ) {
+ _listenerList.remove( TableModelListener.class, listener );
+ }
+
+
+}
Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/TextTypeModel.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/TextTypeModel.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/TextTypeModel.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/table/TextTypeModel.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,124 @@
+package org.apache.ctakes.gui.dictionary.cased.table;
+
+
+import org.apache.ctakes.gui.dictionary.cased.umls.file.Tty;
+
+import javax.swing.event.EventListenerList;
+import javax.swing.event.TableModelListener;
+import javax.swing.table.TableModel;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.EnumSet;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/27/2020
+ */
+public class TextTypeModel implements TableModel {
+
+
+ static private final String[] COLUMN_NAMES = { "Use Type", "TTY", "Text Type" };
+ static private final Class<?>[] COLUMN_CLASSES = { Boolean.class, String.class, String.class };
+
+ private final EventListenerList _listenerList = new EventListenerList();
+ private final Collection<Tty> _wantedTypes = EnumSet.noneOf( Tty.class );
+
+ public TextTypeModel() {
+ Arrays.stream( Tty.values() ).filter( Tty::collect ).forEach( _wantedTypes::add );
+ }
+
+
+ public Collection<Tty> getWantedTypes() {
+ return _wantedTypes;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public int getRowCount() {
+ return Tty.values().length;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public int getColumnCount() {
+ return 3;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public String getColumnName( final int columnIndex ) {
+ return COLUMN_NAMES[ columnIndex ];
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public Class<?> getColumnClass( final int columnIndex ) {
+ return COLUMN_CLASSES[ columnIndex ];
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public boolean isCellEditable( final int rowIndex, final int columnIndex ) {
+ return columnIndex == 0;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public Object getValueAt( final int rowIndex, final int columnIndex ) {
+ final Tty type = Tty.values()[ rowIndex ];
+ switch ( columnIndex ) {
+ case 0:
+ return _wantedTypes.contains( type );
+ case 1:
+ return type.name();
+ case 2:
+ return type.getDescription();
+ }
+ return "ERROR";
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void setValueAt( final Object aValue, final int rowIndex, final int columnIndex ) {
+ if ( aValue instanceof Boolean && columnIndex == 0 ) {
+ final Tty type = Tty.values()[ rowIndex ];
+ if ( (Boolean)aValue ) {
+ _wantedTypes.add( type );
+ } else {
+ _wantedTypes.remove( type );
+ }
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void addTableModelListener( final TableModelListener listener ) {
+ _listenerList.add( TableModelListener.class, listener );
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void removeTableModelListener( final TableModelListener listener ) {
+ _listenerList.remove( TableModelListener.class, listener );
+ }
+
+}
Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/ConsoLine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/ConsoLine.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/ConsoLine.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/ConsoLine.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,159 @@
+package org.apache.ctakes.gui.dictionary.cased.term;
+
+import org.apache.ctakes.core.util.StringUtil;
+import org.apache.ctakes.gui.dictionary.cased.umls.abbreviation.Lat;
+import org.apache.ctakes.gui.dictionary.cased.umls.file.MrConso;
+import org.apache.ctakes.gui.dictionary.cased.umls.file.Tty;
+
+import static org.apache.ctakes.gui.dictionary.cased.umls.file.MrConso.*;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/21/2019
+ */
+final public class ConsoLine implements TermLine {
+ final private String[] _columns;
+
+ public ConsoLine( final String line ) {
+ _columns = StringUtil.fastSplit( line, '|' );
+ }
+
+ private boolean isTextOk() {
+ return isWantedLat() && hasNoSpecialChars();
+ }
+
+ public boolean collect() {
+ return isTextOk()
+// && hasWantedSynonyms()
+ && Tty.collect( column( TTY ) );
+ }
+
+ public boolean isUnwantedDrug() {
+ final String text = getTokenizedText();
+ if ( text.contains( " in " ) && text.endsWith( " dosage form" ) ) {
+ return true;
+ }
+ if ( text.endsWith( " oral tablet" ) || text.endsWith( " oral capsule" )
+ || text.endsWith( "ml vial" ) || text.endsWith( "ml injection" ) ) {
+ return true;
+ }
+// if ( UmlsSource.getSource( column( SAB ) ) != UmlsSource.RXNORM ) {
+// return false;
+// }
+ return text.contains( " " ) && !Tty.keep( column( TTY ) );
+ }
+
+ public boolean isObsolete() {
+ final Tty tty = Tty.getType( column( TTY ) );
+ return tty == Tty.OAP || tty == Tty.OF;
+ }
+
+ // public String getCui() {
+// return CuiUtil.getCui( column( CUI ) );
+// }
+ public String getCui() {
+ return column( CUI );
+ }
+
+ public String getText() {
+ return column( STR );
+ }
+
+ public int getPrefScore() {
+ final String text = getText();
+ if ( text.length() < 3 ) {
+ return 1;
+ }
+ if ( text.chars().filter( Character::isAlphabetic ).count() < 3 ) {
+ return 1;
+ }
+
+ int score = 1;
+ score = upScore( TS, "P", score, 2 );
+
+// score = upScore( SAB, UmlsSource.NCI.getName(), score, 3 );
+// score = upScore( SAB, UmlsSource.FMA.getName(), score, 2 );
+// score = upScore( SAB, UmlsSource.SNOMEDCT_US.getName(), score, 2 );
+// score = upScore( SAB, UmlsSource.MTH.getName(), score, 2 );
+// score = upScore( SAB, UmlsSource.NCI_MTH.getName(), score, 3 );
+
+ score = upScore( STT, "PF", score, 2 );
+ score = upScore( STT, "VC", score, 2 );
+ score = upScore( STT, "VO", score, 2 );
+// score = upScore( ISPREF, "Y", score ); // It usually looks reversed.
+// score = upScore( ISPREF, "N", score, 2 );
+
+ score = upScore( TTY, "PT", score, 3 );
+ score = upScore( TTY, "PN", score, 3 );
+ score = upScore( TTY, "RXN_PT", score, 2 );
+ score = upScore( TTY, "DN", score, 2 );
+
+ if ( text.startsWith( "Entire " )
+ || text.startsWith( "Structure of " )
+ || text.endsWith( " structure" )
+ || text.endsWith( ")" )
+ || text.endsWith( "]" )
+ || text.endsWith( " NOS" )
+// || text.contains( "-" )
+ || text.contains( " or " )
+ || !Character.isLetter( text.charAt( 0 ) ) ) {
+ return score / 3;
+ }
+ if ( text.equals( text.toUpperCase() ) ) {
+ // Should also work for numbers.
+ return score / 2;
+ }
+ // Prefer fewer-word terms, but only slightly
+ final long spaces = text.chars().filter( Character::isWhitespace ).count();
+ return (int)Math.max( 1, score - spaces );
+ }
+
+ public String getSource() {
+ return column( SAB );
+ }
+
+ public String getCode() {
+ return column( CODE );
+ }
+
+ private String column( final MrConso conso ) {
+ return _columns[ conso.ordinal() ];
+ }
+
+ private int upScore( final MrConso column, final String wanted, final int score, final int weight ) {
+ if ( column( column ).equals( wanted ) ) {
+ return weight * score;
+ }
+ return score;
+ }
+
+ public boolean isWantedLat() {
+ return column( LAT ).equals( Lat.ENG.name() );
+ }
+
+// private boolean hasWantedSynonyms() {
+// return WantedSource.hasWantedSynonyms( column( SAB ) );
+// }
+
+
+ private boolean hasNoSpecialChars() {
+ final String text = getText();
+ // strips off all non-ASCII characters
+ String txt = text.replaceAll( "[^\\x00-\\x7F]", "" );
+ // erases all the ASCII control characters
+ txt = txt.replaceAll( "[\\p{Cntrl}&&[^\r\n\t]]", "" );
+ // removes non-printable characters from Unicode
+ txt = txt.replaceAll( "\\p{C}", "" );
+ return text.equals( txt );
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public String toString() {
+ return String.join( " | ", _columns );
+ }
+
+}
Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CuiTerm.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CuiTerm.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CuiTerm.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CuiTerm.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,342 @@
+package org.apache.ctakes.gui.dictionary.cased.term;
+
+
+import jdk.nashorn.internal.ir.annotations.Immutable;
+import org.apache.ctakes.core.util.StringUtil;
+import org.apache.ctakes.core.util.annotation.SemanticGroup;
+import org.apache.ctakes.core.util.annotation.SemanticTui;
+import org.apache.ctakes.gui.dictionary.cased.Ranks;
+import org.apache.ctakes.gui.dictionary.umls.VocabularyStore;
+import org.apache.ctakes.gui.dictionary.util.TextTokenizer;
+
+import java.util.*;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/14/2020
+ */
+@Immutable
+final public class CuiTerm {
+
+ static private final int MIN_SYNONYM_LENGTH = 3;
+ static private final int MAX_SYNONYM_LENGTH = 79;
+ static private final int MAX_SYNONYM_TOKENS = 5;
+
+ private final long _cuiCode;
+
+ private final Collection<SemanticTui> _semanticTuis = EnumSet.noneOf( SemanticTui.class );
+
+ private final Map<String, Collection<String>> _tokenizedVocabTuis = new HashMap<>();
+ private final Collection<ScoredText> _textScores = new HashSet<>();
+ private final Map<String, Collection<String>> _schemaCodes = new HashMap<>();
+
+
+ public CuiTerm( final long cuiCode ) {
+ _cuiCode = cuiCode;
+ }
+
+// public void addTui( final String tui ) {
+// addTui( SemanticTui.getTuiFromCode( tui ) );
+// }
+
+ public void addTui( final SemanticTui semanticTui ) {
+ _semanticTuis.add( semanticTui );
+ }
+
+ public void addSchemaCode( final String sab, final String code ) {
+ if ( _schemaCodes.computeIfAbsent( sab, c -> new HashSet<>() ).add( code ) ) {
+ VocabularyStore.getInstance().addVocabulary( sab, code );
+ }
+ }
+
+ public Map<String, Collection<String>> getSchemaCodes() {
+ return _schemaCodes;
+ }
+
+
+ public void addSynonym( final String text,
+ final String sab,
+ final Collection<SemanticTui> tuis,
+ final String ts,
+ final String stt,
+ final String tty ) {
+ _textScores.add( new ScoredText( text, ts, stt, tty ) );
+ final String tokenized = TextTokenizer.getTokenizedText( text );
+ final String stripped = stripForm( tokenized );
+ if ( !isDictionaryable( stripped ) ) {
+ return;
+ }
+ final String rankCode = Ranks.getRankCode( sab, tty );
+ _tokenizedVocabTuis.computeIfAbsent( maybeUncap( stripped, tuis ), s -> new HashSet<>() ).add( rankCode );
+ }
+
+
+ static private String stripForm( final String tokenized ) {
+ return tokenized.contains( "_ _ _" ) ? "" : tokenized;
+ }
+
+ static private String replaceEnd( final String text, final String end ) {
+ return text.toLowerCase().endsWith( end ) ? text.substring( 0, text.length() - end.length() ).trim() : text;
+ }
+
+ static private String replaceBegin( final String text, final String begin ) {
+ return text.toLowerCase().startsWith( begin ) ? text.substring( begin.length() ).trim() : text;
+ }
+
+ static private boolean isTextValid( final String tokenized ) {
+ final boolean absolutelyNot = tokenized.length() < MIN_SYNONYM_LENGTH
+ || tokenized.length() > MAX_SYNONYM_LENGTH
+ || StringUtil.fastSplit( tokenized, ' ' ).length > MAX_SYNONYM_TOKENS
+ // Check for auto-created note form
+// || StringUtil.fastSplit( tokenized, '@' ).length > 2
+ || tokenized.chars().noneMatch( Character::isAlphabetic )
+ || (tokenized.length() == MIN_SYNONYM_LENGTH && tokenized.charAt( 0 ) == '(');
+ return !absolutelyNot;
+ }
+
+
+ static private boolean isDictionaryable( final String tokenized ) {
+ final boolean absolutelyNot = tokenized.length() < MIN_SYNONYM_LENGTH
+ || tokenized.length() > MAX_SYNONYM_LENGTH
+ || (StringUtil.fastSplit( tokenized, ' ' ).length > MAX_SYNONYM_TOKENS);
+ if ( absolutelyNot ) {
+ return false;
+ }
+ final boolean hasGarbage = tokenized.startsWith( "[" )
+ || tokenized.contains( "#" )
+ || tokenized.contains( "@" )
+ || tokenized.contains( "&" )
+ || tokenized.contains( ";" )
+ || tokenized.contains( "\"" )
+ || tokenized.endsWith( ")" )
+ || tokenized.endsWith( "]" );
+ return !hasGarbage;
+ }
+
+ /**
+ * @return umls cui for the term
+ */
+ public long getCuiCode() {
+ return _cuiCode;
+ }
+
+ public Collection<Integer> getTuis() {
+ return _semanticTuis.stream()
+ .map( SemanticTui::getCode )
+ .collect( Collectors.toSet() );
+ }
+
+ private Collection<String> getTokenizedSynonyms() {
+ return _tokenizedVocabTuis.keySet();
+ }
+
+ static private final Predicate<String> onlyCapped
+ = t -> t.substring( 1 ).equals( t.substring( 1 ).toLowerCase() );
+
+
+ static private final Collection<String> UNITS = new HashSet<>( Arrays.asList(
+ "MG", "MG/MG", "ML", "mL", "MG/ML", "mg/mL", "ML/ML", "GM", "MCG", "MCG/ML", "mcg/mL", "BAU/ML",
+ "MEQ", "MEQ/ML", "UNT", "UNT/MG", "UNT/ML", "unt/mL", "UNT/GM", "MG/ACTUAT", "MG/HR" ) );
+
+ static private String uncapUnits( final String text ) {
+ return UNITS.contains( text ) ? text.toLowerCase() : text;
+ }
+
+ static private String uncapNumUnits( final String text ) {
+ int lastNum = -1;
+ for ( char c : text.toCharArray() ) {
+ if ( !Character.isDigit( c ) ) {
+ break;
+ }
+ lastNum++;
+ }
+ if ( lastNum < 0 || lastNum > text.length() - 2 ) {
+ return text;
+ }
+ final String remainder = text.substring( lastNum + 1 );
+ return UNITS.contains( remainder ) ? text.toLowerCase() : text;
+ }
+
+ static private final Collection<String> OTHERS = new HashSet<>( Arrays.asList( "NOS", "USP", "(USP)" ) );
+
+ static private String uncapOther( final String text ) {
+ return OTHERS.contains( text ) ? text.toLowerCase() : text;
+ }
+
+ static private String uncapitalize( final String text ) {
+ final String first = text.substring( 0, 1 ).toLowerCase();
+ if ( text.length() == 1 ) {
+ return first;
+ }
+ return first + text.substring( 1 );
+ }
+
+ static private final Collection<SemanticGroup> keepSingleCapTuis
+ = EnumSet.of( SemanticGroup.DEVICE, SemanticGroup.TITLE, SemanticGroup.DRUG );
+
+ static private String maybeUncap( final String tokenized, final Collection<SemanticTui> tuis ) {
+ final String[] words = StringUtil.fastSplit( tokenized, ' ' );
+ final String uncapped = Arrays.stream( words )
+ .map( CuiTerm::uncapOther )
+ .map( CuiTerm::uncapUnits )
+ .map( CuiTerm::uncapNumUnits )
+ .collect( Collectors.joining( " " ) );
+ if ( uncapped.equals( tokenized.toLowerCase() ) ) {
+ return tokenized.toLowerCase();
+ }
+ final String[] words2 = StringUtil.fastSplit( uncapped, ' ' );
+ final boolean removeSingleCap = tuis.stream()
+ .map( SemanticTui::getGroup )
+ .noneMatch( SemanticGroup.DRUG::equals );
+// .noneMatch( keepSingleCapTuis::contains );
+ if ( words2.length > 1 || removeSingleCap ) {
+ final String uncapped2 = Arrays.stream( words2 )
+ .map( CuiTerm::uncapitalize )
+ .collect( Collectors.joining( " " ) );
+ if ( uncapped2.equals( tokenized.toLowerCase() ) ) {
+ return tokenized.toLowerCase();
+ }
+ }
+ return tokenized;
+ }
+
+
+ public Collection<String> getUpperOnly() {
+ final Collection<String> lowerOnly = getLowerOnly();
+ final Collection<String> lowerMixed = getMixedOnly().stream()
+ .map( String::toLowerCase )
+ .collect( Collectors.toSet() );
+ return getTokenizedSynonyms()
+ .stream()
+ .filter( t -> t.chars().noneMatch( Character::isLowerCase ) )
+ .filter( t -> !lowerOnly.contains( t.toLowerCase() ) )
+ .filter( t -> !lowerMixed.contains( t.toLowerCase() ) )
+ .collect( Collectors.toSet() );
+ }
+
+ public Collection<String> getMixedOnly() {
+ final Collection<String> lowerOnly = getLowerOnly();
+ return getTokenizedSynonyms()
+ .stream()
+ .filter( t -> t.chars().anyMatch( Character::isUpperCase ) )
+ .filter( t -> t.chars().anyMatch( Character::isLowerCase ) )
+ .filter( t -> !lowerOnly.contains( t.toLowerCase() ) )
+ .collect( Collectors.toSet() );
+ }
+
+ public Collection<String> getLowerOnly() {
+ return getTokenizedSynonyms()
+ .stream()
+ .filter( t -> t.chars().noneMatch( Character::isUpperCase ) )
+ .collect( Collectors.toSet() );
+ }
+
+
+ public String getPreferredText() {
+ return _textScores.stream()
+ .max( prefScorer )
+ .map( ScoredText::getText )
+ .orElse( "" );
+ }
+
+ public int getInstances( final String text ) {
+ return _tokenizedVocabTuis.getOrDefault( text, Collections.emptyList() ).size();
+ }
+
+ public int getRank( final String text ) {
+ return _tokenizedVocabTuis.getOrDefault( text, Collections.emptyList() )
+ .stream()
+ .mapToInt( Ranks.getInstance()::getCodeRank )
+ .min()
+ .orElse( 0 );
+ }
+
+
+ static private final class ScoredText {
+ private final String _text;
+ private final int _tsScore;
+ private final int _sttScore;
+ private final int _ttyScore;
+ private final int _lengthScore;
+ private final int _wordCountScore;
+ private final int _uppercaseScore;
+ static private final Collection<String> GOOD_STT = Arrays.asList( "PF", "VC", "VO" );
+ static private final Collection<String> GREAT_TTY = Arrays.asList( "PT", "PN" );
+ static private final Collection<String> GOOD_TTY = Arrays.asList( "RXN_PT", "DN" );
+
+ private ScoredText( final String text,
+ final String ts,
+ final String stt,
+ final String tty ) {
+ _text = text;
+ _tsScore = ts.equals( "P" ) ? 2 : 1;
+ _sttScore = GOOD_STT.contains( stt ) ? 2 : 1;
+// score = upScore( ISPREF, "Y", score ); // It usually looks reversed.
+// score = upScore( ISPREF, "N", score, 2 );
+ _ttyScore = GREAT_TTY.contains( tty ) ? 3 : (GOOD_TTY.contains( tty ) ? 2 : 1);
+ _lengthScore = text.length();
+ // Prefer fewer-word terms - this should be last in a comparison
+ _wordCountScore = 10 - StringUtil.fastSplit( text, ' ' ).length;
+ _uppercaseScore = Character.isUpperCase( text.charAt( 0 ) ) ? 1 : 0;
+
+ }
+
+ public String getText() {
+ return _text;
+ }
+
+ public int getTsScore() {
+ return _tsScore;
+ }
+
+ public int getSttScore() {
+ return _sttScore;
+ }
+
+ public int getTtyScore() {
+ return _ttyScore;
+ }
+
+ public int getLengthScore() {
+ return _lengthScore;
+ }
+
+ public int getWordCountScore() {
+ return _wordCountScore;
+ }
+
+ public int getUppercaseScore() {
+ return _uppercaseScore;
+ }
+ }
+
+
+ static private final Comparator<ScoredText> prefScorer
+ = Comparator.comparingInt( ScoredText::getUppercaseScore )
+ .thenComparing( ScoredText::getTtyScore )
+ .thenComparingInt( ScoredText::getSttScore )
+ .thenComparingInt( ScoredText::getTsScore )
+ .thenComparingInt( ScoredText::getWordCountScore );
+
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public boolean equals( final Object value ) {
+ return value instanceof CuiTerm && ((CuiTerm)value).getCuiCode() == getCuiCode();
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public int hashCode() {
+ return ((Long)_cuiCode).hashCode();
+ }
+
+}
Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CustomTermLine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CustomTermLine.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CustomTermLine.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/CustomTermLine.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,62 @@
+package org.apache.ctakes.gui.dictionary.cased.term;
+
+
+import org.apache.ctakes.core.util.StringUtil;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 1/8/2020
+ */
+final public class CustomTermLine implements TermLine {
+
+
+ final private String[] _columns;
+
+ public CustomTermLine( final String line ) {
+ _columns = StringUtil.fastSplit( line, '|' );
+ }
+
+ public String getCui() {
+ return _columns[ 0 ];
+ }
+
+ public String getText() {
+ return _columns[ 1 ];
+ }
+
+ public int getPrefScore() {
+ return _columns.length > 2 ? getPrefScore( _columns[ 2 ] ) : TermLine.getHalfScore();
+ }
+
+ public String getSource() {
+ return "CUSTOM";
+ }
+
+ public String getCode() {
+ return getCui();
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public String toString() {
+ return String.join( " | ", _columns );
+ }
+
+
+ private int getPrefScore( final String text ) {
+ if ( text.isEmpty() ) {
+ return TermLine.getHalfScore();
+ }
+ try {
+ final int parseInt = Integer.parseInt( text );
+ return Math.min( Math.max( 0, parseInt ), TermLine.getMaxScore() );
+ } catch ( NumberFormatException nfE ) {
+ return TermLine.getHalfScore();
+ }
+ }
+
+
+}
Added: ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/TermLine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/TermLine.java?rev=1881995&view=auto
==============================================================================
--- ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/TermLine.java (added)
+++ ctakes/trunk/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/cased/term/TermLine.java Fri Sep 25 01:04:47 2020
@@ -0,0 +1,51 @@
+package org.apache.ctakes.gui.dictionary.cased.term;
+
+import org.apache.ctakes.gui.dictionary.util.TextTokenizer;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 1/8/2020
+ */
+public interface TermLine {
+
+ String getCui();
+
+ String getText();
+
+ int getPrefScore();
+
+ String getSource();
+
+ String getCode();
+
+ default String getTokenizedText() {
+ return getTokenizedText( getText() );
+ }
+
+ static int getMaxScore() {
+ // Right now the max score is TS=P SAB=NCI STT=PF TTY=PN + 2 for custom term. = 50
+ return 2 * 3 * 2 * 4 + 2;
+ }
+
+ static int getHalfScore() {
+ return getMaxScore() / 2;
+ }
+
+ static String getTokenizedText( final String text ) {
+ String tokenized = TextTokenizer.getTokenizedText( text );
+ if ( tokenized.endsWith( " nos" ) ) {
+ tokenized = tokenized.substring( 0, tokenized.length() - 4 );
+ if ( tokenized.endsWith( " ," ) ) {
+ tokenized = tokenized.substring( 0, tokenized.length() - 2 );
+ }
+ }
+ if ( tokenized.startsWith( "[ x ] " ) ) {
+ tokenized = tokenized.substring( 6 );
+ } else if ( tokenized.startsWith( "[ d ] " ) ) {
+ tokenized = tokenized.substring( 6 );
+ }
+ return tokenized;
+ }
+
+}