You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2014/02/27 20:19:55 UTC
svn commit: r1572710 [2/2] - in /ctakes/sandbox/dictionarytool: ./ data/
data/default/ lib/ src/ src/META-INF/ src/org/ src/org/apache/
src/org/apache/ctakes/ src/org/apache/ctakes/dictionarytool/
src/org/apache/ctakes/dictionarytool/reader/ src/org/ap...
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,76 @@
+package org.apache.ctakes.dictionarytool.reader;
+
+import org.apache.ctakes.dictionarytool.util.CuiTuiUtil;
+import org.apache.ctakes.dictionarytool.util.FileUtil;
+import org.apache.ctakes.dictionarytool.util.RRF_INDEX;
+import org.apache.ctakes.dictionarytool.util.UmlsTermUtil;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/17/14
+ */
+final public class UmlsTextsForCuisReader {
+
+// static private final String RRF_PATH = "C:/Spiffy/App/umls/2013AA/2013AA/META/MRCONSO.RRF";
+
+
+ private UmlsTextsForCuisReader() {
+ }
+
+ static public Map<String, Collection<String>> readTextsForCuis( final String rrfPath,
+ final Collection<String> wantedCuis,
+ final UmlsTermUtil umlsTermUtil ) {
+ System.out.println( "Compiling map of Umls Cuis and Texts" );
+ long lineCount = 0;
+ long textCount = 0;
+ final Map<String, Collection<String>> cuisAndText = new HashMap<String, Collection<String>>( wantedCuis.size() );
+ try {
+ final BufferedReader reader = FileUtil.createReader( rrfPath );
+ List<String> tokens = FileUtil.readBsvTokens( reader, rrfPath );
+ while ( tokens != null ) {
+ lineCount++;
+ if ( tokens.size() > RRF_INDEX.TEXT._index && tokens.get( RRF_INDEX.LANGUAGE._index ).equals( "ENG" ) ) {
+ final String cui = CuiTuiUtil.getAsCui( tokens.get( RRF_INDEX.CUI._index ) );
+ if ( wantedCuis.contains( cui ) ) {
+ String text = tokens.get( RRF_INDEX.TEXT._index );
+ Collection<String> formattedTexts = umlsTermUtil.getFormattedTexts( text );
+ if ( formattedTexts == null || formattedTexts.isEmpty() ) {
+ tokens = FileUtil.readBsvTokens( reader, rrfPath );
+ continue;
+ }
+ Collection<String> textsForCui = cuisAndText.get( cui );
+ if ( textsForCui == null ) {
+ cuisAndText.put( cui, formattedTexts );
+ textCount += formattedTexts.size();
+ } else {
+ final int oldSize = textsForCui.size();
+ textsForCui.addAll( formattedTexts );
+ textCount += textsForCui.size() - oldSize;
+ }
+ }
+ }
+ if ( lineCount % 2000 == 0 ) {
+ System.out.print( "." );
+ if ( lineCount % 100000 == 0 ) {
+ System.out.println( "File Line " + lineCount + "\t Terms " + textCount );
+ }
+ }
+ tokens = FileUtil.readBsvTokens( reader, rrfPath );
+ }
+ reader.close();
+ } catch ( IOException ioE ) {
+ System.err.println( ioE.getMessage() );
+ }
+ System.out.println( "File Line " + lineCount + "\t Terms " + textCount );
+ return cuisAndText;
+ }
+
+}
Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTuisForCuisReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTuisForCuisReader.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTuisForCuisReader.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTuisForCuisReader.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,75 @@
+package org.apache.ctakes.dictionarytool.reader;
+
+import org.apache.ctakes.dictionarytool.util.CuiTuiUtil;
+import org.apache.ctakes.dictionarytool.util.FileUtil;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/17/14
+ */
+final public class UmlsTuisForCuisReader {
+
+ private UmlsTuisForCuisReader() {
+ }
+
+// static private final String CUI_TUI_PATH = "C:/Spiffy/App/umls/2013AA/2013AA/META/MRSTY.RRF";
+
+ static private final int CUI_INDEX = 0;
+ static private final int TUI_INDEX = 1;
+
+ static public Map<String, Collection<String>> readUmlsTuisForCuis( final String cuiTuiMapPath,
+ final Collection<String> cuis ) {
+ System.out.println( "Compiling list of Tuis for wanted Cuis using " + cuiTuiMapPath );
+ long lineCount = 0;
+ final Map<String, Collection<String>> cuisAndTuis = new HashMap<String, Collection<String>>( cuis.size() );
+ final Collection<String> usedCuis = new HashSet<String>( cuis.size() );
+ try {
+ final BufferedReader reader = FileUtil.createReader( cuiTuiMapPath );
+ List<String> tokens = FileUtil.readBsvTokens( reader, cuiTuiMapPath );
+ while ( tokens != null ) {
+ lineCount++;
+ if ( tokens.size() > TUI_INDEX ) {
+ final String cui = CuiTuiUtil.getAsCui( tokens.get( CUI_INDEX ) );
+ if ( !cuis.contains( cui ) ) {
+ tokens = FileUtil.readBsvTokens( reader, cuiTuiMapPath );
+ continue;
+ }
+ final String tui = CuiTuiUtil.getAsTui( tokens.get( TUI_INDEX ) );
+ Collection<String> tuis = cuisAndTuis.get( cui );
+ if ( tuis == null ) {
+ tuis = new HashSet<String>( 1 );
+ cuisAndTuis.put( cui, tuis );
+ }
+ tuis.add( tui );
+ usedCuis.add( cui );
+ }
+ if ( lineCount % 100000 == 0 ) {
+ System.out.println( "File Line " + lineCount + "\t Cuis " + cuisAndTuis.size() );
+ }
+ tokens = FileUtil.readBsvTokens( reader, cuiTuiMapPath );
+ }
+ reader.close();
+ } catch ( IOException ioE ) {
+ System.err.println( ioE.getMessage() );
+ }
+ System.out.println( "File Lines " + lineCount + "\t Cuis " + cuisAndTuis.size() );
+ if ( usedCuis.size() != cuis.size() ) {
+ cuis.removeAll( usedCuis );
+ for ( String missingCui : cuis ) {
+ System.out.println( "Could not find Tuis for Cui " + missingCui );
+ }
+ }
+ return cuisAndTuis;
+ }
+
+
+}
Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTuisForCuisReader.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,152 @@
+package org.apache.ctakes.dictionarytool.util;
+
+import net.jcip.annotations.NotThreadSafe;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 2/26/14
+ */
+@NotThreadSafe
+final public class CreatorProperties {
+
+ static private final String DEFAULT_DATA_DIR = "./data/default";
+ static private final String DEFAULT_TUI_FILE = DEFAULT_DATA_DIR + "/CtakesAllTuis.txt";
+ static private final String DEFAULT_SOURCE_FILE = DEFAULT_DATA_DIR + "/CtakesSources.txt";
+
+
+ private boolean _rareWordIndex = true;
+
+ public CreatorProperties( final String ... args ) {
+ if ( args.length == 0 ) {
+ printHelp();
+ System.exit( 0 );
+ }
+ for ( String arg : args ) {
+ if ( arg.equalsIgnoreCase( "-fw" ) ) {
+ _rareWordIndex = false;
+ } else if ( arg.equals( "-?" ) || arg.equalsIgnoreCase( "-h" ) ) {
+ printHelp();
+ System.exit( 0 );
+ }
+ }
+ for ( Option option : Option.values() ) {
+ option.parseValue( args );
+ }
+ if ( !ensurePropertiesOk() ) {
+ printHelp();
+ System.exit( 1 );
+ }
+ assignCtakesDefaults();
+ }
+
+ static private void printHelp() {
+ System.out.println( "Dictionary Creator: Creates a flat file Cui|Text or Database Dictionary from UMLS and Orangebook" );
+ System.out.println( "Database Dictionary can be indexed by each Text's First Word or Rarest Word (for the dictionary)" );
+ System.out.println( "Minimal Usage: DictionaryCreator -umls pathToUmlsRoot -ol pathToFlatFileOutput" );
+ System.out.println( "" );
+ System.out.println( "-fw \t\tCreate First Word Index" );
+// System.out.println( "-ct \t\tUse cTakes default setup (default)" );
+ for ( Option option : Option.values() ) {
+ System.out.println( option.getHelp() );
+ }
+ System.out.println( "The UMLS Root Directory must be specified" );
+ System.out.println( "One form of output must be specified using either -ol or -db and -tbl" );
+ System.out.println( "The default index type for databases is Rare Word Index" );
+ System.out.println( "If an Orangebook Path is not specified then (orangebook) medication terms are not written" );
+ System.out.println( "If a Format Data Directory is not specified then the default is used: " + DEFAULT_DATA_DIR );
+ System.out.println( "If an Input Tui List Path is not specified then the cTakes Tuis are used: " + DEFAULT_TUI_FILE );
+ System.out.println( "If a Source Type List Path is not specified then Snomed is used: " + DEFAULT_SOURCE_FILE );
+ }
+
+ private boolean ensurePropertiesOk() {
+ boolean ok = true;
+ if ( !Option.TERM_LIST.hasValue()
+ && (!Option.DATA_BASE.hasValue() || !Option.DATA_TABLE.hasValue()) ) {
+ System.err.println( "Need an output location" );
+ ok = false;
+ }
+ if ( !Option.UMLS_ROOT.hasValue() ) {
+ System.err.println( "Need an UMLS_ROOT root directory" );
+ }
+ return ok;
+ }
+
+ private void assignCtakesDefaults() {
+ if ( !Option.FORMAT_DATA.hasValue() ) {
+ Option.FORMAT_DATA.parseValue( Option.FORMAT_DATA.__key, DEFAULT_DATA_DIR );
+ }
+ if ( !Option.TUI_LIST.hasValue() ) {
+ Option.TUI_LIST.parseValue( Option.TUI_LIST.__key, DEFAULT_TUI_FILE );
+ }
+ if ( !Option.SOURCE.hasValue() ) {
+ Option.SOURCE.parseValue( Option.SOURCE.__key, DEFAULT_SOURCE_FILE );
+ }
+ }
+
+ /**
+ * @return true if a rare word indexed dictionary should be created
+ */
+ public boolean isRareWordIndex() {
+ return _rareWordIndex;
+ }
+
+ static public enum Option {
+ UMLS_ROOT( "Umls Root Directory", "-umls" ),
+ ORANGE_BOOK( "Orangebook Path", "-ob" ),
+ FORMAT_DATA( "Format Data Directory", "-fd" ),
+ TUI_LIST( "Input Tui List Path", "-tui" ),
+// SEM_LIST( "Input Semantic Group List Path", "-sem" ),
+ SOURCE( "Source Type List Path", "-src" ),
+ TERM_LIST( "Output Cui and Term List Path", "-ol" ),
+ DATA_BASE( "Output Database Url", "-db" ),
+ DATA_TABLE( "Output Database Table", "-tbl" );
+ final private String __name;
+ final private String __key;
+ private String __value;
+
+ private Option( final String name, final String key ) {
+ __name = name;
+ __key = key;
+ }
+
+ public String getName() {
+ return __name;
+ }
+
+ public String getKey() {
+ return __key;
+ }
+
+ public String getValue() {
+ return __value;
+ }
+
+ public boolean hasValue() {
+ return __value != null && !__value.isEmpty();
+ }
+
+ private void parseValue( final String ... args ) {
+ if ( args[args.length-1].equalsIgnoreCase( __key ) ) {
+ System.err.println( "An argument is needed for " + __name + " (" + __key + ")" );
+ return;
+ }
+ for ( int i=0; i<args.length-1; i++ ) {
+ if ( args[i].equalsIgnoreCase( __key ) ) {
+ if ( hasValue() ) {
+ System.err.println( __name + " (" + __key + ") has been set more than once" );
+ }
+ __value = args[i+1];
+ // don't break yet, check for repeat setting
+ }
+ }
+ }
+
+ public String getHelp() {
+ return getKey() + " \t\t" + getName();
+ }
+ }
+
+
+
+}
Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CuiTuiUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CuiTuiUtil.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CuiTuiUtil.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CuiTuiUtil.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,29 @@
+package org.apache.ctakes.dictionarytool.util;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/17/14
+ */
+final public class CuiTuiUtil {
+
+ private CuiTuiUtil() {
+ }
+
+ static public String getAsCui( final String code ) {
+ final String cui = code.trim().toUpperCase();
+ if ( cui.startsWith( "C" ) ) {
+ return cui;
+ }
+ return "C" + cui;
+ }
+
+ static public String getAsTui( final String code ) {
+ final String tui = code.trim().toUpperCase();
+ if ( tui.startsWith( "T" ) ) {
+ return tui;
+ }
+ return "T" + tui;
+ }
+
+}
Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CuiTuiUtil.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/FileUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/FileUtil.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/FileUtil.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/FileUtil.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,218 @@
+package org.apache.ctakes.dictionarytool.util;
+
+import javax.swing.filechooser.FileSystemView;
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Logger;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/15/14
+ */
+final public class FileUtil {
+
+ private FileUtil() {
+ }
+
+ static private final Logger LOGGER = Logger.getLogger( "FileUtil" );
+
+ static public String parseDirText( final String dirPath ) {
+ if ( dirPath == null || dirPath.isEmpty() ) {
+ return parseDirText( "." );
+ } else if ( dirPath.startsWith( "~" ) ) {
+ return parseDirText( dirPath.replaceAll( "~", System.getProperty( "user.home" ) ) );
+ } else if ( dirPath.equals( "." ) ) {
+ final String userDir = System.getProperty("user.dir");
+ if ( userDir == null || userDir.isEmpty() ) {
+ return FileSystemView.getFileSystemView().getDefaultDirectory().getPath();
+ }
+ return userDir;
+ } else if ( dirPath.startsWith( ".." ) ) {
+ final String userDirPath = parseDirText( "." );
+ File cwd = new File( userDirPath );
+ String cwdPath = dirPath;
+ while ( cwdPath.startsWith( ".." ) ) {
+ if ( !cwd.isDirectory() ) {
+ LOGGER.severe( "Invalid directory " + dirPath );
+ System.exit( 1 );
+ }
+ cwd = cwd.getParentFile();
+ if ( cwdPath.equals( ".." ) ) {
+ return cwd.getPath();
+ }
+ cwdPath = cwdPath.substring( 3 );
+ }
+ return cwd.getPath();
+ }
+ return dirPath;
+ }
+
+
+
+ static public BufferedReader createReader( final String filePath ) {
+ final String formattedPath = parseDirText( filePath );
+ final File file = new File( formattedPath );
+ if ( !file.canRead() ) {
+ System.err.println( "Cannot read file " + filePath );
+ System.exit( 1 );
+ }
+ try {
+ return new BufferedReader( new FileReader( file ) );
+ } catch ( IOException ioE ) {
+ System.err.println( "Cannot create Reader for " + filePath );
+ System.err.println( ioE.getMessage() );
+ System.exit( 1 );
+ }
+ return null;
+ }
+
+ static public BufferedWriter createWriter( final String filePath ) {
+ final String formattedPath = parseDirText( filePath );
+ final File file = new File( formattedPath );
+ if ( file.getParentFile() != null && !file.getParentFile().isDirectory() ) {
+ file.getParentFile().mkdirs();
+ }
+ try {
+ return new BufferedWriter( new FileWriter( file, true ) );
+ } catch ( IOException ioE ) {
+ System.err.println( "Cannot create Writer for " + filePath );
+ System.err.println( ioE.getMessage() );
+ System.exit( 1 );
+ }
+ return null;
+ }
+
+ static public String readLine( final BufferedReader reader, final String filePath ) {
+ try {
+ String line = reader.readLine();
+ while ( line != null ) {
+ line = line.trim();
+ if ( !line.isEmpty() && !line.startsWith( "//" ) ) {
+ return line;
+ }
+ line = reader.readLine();
+ }
+ } catch ( IOException ioE ) {
+ System.err.println( "Error reading from file " + filePath );
+ }
+ return null;
+ }
+
+ static public List<String> readBsvTokens( final BufferedReader reader, final String filePath ) {
+ final String line = readLine( reader, filePath );
+ if ( line == null ) {
+ return null;
+ }
+ return TokenUtil.getBsvItems( line );
+ }
+
+ static public List<String> readCsvTokens( final BufferedReader reader, final String filePath ) {
+ final String line = readLine( reader, filePath );
+ if ( line == null ) {
+ return null;
+ }
+ return TokenUtil.getCsvItems( line );
+ }
+
+ static public List<String> readTildeTokens( final BufferedReader reader, final String filePath ) {
+ final String line = readLine( reader, filePath );
+ if ( line == null ) {
+ return null;
+ }
+ return TokenUtil.getTildeItems( line );
+ }
+
+ static public void writeOneColumn( final String filePath, final String description,
+ final Collection<String> list ) {
+ System.out.println( "Writing " + description + " to " + filePath );
+ long lineCount = 0;
+ try {
+ final BufferedWriter writer = createWriter( filePath );
+ for ( String item : list ) {
+ lineCount++;
+ writer.write( item );
+ writer.newLine();
+ if ( lineCount % 100000 == 0 ) {
+ System.out.println( "File Line " + lineCount );
+ }
+ }
+ writer.close();
+ } catch ( IOException ioE ) {
+ System.err.println( "Error writing " + description + " on line " + lineCount + " in file " + filePath );
+ }
+ System.out.println( "Wrote " + lineCount + " " + description + " to " + filePath );
+ }
+
+
+ static public Collection<String> readOneColumn( final String listFilePath, final String description ) {
+ System.out.println( "Reading " + description + " from " + listFilePath );
+ final Collection<String> listItems = new HashSet<String>();
+ long lineCount = 0;
+ try {
+ final BufferedReader reader = createReader( listFilePath );
+ String line = readLine( reader, listFilePath );
+ while ( line != null ) {
+ lineCount++;
+ listItems.add( line );
+ if ( lineCount % 100000 == 0 ) {
+ System.out.println( "File Line " + lineCount );
+ }
+ line = readLine( reader, listFilePath );
+ }
+ reader.close();
+ } catch ( IOException ioE ) {
+ System.err.println( ioE.getMessage() );
+ }
+ System.out.println( "File Lines " + lineCount + "\t " + description + " " + listItems.size() );
+ return listItems;
+ }
+
+
+ static public void writeNamedSets( final String filePath, final String description,
+ final Map<String, Collection<String>> namedSets ) {
+ System.out.println( "Writing " + description + " to " + filePath );
+ long lineCount = 0;
+ try {
+ final BufferedWriter writer = createWriter( filePath );
+ for ( Map.Entry<String, Collection<String>> namedSet : namedSets.entrySet() ) {
+ lineCount++;
+ writer.write( TokenUtil.createBsvLine( namedSet.getKey(),
+ TokenUtil.createCsvLine( namedSet.getValue() ) ) );
+ writer.newLine();
+ if ( lineCount % 100000 == 0 ) {
+ System.out.println( "File Line " + lineCount );
+ }
+ }
+ writer.close();
+ } catch ( IOException ioE ) {
+ System.err.println( "Error writing " + description + " on line " + lineCount + " in file " + filePath );
+ }
+ System.out.println( "Wrote " + lineCount + " " + description + " to " + filePath );
+ }
+
+ static public Map<String, Collection<String>> readNamedSets( final String filePath, final String description ) {
+ final Collection<String> lines = readOneColumn( filePath, description );
+ final Map<String, Collection<String>> namedSets = new HashMap<String, Collection<String>>( lines.size() );
+ for ( String line : lines ) {
+ final List<String> nameAndList = TokenUtil.getBsvItems( line );
+ if ( nameAndList == null || nameAndList.size() != 2 ) {
+ System.err.println( "Bad line " + line );
+ continue;
+ }
+ namedSets.put( nameAndList.get( 0 ), TokenUtil.getCsvItems( nameAndList.get( 1 ) ) );
+ }
+ return namedSets;
+ }
+
+
+}
Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/FileUtil.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/JdbcUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/JdbcUtil.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/JdbcUtil.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/JdbcUtil.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,69 @@
+package org.apache.ctakes.dictionarytool.util;
+
+import java.sql.Connection;
+import java.sql.Driver;
+import java.sql.DriverManager;
+import java.sql.SQLException;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/21/14
+ */
+final public class JdbcUtil {
+
+ private JdbcUtil() {
+ }
+
+ static private final String JDBC_DRIVER = "org.hsqldb.jdbcDriver";
+
+
+ static public void registerDriver() {
+ try {
+ Driver driver = (Driver) Class.forName( JDBC_DRIVER ).newInstance();
+ DriverManager.registerDriver( driver );
+ } catch ( Exception e ) {
+ // TODO At least four different exceptions are thrown here, and should be caught and handled individually
+ System.err.println( "Could not register Driver " + JDBC_DRIVER );
+ System.err.println( e.getMessage() );
+ System.exit( 1 );
+ }
+ }
+
+ static public Connection createDatabaseConnection( final String url, final String user, final String pass ) {
+ registerDriver();
+ System.out.println( "Connecting to " + url + " as " + user );
+ Connection connection = null;
+ try {
+ connection = DriverManager.getConnection( url, user, pass );
+ } catch ( SQLException sqlE ) {
+ // thrown by Connection.prepareStatement(..) and getTotalRowCount(..)
+ System.err.println( "Could not establish connection to " + url + " as " + user );
+ System.err.println( sqlE.getMessage() );
+ System.exit( 1 );
+ }
+ return connection;
+ }
+
+// static public String createRowInsertSql( final String tableName, final int valueCount ) {
+ static public String createRowInsertSql( final String tableName, final Enum ... fields ) {
+
+ final StringBuilder sb = new StringBuilder( "insert into" );
+ sb.append( " " ).append( tableName );
+ sb.append( " (" );
+ for ( Enum field : fields ) {
+ sb.append( field.name() ).append( ',' );
+ }
+ // remove last comma
+ sb.setLength( sb.length() - 1 );
+ sb.append( ") " );
+ sb.append( " values (" );
+ for ( int i = 0; i < fields.length - 1; i++ ) {
+ sb.append( "?," );
+ }
+ sb.append( "?)" );
+ return sb.toString();
+ }
+
+
+}
Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/JdbcUtil.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RRF_INDEX.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RRF_INDEX.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RRF_INDEX.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RRF_INDEX.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,15 @@
+package org.apache.ctakes.dictionarytool.util;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/23/14
+ */
+public enum RRF_INDEX {
+ CUI(0), LANGUAGE(1), SOURCE(11), TEXT(14);
+ final public int _index;
+ RRF_INDEX( final int index ) {
+ _index = index;
+ }
+
+}
Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RRF_INDEX.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RareWordUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RareWordUtil.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RareWordUtil.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RareWordUtil.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,171 @@
+package org.apache.ctakes.dictionarytool.util;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/17/14
+ */
+final public class RareWordUtil {
+
+ private RareWordUtil() {}
+
+ // LookupDesc for the standard excluded pos tags are
+ // VB,VBD,VBG,VBN,VBP,VBZ,CC,CD,DT,EX,LS,MD,PDT,POS,PP,PP$,PRP,PRP$,RP,TO,WDT,WP,WPS,WRB
+ // Listing every verb in the language seems a pain, but listing the others is possible.
+ // Verbs should be rare in the dictionaries, excepting perhaps the activity and concept dictionaries
+ // CD, CC, DT, EX, MD, PDT, PP, PP$, PRP, PRP$, RP, TO, WDT, WP, WPS, WRB
+ // why not WP$ (possessive wh- pronoun "whose")
+ // PP$ is a Brown POS tag, not Penn Treebank (as are the rest)
+// static private final String[] BAD_POS_TERMS = {
+// // CD cardinal number
+// "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
+// // CC coordinating conjunction
+// "and", "or", "but", "for", "nor", "so", "yet",
+// // DT determiner
+// "this", "that", "these", "those", "the",
+// // EX existential there
+// "there",
+// // MD modal
+// "can", "should", "will", "may", "might", "must", "could", "would",
+// // PDT predeterminer
+// "some", "any", "all", "both", "half", "none", "twice",
+// // PP prepositional phrase (preposition)
+// "at", "before", "after", "behind", "beneath", "beside", "between", "into", "through", "across", "of",
+// "concerning", "like", "except", "with", "without", "toward", "to", "past", "against", "during", "until",
+// "throughout", "below", "besides", "beyond", "from", "inside", "near", "outside", "since", "upon",
+// // PP$ possessive personal pronoun - Brown POS tag, not Penn TreeBank
+// "my", "our",
+// // PRP personal pronoun
+// "i", "you", "he", "she", "it",
+// // PRP$ possesive pronoun
+// "mine", "yours", "his", "hers", "its", "ours", "theirs",
+// // RP particle - this contains some prepositions
+// "about", "off", "up", "along", "away", "back", "by", "down", "forward", "in", "on", "out",
+// "over", "around", "under",
+// // TO to - also a preposition
+// "to",
+// // WDT wh- determiner
+// "what", "whatever", "which", "whichever",
+// // WP, WPS wh- pronoun, nominative wh- pronoun
+// "who", "whom", "which", "that", "whoever", "whomever",
+// // WRB
+// "how", "where", "when", "however", "wherever", "whenever",
+// // Mine ...
+// "no"
+// };
+
+ static private Set<String> BAD_POS_TERM_SET;
+
+ static {
+ final String[] BAD_POS_TERMS = {
+ // CD cardinal number
+ "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
+ // CC coordinating conjunction
+ "and", "or", "but", "for", "nor", "so", "yet",
+ // DT determiner
+ "this", "that", "these", "those", "the",
+ // EX existential there
+ "there",
+ // MD modal
+ "can", "should", "will", "may", "might", "must", "could", "would",
+ // PDT predeterminer
+ "some", "any", "all", "both", "half", "none", "twice",
+ // PP prepositional phrase (preposition)
+ "at", "before", "after", "behind", "beneath", "beside", "between", "into", "through", "across", "of",
+ "concerning", "like", "except", "with", "without", "toward", "to", "past", "against", "during", "until",
+ "throughout", "below", "besides", "beyond", "from", "inside", "near", "outside", "since", "upon",
+ // PP$ possessive personal pronoun - Brown POS tag, not Penn TreeBank
+ "my", "our",
+ // PRP personal pronoun
+ "i", "you", "he", "she", "it",
+ // PRP$ possesive pronoun
+ "mine", "yours", "his", "hers", "its", "ours", "theirs",
+ // RP particle - this contains some prepositions
+ "about", "off", "up", "along", "away", "back", "by", "down", "forward", "in", "on", "out",
+ "over", "around", "under",
+ // TO to - also a preposition
+ "to",
+ // WDT wh- determiner
+ "what", "whatever", "which", "whichever",
+ // WP, WPS wh- pronoun, nominative wh- pronoun
+ "who", "whom", "which", "that", "whoever", "whomever",
+ // WRB
+ "how", "where", "when", "however", "wherever", "whenever",
+ // Mine ...
+ "no"
+ };
+ BAD_POS_TERM_SET = new HashSet<String>( Arrays.asList( BAD_POS_TERMS ) );
+ }
+
+ static public boolean isRarableToken( final String token ) {
+ if ( token.length() <= 1 ) {
+ return false;
+ }
+ boolean hasLetter = false;
+ for ( int i = 0; i < token.length(); i++ ) {
+ if ( Character.isLetter( token.charAt( i ) ) ) {
+ hasLetter = true;
+ break;
+ }
+ }
+ return hasLetter && !BAD_POS_TERM_SET.contains( token );
+ }
+
+
+ static public Map<String, Integer> getTokenCounts( final Map<String, Collection<String>> cuiTexts ) {
+ final Map<String, Integer> tokenCounts = new HashMap<String, Integer>();
+ for ( Collection<String> texts : cuiTexts.values() ) {
+ for ( String text : texts ) {
+ final String[] tokens = text.split( "\\s+" );
+ for ( String token : tokens ) {
+ if ( RareWordUtil.isRarableToken( token ) ) {
+ Integer count = tokenCounts.get( token );
+ if ( count == null ) {
+ count = 0;
+ }
+ tokenCounts.put( token, (count + 1) );
+ }
+ }
+
+ }
+ }
+ return tokenCounts;
+ }
+
+
+ // static public String getRareToken( final Map<String,Integer> tokenCounts, final String text ) {
+ // final String[] tokens = text.split( "\\s+" );
+ // int bestIndex = 0;
+ // int bestCount = Integer.MAX_VALUE;
+ // for ( int i = 0; i < tokens.length; i++ ) {
+ // Integer count = tokenCounts.get( tokens[i] );
+ // if ( count != null && count < bestCount ) {
+ // bestIndex = i;
+ // bestCount = count;
+ // }
+ // }
+ // return tokens[bestIndex];
+ // }
+ //
+ // static public int getRareTokenIndex( final Map<String,Integer> tokenCounts, final String text ) {
+ // final String[] tokens = text.split( "\\s+" );
+ // int bestIndex = 0;
+ // int bestCount = Integer.MAX_VALUE;
+ // for ( int i = 0; i < tokens.length; i++ ) {
+ // Integer count = tokenCounts.get( tokens[i] );
+ // if ( count != null && count < bestCount ) {
+ // bestIndex = i;
+ // bestCount = count;
+ // }
+ // }
+ // return bestIndex;
+ // }
+
+}
Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RareWordUtil.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,191 @@
+package org.apache.ctakes.dictionarytool.util;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/16/14
+ */
+final public class TextTokenizer {
+
+ private TextTokenizer() {
+ }
+
+ static private final String[] PREFIXES = {
+ "e-",
+ "a-",
+ "u-",
+ "x-",
+ "agro-",
+ "ante-",
+ "anti-",
+ "arch-",
+ "be-",
+ "bi-",
+ "bio-",
+ "co-",
+ "counter-",
+ "cross-",
+ "cyber-",
+ "de-",
+ "eco-",
+ "ex-",
+ "extra-",
+ "inter-",
+ "intra-",
+ "macro-",
+ "mega-",
+ "micro-",
+ "mid-",
+ "mini-",
+ "multi-",
+ "neo-",
+ "non-",
+ "over-",
+ "pan-",
+ "para-",
+ "peri-",
+ "post-",
+ "pre-",
+ "pro-",
+ "pseudo-",
+ "quasi-",
+ "re-",
+ "semi-",
+ "sub-",
+ "super-",
+ "tri-",
+ "ultra-",
+ "un-",
+ "uni-",
+ "vice-",
+ // From email from Colin Warner <co...@ldc.upenn.edu> on 7/25/2010
+ "electro-",
+ "gasto-",
+ "homo-",
+ "hetero-",
+ "ortho-",
+ "phospho-",
+ };
+
+ static private final String[] SUFFIXES = {"-esque", "-ette", "-fest", "-fold", "-gate", "-itis", "-less", "-most",
+ "-o-torium", "-rama", "-wise"};
+
+
+ static private String getNextCharTerm( final String word ) {
+ final StringBuilder sb = new StringBuilder();
+ final int count = word.length();
+ for ( int i = 0; i < count; i++ ) {
+ final char c = word.charAt( i );
+ if ( !Character.isLetterOrDigit( c ) ) {
+ return sb.toString();
+ }
+ sb.append( c );
+ }
+ return sb.toString();
+ }
+
+ static private boolean isPrefix( final String word ) {
+ final String prefixQ = word + "-";
+ for ( String prefix : PREFIXES ) {
+ if ( prefix.equals( prefixQ ) ) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ static private boolean isSuffix( final String word, final int startIndex ) {
+ if ( word.length() >= startIndex ) {
+ return false;
+ }
+ final String nextCharTerm = getNextCharTerm( word.substring( startIndex ) );
+ if ( nextCharTerm.isEmpty() ) {
+ return false;
+ }
+ final String suffixQ = "-" + nextCharTerm;
+ for ( String suffix : SUFFIXES ) {
+ if ( suffix.equals( suffixQ ) ) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+
+ static public List<String> getTokens( final String word ) {
+ final List<String> tokens = new ArrayList<String>();
+ final StringBuilder sb = new StringBuilder();
+ final int count = word.length();
+ for ( int i = 0; i < count; i++ ) {
+ final char c = word.charAt( i );
+ if ( Character.isLetterOrDigit( c ) ) {
+ // Appending character to current word
+ sb.append( c );
+ continue;
+ }
+ if ( c != '-' ) {
+ // have a symbol other than dash
+ if ( sb.length() != 0 ) {
+ // add the current word
+ tokens.add( sb.toString() );
+ sb.setLength( 0 );
+ }
+ // add the symbol
+ tokens.add( "" + c );
+ continue;
+ }
+ final boolean isPrefix = isPrefix( sb.toString() );
+ if ( isPrefix ) {
+ // what precedes is a prefix, so append the dash to the current word and move on
+ sb.append( '-' );
+ continue;
+ }
+ final boolean isSuffix = isSuffix( word, i + 1 );
+ if ( !isSuffix ) {
+ // what follows is not a suffix, so add the current word, add the dash, and move on
+ if ( sb.length() != 0 ) {
+ tokens.add( sb.toString() );
+ sb.setLength( 0 );
+ }
+ tokens.add( "" + c );
+ continue;
+ }
+ tokens.add( "" + c );
+ }
+ if ( sb.length() != 0 ) {
+ // add the final word
+ tokens.add( sb.toString() );
+ }
+ return tokens;
+ }
+
+ static public String getTokenizedText( final String text ) {
+ if ( text.isEmpty() ) {
+ return text;
+ }
+ final String[] splits = text.toLowerCase().split( "\\s+" );
+ if ( splits.length == 0 ) {
+ return "";
+ }
+ final String lastSplit = splits[splits.length - 1];
+ if ( lastSplit.endsWith( "," ) || lastSplit.endsWith( ";" ) || lastSplit.endsWith( "." ) ) {
+ // get rid of last comma or semicolon or period
+ splits[splits.length - 1] = lastSplit.substring( 0, lastSplit.length() - 1 );
+ }
+ final StringBuilder sb = new StringBuilder();
+ for ( String split : splits ) {
+ final List<String> tokens = getTokens( split );
+ for ( String token : tokens ) {
+ sb.append( token ).append( " " );
+ }
+ }
+ // trim whitespace
+ sb.setLength( Math.max( 0, sb.length() - 1 ) );
+ return sb.toString();
+ }
+
+
+}
Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TokenUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TokenUtil.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TokenUtil.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TokenUtil.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,77 @@
+package org.apache.ctakes.dictionarytool.util;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/15/14
+ */
+final public class TokenUtil {
+
+ private TokenUtil() {
+ }
+
+ static public List<String> getBsvItems( final String line ) {
+ return getSeparatedValueItems( line, '|' );
+ }
+
+ static public List<String> getTildeItems( final String line ) {
+ return getSeparatedValueItems( line, '~' );
+ }
+
+ static public List<String> getCsvItems( final String line ) {
+ return getSeparatedValueItems( line, ',' );
+ }
+
+ static private List<String> getSeparatedValueItems( final String line, final char separator ) {
+ if ( line == null || line.trim().isEmpty() ) {
+ return Collections.emptyList();
+ }
+ final List<String> tokens = new ArrayList<String>();
+ int startIndex = 0;
+ int stopIndex = line.indexOf( separator );
+ while ( stopIndex > 0 && stopIndex < line.length() ) {
+ tokens.add( line.substring( startIndex, stopIndex ) );
+ startIndex = stopIndex + 1;
+ stopIndex = line.indexOf( separator, startIndex );
+ }
+ if ( startIndex < line.length() - 1 ) {
+ tokens.add( line.substring( startIndex ) );
+ }
+ return tokens;
+ }
+
+
+
+
+ static public String createBsvLine( final Collection<String> values ) {
+ return createBsvLine( values.toArray( new String[values.size()] ) );
+ }
+
+ static public String createBsvLine( final String... values ) {
+ final StringBuilder sb = new StringBuilder();
+ for ( String value : values ) {
+ sb.append( value ).append( "|" );
+ }
+ sb.setLength( sb.length() - 1 );
+ return sb.toString();
+ }
+
+ static public String createCsvLine( final Collection<String> values ) {
+ return createCsvLine( values.toArray( new String[values.size()] ) );
+ }
+
+ static public String createCsvLine( final String... values ) {
+ final StringBuilder sb = new StringBuilder();
+ for ( String value : values ) {
+ sb.append( value ).append( "," );
+ }
+ sb.setLength( sb.length() - 1 );
+ return sb.toString();
+ }
+
+}
Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TokenUtil.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsSourceTypeCuiValidator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsSourceTypeCuiValidator.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsSourceTypeCuiValidator.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsSourceTypeCuiValidator.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,79 @@
+package org.apache.ctakes.dictionarytool.util;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/17/14
+ */
+final public class UmlsSourceTypeCuiValidator {
+
+ private UmlsSourceTypeCuiValidator() {}
+
+
+ /**
+ * Can cull the given collection of cuis
+ * @param rrfPath path to the UMLS_ROOT Meta/MRCONSO.RRF file
+ * @param sourceTypes desired source type names as appear in rrf: RXNORM, SNOMEDCT, MSH, etc.
+ * @param cuis current list of cuis
+ * @return Subset of cuis that exist in in the given sources
+ */
+ static public Collection<String> getSourceTypeValidCuis( final String rrfPath,
+ final Collection<String> sourceTypes,
+ final Collection<String> cuis ) {
+ final Collection<String> validCuis = new HashSet<String>( cuis.size() );
+ long lineCount = 0;
+ try {
+ final BufferedReader reader = FileUtil.createReader( rrfPath );
+ List<String> tokens = FileUtil.readBsvTokens( reader, rrfPath );
+ while ( tokens != null ) {
+ lineCount++;
+ if ( tokens.size() > RRF_INDEX.SOURCE._index && sourceTypes.contains( tokens.get( RRF_INDEX.SOURCE._index ) ) ) {
+ final String cui = CuiTuiUtil.getAsCui( tokens.get( RRF_INDEX.CUI._index ) );
+ if ( cuis.contains( cui ) ) {
+ validCuis.add( cui );
+ }
+ }
+ if ( lineCount % 2000 == 0 ) {
+ System.out.print( "." );
+ if ( lineCount % 100000 == 0 ) {
+ System.out.println( "File Line " + lineCount + "\t Valid Cuis " + validCuis.size() );
+ }
+ }
+ tokens = FileUtil.readBsvTokens( reader, rrfPath );
+ }
+ reader.close();
+ } catch ( IOException ioE ) {
+ System.err.println( ioE.getMessage() );
+ }
+ System.out.println( "File Lines " + lineCount + "\t Valid Cuis " + validCuis.size() );
+ return validCuis;
+ }
+
+ /**
+ * Given a collection of cuis, returns all of the cuis that don't exist for the given source types
+ * @param rrfPath path to the UMLS_ROOT Meta/MRCONSO.RRF file
+ * @param sourceTypes desired source type names as appear in rrf: RXNORM, SNOMEDCT, MSH, etc.
+ * @param cuis current list of cuis
+ * @return Subset of cuis that don't exist in in the given sources
+ */
+ static public Collection<String> getSourceTypeInvalidCuis( final String rrfPath,
+ final Collection<String> sourceTypes,
+ final Collection<String> cuis ) {
+ final Collection<String> validCuis = getSourceTypeValidCuis( rrfPath, sourceTypes, cuis );
+ final Collection<String> invalidCuis = new HashSet<String>( cuis.size() - validCuis.size() );
+ for ( String cui : cuis ) {
+ if ( !validCuis.contains( cui ) ) {
+ invalidCuis.add( cui );
+ }
+ }
+ return invalidCuis;
+ }
+
+
+}
Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsSourceTypeCuiValidator.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,469 @@
+package org.apache.ctakes.dictionarytool.util;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+
+
+/**
+ * Contains all the methods used to parse individual text definitions of umls terms
+ *
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/16/14
+ */
+final public class UmlsTermUtil {
+
+
+ static private enum DATA_FILE {
+ REMOVAL_PREFIX_TRIGGERS( "RemovalPrefixTriggers.txt" ),
+ REMOVAL_SUFFIX_TRIGGERS( "RemovalSuffixTriggers.txt" ),
+ REMOVAL_COLON_TRIGGERS( "RemovalColonTriggers.txt" ),
+ UNWANTED_PREFIXES( "UnwantedPrefixes.txt" ),
+ UNWANTED_SUFFIXES( "UnwantedSuffixes.txt" ),
+ MODIFIER_SUFFIXES( "ModifierSuffixes.txt" ),
+ RIGHT_ABBREVIATIONS( "RightAbbreviations.txt");
+ final private String __name;
+ private DATA_FILE( final String name ) {
+ __name = name;
+ }
+ }
+
+ static private String getDataPath( final String dataDir, final DATA_FILE dataFile ) {
+ return dataDir + '/' + dataFile.__name;
+ }
+
+ final private Collection<String> _removalPrefixTriggers;
+ final private Collection<String> _removalSuffixTriggers;
+ final private Collection<String> _removalColonTriggers;
+ final private Collection<String> _unwantedPrefixes;
+ final private Collection<String> _unwantedSuffixes;
+ final private Collection<String> _modifierSuffixes;
+ final private Collection<String> _abbreviations;
+
+ public UmlsTermUtil( final String dataDir ) {
+ this( getDataPath( dataDir, DATA_FILE.REMOVAL_PREFIX_TRIGGERS ),
+ getDataPath( dataDir, DATA_FILE.REMOVAL_SUFFIX_TRIGGERS ),
+ getDataPath( dataDir, DATA_FILE.REMOVAL_COLON_TRIGGERS ),
+ getDataPath( dataDir, DATA_FILE.UNWANTED_PREFIXES ),
+ getDataPath( dataDir, DATA_FILE.UNWANTED_SUFFIXES ),
+ getDataPath( dataDir, DATA_FILE.MODIFIER_SUFFIXES ),
+ getDataPath( dataDir, DATA_FILE.RIGHT_ABBREVIATIONS ) );
+ }
+
+ public UmlsTermUtil( final String removalPrefixTriggersPath, final String removalSuffixTriggersPath,
+ final String removalColonTriggersPath,
+ final String unwantedPrefixesPath, final String unwantedSuffixesPath,
+ final String modifierSuffixesPath, final String abbreviationsPath ) {
+ _removalPrefixTriggers = FileUtil.readOneColumn( removalPrefixTriggersPath, "term removal Prefix Triggers" );
+ _removalSuffixTriggers = FileUtil.readOneColumn( removalSuffixTriggersPath, "term removal Suffix Triggers" );
+ _removalColonTriggers = FileUtil.readOneColumn( removalColonTriggersPath, "term removal Colon Triggers" );
+ _unwantedPrefixes = FileUtil.readOneColumn( unwantedPrefixesPath, "unwanted Prefixes" );
+ _unwantedSuffixes = FileUtil.readOneColumn( unwantedSuffixesPath, "unwanted Suffixes" );
+ _modifierSuffixes = FileUtil.readOneColumn( modifierSuffixesPath, "modifier Suffixes" );
+ _abbreviations = FileUtil.readOneColumn( abbreviationsPath, "Abbreviations to expand" );
+ }
+
+ public Collection<String> getFormattedTexts( final String text ) {
+ final String tokenizedText = TextTokenizer.getTokenizedText( text );
+ if ( tokenizedText == null || tokenizedText.isEmpty() ) {
+ return Collections.emptyList();
+ }
+ if ( !isTextValid( tokenizedText ) ) {
+ return Collections.emptyList();
+ }
+ final String validText = getValidText( tokenizedText );
+ if ( validText == null || validText.isEmpty() ) {
+ return Collections.emptyList();
+ }
+ // add embedded abbreviations
+ Collection<String> extractedTerms = extractAbbreviations( validText );
+ if ( extractedTerms.isEmpty() ) {
+ extractedTerms = autoExtractAcronyms( validText );
+ }
+ if ( extractedTerms.isEmpty() ) {
+ extractedTerms = extractModifiers( validText );
+ }
+ if ( !extractedTerms.isEmpty() ) {
+ extractedTerms.add( validText );
+ return getPluralTerms( extractedTerms );
+ }
+ // Check for embedded and / or terms
+ if ( extractedTerms.isEmpty() ) {
+ extractedTerms = autoExtractColonParaTerms( validText );
+ }
+ if ( extractedTerms.isEmpty() ) {
+ extractedTerms = autoExtractOrParaTerms( validText );
+ }
+ if ( extractedTerms.isEmpty() ) {
+ extractedTerms = autoExtractColonBracketTerms( validText );
+ }
+// if ( extractedTerms.isEmpty() ) {
+// extractedTerms = autoExtractAndBracketTerms( validText );
+// }
+ if ( extractedTerms.isEmpty() ) {
+ extractedTerms = autoExtractOrBracketTerms( validText );
+ }
+ if ( !extractedTerms.isEmpty() ) {
+// System.out.println( validText );
+// for ( String et : extractedTerms ) {
+// System.out.println(" " + et);
+// }
+ return getPluralTerms( extractedTerms );
+ } else {
+ Collection<String> texts = new HashSet<String>( 1 );
+ texts.add( validText );
+ return getPluralTerms( texts );
+ }
+ }
+
+ static private Collection<String> getPluralTerms( final Collection<String> texts ) {
+ final Collection<String> plurals = new HashSet<String>();
+ for ( String text : texts ) {
+ if ( text.endsWith( "( s )" ) ) {
+ final String singular = text.substring( 0, text.length() - 5 ).trim();
+ plurals.add( singular );
+ plurals.add( singular + "s" );
+ }
+ }
+ texts.addAll( plurals );
+ return texts;
+ }
+
+ private boolean isTextValid( final String text ) {
+ // Check for illegal characters
+ for ( int i = 0; i < text.length(); i++ ) {
+ if ( text.charAt( i ) < ' ' || text.charAt( i ) > '~' ) {
+ return false;
+ }
+ }
+ // Check for auto-created note form
+ if ( text.split( "@" ).length > 2 ) {
+ return false;
+ }
+ if ( text.length() == 3 && text.charAt( 0 ) == '(' ) {
+ return false;
+ }
+ for ( String removalPrefix : _removalPrefixTriggers ) {
+ if ( text.startsWith( removalPrefix ) ) {
+ return false;
+ }
+ }
+ for ( String removalSuffix : _removalSuffixTriggers ) {
+ if ( text.endsWith( removalSuffix ) ) {
+ return false;
+ }
+ }
+ for ( String removalColon : _removalColonTriggers ) {
+ if ( text.contains( removalColon ) ) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private String getValidText( final String text ) {
+ // remove form underlines
+ if ( text.contains( "_ _ _" ) ) {
+ final int lastParen = text.lastIndexOf( '(' );
+ final int lastDash = text.indexOf( "_ _ _" );
+ final int deleteIndex = Math.max( 0, Math.min( lastParen, lastDash ) );
+ if ( deleteIndex > 0 ) {
+ return getValidText( text.substring( 0, deleteIndex - 1 ).trim() );
+ }
+ }
+ // remove unmatched parentheses, brackets, etc.
+// if ( text.startsWith( "(" ) && !text.contains( ")" ) ) {
+// return getValidText( text.substring( 1 ).trim() );
+// }
+// if ( text.startsWith( "[" ) && !text.contains( "]" ) ) {
+// return getValidText( text.substring( 1 ).trim() );
+// }
+// if ( text.startsWith( "(" ) && text.endsWith( ") or" ) ) {
+// return getValidText( text.substring( 1, text.length() - 4 ).trim() );
+// }
+// if ( text.startsWith( "or (" ) ) {
+// return getValidText( text.substring( 2 ).trim() );
+// }
+// if ( text.startsWith( "\"" ) && text.endsWith( "\"" ) ) {
+// return getValidText( text.substring( 1 ).trim() );
+// }
+// if ( text.startsWith( "(" ) && text.endsWith( ")" ) ) {
+// return getValidText( text.substring( 1, text.length() - 2 ).trim() );
+// }
+// if ( text.startsWith( "[" ) && text.endsWith( "]" ) ) {
+// return getValidText( text.substring( 1, text.length() - 2 ).trim() );
+// }
+// if ( text.startsWith( "&" ) ) {
+// return getValidText( text.substring( 1 ).trim() );
+// }
+// if ( text.endsWith( "]" ) && !text.contains( "[" ) ) {
+// return getValidText( text.substring( 0, text.length() - 2 ).trim() );
+// }
+// if ( text.endsWith( ")" ) && !text.contains( "(" ) ) {
+// return getValidText( text.substring( 0, text.length() - 2 ).trim() );
+// }
+ String strippedText = text.trim();
+ // Text in umls can have multiple suffixes and/or prefixes. Stripping just once doesn't do the trick
+ int lastLength = Integer.MAX_VALUE;
+ while ( lastLength != strippedText.length() ) {
+ lastLength = strippedText.length();
+ for ( String prefix : _unwantedPrefixes ) {
+ if ( strippedText.startsWith( prefix ) ) {
+ strippedText = strippedText.substring( prefix.length() ).trim();
+ }
+ }
+ for ( String suffix : _unwantedSuffixes ) {
+ if ( strippedText.endsWith( suffix ) ) {
+ strippedText = strippedText.substring( 0, strippedText.length() - suffix.length() ).trim();
+ }
+ }
+ }
+ if ( strippedText.contains( "(" ) && strippedText.contains( "[" ) ) {
+ return "";
+ }
+ return strippedText;
+ }
+
+
+ private Collection<String> extractAbbreviations( final String tokenizedText ) {
+ for ( String abbreviation : _abbreviations ) {
+ if ( tokenizedText.endsWith( abbreviation )
+ && !tokenizedText.contains( ":" ) && !tokenizedText.contains( " of " )
+ && !tokenizedText.contains( " for " ) ) {
+ final String noAbbrTerm
+ = tokenizedText.substring( 0, tokenizedText.length() - abbreviation.length() ).trim();
+ final String abbrTerm
+ = abbreviation.replace( ":", "" ).replace( "(", "" ).replace( ")", "" ).replace( "-", "" )
+ .replace( "[", "" ).replace( "]", "" ).replace( "&", "" ).trim();
+ final Collection<String> extractedAbbreviations = new HashSet<String>( 2 );
+ extractedAbbreviations.add( noAbbrTerm );
+ extractedAbbreviations.add( abbrTerm );
+ return extractedAbbreviations;
+ }
+ }
+ return Collections.emptyList();
+ }
+
+ private Collection<String> extractModifiers( final String tokenizedText ) {
+ for ( String modifier : _modifierSuffixes ) {
+ if ( tokenizedText.endsWith( modifier ) ) {
+ final String mainText = tokenizedText.substring( 0, tokenizedText.length() - modifier.length() ).trim();
+ final String modifierText = modifier.replace( "(", "" ).replace( ")", "" ).trim();
+ final Collection<String> modifiedTexts = new HashSet<String>( 2 );
+ modifiedTexts.add( modifierText + " " + mainText );
+ return modifiedTexts;
+ }
+ }
+ return Collections.emptyList();
+ }
+
+ private Collection<String> autoExtractAcronyms( final String tokenizedText ) {
+ final int dashIndex = tokenizedText.indexOf( '-' );
+ if ( dashIndex > 1 ) {
+ // have text ABC - DEF, check for acronym
+ final String acronym = tokenizedText.substring( 0, dashIndex - 1 ).trim();
+ if ( acronym.isEmpty() || acronym.length() > 8 || acronym.equals( "dose" ) ) {
+ return Collections.emptyList();
+ }
+ final String[] splits = acronym.split( "\\s+" );
+ if ( (splits.length == 1 && acronym.length() > 6) || splits.length > 2 ) {
+ return Collections.emptyList();
+ }
+ final String definition = tokenizedText.substring( dashIndex + 1 ).trim();
+ if ( definition.isEmpty() ) {
+ return Collections.emptyList();
+ }
+ if ( (acronym.charAt( 0 ) != definition.charAt( 0 ) && !definition.contains( "' s" )) ) {
+ return Collections.emptyList();
+ }
+ final String[] definitionSplits = definition.split( "\\s+" );
+ if ( acronym.length() != definitionSplits.length
+ || definitionSplits[definitionSplits.length - 1].charAt( 0 ) != acronym.charAt(
+ acronym.length() - 1 ) ) {
+ return Collections.emptyList();
+ }
+ final Collection<String> extractedAbbreviations = new HashSet<String>( 2 );
+ extractedAbbreviations.add( acronym );
+ extractedAbbreviations.add( definition );
+ return extractedAbbreviations;
+ }
+ return Collections.emptyList();
+ }
+
+ private Collection<String> autoExtractColonBracketTerms( final String tokenizedText ) {
+ final int colonIndex = tokenizedText.indexOf( ':' );
+ if ( colonIndex < 0 ) {
+ return Collections.emptyList();
+ }
+ final int orIndex = tokenizedText.indexOf( "] or [" );
+ final int andOrIndex = tokenizedText.indexOf( "] & / or [" );
+ if ( Math.max( orIndex, andOrIndex ) < colonIndex ) {
+ return Collections.emptyList();
+ }
+ String splitter = "\\] or \\[";
+ if ( andOrIndex > 0 ) {
+ splitter = "\\] & / or \\[";
+ }
+ final Collection<String> extractedTerms = new HashSet<String>( 2 );
+ final String thing = tokenizedText.substring( 0, colonIndex - 1 ).trim();
+ final String types = tokenizedText.substring( colonIndex + 1 ).trim();
+ final String[] splits = types.split( splitter );
+ for ( String split : splits ) {
+ split = trimBracketText( split );
+ if ( split.equals( "nos" ) || split.equals( "nec" ) || split.equals( "unspecified" )
+ || split.equals( "other" ) || split.isEmpty() ) {
+ extractedTerms.addAll( getFormattedTexts( thing ) );
+ } else {
+ extractedTerms.addAll( getFormattedTexts( split + " " + thing ) );
+ extractedTerms.addAll( getFormattedTexts( thing + " " + split ) );
+ }
+ }
+ return extractedTerms;
+ }
+
+ private Collection<String> autoExtractAndBracketTerms( final String tokenizedText ) {
+ final int andIndex = tokenizedText.indexOf( "( &" );
+ if ( andIndex < 0 || tokenizedText.indexOf( "] or [" ) < andIndex ) {
+ return Collections.emptyList();
+ }
+ final Collection<String> extractedTerms = new HashSet<String>( 3 );
+ final String thing = tokenizedText.substring( 0, andIndex - 1 ).trim();
+ extractedTerms.add( thing );
+ final String types = tokenizedText.substring( andIndex + 3 ).trim();
+ final String[] splits = types.split( "\\] or \\[" );
+ for ( String split : splits ) {
+ split = trimBracketText( split );
+ extractedTerms.addAll( getFormattedTexts( split + " " + thing ) );
+ extractedTerms.addAll( getFormattedTexts( thing + " " + split ) );
+ }
+ return extractedTerms;
+ }
+
+ private Collection<String> autoExtractOrBracketTerms( final String tokenizedText ) {
+ if ( !tokenizedText.contains( "] or [" ) && !tokenizedText.contains( "] & / or [" ) ) {
+ return Collections.emptyList();
+ }
+ final int lastOf = tokenizedText.lastIndexOf( " of " );
+ if ( lastOf > tokenizedText.lastIndexOf( ']' ) ) {
+ final String ofTerm = tokenizedText.substring( lastOf ).trim();
+ final Collection<String> ofExtractions = autoExtractOrBracketTerms( tokenizedText.substring( 0, lastOf ).trim() );
+ final Collection<String> ofTexts = new HashSet<String>( ofExtractions.size() );
+ for ( String ofText : ofExtractions ) {
+ ofTexts.add( ofText + " " + ofTerm );
+ }
+ return ofTexts;
+ }
+ final Collection<String> extractedTerms = new HashSet<String>( 2 );
+ String splitter = "\\] or \\[";
+ if ( tokenizedText.contains( "] & / or [" ) ) {
+ splitter = "\\] & / or \\[";
+ }
+ final String[] splits = tokenizedText.split( splitter );
+ for ( String split : splits ) {
+ split = trimBracketText( split );
+ if ( !split.equals( "operation" ) && !split.equals( "therapy" ) && !split.equals( "provision of" ) ) {
+ extractedTerms.addAll( getFormattedTexts( split ) );
+ }
+ }
+ return extractedTerms;
+ }
+
+ private Collection<String> autoExtractOrParaTerms( final String tokenizedText ) {
+ if ( !tokenizedText.contains( ") or (" ) && !tokenizedText.contains( ") & / or (" ) ) {
+ return Collections.emptyList();
+ }
+ final int lastOf = tokenizedText.lastIndexOf( " of " );
+ if ( lastOf > tokenizedText.lastIndexOf( ')' ) ) {
+ final String ofTerm = tokenizedText.substring( lastOf ).trim();
+ final Collection<String> ofExtractions = autoExtractOrBracketTerms( tokenizedText.substring( 0, lastOf ).trim() );
+ final Collection<String> ofTexts = new HashSet<String>( ofExtractions.size() );
+ for ( String ofText : ofExtractions ) {
+ ofTexts.add( ofText + " " + ofTerm );
+ }
+ return ofTexts;
+ }
+ final Collection<String> extractedTerms = new HashSet<String>( 2 );
+ String splitter = "\\) or \\(";
+ if ( tokenizedText.contains( ") & / or (" ) ) {
+ splitter = "\\) & / or \\(";
+ }
+ final String[] splits = tokenizedText.split( splitter );
+ for ( String split : splits ) {
+ split = trimParaText( split );
+ if ( !split.equals( "operation" ) && !split.equals( "therapy" ) && !split.equals( "provision of" ) ) {
+ extractedTerms.addAll( getFormattedTexts( split ) );
+ }
+ }
+ return extractedTerms;
+ }
+
+ private Collection<String> autoExtractColonParaTerms( final String tokenizedText ) {
+ final int colonIndex = tokenizedText.indexOf( ':' );
+ if ( colonIndex < 0 || colonIndex > tokenizedText.indexOf( '(' ) ) {
+ return Collections.emptyList();
+ }
+ final int orIndex = tokenizedText.indexOf( ") or (" );
+ final int andOrIndex = tokenizedText.indexOf( ") & / or (" );
+ if ( Math.max( orIndex, andOrIndex ) < colonIndex ) {
+ return Collections.emptyList();
+ }
+ String splitter = "\\) or \\(";
+ if ( andOrIndex > 0 ) {
+ splitter = "\\) & / or \\(";
+ }
+ final Collection<String> extractedTerms = new HashSet<String>( 2 );
+ final String thing = tokenizedText.substring( 0, colonIndex - 1 ).trim();
+ final String types = tokenizedText.substring( colonIndex + 1 ).trim();
+ final String[] splits = types.split( splitter );
+ for ( String split : splits ) {
+ split = trimParaText( split );
+ if ( split.equals( "nos" ) || split.equals( "nec" ) || split.equals( "unspecified" )
+ || split.equals( "other" ) || split.isEmpty() ) {
+ extractedTerms.addAll( getFormattedTexts( thing ) );
+ } else {
+ extractedTerms.addAll( getFormattedTexts( split + " " + thing ) );
+ extractedTerms.addAll( getFormattedTexts( thing + " " + split ) );
+ }
+ }
+ return extractedTerms;
+ }
+
+ static private String trimParaText( String paraText ) {
+ if ( paraText.startsWith( "(" ) ) {
+ paraText = paraText.substring( 1 );
+ }
+ if ( paraText.endsWith( " nos " ) || paraText.endsWith( " nec " ) ) {
+ return paraText.substring( 0, paraText.length()-4 ).trim();
+ } else if ( paraText.endsWith( ", unspecified " ) ) {
+ return paraText.substring( 0, paraText.length() - 14 ).trim();
+ } else if ( paraText.endsWith( " nos )" ) || paraText.endsWith( " nec )" ) ) {
+ return paraText.substring( 0, paraText.length() - 5 ).trim();
+ } else if ( paraText.endsWith( ", unspecified )" ) ) {
+ return paraText.substring( 0, paraText.length() - 15 ).trim();
+ } else if ( paraText.endsWith( ")" ) ) {
+ return paraText.substring( 0, paraText.length()-1 ).trim();
+ }
+ return paraText.trim();
+ }
+
+ static private String trimBracketText( String bracketText ) {
+ if ( bracketText.startsWith( "[" ) ) {
+ bracketText = bracketText.substring( 1 );
+ }
+ if ( bracketText.endsWith( " nos " ) || bracketText.endsWith( " nec " ) ) {
+ return bracketText.substring( 0, bracketText.length()-4 ).trim();
+ } else if ( bracketText.endsWith( ", unspecified " ) ) {
+ return bracketText.substring( 0, bracketText.length() - 14 ).trim();
+ } else if ( bracketText.endsWith( " nos ]" ) || bracketText.endsWith( " nec ]" ) ) {
+ return bracketText.substring( 0, bracketText.length() - 5 ).trim();
+ } else if ( bracketText.endsWith( ", unspecified ]" ) ) {
+ return bracketText.substring( 0, bracketText.length() - 15 ).trim();
+ } else if ( bracketText.endsWith( "]" ) ) {
+ return bracketText.substring( 0, bracketText.length()-1 ).trim();
+ }
+ return bracketText.trim();
+ }
+
+
+}
Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTextsMapWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTextsMapWriter.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTextsMapWriter.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTextsMapWriter.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,46 @@
+package org.apache.ctakes.dictionarytool.writer;
+
+import org.apache.ctakes.dictionarytool.util.FileUtil;
+import org.apache.ctakes.dictionarytool.util.TokenUtil;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Map;
+
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/15/14
+ */
+final public class CuiTextsMapWriter {
+
+ private CuiTextsMapWriter() {
+ }
+
+ static public void writeCuiTexts( final String termFilePath, final Map<String, Collection<String>> cuiTexts ) {
+ System.out.println( "Writing map of Cuis and Texts to " + termFilePath );
+ long lineCount = 0;
+ try {
+ final BufferedWriter writer = FileUtil.createWriter( termFilePath );
+ for ( Map.Entry<String,Collection<String>> cuiTextsEntry : cuiTexts.entrySet() ) {
+ final String cui = cuiTextsEntry.getKey();
+ for ( String text : cuiTextsEntry.getValue() ) {
+ lineCount++;
+ writer.write( TokenUtil.createBsvLine( cui, text ) );
+ writer.newLine();
+ if ( lineCount % 100000 == 0 ) {
+ System.out.println( "File Line " + lineCount );
+ }
+ }
+ }
+ writer.close();
+ } catch ( IOException ioE ) {
+ System.err.println( "Error writing Term on line " + lineCount + " in file " + termFilePath );
+ }
+ System.out.println( "Wrote " + lineCount + " terms to " + termFilePath );
+ }
+
+
+}
Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTextsMapWriter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiMapWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiMapWriter.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiMapWriter.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiMapWriter.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,25 @@
+package org.apache.ctakes.dictionarytool.writer;
+
+import org.apache.ctakes.dictionarytool.util.FileUtil;
+
+import java.util.Collection;
+import java.util.Map;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/15/14
+ */
+final public class CuiTuiMapWriter {
+
+ private CuiTuiMapWriter() {
+ }
+
+
+ static private void writeCuiTuiMap( final String cuiTuiFilePath,
+ final Map<String, Collection<String>> cuisAndTuis ) {
+ FileUtil.writeNamedSets( cuiTuiFilePath, "map of Cuis and Tuis", cuisAndTuis );
+ }
+
+
+}
Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiMapWriter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/FirstWordDbWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/FirstWordDbWriter.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/FirstWordDbWriter.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/FirstWordDbWriter.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,92 @@
+package org.apache.ctakes.dictionarytool.writer;
+
+import org.apache.ctakes.dictionarytool.util.JdbcUtil;
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.Collection;
+import java.util.Map;
+
+/**
+ * <p>
+ * CREATE CACHED TABLE UMLS_MS_2011AB (
+ * CUI VARCHAR_IGNORECASE(8) NOT NULL,
+ * FWORD VARCHAR_IGNORECASE(80) NOT NULL,
+ * TEXT VARCHAR_IGNORECASE(2048) NOT NULL,
+ * CODE VARCHAR_IGNORECASE(45) NOT NULL,
+ * SOURCETYPE VARCHAR_IGNORECASE(45) NOT NULL,
+ * TUI VARCHAR_IGNORECASE(4) NOT NULL
+ * );
+ * CREATE INDEX IDX_UMLS_MS_2011AB ON UMLS_MS_2011AB( FWORD );
+ * COMMIT;
+ * </p>
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/15/14
+ */
+final public class FirstWordDbWriter {
+
+ private FirstWordDbWriter() {}
+
+
+ static private enum FIELD {
+ CUI(1),FWORD(2),TEXT(3),CODE(4),SOURCETYPE(5),TUI(6);
+ final private int __index;
+ FIELD( final int index ) {
+ __index = index;
+ }
+ }
+
+
+ static public void writeTermsToDb( final Map<String, Collection<String>> cuiTuis,
+ final Map<String, Collection<String>> cuiTexts,
+ final String url, final String user, final String pass, final String tableName ) {
+ final Connection connection = JdbcUtil.createDatabaseConnection( url, user, pass );
+ final String sql = JdbcUtil.createRowInsertSql( tableName, FIELD.values() );
+ System.out.println( "Writing to " + tableName );
+ try {
+
+ final PreparedStatement rowInsertSql = connection.prepareStatement( sql );
+ long lineCount = 0;
+ for ( Map.Entry<String, Collection<String>> cuiTextEntry : cuiTexts.entrySet() ) {
+ final Collection<String> tuis = cuiTuis.get( cuiTextEntry.getKey() );
+ if ( tuis == null ) {
+ continue;
+ }
+ for ( String text : cuiTextEntry.getValue() ) {
+ final String[] tokens = text.split( "\\s+" );
+ rowInsertSql.setString( FIELD.CUI.__index, cuiTextEntry.getKey() );
+ rowInsertSql.setString( FIELD.FWORD.__index, tokens[0] );
+ rowInsertSql.setString( FIELD.TEXT.__index, text );
+ rowInsertSql.setString( FIELD.CODE.__index, cuiTextEntry.getKey() );
+ rowInsertSql.setString( FIELD.SOURCETYPE.__index, "UMLS_ROOT" );
+ rowInsertSql.setString( FIELD.TUI.__index, getSingleTui( tuis ) );
+ rowInsertSql.executeUpdate();
+ lineCount++;
+ if ( lineCount % 100000 == 0 ) {
+ System.out.println( "DB Row " + lineCount );
+ }
+ }
+ }
+ System.out.println( "DB Rows " + lineCount );
+
+ final Statement statement = connection.createStatement();
+ statement.execute( "commit" );
+ rowInsertSql.close();
+ } catch ( SQLException sqlE ) {
+ System.err.println( sqlE.getMessage() );
+ }
+ }
+
+
+ static private String getSingleTui( final Collection<String> tuis ) {
+ for ( String tui : tuis ) {
+ return tui;
+ }
+ return "T000";
+ }
+
+
+}
Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/FirstWordDbWriter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/RareWordDbWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/RareWordDbWriter.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/RareWordDbWriter.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/RareWordDbWriter.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,93 @@
+package org.apache.ctakes.dictionarytool.writer;
+
+import org.apache.ctakes.dictionarytool.util.JdbcUtil;
+import org.apache.ctakes.dictionarytool.util.RareWordUtil;
+import org.apache.ctakes.dictionarytool.util.TokenUtil;
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.Collection;
+import java.util.Map;
+
+
+/**
+ * <p>
+ * CREATE CACHED TABLE CTAKES_UMLS (
+ * CUI VARCHAR_IGNORECASE(12),
+ * TUI VARCHAR_IGNORECASE(48),
+ * RINDEX INTEGER,
+ * TCOUNT INTEGER,
+ * TEXT VARCHAR_IGNORECASE(255),
+ * RWORD VARCHAR_IGNORECASE(48)
+ * );
+ * CREATE INDEX IDX_CTAKES_UMLS ON CTAKES_UMLS( RWORD );
+ * COMMIT;
+ * </p>
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/15/14
+ */
+final public class RareWordDbWriter {
+
+ private RareWordDbWriter() {}
+
+ static private enum FIELD {
+ CUI(1),TUI(2),RINDEX(3),TCOUNT(4),TEXT(5),RWORD(6);
+ final private int __index;
+ FIELD( final int index ) {
+ __index = index;
+ }
+ }
+
+
+ static public void writeTermsToDb( final Map<String, Collection<String>> cuiTuis,
+ final Map<String, Collection<String>> cuiTexts,
+ final String url, final String user, final String pass, final String tableName ) {
+ final Connection connection = JdbcUtil.createDatabaseConnection( url, user, pass );
+ final String sql = JdbcUtil.createRowInsertSql( tableName, FIELD.values() );
+ try {
+ final PreparedStatement rowInsertSql = connection.prepareStatement( sql );
+ final Map<String, Integer> tokenCounts = RareWordUtil.getTokenCounts( cuiTexts );
+ long lineCount = 0;
+ for ( Map.Entry<String, Collection<String>> cuiTextEntry : cuiTexts.entrySet() ) {
+ final Collection<String> tuis = cuiTuis.get( cuiTextEntry.getKey() );
+ if ( tuis == null ) {
+ continue;
+ }
+ for ( String text : cuiTextEntry.getValue() ) {
+ final String[] tokens = text.split( "\\s+" );
+ int bestIndex = 0;
+ int bestCount = Integer.MAX_VALUE;
+ for ( int i = 0; i < tokens.length; i++ ) {
+ Integer count = tokenCounts.get( tokens[i] );
+ if ( count != null && count < bestCount ) {
+ bestIndex = i;
+ bestCount = count;
+ }
+ }
+ rowInsertSql.setString( FIELD.CUI.__index, cuiTextEntry.getKey() );
+ rowInsertSql.setString( FIELD.TUI.__index, TokenUtil.createCsvLine( tuis ) );
+ rowInsertSql.setInt( FIELD.RINDEX.__index, bestIndex );
+ rowInsertSql.setInt( FIELD.TCOUNT.__index, tokens.length );
+ rowInsertSql.setString( FIELD.TEXT.__index, text );
+ rowInsertSql.setString( FIELD.RWORD.__index, tokens[bestIndex] );
+ rowInsertSql.executeUpdate();
+ lineCount++;
+ if ( lineCount % 100000 == 0 ) {
+ System.out.println( "DB Row " + lineCount );
+ }
+ }
+ }
+ System.out.println( "DB Rows " + lineCount );
+
+ final Statement statement = connection.createStatement();
+ statement.execute( "commit" );
+ rowInsertSql.close();
+ } catch ( SQLException sqlE ) {
+ System.err.println( sqlE.getMessage() );
+ }
+ }
+
+}
Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/RareWordDbWriter.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/TuiListWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/TuiListWriter.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/TuiListWriter.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/TuiListWriter.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,20 @@
+package org.apache.ctakes.dictionarytool.writer;
+
+import org.apache.ctakes.dictionarytool.util.FileUtil;
+
+import java.util.Collection;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/16/14
+ */
+final public class TuiListWriter {
+
+ private TuiListWriter() {}
+
+ static public void writeTuiList( final String tuiFilePath, final Collection<String> typeTuis ) {
+ FileUtil.writeOneColumn( tuiFilePath, "list of Tuis", typeTuis );
+ }
+
+}
Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/TuiListWriter.java
------------------------------------------------------------------------------
svn:eol-style = native