You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2015/07/09 21:39:02 UTC
svn commit: r1690150 - in /ctakes/trunk/ctakes-dictionary-lookup-fast/src:
main/java/org/apache/ctakes/dictionary/lookup2/concept/
main/java/org/apache/ctakes/dictionary/lookup2/consumer/
main/java/org/apache/ctakes/dictionary/lookup2/dictionary/ main/...
Author: seanfinan
Date: Thu Jul 9 19:39:02 2015
New Revision: 1690150
URL: http://svn.apache.org/r1690150
Log:
CTAKES-368 Adding capability to use custom cui prefixes
Added:
ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/ctakes/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/ctakes/dictionary/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/ctakes/dictionary/lookup2/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/ctakes/dictionary/lookup2/util/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtilTester.java
Modified:
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/BsvConceptFactory.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/JdbcConceptFactory.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/MemConceptFactory.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtil.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/DefaultDictionarySpec.java
Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/BsvConceptFactory.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/BsvConceptFactory.java?rev=1690150&r1=1690149&r2=1690150&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/BsvConceptFactory.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/BsvConceptFactory.java Thu Jul 9 19:39:02 2015
@@ -35,18 +35,13 @@ final public class BsvConceptFactory imp
}
public BsvConceptFactory( final String name, final String bsvFilePath ) {
-// this( name, new File( bsvFilePath ) );
-// }
-//
-// public BsvConceptFactory( final String name, final File bsvFile ) {
-// final Collection<CuiTuiTerm> cuiTuiTerms = parseBsvFile( bsvFile );
final Collection<CuiTuiTerm> cuiTuiTerms = parseBsvFile( bsvFilePath );
final Map<Long, Concept> conceptMap = new HashMap<>( cuiTuiTerms.size() );
for ( CuiTuiTerm cuiTuiTerm : cuiTuiTerms ) {
final CollectionMap<ConceptCode, String, ? extends Collection<String>> codes
= new EnumSetMap<>( ConceptCode.class );
codes.placeValue( ConceptCode.TUI, TuiCodeUtil.getAsTui( cuiTuiTerm.getTui() ) );
- conceptMap.put( CuiCodeUtil.getCuiCode( cuiTuiTerm.getCui() ),
+ conceptMap.put( CuiCodeUtil.getInstance().getCuiCode( cuiTuiTerm.getCui() ),
new Concept( cuiTuiTerm.getCui(), cuiTuiTerm.getPrefTerm(), codes ) );
}
_delegateFactory = new MemConceptFactory( name, conceptMap );
@@ -98,7 +93,6 @@ final public class BsvConceptFactory imp
* @param bsvFilePath file containing term rows and bsv columns
* @return collection of all valid terms read from the bsv file
*/
-// static private Collection<CuiTuiTerm> parseBsvFile( final File bsvFile ) {
static private Collection<CuiTuiTerm> parseBsvFile( final String bsvFilePath ) {
File bsvFile = null;
try {
Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/JdbcConceptFactory.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/JdbcConceptFactory.java?rev=1690150&r1=1690149&r2=1690150&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/JdbcConceptFactory.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/JdbcConceptFactory.java Thu Jul 9 19:39:02 2015
@@ -43,7 +43,6 @@ public class JdbcConceptFactory extends
static private final String ICD10_TABLE = "icd10Table";
- // final private Connection _connection;
private PreparedStatement _selectTuiCall;
private PreparedStatement _selectPrefTermCall;
private PreparedStatement _selectSnomedCall;
@@ -117,7 +116,7 @@ public class JdbcConceptFactory extends
if ( _selectIcd10Call != null ) {
codes.addAllValues( ICD10PCS, getStringCodes( _selectIcd10Call, cuiCode ) );
}
- return new Concept( CuiCodeUtil.getAsCui( cuiCode ), prefTerm, codes );
+ return new Concept( CuiCodeUtil.getInstance().getAsCui( cuiCode ), prefTerm, codes );
}
Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/MemConceptFactory.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/MemConceptFactory.java?rev=1690150&r1=1690149&r2=1690150&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/MemConceptFactory.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/MemConceptFactory.java Thu Jul 9 19:39:02 2015
@@ -27,7 +27,7 @@ final public class MemConceptFactory ext
public Concept createConcept( final Long cuiCode ) {
Concept concept = _conceptMap.get( cuiCode );
if ( concept == null ) {
- concept = new Concept( CuiCodeUtil.getAsCui( cuiCode ) );
+ concept = new Concept( CuiCodeUtil.getInstance().getAsCui( cuiCode ) );
_conceptMap.put( cuiCode, concept );
}
return concept;
Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java?rev=1690150&r1=1690149&r2=1690150&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java Thu Jul 9 19:39:02 2015
@@ -29,6 +29,7 @@ import org.apache.ctakes.typesystem.type
import org.apache.ctakes.typesystem.type.textsem.*;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CASRuntimeException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
@@ -64,7 +65,8 @@ final public class DefaultTermConsumer e
for ( Map.Entry<TextSpan, ? extends Collection<Long>> spanCuis : textSpanCuis ) {
umlsConceptList.clear();
for ( Long cuiCode : spanCuis.getValue() ) {
- umlsConceptList.addAll( createUmlsConcepts( jcas, defaultScheme, cTakesSemantic, cuiCode, cuiConcepts ) );
+ umlsConceptList
+ .addAll( createUmlsConcepts( jcas, defaultScheme, cTakesSemantic, cuiCode, cuiConcepts ) );
}
final FSArray conceptArr = new FSArray( jcas, umlsConceptList.size() );
int arrIdx = 0;
@@ -80,10 +82,9 @@ final public class DefaultTermConsumer e
annotation.setOntologyConceptArr( conceptArr );
annotation.addToIndexes();
}
- } catch ( Exception e ) {
- // TODO Poor form - refactor
+ } catch ( CASRuntimeException crtE ) {
// What is really thrown? The jcas "throwFeatMissing" is not a great help
- throw new AnalysisEngineProcessException( e );
+ throw new AnalysisEngineProcessException( crtE );
}
}
@@ -119,7 +120,7 @@ final public class DefaultTermConsumer e
final Collection<Concept> concepts = conceptMap.getCollection( cuiCode );
if ( concepts == null || concepts.isEmpty() ) {
return Arrays.asList( createUmlsConcept( jcas, defaultScheme,
- CuiCodeUtil.getAsCui( cuiCode ), null, null, null ) );
+ CuiCodeUtil.getInstance().getAsCui( cuiCode ), null, null, null ) );
}
final Collection<UmlsConcept> umlsConcepts = new HashSet<>();
for ( Concept concept : concepts ) {
@@ -155,7 +156,8 @@ final public class DefaultTermConsumer e
}
}
if ( concepts.isEmpty() ) {
- concepts.add( createUmlsConcept( jcas, defaultScheme, concept.getCui(), tui, concept.getPreferredText(), null ) );
+ concepts.add( createUmlsConcept( jcas, defaultScheme, concept.getCui(), tui, concept
+ .getPreferredText(), null ) );
}
return concepts;
}
Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java?rev=1690150&r1=1690149&r2=1690150&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java Thu Jul 9 19:39:02 2015
@@ -363,7 +363,7 @@ final public class RareWordTermMapCreato
public CuiTerm( final String cui, final String term ) {
__term = getTokenizedTerm( term );
- __cui = CuiCodeUtil.getCuiCode( cui );
+ __cui = CuiCodeUtil.getInstance().getCuiCode( cui );
__hashcode = (__cui + "_" + __term).hashCode();
}
Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtil.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtil.java?rev=1690150&r1=1690149&r2=1690150&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtil.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtil.java Thu Jul 9 19:39:02 2015
@@ -1,60 +1,117 @@
package org.apache.ctakes.dictionary.lookup2.util;
-import java.util.Collection;
-import java.util.HashSet;
+import org.apache.log4j.Logger;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
/**
* Author: SPF
* Affiliation: CHIP-NLP
* Date: 9/5/2014
*/
-final public class CuiCodeUtil {
+public enum CuiCodeUtil {
+ INSTANCE;
- private CuiCodeUtil() {
+ static public CuiCodeUtil getInstance() {
+ return INSTANCE;
}
- static public String getAsCui( final Long code ) {
- final StringBuilder sb = new StringBuilder( 8 );
- sb.append( code );
- return getAsCui( sb );
- }
+ static private final Logger LOGGER = Logger.getLogger( "CuiCodeUtil" );
+ static private final long PREFIX_MULTIPLIER = 100000000;
+
+ final private List<PrefixerPair> _prefixerPairList = new ArrayList<>();
- static public String getAsCui( final String code ) {
- if ( code.length() == 8 && code.startsWith( "C" ) ) {
- return code;
- }
- final StringBuilder sb = new StringBuilder( 8 );
- sb.append( code.replace( "C", "" ) );
- return getAsCui( sb );
+ private CuiCodeUtil() {
+ // Add the standard C as the default encoding prefix
+ _prefixerPairList.add( new PrefixerPair( "C0000000" ) );
}
- static private String getAsCui( final StringBuilder sb ) {
- while ( sb.length() < 7 ) {
- sb.insert( 0, '0' );
+ public String getAsCui( final Long code ) {
+ final long multiplier = code / PREFIX_MULTIPLIER;
+ if ( code < 0 || multiplier < 0 || multiplier >= _prefixerPairList.size() ) {
+ LOGGER.error( "Could not create Cui String for " + code );
+ return "" + code;
}
- sb.insert( 0, 'C' );
- return sb.toString();
+ return _prefixerPairList.get( (int)multiplier ).getAsCui( code % PREFIX_MULTIPLIER );
}
- static public Long getCuiCode( final String cui ) {
- final String cuiText = getAsCui( cui );
- final String cuiNum = cuiText.substring( 1, cuiText.length() );
- try {
- return Long.parseLong( cuiNum );
- } catch ( NumberFormatException nfE ) {
- System.err.println( "Could not create Cui Code for " + cui );
+ public Long getCuiCode( final String cui ) {
+ final PrefixerPair prefixerPair = new PrefixerPair( cui );
+ int prefixerIndex = _prefixerPairList.indexOf( prefixerPair );
+ if ( prefixerIndex < 0 ) {
+ prefixerIndex = _prefixerPairList.size();
+ _prefixerPairList.add( prefixerPair );
}
- return -1l;
+ return prefixerPair.getCuiCode( cui, prefixerIndex );
}
- static public Collection<Long> getCuiCodes( final Collection<String> cuis ) {
- final Collection<Long> cuiCodes = new HashSet<>( cuis.size() );
- for ( String cui : cuis ) {
- cuiCodes.add( getCuiCode( cui ) );
+
+ static private final class PrefixerPair {
+ final private int __digitCount;
+ final private char[] __prefix;
+ final private int __hashCode;
+
+ private PrefixerPair( final String cui ) {
+ final char[] chars = cui.toCharArray();
+ int digitCount = 0;
+ while ( digitCount < chars.length
+ && digitCount < 7
+ && Character.isDigit( chars[ chars.length - 1 - digitCount ] ) ) {
+ digitCount++;
+ }
+ __digitCount = digitCount;
+ __prefix = Arrays.copyOfRange( chars, 0, chars.length - digitCount );
+ __hashCode = digitCount + Arrays.hashCode( __prefix );
+ }
+
+ private Long getCuiCode( final String cui, final int multiplier ) {
+ final String cuiNum = cui.substring( cui.length() - __digitCount, cui.length() );
+ try {
+ return PREFIX_MULTIPLIER * multiplier + Long.parseLong( cuiNum );
+ } catch ( NumberFormatException nfE ) {
+ LOGGER.error( "Could not create Cui Code for " + cui );
+ }
+ return -1l;
+ }
+
+ private String getAsCui( final Long code ) {
+ final char[] codeChars = String.valueOf( code ).toCharArray();
+ if ( codeChars.length > __digitCount ) {
+ LOGGER.error( "Invalid code " + code + " for prefix " + __prefix
+ + " has more than " + __digitCount + " digits" );
+ return String.valueOf( __prefix ) + String.valueOf( codeChars );
+ }
+ final int cuiLength = __prefix.length + __digitCount;
+ final char[] cuiChars = new char[ cuiLength ];
+ System.arraycopy( __prefix, 0, cuiChars, 0, __prefix.length );
+ System.arraycopy( codeChars, 0, cuiChars, cuiLength - codeChars.length, codeChars.length );
+ for ( int i = __prefix.length; i < cuiLength - codeChars.length; i++ ) {
+ cuiChars[ i ] = '0';
+ }
+ return String.valueOf( cuiChars );
+ }
+
+ public int hashCode() {
+ return __hashCode;
+ }
+
+ public boolean equals( final Object other ) {
+ return other instanceof PrefixerPair
+ && __hashCode == ((PrefixerPair)other).__hashCode
+ && __digitCount == ((PrefixerPair)other).__digitCount
+ && Arrays.equals( __prefix, ((PrefixerPair)other).__prefix );
}
- return cuiCodes;
}
+ // todo
+ // todo switch to int: 32 bit signed, max = 2,147,483,647
+ // todo if we keep final 7 digits for the numerical then we have 213 possible prefixes
+ // todo
+ // todo can probably change the code and the db will be fine, change the db too
+ // todo
+
}
Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/DefaultDictionarySpec.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/DefaultDictionarySpec.java?rev=1690150&r1=1690149&r2=1690150&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/DefaultDictionarySpec.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/DefaultDictionarySpec.java Thu Jul 9 19:39:02 2015
@@ -56,7 +56,7 @@ final public class DefaultDictionarySpec
}
public Concept createConcept( final Long cuiCode ) {
- return new Concept( CuiCodeUtil.getAsCui( cuiCode ) );
+ return new Concept( CuiCodeUtil.getInstance().getAsCui( cuiCode ) );
}
public Map<Long, Concept> createConcepts( final Collection<Long> cuiCodes ) {
Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtilTester.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtilTester.java?rev=1690150&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtilTester.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtilTester.java Thu Jul 9 19:39:02 2015
@@ -0,0 +1,61 @@
+package org.apache.ctakes.dictionary.lookup2.util;
+
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 7/9/2015
+ */
+final public class CuiCodeUtilTester {
+
+
+ @Test
+ public void testGetAsCuiDefault() {
+ assertEquals( "Standard Prefix \'C\' not preppended as default",
+ "C0000123", CuiCodeUtil.getInstance().getAsCui( 123l ) );
+ }
+
+ @Test
+ public void testGetAsCuiCustom() {
+ final long bing123 = CuiCodeUtil.getInstance().getCuiCode( "BING123" );
+ final long binger = bing123 - 123;
+ assertEquals( "Custom Prefix \'BING\' not preppended",
+ "BING123", CuiCodeUtil.getInstance().getAsCui( bing123 ) );
+ assertEquals( "Custom Prefix \'BING\' not preppended",
+ "BING004", CuiCodeUtil.getInstance().getAsCui( binger + 4 ) );
+
+ final long bang123 = CuiCodeUtil.getInstance().getCuiCode( "BANG123" );
+ final long banger = bang123 - 123;
+ assertEquals( "Custom Prefix \'BANG\' not preppended",
+ "BANG123", CuiCodeUtil.getInstance().getAsCui( bang123 ) );
+ assertEquals( "Custom Prefix \'BANG\' not preppended",
+ "BANG004", CuiCodeUtil.getInstance().getAsCui( banger + 4 ) );
+ }
+
+ @Test
+ public void breakCodeTooLarge() {
+ final long bing123 = CuiCodeUtil.getInstance().getCuiCode( "BING123" );
+ final long binger = bing123 - 123;
+ assertEquals( "Cui length not expanded for large code",
+ "BING1004", CuiCodeUtil.getInstance().getAsCui( binger + 1004 ) );
+ }
+
+
+ @Test
+ public void breakCodeIsNegative() {
+ assertEquals( "Negative code did not return as-is",
+ "-1", CuiCodeUtil.getInstance().getAsCui( -1l ) );
+ }
+
+ @Test
+ public void breakPrefixUnknown() {
+ assertEquals( "Huge code did not return as-is",
+ "10000000000000", CuiCodeUtil.getInstance().getAsCui( 10000000000000l ) );
+ }
+
+
+}