You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2015/07/09 21:39:02 UTC

svn commit: r1690150 - in /ctakes/trunk/ctakes-dictionary-lookup-fast/src: main/java/org/apache/ctakes/dictionary/lookup2/concept/ main/java/org/apache/ctakes/dictionary/lookup2/consumer/ main/java/org/apache/ctakes/dictionary/lookup2/dictionary/ main/...

Author: seanfinan
Date: Thu Jul  9 19:39:02 2015
New Revision: 1690150

URL: http://svn.apache.org/r1690150
Log:
CTAKES-368  Adding capability to use custom cui prefixes

Added:
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/ctakes/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/ctakes/dictionary/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/ctakes/dictionary/lookup2/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/ctakes/dictionary/lookup2/util/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtilTester.java
Modified:
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/BsvConceptFactory.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/JdbcConceptFactory.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/MemConceptFactory.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtil.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/DefaultDictionarySpec.java

Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/BsvConceptFactory.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/BsvConceptFactory.java?rev=1690150&r1=1690149&r2=1690150&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/BsvConceptFactory.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/BsvConceptFactory.java Thu Jul  9 19:39:02 2015
@@ -35,18 +35,13 @@ final public class BsvConceptFactory imp
    }
 
    public BsvConceptFactory( final String name, final String bsvFilePath ) {
-//      this( name, new File( bsvFilePath ) );
-//   }
-//
-//   public BsvConceptFactory( final String name, final File bsvFile ) {
-//      final Collection<CuiTuiTerm> cuiTuiTerms = parseBsvFile( bsvFile );
       final Collection<CuiTuiTerm> cuiTuiTerms = parseBsvFile( bsvFilePath );
       final Map<Long, Concept> conceptMap = new HashMap<>( cuiTuiTerms.size() );
       for ( CuiTuiTerm cuiTuiTerm : cuiTuiTerms ) {
          final CollectionMap<ConceptCode, String, ? extends Collection<String>> codes
                = new EnumSetMap<>( ConceptCode.class );
          codes.placeValue( ConceptCode.TUI, TuiCodeUtil.getAsTui( cuiTuiTerm.getTui() ) );
-         conceptMap.put( CuiCodeUtil.getCuiCode( cuiTuiTerm.getCui() ),
+         conceptMap.put( CuiCodeUtil.getInstance().getCuiCode( cuiTuiTerm.getCui() ),
                new Concept( cuiTuiTerm.getCui(), cuiTuiTerm.getPrefTerm(), codes ) );
       }
       _delegateFactory = new MemConceptFactory( name, conceptMap );
@@ -98,7 +93,6 @@ final public class BsvConceptFactory imp
     * @param bsvFilePath file containing term rows and bsv columns
     * @return collection of all valid terms read from the bsv file
     */
-//   static private Collection<CuiTuiTerm> parseBsvFile( final File bsvFile ) {
    static private Collection<CuiTuiTerm> parseBsvFile( final String bsvFilePath ) {
       File bsvFile = null;
       try {

Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/JdbcConceptFactory.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/JdbcConceptFactory.java?rev=1690150&r1=1690149&r2=1690150&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/JdbcConceptFactory.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/JdbcConceptFactory.java Thu Jul  9 19:39:02 2015
@@ -43,7 +43,6 @@ public class JdbcConceptFactory extends
    static private final String ICD10_TABLE = "icd10Table";
 
 
-   //   final private Connection _connection;
    private PreparedStatement _selectTuiCall;
    private PreparedStatement _selectPrefTermCall;
    private PreparedStatement _selectSnomedCall;
@@ -117,7 +116,7 @@ public class JdbcConceptFactory extends
       if ( _selectIcd10Call != null ) {
          codes.addAllValues( ICD10PCS, getStringCodes( _selectIcd10Call, cuiCode ) );
       }
-      return new Concept( CuiCodeUtil.getAsCui( cuiCode ), prefTerm, codes );
+      return new Concept( CuiCodeUtil.getInstance().getAsCui( cuiCode ), prefTerm, codes );
    }
 
 

Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/MemConceptFactory.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/MemConceptFactory.java?rev=1690150&r1=1690149&r2=1690150&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/MemConceptFactory.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/MemConceptFactory.java Thu Jul  9 19:39:02 2015
@@ -27,7 +27,7 @@ final public class MemConceptFactory ext
    public Concept createConcept( final Long cuiCode ) {
       Concept concept = _conceptMap.get( cuiCode );
       if ( concept == null ) {
-         concept = new Concept( CuiCodeUtil.getAsCui( cuiCode ) );
+         concept = new Concept( CuiCodeUtil.getInstance().getAsCui( cuiCode ) );
          _conceptMap.put( cuiCode, concept );
       }
       return concept;

Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java?rev=1690150&r1=1690149&r2=1690150&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java Thu Jul  9 19:39:02 2015
@@ -29,6 +29,7 @@ import org.apache.ctakes.typesystem.type
 import org.apache.ctakes.typesystem.type.textsem.*;
 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CASRuntimeException;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.cas.FSArray;
 
@@ -64,7 +65,8 @@ final public class DefaultTermConsumer e
          for ( Map.Entry<TextSpan, ? extends Collection<Long>> spanCuis : textSpanCuis ) {
             umlsConceptList.clear();
             for ( Long cuiCode : spanCuis.getValue() ) {
-               umlsConceptList.addAll( createUmlsConcepts( jcas, defaultScheme, cTakesSemantic, cuiCode, cuiConcepts ) );
+               umlsConceptList
+                     .addAll( createUmlsConcepts( jcas, defaultScheme, cTakesSemantic, cuiCode, cuiConcepts ) );
             }
             final FSArray conceptArr = new FSArray( jcas, umlsConceptList.size() );
             int arrIdx = 0;
@@ -80,10 +82,9 @@ final public class DefaultTermConsumer e
             annotation.setOntologyConceptArr( conceptArr );
             annotation.addToIndexes();
          }
-      } catch ( Exception e ) {
-         // TODO Poor form - refactor
+      } catch ( CASRuntimeException crtE ) {
          // What is really thrown?  The jcas "throwFeatMissing" is not a great help
-         throw new AnalysisEngineProcessException( e );
+         throw new AnalysisEngineProcessException( crtE );
       }
    }
 
@@ -119,7 +120,7 @@ final public class DefaultTermConsumer e
       final Collection<Concept> concepts = conceptMap.getCollection( cuiCode );
       if ( concepts == null || concepts.isEmpty() ) {
          return Arrays.asList( createUmlsConcept( jcas, defaultScheme,
-               CuiCodeUtil.getAsCui( cuiCode ), null, null, null ) );
+               CuiCodeUtil.getInstance().getAsCui( cuiCode ), null, null, null ) );
       }
       final Collection<UmlsConcept> umlsConcepts = new HashSet<>();
       for ( Concept concept : concepts ) {
@@ -155,7 +156,8 @@ final public class DefaultTermConsumer e
          }
       }
       if ( concepts.isEmpty() ) {
-         concepts.add( createUmlsConcept( jcas, defaultScheme, concept.getCui(), tui, concept.getPreferredText(), null ) );
+         concepts.add( createUmlsConcept( jcas, defaultScheme, concept.getCui(), tui, concept
+               .getPreferredText(), null ) );
       }
       return concepts;
    }

Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java?rev=1690150&r1=1690149&r2=1690150&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java Thu Jul  9 19:39:02 2015
@@ -363,7 +363,7 @@ final public class RareWordTermMapCreato
 
       public CuiTerm( final String cui, final String term ) {
          __term = getTokenizedTerm( term );
-         __cui = CuiCodeUtil.getCuiCode( cui );
+         __cui = CuiCodeUtil.getInstance().getCuiCode( cui );
          __hashcode = (__cui + "_" + __term).hashCode();
       }
 

Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtil.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtil.java?rev=1690150&r1=1690149&r2=1690150&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtil.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtil.java Thu Jul  9 19:39:02 2015
@@ -1,60 +1,117 @@
 package org.apache.ctakes.dictionary.lookup2.util;
 
-import java.util.Collection;
-import java.util.HashSet;
+import org.apache.log4j.Logger;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
 
 /**
  * Author: SPF
  * Affiliation: CHIP-NLP
  * Date: 9/5/2014
  */
-final public class CuiCodeUtil {
+public enum CuiCodeUtil {
+   INSTANCE;
 
-   private CuiCodeUtil() {
+   static public CuiCodeUtil getInstance() {
+      return INSTANCE;
    }
 
-   static public String getAsCui( final Long code ) {
-      final StringBuilder sb = new StringBuilder( 8 );
-      sb.append( code );
-      return getAsCui( sb );
-   }
+   static private final Logger LOGGER = Logger.getLogger( "CuiCodeUtil" );
+   static private final long PREFIX_MULTIPLIER = 100000000;
+
+   final private List<PrefixerPair> _prefixerPairList = new ArrayList<>();
 
-   static public String getAsCui( final String code ) {
-      if ( code.length() == 8 && code.startsWith( "C" ) ) {
-         return code;
-      }
-      final StringBuilder sb = new StringBuilder( 8 );
-      sb.append( code.replace( "C", "" ) );
-      return getAsCui( sb );
+   private CuiCodeUtil() {
+      // Add the standard C as the default encoding prefix
+      _prefixerPairList.add( new PrefixerPair( "C0000000" ) );
    }
 
-   static private String getAsCui( final StringBuilder sb ) {
-      while ( sb.length() < 7 ) {
-         sb.insert( 0, '0' );
+   public String getAsCui( final Long code ) {
+      final long multiplier = code / PREFIX_MULTIPLIER;
+      if ( code < 0 || multiplier < 0 || multiplier >= _prefixerPairList.size() ) {
+         LOGGER.error( "Could not create Cui String for " + code );
+         return "" + code;
       }
-      sb.insert( 0, 'C' );
-      return sb.toString();
+      return _prefixerPairList.get( (int)multiplier ).getAsCui( code % PREFIX_MULTIPLIER );
    }
 
 
-   static public Long getCuiCode( final String cui ) {
-      final String cuiText = getAsCui( cui );
-      final String cuiNum = cuiText.substring( 1, cuiText.length() );
-      try {
-         return Long.parseLong( cuiNum );
-      } catch ( NumberFormatException nfE ) {
-         System.err.println( "Could not create Cui Code for " + cui );
+   public Long getCuiCode( final String cui ) {
+      final PrefixerPair prefixerPair = new PrefixerPair( cui );
+      int prefixerIndex = _prefixerPairList.indexOf( prefixerPair );
+      if ( prefixerIndex < 0 ) {
+         prefixerIndex = _prefixerPairList.size();
+         _prefixerPairList.add( prefixerPair );
       }
-      return -1l;
+      return prefixerPair.getCuiCode( cui, prefixerIndex );
    }
 
-   static public Collection<Long> getCuiCodes( final Collection<String> cuis ) {
-      final Collection<Long> cuiCodes = new HashSet<>( cuis.size() );
-      for ( String cui : cuis ) {
-         cuiCodes.add( getCuiCode( cui ) );
+
+   static private final class PrefixerPair {
+      final private int __digitCount;
+      final private char[] __prefix;
+      final private int __hashCode;
+
+      private PrefixerPair( final String cui ) {
+         final char[] chars = cui.toCharArray();
+         int digitCount = 0;
+         while ( digitCount < chars.length
+                 && digitCount < 7
+                 && Character.isDigit( chars[ chars.length - 1 - digitCount ] ) ) {
+            digitCount++;
+         }
+         __digitCount = digitCount;
+         __prefix = Arrays.copyOfRange( chars, 0, chars.length - digitCount );
+         __hashCode = digitCount + Arrays.hashCode( __prefix );
+      }
+
+      private Long getCuiCode( final String cui, final int multiplier ) {
+         final String cuiNum = cui.substring( cui.length() - __digitCount, cui.length() );
+         try {
+            return PREFIX_MULTIPLIER * multiplier + Long.parseLong( cuiNum );
+         } catch ( NumberFormatException nfE ) {
+            LOGGER.error( "Could not create Cui Code for " + cui );
+         }
+         return -1l;
+      }
+
+      private String getAsCui( final Long code ) {
+         final char[] codeChars = String.valueOf( code ).toCharArray();
+         if ( codeChars.length > __digitCount ) {
+            LOGGER.error( "Invalid code " + code + " for prefix " + __prefix
+                          + " has more than " + __digitCount + " digits" );
+            return String.valueOf( __prefix ) + String.valueOf( codeChars );
+         }
+         final int cuiLength = __prefix.length + __digitCount;
+         final char[] cuiChars = new char[ cuiLength ];
+         System.arraycopy( __prefix, 0, cuiChars, 0, __prefix.length );
+         System.arraycopy( codeChars, 0, cuiChars, cuiLength - codeChars.length, codeChars.length );
+         for ( int i = __prefix.length; i < cuiLength - codeChars.length; i++ ) {
+            cuiChars[ i ] = '0';
+         }
+         return String.valueOf( cuiChars );
+      }
+
+      public int hashCode() {
+         return __hashCode;
+      }
+
+      public boolean equals( final Object other ) {
+         return other instanceof PrefixerPair
+                && __hashCode == ((PrefixerPair)other).__hashCode
+                && __digitCount == ((PrefixerPair)other).__digitCount
+                && Arrays.equals( __prefix, ((PrefixerPair)other).__prefix );
       }
-      return cuiCodes;
    }
 
+   // todo
+   // todo switch to int: 32 bit signed, max = 2,147,483,647
+   // todo if we keep final 7 digits for the numerical then we have 213 possible prefixes
+   // todo
+   // todo can probably change the code and the db will be fine, change the db too
+   // todo
+
 
 }

Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/DefaultDictionarySpec.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/DefaultDictionarySpec.java?rev=1690150&r1=1690149&r2=1690150&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/DefaultDictionarySpec.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/DefaultDictionarySpec.java Thu Jul  9 19:39:02 2015
@@ -56,7 +56,7 @@ final public class DefaultDictionarySpec
       }
 
       public Concept createConcept( final Long cuiCode ) {
-         return new Concept( CuiCodeUtil.getAsCui( cuiCode ) );
+         return new Concept( CuiCodeUtil.getInstance().getAsCui( cuiCode ) );
       }
 
       public Map<Long, Concept> createConcepts( final Collection<Long> cuiCodes ) {

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtilTester.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtilTester.java?rev=1690150&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtilTester.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/test/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtilTester.java Thu Jul  9 19:39:02 2015
@@ -0,0 +1,61 @@
+package org.apache.ctakes.dictionary.lookup2.util;
+
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 7/9/2015
+ */
+final public class CuiCodeUtilTester {
+
+
+   @Test
+   public void testGetAsCuiDefault() {
+      assertEquals( "Standard Prefix \'C\' not preppended as default",
+            "C0000123", CuiCodeUtil.getInstance().getAsCui( 123l ) );
+   }
+
+   @Test
+   public void testGetAsCuiCustom() {
+      final long bing123 = CuiCodeUtil.getInstance().getCuiCode( "BING123" );
+      final long binger = bing123 - 123;
+      assertEquals( "Custom Prefix \'BING\' not preppended",
+            "BING123", CuiCodeUtil.getInstance().getAsCui( bing123 ) );
+      assertEquals( "Custom Prefix \'BING\' not preppended",
+            "BING004", CuiCodeUtil.getInstance().getAsCui( binger + 4 ) );
+
+      final long bang123 = CuiCodeUtil.getInstance().getCuiCode( "BANG123" );
+      final long banger = bang123 - 123;
+      assertEquals( "Custom Prefix \'BANG\' not preppended",
+            "BANG123", CuiCodeUtil.getInstance().getAsCui( bang123 ) );
+      assertEquals( "Custom Prefix \'BANG\' not preppended",
+            "BANG004", CuiCodeUtil.getInstance().getAsCui( banger + 4 ) );
+   }
+
+   @Test
+   public void breakCodeTooLarge() {
+      final long bing123 = CuiCodeUtil.getInstance().getCuiCode( "BING123" );
+      final long binger = bing123 - 123;
+      assertEquals( "Cui length not expanded for large code",
+            "BING1004", CuiCodeUtil.getInstance().getAsCui( binger + 1004 ) );
+   }
+
+
+   @Test
+   public void breakCodeIsNegative() {
+      assertEquals( "Negative code did not return as-is",
+            "-1", CuiCodeUtil.getInstance().getAsCui( -1l ) );
+   }
+
+   @Test
+   public void breakPrefixUnknown() {
+      assertEquals( "Huge code did not return as-is",
+            "10000000000000", CuiCodeUtil.getInstance().getAsCui( 10000000000000l ) );
+   }
+
+
+}