You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2017/08/17 01:44:26 UTC

svn commit: r1805250 - /ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java

Author: seanfinan
Date: Thu Aug 17 01:44:25 2017
New Revision: 1805250

URL: http://svn.apache.org/viewvc?rev=1805250&view=rev
Log:
Add blacklist of terms

Modified:
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java

Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java?rev=1805250&r1=1805249&r2=1805250&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java Thu Aug 17 01:44:25 2017
@@ -18,7 +18,9 @@
  */
 package org.apache.ctakes.dictionary.lookup2.consumer;
 
+import org.apache.ctakes.core.resource.FileLocator;
 import org.apache.ctakes.core.util.collection.CollectionMap;
+import org.apache.ctakes.core.util.collection.HashSetMap;
 import org.apache.ctakes.dictionary.lookup2.concept.Concept;
 import org.apache.ctakes.dictionary.lookup2.textspan.TextSpan;
 import org.apache.ctakes.dictionary.lookup2.util.CuiCodeUtil;
@@ -26,12 +28,17 @@ import org.apache.ctakes.dictionary.look
 import org.apache.ctakes.typesystem.type.constants.CONST;
 import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
 import org.apache.ctakes.typesystem.type.textsem.*;
+import org.apache.ctakes.utils.env.EnvironmentVariable;
+import org.apache.log4j.Logger;
 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.CASRuntimeException;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.cas.FSArray;
 
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
 import java.util.*;
 
 import static org.apache.ctakes.typesystem.type.constants.CONST.*;
@@ -43,9 +50,16 @@ import static org.apache.ctakes.typesyst
  * Date: 1/9/14
  */
 final public class DefaultTermConsumer extends AbstractTermConsumer {
+   static private final Logger LOGGER = Logger.getLogger( "DefaultTermConsumer" );
+
+   static private final String BLACKLIST_KEY = "Blacklist";
 
    final private UmlsConceptCreator _umlsConceptCreator;
 
+   private final CollectionMap<Integer, String, Set<String>> _blacklists = new HashSetMap<>();
+
+
+
    public DefaultTermConsumer( final UimaContext uimaContext, final Properties properties ) {
       this( uimaContext, properties, new DefaultUmlsConceptCreator() );
    }
@@ -54,8 +68,69 @@ final public class DefaultTermConsumer e
                                final UmlsConceptCreator umlsConceptCreator ) {
       super( uimaContext, properties );
       _umlsConceptCreator = umlsConceptCreator;
+      String blacklistPath = EnvironmentVariable.getEnv( BLACKLIST_KEY, uimaContext );
+      if ( blacklistPath == null || blacklistPath.equals( EnvironmentVariable.NOT_PRESENT ) ) {
+         blacklistPath = properties.getProperty( BLACKLIST_KEY );
+      }
+      if ( blacklistPath != null && !blacklistPath.equals( EnvironmentVariable.NOT_PRESENT ) ) {
+         loadBlacklist( blacklistPath );
+      }
+   }
+
+   /**
+    * @param blacklistPath path to file containing text that should be blacklisted from the dictionary
+    */
+   private void loadBlacklist( final String blacklistPath ) {
+      LOGGER.info( "Loading Term Blacklist " + blacklistPath );
+      try ( BufferedReader reader = new BufferedReader( new InputStreamReader( FileLocator.getAsStream( blacklistPath ) ) ) ) {
+         String line;
+         String[] splits;
+         while ( (line = reader.readLine()) != null ) {
+            line = line.trim();
+            if ( line.isEmpty() || line.startsWith( "//" ) || line.startsWith( "#" ) ) {
+               continue;
+            }
+            splits = line.split( "\\|" );
+            if ( splits.length != 2 ) {
+               LOGGER.warn( "Blacklist line is not correct <semanticType>|<text> format " + line );
+               continue;
+            }
+            final Integer key = attemptParseInt( splits[ 0 ] );
+            _blacklists.placeValue( key, splits[ 1 ].trim().toLowerCase() );
+         }
+      } catch ( IOException ioE ) {
+         LOGGER.error( "Could not load blacklist " + blacklistPath );
+      }
    }
 
+   /**
+    * Since uimafit parameter values can be integers, check for an integer value
+    *
+    * @param value String value parsed from file
+    * @return the value as an Integer, or the original String if an Integer could not be resolved
+    */
+   static private Integer attemptParseInt( final String value ) {
+      try {
+         return Integer.valueOf( value );
+      } catch ( NumberFormatException nfE ) {
+         return CONST.NE_TYPE_ID_UNKNOWN;
+      }
+   }
+
+   /**
+    *
+    * @param cTakesSemantic semantic code integer
+    * @param jCas ye olde ...
+    * @param textSpan span of candidate text
+    * @return true if the candidate text is in the blacklist for the semantic type
+    */
+   private boolean inBlacklist( final int cTakesSemantic, final JCas jCas, final TextSpan textSpan ) {
+      if ( !_blacklists.containsKey( cTakesSemantic ) ) {
+         return false;
+      }
+      final String text = jCas.getDocumentText().substring( textSpan.getStart(), textSpan.getEnd() ).trim().toLowerCase();
+      return _blacklists.containsValue( cTakesSemantic, text );
+   }
 
    /**
     * {@inheritDoc}
@@ -70,6 +145,9 @@ final public class DefaultTermConsumer e
       try {
          for ( Map.Entry<TextSpan, ? extends Collection<Long>> spanCuis : textSpanCuis ) {
             umlsConceptList.clear();
+            if ( inBlacklist( cTakesSemantic, jcas, spanCuis.getKey() ) ) {
+               continue;
+            }
             for ( Long cuiCode : spanCuis.getValue() ) {
                umlsConceptList.addAll(
                      createUmlsConcepts( jcas, codingScheme, cTakesSemantic, cuiCode, cuiConcepts ) );