You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2020/09/25 00:59:37 UTC
svn commit: r1881994 [1/3] - in
/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased:
./ ae/ annotation/ dictionary/ encoder/ lookup/ table/ table/column/ util/
util/bsv/ util/jdbc/ util/textspan/ util/tokenize/ ...
Author: seanfinan
Date: Fri Sep 25 00:59:37 2020
New Revision: 1881994
URL: http://svn.apache.org/viewvc?rev=1881994&view=rev
Log:
New Case Sensitive Dictionary Lookup
Added:
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/ae/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/ae/CasedAnnotationFinder.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AlikeSubsumingAnnotationCreator.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreator.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreatorUtil.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/NonSubsumingAnnotationCreator.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SemanticSubsumingAnnotationCreator.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SubsumptionUtil.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvDictionary.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvListDictionary.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/CasedDictionary.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/DictionaryStore.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/InMemoryDictionary.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/JdbcDictionary.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvEncoder.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvListEncoder.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/CodeSchema.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/EncoderStore.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/InMemoryEncoder.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/JdbcEncoder.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoder.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoding.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/CandidateTerm.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/ContiguousLookupEngine.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/DiscoveredTerm.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupEngine.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupToken.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/CodeType.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/SchemaCode.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/Synonym.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvFileParser.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvObjectCreator.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/StringArrayCreator.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/jdbc/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/jdbc/JdbcUtil.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/ContiguousTextSpan.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/DiscontiguousTextSpan.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/MagicTextSpan.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/tokenize/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/tokenize/TokenizedTerm.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/tokenize/TokenizedTermMapper.java
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/wsd/
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/wsd/WsdUtil.java
Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/ae/CasedAnnotationFinder.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/ae/CasedAnnotationFinder.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/ae/CasedAnnotationFinder.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/ae/CasedAnnotationFinder.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,440 @@
+package org.apache.ctakes.dictionary.cased.ae;
+
+import org.apache.ctakes.core.pipeline.PipeBitInfo;
+import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.core.util.StringUtil;
+import org.apache.ctakes.core.util.annotation.SemanticGroup;
+import org.apache.ctakes.core.util.annotation.SemanticTui;
+import org.apache.ctakes.dictionary.cased.annotation.AlikeSubsumingAnnotationCreator;
+import org.apache.ctakes.dictionary.cased.annotation.AnnotationCreator;
+import org.apache.ctakes.dictionary.cased.annotation.NonSubsumingAnnotationCreator;
+import org.apache.ctakes.dictionary.cased.annotation.SemanticSubsumingAnnotationCreator;
+import org.apache.ctakes.dictionary.cased.dictionary.*;
+import org.apache.ctakes.dictionary.cased.encoder.*;
+import org.apache.ctakes.dictionary.cased.lookup.DiscoveredTerm;
+import org.apache.ctakes.dictionary.cased.lookup.LookupEngine;
+import org.apache.ctakes.dictionary.cased.lookup.LookupToken;
+import org.apache.ctakes.dictionary.lookup2.ae.JCasTermAnnotator;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.ctakes.utils.env.EnvironmentVariable;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+
+import java.sql.SQLException;
+import java.util.*;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+
+import static org.apache.ctakes.core.pipeline.PipeBitInfo.TypeProduct.*;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/12/2020
+ */
+@PipeBitInfo(
+ name = "CasedAnnotationFinder",
+ description = "Finds all-uppercase or normal terms in text.",
+ role = PipeBitInfo.Role.ANNOTATOR,
+ dependencies = { BASE_TOKEN, SENTENCE },
+ products = IDENTIFIED_ANNOTATION
+)
+final public class CasedAnnotationFinder extends JCasAnnotator_ImplBase {
+
+ static private final Logger LOGGER = Logger.getLogger( "CasedAnnotationFinder" );
+
+ static public final String DICTIONARY_TYPE = "_type";
+ static public final String ENCODER_TYPE = "_type";
+
+
+ // dictionaries accepts a comma-separated list
+ @ConfigurationParameter( name = "dictionaries", mandatory = true,
+ description = "Dictionaries to use for lookup." )
+ private String[] _dictionaries;
+
+ static private final String snomed_rxnorm_2020aa_type = "Jdbc";
+
+
+ // https://www.eecis.udel.edu/~vijay/cis889/ie/pos-set.pdf
+
+ static private final String[] VERB_POS = { "VB", "VBD", "VBG", "VBN", "VBP", "VBZ",
+ "VV", "VVD", "VVG", "VVN", "VVP", "VVZ" };
+ @ConfigurationParameter( name = "lookupVerbs", mandatory = false,
+ description = "Use Verb parts of speech for lookup." )
+ private String _lookupVerbs = "yes";
+
+ static private final String[] NOUN_POS = { "NN", "NNS", "NP", "NPS", "NNP", "NNPS" };
+ @ConfigurationParameter( name = "lookupNouns", mandatory = false,
+ description = "Use Noun parts of speech for lookup." )
+ private String _lookupNouns = "yes";
+
+ static private final String[] ADJECTIVE_POS = { "JJ", "JJR", "JJS" };
+ @ConfigurationParameter( name = "lookupAdjectives", mandatory = false,
+ description = "Use Adjective parts of speech for lookup." )
+ private String _lookupAdjectives = "yes";
+
+ static private final String[] ADVERB_POS = { "RB", "RBR", "RBS" };
+ @ConfigurationParameter( name = "lookupAdverbs", mandatory = false,
+ description = "Use Adverb parts of speech for lookup." )
+ private String _lookupAdverbs = "yes";
+
+ @ConfigurationParameter( name = "otherLookups", mandatory = false,
+ description = "List of other parts of speech for lookup." )
+ private String[] _otherLookups = {};
+
+ // minimum span required to accept a term
+ @ConfigurationParameter( name = JCasTermAnnotator.PARAM_MIN_SPAN_KEY, mandatory = false,
+ description = "Minimum number of characters for a term." )
+ protected int _minLookupSpan = JCasTermAnnotator.DEFAULT_MINIMUM_SPAN;
+
+
+ @ConfigurationParameter( name = "allowWordSkips", mandatory = false,
+ description = "Terms may include words that do not match. So-called loose matching." )
+ protected String _allowSkips = "no";
+
+ static private final String CONS_SKIP_PRP_KEY = "consecutiveSkips";
+ @ConfigurationParameter( name = CONS_SKIP_PRP_KEY, mandatory = false,
+ description = "Number of consecutive non-comma tokens that can be skipped." )
+ private int _consecutiveSkipMax = 2;
+
+ static private final String TOTAL_SKIP_PRP_KEY = "totalSkips";
+ @ConfigurationParameter( name = TOTAL_SKIP_PRP_KEY, mandatory = false,
+ description = "Number of total tokens that can be skipped." )
+ private int _totalSkipMax = 4;
+
+
+ @ConfigurationParameter( name = "subsume", mandatory = false,
+ description = "Subsume contained terms of the same semantic group.", defaultValue = "yes" )
+ private String _subsume = "yes";
+
+ @ConfigurationParameter( name = "subsumeSemantics", mandatory = false,
+ description = "Subsume contained terms of the same and certain other semantic groups.", defaultValue = "yes" )
+ private String _subsumeSemantics = "yes";
+
+
+ @ConfigurationParameter( name = "reassignSemantics", mandatory = false,
+ description = "Reassign Semantic Types (TUIs) to non-default Semantic Groups." )
+ private String[] _reassignSemanticList = {};
+
+
+ // code lists accepts a comma-separated list
+ @ConfigurationParameter( name = "encoders", mandatory = true,
+ description = "Term Encoders with schemas and schema codes." )
+ private String[] _encoders;
+
+
+ private boolean _allowSkipping;
+
+ private AnnotationCreator _annotationCreator;
+
+ final private Collection<String> _lookupPos = new HashSet<>();
+
+ final private Map<SemanticTui, SemanticGroup> _semanticReassignment = new HashMap<>();
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void initialize( final UimaContext context ) throws ResourceInitializationException {
+ LOGGER.info( "Initializing Dictionary Lookup ..." );
+ super.initialize( context );
+
+ if ( isParameterTrue( _subsumeSemantics ) ) {
+ _annotationCreator = new SemanticSubsumingAnnotationCreator();
+
+ } else if ( isParameterTrue( _subsume ) ) {
+ _annotationCreator = new AlikeSubsumingAnnotationCreator();
+ } else {
+ _annotationCreator = new NonSubsumingAnnotationCreator();
+ }
+
+ setupDictionaries( context );
+ setupEncoders( context );
+ setupPos();
+ setupReassignSemantics();
+ }
+
+
+ static private boolean isParameterTrue( final String value ) {
+ return value.equalsIgnoreCase( "yes" ) || value.equalsIgnoreCase( "true" );
+ }
+
+ private void setupDictionaries( final UimaContext context ) throws ResourceInitializationException {
+ if ( _dictionaries.length == 0 ) {
+ LOGGER.error( "Dictionary List is empty. Consider using the default cTAKES Dictionary." +
+ " If you are using a piper file, add the line \"load sno_rx_16ab_settings\"" );
+ throw new ResourceInitializationException();
+ }
+ for ( String name : _dictionaries ) {
+ final CasedDictionary dictionary = createDictionary( name, context );
+ if ( dictionary == null ) {
+ LOGGER.error( "Could not create Dictionary for " + name );
+ throw new ResourceInitializationException();
+ }
+ DictionaryStore.getInstance().addDictionary( dictionary );
+ }
+ }
+
+
+ private CasedDictionary createDictionary( final String name, final UimaContext context ) {
+ final String type = EnvironmentVariable.getEnv( name + DICTIONARY_TYPE, context );
+ if ( type == null || type.equals( EnvironmentVariable.NOT_PRESENT ) ) {
+ LOGGER.error(
+ "No Dictionary Type specified for " + name + ". Please set parameter " + name + DICTIONARY_TYPE );
+ return null;
+ }
+ try {
+ switch ( type.toUpperCase() ) {
+ case JdbcDictionary
+ .DICTIONARY_TYPE:
+ return new JdbcDictionary( name, context );
+ case BsvDictionary
+ .DICTIONARY_TYPE:
+ return new BsvDictionary( name, context );
+ case BsvListDictionary
+ .DICTIONARY_TYPE:
+ return new BsvListDictionary( name, context );
+ default:
+ LOGGER.error( "Unknown Dictionary type " + type + " specified for " + name );
+ }
+ } catch ( SQLException multE ) {
+ LOGGER.error( multE.getMessage() );
+ }
+ return null;
+ }
+
+
+ private void setupEncoders( final UimaContext context ) throws ResourceInitializationException {
+ if ( _encoders.length == 0 ) {
+ LOGGER.error( "Term Encoder List is empty. Consider using the default cTAKES Term Encoder." +
+ " If you are using a piper file, add the line \"load sno_rx_2020aa_settings\"" );
+ throw new ResourceInitializationException();
+ }
+ for ( String name : _encoders ) {
+ final TermEncoder encoder = createEncoder( name, context );
+ if ( encoder == null ) {
+ LOGGER.error( "Could not create Term Encoder for " + name );
+ throw new ResourceInitializationException();
+ }
+ EncoderStore.getInstance().addEncoder( encoder );
+ }
+ }
+
+
+ private TermEncoder createEncoder( final String name, final UimaContext context ) {
+ final String type = EnvironmentVariable.getEnv( name + ENCODER_TYPE, context );
+ if ( type == null || type.equals( EnvironmentVariable.NOT_PRESENT ) ) {
+ LOGGER.error(
+ "No Term Encoder Type specified for " + name + ". Please set parameter " + name + ENCODER_TYPE );
+ return null;
+ }
+ try {
+ switch ( type.toUpperCase() ) {
+ case JdbcEncoder
+ .ENCODER_TYPE:
+ return new JdbcEncoder( name, context );
+ case BsvEncoder
+ .ENCODER_TYPE:
+ return new BsvEncoder( name, context );
+ case BsvListEncoder
+ .ENCODER_TYPE:
+ return new BsvListEncoder( name, context );
+ default:
+ LOGGER.error( "Unknown Term Encoder type " + type + " specified for " + name );
+ }
+ } catch ( SQLException multE ) {
+ LOGGER.error( multE.getMessage() );
+ }
+ return null;
+ }
+
+
+ private void setupPos() throws ResourceInitializationException {
+ if ( isTrue( _lookupVerbs ) ) {
+ _lookupPos.addAll( Arrays.asList( VERB_POS ) );
+ }
+ if ( isTrue( _lookupNouns ) ) {
+ _lookupPos.addAll( Arrays.asList( NOUN_POS ) );
+ }
+ if ( isTrue( _lookupAdjectives ) ) {
+ _lookupPos.addAll( Arrays.asList( ADJECTIVE_POS ) );
+ }
+ if ( isTrue( _lookupAdverbs ) ) {
+ _lookupPos.addAll( Arrays.asList( ADVERB_POS ) );
+ }
+ if ( _otherLookups.length != 0 ) {
+ _lookupPos.addAll( Arrays.asList( _otherLookups ) );
+ }
+ if ( _lookupPos.isEmpty() ) {
+ LOGGER.error( "No Parts of Speech indicated for Lookup. At least one Part of Speech must be used." );
+ throw new ResourceInitializationException();
+ }
+ LOGGER.info( "Using Parts of Speech " + String.join( ", ", _lookupPos ) );
+ }
+
+ private void setupReassignSemantics() {
+ if ( _semanticReassignment == null || _reassignSemanticList.length == 0 ) {
+ return;
+ }
+ for ( String keyValue : _reassignSemanticList ) {
+ final String[] splits = StringUtil.fastSplit( keyValue, ':' );
+ if ( splits.length != 2 ) {
+ LOGGER.warn( "Improper Key : Value pair for Semantic Reassignment " + keyValue );
+ continue;
+ }
+ final SemanticTui tui = SemanticTui.getTui( splits[ 0 ].trim() );
+ final SemanticGroup group = SemanticGroup.getGroup( splits[ 1 ].trim() );
+ _semanticReassignment.put( tui, group );
+ }
+ LOGGER.info( "Reassigned Semantics: "
+ + _semanticReassignment.entrySet()
+ .stream()
+ .map( e -> e.getKey().getSemanticType() + " : " + e.getValue().getLongName() )
+ .collect( Collectors.joining( ", " ) ) );
+ }
+
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void process( final JCas jCas ) throws AnalysisEngineProcessException {
+ LOGGER.info( "Finding Named Entities ..." );
+
+ // Get all BaseTokens, grouped by Sentence.
+ final Map<Sentence, Collection<BaseToken>> sentenceBaseTokens
+ = JCasUtil.indexCovered( jCas, Sentence.class, BaseToken.class );
+
+ // Discover Terms in text, grouped by text span.
+ final Map<Pair<Integer>, Collection<DiscoveredTerm>> allDiscoveredTermsMap = new HashMap<>();
+ try {
+ // Using foreach loop because try/catch in a stream is terrible.
+ for ( Collection<BaseToken> baseTokens : sentenceBaseTokens.values() ) {
+ allDiscoveredTermsMap.putAll( getDiscoveredTerms( baseTokens ) );
+ }
+ } catch ( ArrayIndexOutOfBoundsException iobE ) {
+ // JCasHashMap will throw this every once in a while. Assume the windows are done and move on.
+ LOGGER.warn( iobE.getMessage() );
+ }
+
+
+ // Get all encodings (schemas and codes) or the discovered terms.
+ final Map<DiscoveredTerm, Collection<TermEncoding>> termEncodingMap
+ = allDiscoveredTermsMap.values()
+ .stream()
+ .flatMap( Collection::parallelStream )
+ .collect( Collectors.toMap( Function.identity(), this::getEncodings ) );
+
+
+ createAnnotations( jCas, allDiscoveredTermsMap, termEncodingMap );
+ }
+
+
+ private void createAnnotations( final JCas jCas,
+ final Map<Pair<Integer>, Collection<DiscoveredTerm>> allDiscoveredTermsMap,
+ final Map<DiscoveredTerm, Collection<TermEncoding>> termEncodingMap ) {
+ _annotationCreator.createAnnotations( jCas, allDiscoveredTermsMap, termEncodingMap, _semanticReassignment );
+ }
+
+
+ private Collection<TermEncoding> getEncodings( final DiscoveredTerm discoveredTerm ) {
+ return EncoderStore.getInstance()
+ .getEncoders()
+ .stream()
+ .map( e -> e.getEncodings( discoveredTerm ) )
+ .filter( Objects::nonNull )
+ .flatMap( Collection::stream )
+ .collect( Collectors.toSet() );
+ }
+
+
+ public Map<Pair<Integer>, Collection<DiscoveredTerm>> getDiscoveredTerms( final Collection<BaseToken> baseTokens ) {
+ final Map<CasedDictionary, Map<Pair<Integer>, Collection<DiscoveredTerm>>> discoveredTermsMap
+ = findTerms( baseTokens );
+
+ return discoveredTermsMap.values()
+ .stream()
+ .map( Map::entrySet )
+ .flatMap( Collection::stream )
+ .collect( Collectors.toMap( Map.Entry::getKey, Map.Entry::getValue ) );
+ }
+
+
+ /**
+ * Given a set of dictionaries, tokens, and lookup token indices, populate a terms map with discovered terms
+ *
+ * @param baseTokens -
+ * @return dictionaries to map of text spans to terms discovered at those text spans.
+ */
+ private Map<CasedDictionary, Map<Pair<Integer>, Collection<DiscoveredTerm>>> findTerms(
+ final Collection<BaseToken> baseTokens ) {
+ final Collection<CasedDictionary> dictionaries = DictionaryStore.getInstance().getDictionaries();
+ final Map<CasedDictionary, Map<Pair<Integer>, Collection<DiscoveredTerm>>> dictionaryTermsMap
+ = new HashMap<>( dictionaries.size() );
+ final List<LookupToken> lookupTokens = baseTokens.stream()
+ .filter( isWantedToken )
+ .sorted( Comparator.comparingInt( Annotation::getBegin ) )
+ .map( toLookupToken )
+ .collect( Collectors.toList() );
+ final LookupEngine engine = getLookupEngine();
+ dictionaries.forEach( d -> dictionaryTermsMap.put( d,
+ engine.findTerms( d, lookupTokens, _consecutiveSkipMax, _totalSkipMax ) ) );
+ return dictionaryTermsMap;
+ }
+
+ static private final Predicate<BaseToken> isWantedToken = t -> !(t instanceof NewlineToken);
+
+ private final Function<BaseToken, LookupToken> toLookupToken = b -> new LookupToken( b, isValidLookup( b ) );
+
+
+ private boolean isValidLookup( final BaseToken baseToken ) {
+ // We are only interested in tokens that are -words- of a certain length.
+ if ( !(baseToken instanceof WordToken)
+ || (baseToken.getEnd() - baseToken.getBegin() < _minLookupSpan) ) {
+ return false;
+ }
+ // We are only interested in tokens that are -words- of the wanted part of speech.
+ final String partOfSpeech = baseToken.getPartOfSpeech();
+ return partOfSpeech == null || _lookupPos.contains( partOfSpeech );
+ }
+
+
+ private LookupEngine getLookupEngine() {
+ return new LookupEngine();
+ }
+
+
+ static protected int parseInt( final Object value, final String name, final int defaultValue ) {
+ if ( value instanceof Integer ) {
+ return (Integer)value;
+ } else if ( value instanceof String ) {
+ try {
+ return Integer.parseInt( (String)value );
+ } catch ( NumberFormatException nfE ) {
+ LOGGER.warn( "Could not parse " + name + " " + value + " as an integer" );
+ }
+ } else {
+ LOGGER.warn( "Could not parse " + name + " " + value + " as an integer" );
+ }
+ return defaultValue;
+ }
+
+
+ static private boolean isTrue( final String text ) {
+ return text.equalsIgnoreCase( "yes" ) || text.equalsIgnoreCase( "true" );
+ }
+
+
+}
Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AlikeSubsumingAnnotationCreator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AlikeSubsumingAnnotationCreator.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AlikeSubsumingAnnotationCreator.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AlikeSubsumingAnnotationCreator.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,97 @@
+package org.apache.ctakes.dictionary.cased.annotation;
+
+
+import jdk.nashorn.internal.ir.annotations.Immutable;
+import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.core.util.annotation.SemanticGroup;
+import org.apache.ctakes.core.util.annotation.SemanticTui;
+import org.apache.ctakes.dictionary.cased.encoder.TermEncoding;
+import org.apache.ctakes.dictionary.cased.lookup.DiscoveredTerm;
+import org.apache.ctakes.dictionary.cased.util.textspan.MagicTextSpan;
+import org.apache.ctakes.dictionary.cased.wsd.WsdUtil;
+import org.apache.log4j.Logger;
+import org.apache.uima.jcas.JCas;
+
+import java.util.*;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/19/2020
+ */
+@Immutable
+final public class AlikeSubsumingAnnotationCreator implements AnnotationCreator {
+
+ static private final Logger LOGGER = Logger.getLogger( "AlikeSubsumingAnnotationCreator" );
+
+ public AlikeSubsumingAnnotationCreator() {
+ }
+
+
+ public void createAnnotations( final JCas jCas,
+ final Map<Pair<Integer>, Collection<DiscoveredTerm>> allDiscoveredTermsMap,
+ final Map<DiscoveredTerm, Collection<TermEncoding>> termEncodingMap,
+ final Map<SemanticTui, SemanticGroup> reassignSemantics ) {
+
+ final Map<SemanticGroup, Collection<DiscoveredTerm>> semanticTermsMap
+ = AnnotationCreatorUtil.mapSemanticTerms( termEncodingMap, reassignSemantics );
+
+ final Map<DiscoveredTerm, Collection<MagicTextSpan>> termSpanMap
+ = AnnotationCreatorUtil.mapTermSpans( allDiscoveredTermsMap );
+
+
+ for ( SemanticGroup subsumingGroup : SemanticGroup.values() ) {
+ final Collection<DiscoveredTerm> semanticTerms = semanticTermsMap.get( subsumingGroup );
+ if ( semanticTerms == null || semanticTerms.isEmpty() ) {
+ continue;
+ }
+
+ final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumedTermsMap
+ = getSubsumedSpanTerms( subsumingGroup, semanticTermsMap, termSpanMap );
+
+ for ( Map.Entry<MagicTextSpan, Collection<DiscoveredTerm>> subsumedTerms : subsumedTermsMap.entrySet() ) {
+ allDiscoveredTermsMap.getOrDefault( subsumedTerms.getKey().toIntPair(), new HashSet<>() )
+ .removeAll( subsumedTerms.getValue() );
+ semanticTerms.removeAll( subsumedTerms.getValue() );
+ }
+
+ final Map<MagicTextSpan, Collection<DiscoveredTerm>> wsdedTermsMap
+ = WsdUtil.getSemanticWsdSpanTerms( semanticTerms, termSpanMap );
+
+ for ( Map.Entry<MagicTextSpan, Collection<DiscoveredTerm>> wsdedTerms : wsdedTermsMap.entrySet() ) {
+ allDiscoveredTermsMap.getOrDefault( wsdedTerms.getKey().toIntPair(), new HashSet<>() )
+ .removeAll( wsdedTerms.getValue() );
+ }
+
+ }
+
+ allDiscoveredTermsMap.forEach(
+ ( k, v ) -> AnnotationCreatorUtil.createAnnotations( jCas, k, v, termEncodingMap, reassignSemantics ) );
+ }
+
+
+ static public Map<MagicTextSpan, Collection<DiscoveredTerm>> getSubsumedSpanTerms(
+ final SemanticGroup subsumingGroup,
+ final Map<SemanticGroup, Collection<DiscoveredTerm>> semanticTermsMap,
+ final Map<DiscoveredTerm, Collection<MagicTextSpan>> termSpanMap ) {
+ final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumedSpanTermsMap = new HashMap<>();
+ // Get subsuming spans and their corresponding terms.
+ final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumingSpanTermsMap
+ = SubsumptionUtil.mapSpanTerms( subsumingGroup, semanticTermsMap, termSpanMap );
+ if ( subsumingSpanTermsMap.isEmpty() ) {
+ // No subsuming Spans.
+ return Collections.emptyMap();
+ }
+ // List of spans for subsuming terms, sorted by end character index.
+ final List<MagicTextSpan> subsumingSpans = new ArrayList<>( subsumingSpanTermsMap.keySet() );
+ subsumingSpans.sort( Comparator.comparingInt( MagicTextSpan::getEnd ) );
+ // Remove smaller terms of the same semantic group
+ if ( subsumingSpanTermsMap.size() > 1 ) {
+ subsumedSpanTermsMap.putAll( SubsumptionUtil.mapFullySubsumedTermSpans( subsumingSpans, subsumingSpanTermsMap ) );
+ }
+ return subsumedSpanTermsMap;
+ }
+
+
+}
Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreator.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreator.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreator.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,25 @@
+package org.apache.ctakes.dictionary.cased.annotation;
+
+import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.core.util.annotation.SemanticGroup;
+import org.apache.ctakes.core.util.annotation.SemanticTui;
+import org.apache.ctakes.dictionary.cased.encoder.TermEncoding;
+import org.apache.ctakes.dictionary.cased.lookup.DiscoveredTerm;
+import org.apache.uima.jcas.JCas;
+
+import java.util.Collection;
+import java.util.Map;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/26/2020
+ */
+public interface AnnotationCreator {
+
+ void createAnnotations( final JCas jCas,
+ final Map<Pair<Integer>, Collection<DiscoveredTerm>> allDiscoveredTermsMap,
+ final Map<DiscoveredTerm, Collection<TermEncoding>> termEncodingMap,
+ final Map<SemanticTui, SemanticGroup> reassignSemantics );
+
+}
Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreatorUtil.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreatorUtil.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreatorUtil.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreatorUtil.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,236 @@
+package org.apache.ctakes.dictionary.cased.annotation;
+
+import jdk.nashorn.internal.ir.annotations.Immutable;
+import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.core.util.annotation.SemanticGroup;
+import org.apache.ctakes.core.util.annotation.SemanticTui;
+import org.apache.ctakes.dictionary.cased.encoder.CodeSchema;
+import org.apache.ctakes.dictionary.cased.encoder.TermEncoding;
+import org.apache.ctakes.dictionary.cased.lookup.DiscoveredTerm;
+import org.apache.ctakes.dictionary.cased.util.textspan.ContiguousTextSpan;
+import org.apache.ctakes.dictionary.cased.util.textspan.MagicTextSpan;
+import org.apache.ctakes.dictionary.lookup2.util.CuiCodeUtil;
+import org.apache.ctakes.dictionary.lookup2.util.TuiCodeUtil;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.log4j.Logger;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+
+import java.util.*;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/26/2020
+ */
+@Immutable
+final public class AnnotationCreatorUtil {
+
+ static private final Logger LOGGER = Logger.getLogger( "AnnotationCreatorUtil" );
+
+ private AnnotationCreatorUtil() {
+ }
+
+
+ static public Map<DiscoveredTerm, Collection<MagicTextSpan>> mapTermSpans(
+ final Map<Pair<Integer>, Collection<DiscoveredTerm>> allDiscoveredTermsMap ) {
+ final Map<DiscoveredTerm, Collection<MagicTextSpan>> termSpanMap = new HashMap<>();
+ for ( Map.Entry<Pair<Integer>, Collection<DiscoveredTerm>> spanTerms : allDiscoveredTermsMap.entrySet() ) {
+ final MagicTextSpan textSpan = new ContiguousTextSpan( spanTerms.getKey() );
+ spanTerms.getValue().forEach( t -> termSpanMap.computeIfAbsent( t, s -> new HashSet<>() ).add( textSpan ) );
+ }
+ return termSpanMap;
+ }
+
+
+ static public void createAnnotations( final JCas jcas,
+ final Pair<Integer> textSpan,
+ final Collection<DiscoveredTerm> discoveredTerms,
+ final Map<DiscoveredTerm, Collection<TermEncoding>> termEncodingMap,
+ final Map<SemanticTui, SemanticGroup> reassignSemantics ) {
+ discoveredTerms.forEach( t
+ -> createAnnotation( jcas, textSpan, t, termEncodingMap.get( t ), reassignSemantics ) );
+ }
+
+ static private void createAnnotation( final JCas jcas,
+ final Pair<Integer> textSpan,
+ final DiscoveredTerm discoveredTerm,
+ final Collection<TermEncoding> termEncodings,
+ final Map<SemanticTui, SemanticGroup> reassignSemantics ) {
+ final SemanticGroup bestGroup
+ = SemanticGroup.getBestGroup( getSemanticGroups( termEncodings, reassignSemantics ) );
+ final IdentifiedAnnotation annotation = bestGroup
+ .getCreator()
+ .apply( jcas );
+ annotation.setTypeID( bestGroup.getCode() );
+ annotation.setBegin( textSpan.getValue1() );
+ annotation.setEnd( textSpan.getValue2() );
+ annotation.setDiscoveryTechnique( CONST.NE_DISCOVERY_TECH_DICT_LOOKUP );
+
+ final String cui = CuiCodeUtil.getInstance().getAsCui( discoveredTerm.getCuiCode() );
+ Collection<String> tuis = getTuis( termEncodings );
+ if ( tuis.isEmpty() ) {
+ tuis = Collections.singletonList( SemanticTui.UNKNOWN.name() );
+ }
+ final String prefText = getPreferredText( termEncodings );
+
+ final Collection<UmlsConcept> umlsConcepts = new HashSet<>();
+ for ( String tui : tuis ) {
+ termEncodings.stream()
+ .filter( isPrefTextEncoding.negate() )
+ .filter( isTuiEncoding.negate() )
+ .map( e -> createUmlsConcept( jcas, cui, tui, prefText, e ) )
+ .forEach( umlsConcepts::add );
+ }
+ final FSArray conceptArr = new FSArray( jcas, umlsConcepts.size() );
+ int arrIdx = 0;
+ for ( UmlsConcept umlsConcept : umlsConcepts ) {
+ conceptArr.set( arrIdx, umlsConcept );
+ arrIdx++;
+ }
+ annotation.setOntologyConceptArr( conceptArr );
+ annotation.addToIndexes();
+ LOGGER.warn( "Created Annotation " + annotation.getCoveredText()
+ + " of " + bestGroup.getName()
+ + " with " + termEncodings.stream()
+ .map( t -> t.getSchema() + " " + t.getSchemaCode() )
+ .collect( Collectors.joining( ";" ) )
+ + " tuis " + String.join( ",", tuis ) );
+ }
+
+
+ static private String getPreferredText( final Collection<TermEncoding> termEncodings ) {
+ return termEncodings.stream()
+ .filter( CodeSchema.PREFERRED_TEXT::isSchema )
+ .map( TermEncoding::getSchemaCode )
+ .map( Object::toString )
+ .distinct()
+ .collect( Collectors.joining( ";" ) );
+ }
+
+ static private final Predicate<TermEncoding> isPrefTextEncoding
+ = CodeSchema.PREFERRED_TEXT::isSchema;
+
+
+ static private String getTui( final Collection<TermEncoding> termEncodings ) {
+ return termEncodings.stream()
+ .filter( CodeSchema.TUI::isSchema )
+ .map( TermEncoding::getSchemaCode )
+ .map( AnnotationCreatorUtil::parseTuiValue )
+ .map( TuiCodeUtil::getAsTui )
+ .distinct()
+ .collect( Collectors.joining( ";" ) );
+ }
+
+ static private Collection<String> getTuis( final Collection<TermEncoding> termEncodings ) {
+ return termEncodings.stream()
+ .filter( CodeSchema.TUI::isSchema )
+ .map( TermEncoding::getSchemaCode )
+ .map( AnnotationCreatorUtil::parseTuiValue )
+ .map( TuiCodeUtil::getAsTui )
+ .collect( Collectors.toSet() );
+ }
+
+ static private final Predicate<TermEncoding> isTuiEncoding = CodeSchema.TUI::isSchema;
+
+
+ static private UmlsConcept createUmlsConcept( final JCas jcas,
+ final String cui,
+ final String tui,
+ final String preferredText,
+ final TermEncoding termEncoding ) {
+ final UmlsConcept umlsConcept = new UmlsConcept( jcas );
+ umlsConcept.setCodingScheme( termEncoding.getSchema() );
+ umlsConcept.setCui( cui );
+ if ( tui != null ) {
+ umlsConcept.setTui( tui );
+ }
+ if ( preferredText != null && !preferredText.isEmpty() ) {
+ umlsConcept.setPreferredText( preferredText );
+ }
+ umlsConcept.setCode( termEncoding.getSchemaCode().toString() );
+ return umlsConcept;
+ }
+
+
+ static public Map<SemanticGroup, Collection<DiscoveredTerm>> mapSemanticTerms(
+ final Map<DiscoveredTerm, Collection<TermEncoding>> termEncodingMap,
+ final Map<SemanticTui, SemanticGroup> reassignSemantics ) {
+ final Map<SemanticGroup, Collection<DiscoveredTerm>> semanticTermMap = new EnumMap<>( SemanticGroup.class );
+ for ( Map.Entry<DiscoveredTerm, Collection<TermEncoding>> discoveredEncodings : termEncodingMap.entrySet() ) {
+ getSemanticGroups( discoveredEncodings.getValue(), reassignSemantics )
+ .forEach( g -> semanticTermMap.computeIfAbsent( g, s -> new HashSet<>() )
+ .add( discoveredEncodings.getKey() ) );
+ }
+ return semanticTermMap;
+ }
+
+
+ static private Collection<SemanticGroup> getSemanticGroups(
+ final Collection<TermEncoding> termEncodings,
+ final Map<SemanticTui, SemanticGroup> reassignSemantics ) {
+ final Collection<SemanticGroup> groups = termEncodings.stream()
+ .filter( CodeSchema.TUI::isSchema )
+ .map( e -> getSemanticGroup( e, reassignSemantics ) )
+ .collect( Collectors.toSet() );
+ if ( groups.isEmpty() ) {
+ return Collections.singletonList( SemanticGroup.UNKNOWN );
+ }
+ return groups;
+ }
+
+
+ static private SemanticGroup getSemanticGroup( final TermEncoding tuiEncoding,
+ final Map<SemanticTui, SemanticGroup> reassignSemantics ) {
+ final Object object = tuiEncoding.getSchemaCode();
+ if ( object instanceof Integer ) {
+ return getSemanticGroup( (Integer)object, reassignSemantics );
+ }
+ return getSemanticGroup( parseTuiValue( object ), reassignSemantics );
+ }
+
+ static private SemanticGroup getSemanticGroup( final int tuiCode,
+ final Map<SemanticTui, SemanticGroup> reassignSemantics ) {
+ final SemanticTui tui = SemanticTui.getTui( tuiCode );
+ if ( !reassignSemantics.isEmpty() ) {
+ final SemanticGroup reassignGroup = reassignSemantics.get( tui );
+ if ( reassignGroup != null ) {
+ return reassignGroup;
+ }
+ }
+ return tui.getGroup();
+ }
+
+
+ static private int parseTuiValue( final Object object ) {
+ try {
+ return Integer.parseInt( object.toString() );
+ } catch ( NumberFormatException nfE ) {
+ return SemanticTui.UNKNOWN.getCode();
+ }
+ }
+
+
+// static private Map<DiscoveredTerm, Collection<SemanticGroup>> mapTermSemantics(
+// final Map<DiscoveredTerm, Collection<TermEncoding>> termEncodingMap,
+// final Map<SemanticTui,SemanticGroup> reassignSemantics ) {
+// final Map<DiscoveredTerm, Collection<SemanticGroup>> termSemanticsMap = new HashMap<>( termEncodingMap.size() );
+// termEncodingMap.forEach( (k,v) -> termSemanticsMap.put( k, getSemanticGroups( v, reassignSemantics) ) );
+// return termSemanticsMap;
+// }
+
+
+// static private Map<TermEncoding,SemanticGroup> mapEncodingSemantics( final Collection<TermEncoding> termEncodings,
+// final Map<SemanticTui,SemanticGroup> reassignSemantics ) {
+// return termEncodings.stream()
+// .collect( Collectors.toMap( Function.identity(),
+// e -> getSemanticGroup( e, reassignSemantics ) ) );
+// }
+
+
+}
Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/NonSubsumingAnnotationCreator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/NonSubsumingAnnotationCreator.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/NonSubsumingAnnotationCreator.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/NonSubsumingAnnotationCreator.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,38 @@
+package org.apache.ctakes.dictionary.cased.annotation;
+
+
+import jdk.nashorn.internal.ir.annotations.Immutable;
+import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.core.util.annotation.SemanticGroup;
+import org.apache.ctakes.core.util.annotation.SemanticTui;
+import org.apache.ctakes.dictionary.cased.encoder.TermEncoding;
+import org.apache.ctakes.dictionary.cased.lookup.DiscoveredTerm;
+import org.apache.log4j.Logger;
+import org.apache.uima.jcas.JCas;
+
+import java.util.Collection;
+import java.util.Map;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/26/2020
+ */
+@Immutable
+final public class NonSubsumingAnnotationCreator implements AnnotationCreator {
+
+ static private final Logger LOGGER = Logger.getLogger( "NonSubsumingAnnotationCreator" );
+
+ public NonSubsumingAnnotationCreator() {
+ }
+
+
+ public void createAnnotations( final JCas jCas,
+ final Map<Pair<Integer>, Collection<DiscoveredTerm>> allDiscoveredTermsMap,
+ final Map<DiscoveredTerm, Collection<TermEncoding>> termEncodingMap,
+ final Map<SemanticTui, SemanticGroup> reassignSemantics ) {
+ allDiscoveredTermsMap.forEach(
+ ( k, v ) -> AnnotationCreatorUtil.createAnnotations( jCas, k, v, termEncodingMap, reassignSemantics ) );
+ }
+
+}
Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SemanticSubsumingAnnotationCreator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SemanticSubsumingAnnotationCreator.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SemanticSubsumingAnnotationCreator.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SemanticSubsumingAnnotationCreator.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,163 @@
+package org.apache.ctakes.dictionary.cased.annotation;
+
+
+import jdk.nashorn.internal.ir.annotations.Immutable;
+import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.core.util.annotation.SemanticGroup;
+import org.apache.ctakes.core.util.annotation.SemanticTui;
+import org.apache.ctakes.dictionary.cased.encoder.TermEncoding;
+import org.apache.ctakes.dictionary.cased.lookup.DiscoveredTerm;
+import org.apache.ctakes.dictionary.cased.util.textspan.MagicTextSpan;
+import org.apache.ctakes.dictionary.cased.wsd.WsdUtil;
+import org.apache.log4j.Logger;
+import org.apache.uima.jcas.JCas;
+
+import java.util.*;
+
+import static org.apache.ctakes.core.util.annotation.SemanticGroup.*;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/19/2020
+ */
+@Immutable
+final public class SemanticSubsumingAnnotationCreator implements AnnotationCreator {
+
+ static private final Logger LOGGER = Logger.getLogger( "SemanticSubsumingAnnotationCreator" );
+
+ public SemanticSubsumingAnnotationCreator() {
+ }
+
+
+ static private final Map<SemanticGroup, Collection<SemanticGroup>> SUBSUME_MAP
+ = new EnumMap<>( SemanticGroup.class );
+
+ static {
+ //
+ SUBSUME_MAP.put( DRUG, EnumSet.of( LAB, PHENOMENON, ENTITY, EVENT ) );
+ //
+ SUBSUME_MAP.put( DISORDER, EnumSet.of( DRUG, FINDING, LAB, PHENOMENON, ENTITY, EVENT ) );
+ //
+ SUBSUME_MAP.put( FINDING, EnumSet.of( LAB, PHENOMENON, ENTITY, EVENT ) );
+ // "Oral Surgery"
+ SUBSUME_MAP.put( PROCEDURE, EnumSet.of( LAB, PHENOMENON, EVENT ) );
+ //
+ SUBSUME_MAP.put( ANATOMY, EnumSet.of( DRUG, DISORDER, FINDING, LAB, PHENOMENON, ENTITY ) );
+ //
+// SUBSUME_MAP.put( CLINICAL_ATTRIBUTE, EnumSet.of( ENTITY ) );
+ // may be wanted even within procedure, procedure probably wanted within device. Maybe Anatomy?
+// SUBSUME_MAP.put( DEVICE, EnumSet.of( ENTITY ) );
+ //
+// SUBSUME_MAP.put( LAB, EnumSet.of( PHENOMENON, ENTITY, EVENT ) );
+ //
+// SUBSUME_MAP.put( PHENOMENON, EnumSet.of( ENTITY ) );
+ // SUBJECT
+ // TITLE
+ // EVENT
+ // ENTITY
+ // TIME
+ // MODIFIER
+ // LAB_MODIFIER
+ }
+
+
+ public void createAnnotations( final JCas jCas,
+ final Map<Pair<Integer>, Collection<DiscoveredTerm>> allDiscoveredTermsMap,
+ final Map<DiscoveredTerm, Collection<TermEncoding>> termEncodingMap,
+ final Map<SemanticTui, SemanticGroup> reassignSemantics ) {
+
+ final Map<SemanticGroup, Collection<DiscoveredTerm>> semanticTermsMap
+ = AnnotationCreatorUtil.mapSemanticTerms( termEncodingMap, reassignSemantics );
+
+ final Map<DiscoveredTerm, Collection<MagicTextSpan>> termSpanMap
+ = AnnotationCreatorUtil.mapTermSpans( allDiscoveredTermsMap );
+
+
+ for ( SemanticGroup subsumingGroup : SemanticGroup.values() ) {
+ final Collection<DiscoveredTerm> semanticTerms = semanticTermsMap.get( subsumingGroup );
+ if ( semanticTerms == null || semanticTerms.isEmpty() ) {
+ continue;
+ }
+ final Collection<SemanticGroup> subsumedGroups
+ = SUBSUME_MAP.getOrDefault( subsumingGroup, Collections.emptyList() );
+ final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumedTermsMap
+ = getSemanticSubsumedSpanTerms(
+ subsumingGroup, subsumedGroups, semanticTermsMap, termSpanMap );
+
+ for ( Map.Entry<MagicTextSpan, Collection<DiscoveredTerm>> subsumedTerms : subsumedTermsMap.entrySet() ) {
+ allDiscoveredTermsMap.getOrDefault( subsumedTerms.getKey().toIntPair(), new HashSet<>() )
+ .removeAll( subsumedTerms.getValue() );
+ semanticTerms.removeAll( subsumedTerms.getValue() );
+ for ( SemanticGroup subsumedGroup : subsumedGroups ) {
+ semanticTermsMap.getOrDefault( subsumedGroup, new HashSet<>() ).removeAll( subsumedTerms.getValue() );
+ }
+ }
+
+ // WSD
+ final Map<MagicTextSpan, Collection<DiscoveredTerm>> wsdedTermsMap
+ = WsdUtil.getSemanticWsdSpanTerms( semanticTerms, termSpanMap );
+ for ( Map.Entry<MagicTextSpan, Collection<DiscoveredTerm>> wsdedTerms : wsdedTermsMap.entrySet() ) {
+ allDiscoveredTermsMap.getOrDefault( wsdedTerms.getKey().toIntPair(), new HashSet<>() )
+ .removeAll( wsdedTerms.getValue() );
+ }
+
+ }
+
+ allDiscoveredTermsMap.forEach(
+ ( k, v ) -> AnnotationCreatorUtil.createAnnotations( jCas, k, v, termEncodingMap, reassignSemantics ) );
+ }
+
+
+ static public Map<MagicTextSpan, Collection<DiscoveredTerm>> getSemanticSubsumedSpanTerms(
+ final SemanticGroup subsumingGroup,
+ final Collection<SemanticGroup> subsumedGroups,
+ final Map<SemanticGroup, Collection<DiscoveredTerm>> semanticTermsMap,
+ final Map<DiscoveredTerm, Collection<MagicTextSpan>> termSpanMap ) {
+ final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumedSpanTermsMap = new HashMap<>();
+ // Get subsuming spans and their corresponding terms.
+ final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumingSpanTermsMap
+ = SubsumptionUtil.mapSpanTerms( subsumingGroup, semanticTermsMap, termSpanMap );
+ if ( subsumingSpanTermsMap.isEmpty() ) {
+ // No subsuming Spans.
+ return Collections.emptyMap();
+ }
+ // List of spans for subsuming terms, sorted by end character index.
+ final List<MagicTextSpan> subsumingSpans = new ArrayList<>( subsumingSpanTermsMap.keySet() );
+// subsumingSpans.sort( Comparator.comparingInt( MagicTextSpan::getEnd ) );
+ // Remove smaller terms of the same semantic group
+ if ( subsumingSpanTermsMap.size() > 1 ) {
+ subsumedSpanTermsMap.putAll( SubsumptionUtil.mapFullySubsumedTermSpans( subsumingSpans, subsumingSpanTermsMap ) );
+ if ( subsumedGroups.isEmpty() ) {
+ return subsumedSpanTermsMap;
+ }
+ subsumingSpans.removeAll( subsumedSpanTermsMap.keySet() );
+ }
+
+ // Remove smaller or the same span terms of the other semantic groups
+ final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumableSpanTermsMap = new HashMap<>();
+ for ( SemanticGroup group : subsumedGroups ) {
+ final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumedGroupSpanTermsMap
+ = SubsumptionUtil.mapSpanTerms( group, semanticTermsMap, termSpanMap );
+ for ( Map.Entry<MagicTextSpan, Collection<DiscoveredTerm>> subsumedGroupSpanTerms
+ : subsumedGroupSpanTermsMap.entrySet() ) {
+ subsumableSpanTermsMap.computeIfAbsent( subsumedGroupSpanTerms.getKey(),
+ t -> new HashSet<>() ).addAll( subsumedGroupSpanTerms.getValue() );
+ }
+ }
+
+ if ( subsumableSpanTermsMap.isEmpty() ) {
+ return subsumedSpanTermsMap;
+ }
+ final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumedGroupsSpanTermsMap
+ = SubsumptionUtil.mapSubsumedOrSameTermSpans( subsumingSpans, subsumableSpanTermsMap );
+ for ( Map.Entry<MagicTextSpan, Collection<DiscoveredTerm>> subsumedGroupsSpanTerms
+ : subsumedGroupsSpanTermsMap.entrySet() ) {
+ subsumedSpanTermsMap.computeIfAbsent( subsumedGroupsSpanTerms.getKey(),
+ t -> new HashSet<>() ).addAll( subsumedGroupsSpanTerms.getValue() );
+ }
+ return subsumedSpanTermsMap;
+ }
+
+
+}
Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SubsumptionUtil.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SubsumptionUtil.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SubsumptionUtil.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SubsumptionUtil.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,118 @@
+package org.apache.ctakes.dictionary.cased.annotation;
+
+
+import jdk.nashorn.internal.ir.annotations.Immutable;
+import org.apache.ctakes.core.util.annotation.SemanticGroup;
+import org.apache.ctakes.dictionary.cased.lookup.DiscoveredTerm;
+import org.apache.ctakes.dictionary.cased.util.textspan.MagicTextSpan;
+
+import java.util.*;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/24/2020
+ */
+@Immutable
+final public class SubsumptionUtil {
+
+
+ private SubsumptionUtil() {
+ }
+
+ static public Map<MagicTextSpan, Collection<DiscoveredTerm>> mapSpanTerms(
+ final SemanticGroup semanticGroup,
+ final Map<SemanticGroup, Collection<DiscoveredTerm>> semanticTermsMap,
+ final Map<DiscoveredTerm, Collection<MagicTextSpan>> termSpanMap ) {
+ final Collection<DiscoveredTerm> semanticTerms = semanticTermsMap.get( semanticGroup );
+ if ( semanticTerms == null || semanticTerms.isEmpty() ) {
+ return Collections.emptyMap();
+ }
+ return mapSpanTerms( semanticTerms, termSpanMap );
+ }
+
+ static private Map<MagicTextSpan, Collection<DiscoveredTerm>> mapSpanTerms(
+ final Collection<DiscoveredTerm> discoveredTerms,
+ final Map<DiscoveredTerm, Collection<MagicTextSpan>> termSpanMap ) {
+ final Map<MagicTextSpan, Collection<DiscoveredTerm>> spanTerms = new HashMap<>();
+ for ( DiscoveredTerm term : discoveredTerms ) {
+ final Collection<MagicTextSpan> termSpans = termSpanMap.get( term );
+ if ( termSpans == null ) {
+ continue;
+ }
+ termSpans.forEach(
+ p -> spanTerms.computeIfAbsent( p, s -> new HashSet<>() )
+ .add( term ) );
+ }
+ return spanTerms;
+ }
+
+
+ static public Map<MagicTextSpan, Collection<DiscoveredTerm>> mapFullySubsumedTermSpans(
+ final List<MagicTextSpan> subsumingSpans,
+ final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumableSpanTermsMap ) {
+ final List<MagicTextSpan> possiblySubsumedSpans = new ArrayList<>( subsumableSpanTermsMap.keySet() );
+// possiblySubsumedSpans.sort( Comparator.comparingInt( MagicTextSpan::getBegin ) );
+
+ final Collection<MagicTextSpan> subsumedSpans = getFullySubsumedSpans( subsumingSpans, possiblySubsumedSpans );
+ final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumedLikeTermMap
+ = new HashMap<>( subsumableSpanTermsMap );
+ subsumedLikeTermMap.keySet().retainAll( subsumedSpans );
+ return subsumedLikeTermMap;
+ }
+
+ static public Map<MagicTextSpan, Collection<DiscoveredTerm>> mapSubsumedOrSameTermSpans(
+ final List<MagicTextSpan> subsumingSpans,
+ final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumableSpanTermsMap ) {
+ final List<MagicTextSpan> possiblySubsumedSpans = new ArrayList<>( subsumableSpanTermsMap.keySet() );
+// possiblySubsumedSpans.sort( Comparator.comparingInt( MagicTextSpan::getBegin ) );
+
+ final Collection<MagicTextSpan> subsumedSpans = getSubsumedOrSameSpans( subsumingSpans, possiblySubsumedSpans );
+ final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumedLikeTermMap
+ = new HashMap<>( subsumableSpanTermsMap );
+ subsumedLikeTermMap.keySet().retainAll( subsumedSpans );
+ return subsumedLikeTermMap;
+ }
+
+ /**
+ * Refine a collection of dictionary terms to only contain the most specific variations:
+ * "colon cancer" instead of "cancer", performed by span inclusion /complete containment, not overlap
+ */
+ static private Collection<MagicTextSpan> getFullySubsumedSpans(
+ final List<MagicTextSpan> subsumingSpans,
+ final List<MagicTextSpan> possiblySubsumedSpans ) {
+ final Collection<MagicTextSpan> subsumedSpans = new HashSet<>();
+
+ // Subsuming spans start at the begin of the document and move forward
+ for ( MagicTextSpan subsumingSpan : subsumingSpans ) {
+ for ( MagicTextSpan possiblySubsumedSpan : possiblySubsumedSpans ) {
+ if ( subsumingSpan.fullyContainsAll( possiblySubsumedSpan ) ) {
+ subsumedSpans.add( possiblySubsumedSpan );
+ }
+ }
+ }
+ return subsumedSpans;
+ }
+
+
+ /**
+ * Refine a collection of dictionary terms to only contain the most specific variations:
+ * "headache medicine" instead of "headache", performed by span inclusion /complete containment, not overlap
+ */
+ static public Collection<MagicTextSpan> getSubsumedOrSameSpans(
+ final List<MagicTextSpan> subsumingSpans,
+ final List<MagicTextSpan> possiblySubsumedSpans ) {
+ final Collection<MagicTextSpan> subsumedSpans = new HashSet<>();
+ // Subsuming spans start at the begin of the document and move forward
+ for ( MagicTextSpan subsumingSpan : subsumingSpans ) {
+ for ( MagicTextSpan possiblySubsumedSpan : possiblySubsumedSpans ) {
+ if ( subsumingSpan.containsAll( possiblySubsumedSpan ) ) {
+ subsumedSpans.add( possiblySubsumedSpan );
+ }
+ }
+ }
+ return subsumedSpans;
+ }
+
+
+}
Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvDictionary.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvDictionary.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvDictionary.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvDictionary.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,102 @@
+package org.apache.ctakes.dictionary.cased.dictionary;
+
+
+import org.apache.ctakes.dictionary.cased.lookup.CandidateTerm;
+import org.apache.ctakes.dictionary.cased.lookup.LookupToken;
+import org.apache.ctakes.dictionary.cased.util.bsv.BsvFileParser;
+import org.apache.ctakes.dictionary.cased.util.bsv.BsvObjectCreator;
+import org.apache.ctakes.dictionary.cased.util.tokenize.TokenizedTerm;
+import org.apache.ctakes.dictionary.cased.util.tokenize.TokenizedTermMapper;
+import org.apache.ctakes.utils.env.EnvironmentVariable;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/13/2020
+ */
+final public class BsvDictionary implements CasedDictionary {
+
+ static public final String DICTIONARY_TYPE = "BSV";
+
+ static private final Logger LOGGER = Logger.getLogger( "BsvDictionary" );
+
+ final private CasedDictionary _delegateDictionary;
+
+ /**
+ * @param name unique name for dictionary
+ * @param uimaContext -
+ */
+ public BsvDictionary( final String name, final UimaContext uimaContext ) {
+ this( name, EnvironmentVariable.getEnv( name + "_file", uimaContext ) );
+ }
+
+ /**
+ * @param name unique name for dictionary
+ * @param bsvPath path to bsv file containing synonyms and cuis
+ */
+ public BsvDictionary( final String name, final String bsvPath ) {
+ final Collection<TokenizedTerm> tokenizedTerms = parseBsvFile( bsvPath );
+ final Map<String, Collection<CandidateTerm>> upperWordTermMap = new HashMap<>();
+ final Map<String, Collection<CandidateTerm>> mixedWordTermMap = new HashMap<>();
+ final Map<String, Collection<CandidateTerm>> lowerWordTermMap = new HashMap<>();
+ TokenizedTermMapper.createTermMap( tokenizedTerms, upperWordTermMap, mixedWordTermMap, lowerWordTermMap );
+ _delegateDictionary = new InMemoryDictionary( name, upperWordTermMap, mixedWordTermMap, lowerWordTermMap );
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public String getName() {
+ return _delegateDictionary.getName();
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public Collection<CandidateTerm> getCandidateTerms( final LookupToken lookupToken ) {
+ return _delegateDictionary.getCandidateTerms( lookupToken );
+ }
+
+ /**
+ * Create a collection of {@link TokenizedTerm} Objects
+ * by parsing a bsv file. The file can be in one of two columnar formats:
+ * <p>
+ * CUI|Text
+ * </p>
+ *
+ * @param bsvFilePath path to file containing term rows and bsv columns
+ * @return collection of all valid terms read from the bsv file
+ */
+ static private Collection<TokenizedTerm> parseBsvFile( final String bsvFilePath ) {
+ try {
+ return BsvFileParser.parseBsvFile( bsvFilePath, new TokenizedTermCreator() );
+ } catch ( IOException ioE ) {
+ LOGGER.error( ioE.getMessage() );
+ }
+ return Collections.emptyList();
+ }
+
+
+ static private class TokenizedTermCreator implements BsvObjectCreator<TokenizedTerm> {
+ public TokenizedTerm createBsvObject( final String[] columns ) {
+ if ( columns.length != 2 ) {
+ return null;
+ }
+ return new TokenizedTerm( columns[ 0 ].trim(), columns[ 1 ].trim() );
+ }
+ }
+
+
+}
+
Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvListDictionary.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvListDictionary.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvListDictionary.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvListDictionary.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,96 @@
+package org.apache.ctakes.dictionary.cased.dictionary;
+
+
+import org.apache.ctakes.core.util.StringUtil;
+import org.apache.ctakes.dictionary.cased.lookup.CandidateTerm;
+import org.apache.ctakes.dictionary.cased.lookup.LookupToken;
+import org.apache.ctakes.dictionary.cased.util.tokenize.TokenizedTerm;
+import org.apache.ctakes.dictionary.cased.util.tokenize.TokenizedTermMapper;
+import org.apache.ctakes.utils.env.EnvironmentVariable;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+
+import java.util.*;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/13/2020
+ */
+final public class BsvListDictionary implements CasedDictionary {
+
+ static public final String DICTIONARY_TYPE = "BSV_LIST";
+
+ static private final Logger LOGGER = Logger.getLogger( "BsvListDictionary" );
+
+ final private CasedDictionary _delegateDictionary;
+
+ /**
+ * @param name unique name for dictionary
+ * @param uimaContext -
+ */
+ public BsvListDictionary( final String name, final UimaContext uimaContext ) {
+ this( name, EnvironmentVariable.getEnv( name + "_list", uimaContext ) );
+ }
+
+ /**
+ * @param name unique name for dictionary
+ * @param bsvList list containing synonyms and cuis
+ */
+ public BsvListDictionary( final String name, final String bsvList ) {
+ final Collection<TokenizedTerm> tokenizedTerms = parseList( name, bsvList );
+ LOGGER.info( "Parsed " + tokenizedTerms.size() + " terms for dictionary " + name );
+ final Map<String, Collection<CandidateTerm>> upperWordTermMap = new HashMap<>();
+ final Map<String, Collection<CandidateTerm>> mixedWordTermMap = new HashMap<>();
+ final Map<String, Collection<CandidateTerm>> lowerWordTermMap = new HashMap<>();
+ TokenizedTermMapper.createTermMap( tokenizedTerms, upperWordTermMap, mixedWordTermMap, lowerWordTermMap );
+ _delegateDictionary = new InMemoryDictionary( name, upperWordTermMap, mixedWordTermMap, lowerWordTermMap );
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public String getName() {
+ return _delegateDictionary.getName();
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public Collection<CandidateTerm> getCandidateTerms( final LookupToken lookupToken ) {
+ return _delegateDictionary.getCandidateTerms( lookupToken );
+ }
+
+ /**
+ * Create a collection of {@link TokenizedTerm} Objects
+ * by parsing a bsv file. The file can be in one of two columnar formats:
+ * <p>
+ * CUI|Text
+ * </p>
+ *
+ * @param termList list containing synonyms and cuis
+ * @return collection of all valid terms read from the bsv file
+ */
+ static private Collection<TokenizedTerm> parseList( final String name, final String termList ) {
+ if ( termList.isEmpty() ) {
+ LOGGER.error( "List of terms is empty for " + name );
+ return Collections.emptyList();
+ }
+ final Collection<TokenizedTerm> tokenizedTerms = new HashSet<>();
+ for ( String term : StringUtil.fastSplit( termList, '|' ) ) {
+ final String[] keyValue = StringUtil.fastSplit( term, ':' );
+ if ( keyValue.length != 2 ) {
+ LOGGER.warn( "Improper Key : Value pair for Dictionary Term " + term );
+ continue;
+ }
+ tokenizedTerms.add( new TokenizedTerm( keyValue[ 0 ].trim(), keyValue[ 1 ].trim() ) );
+ }
+ return tokenizedTerms;
+ }
+
+
+}
+
Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/CasedDictionary.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/CasedDictionary.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/CasedDictionary.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/CasedDictionary.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,35 @@
+package org.apache.ctakes.dictionary.cased.dictionary;
+
+import org.apache.ctakes.dictionary.cased.lookup.CandidateTerm;
+import org.apache.ctakes.dictionary.cased.lookup.LookupToken;
+
+import java.util.Collection;
+
+/**
+ * Dictionary used to lookup terms by the most rare word within them.
+ *
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/13/2020
+ */
+public interface CasedDictionary {
+
+
+ /**
+ * The Type identifier and Name are used to maintain a collection of dictionaries,
+ * so the combination of Type and Name should be unique for each dictionary if possible.
+ *
+ * @return simple name for the dictionary
+ */
+ String getName();
+
+ /**
+ * Any single token can exist in zero or more terms in the dictionary. It may exist as its -own- form or as an
+ * alternate canonical variant. This method will check the dictionary for both.
+ *
+ * @param lookupToken a single-word token
+ * @return zero or more terms that contain the lookup token
+ */
+ Collection<CandidateTerm> getCandidateTerms( final LookupToken lookupToken );
+
+}
Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/DictionaryStore.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/DictionaryStore.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/DictionaryStore.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/DictionaryStore.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,42 @@
+package org.apache.ctakes.dictionary.cased.dictionary;
+
+import java.util.ArrayList;
+import java.util.Collection;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/13/2020
+ */
+public enum DictionaryStore {
+ INSTANCE;
+
+ static public DictionaryStore getInstance() {
+ return INSTANCE;
+ }
+
+
+ private final Collection<CasedDictionary> _dictionaries = new ArrayList<>();
+
+ public boolean addDictionary( final CasedDictionary dictionary ) {
+ final String name = dictionary.getName();
+ synchronized ( _dictionaries ) {
+ final boolean present = _dictionaries.stream()
+ .map( CasedDictionary::getName )
+ .anyMatch( name::equals );
+ if ( present ) {
+ // Dictionary with given name already exists.
+ return false;
+ }
+ _dictionaries.add( dictionary );
+ return true;
+ }
+ }
+
+
+ public Collection<CasedDictionary> getDictionaries() {
+ return _dictionaries;
+ }
+
+
+}
Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/InMemoryDictionary.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/InMemoryDictionary.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/InMemoryDictionary.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/InMemoryDictionary.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,70 @@
+package org.apache.ctakes.dictionary.cased.dictionary;
+
+
+import org.apache.ctakes.dictionary.cased.lookup.CandidateTerm;
+import org.apache.ctakes.dictionary.cased.lookup.LookupToken;
+
+import java.util.Collection;
+import java.util.Map;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/13/2020
+ */
+final public class InMemoryDictionary implements CasedDictionary {
+
+ private final String _name;
+
+ // Map of rare tokens to terms that contain those tokens. Used like "First Word Token Lookup" but faster
+ private final Map<String, Collection<CandidateTerm>> _upperTermMap;
+ // Map of rare tokens to terms that contain those tokens. Used like "First Word Token Lookup" but faster
+ private final Map<String, Collection<CandidateTerm>> _mixedTermMap;
+ // Map of rare tokens to terms that contain those tokens. Used like "First Word Token Lookup" but faster
+ private final Map<String, Collection<CandidateTerm>> _lowerTermMap;
+
+ /**
+ * @param name unique name for dictionary
+ * @param upperTermMap Map with a case-sensitive Rare Word (tokens) as key, and RareWordTerm Collection as value
+ * @param mixedTermMap Map with a case-sensitive Rare Word (tokens) as key, and RareWordTerm Collection as value
+ * @param lowerTermMap Map with a lowercase Rare Word (tokens) as key, and RareWordTerm Collection as value
+ */
+ public InMemoryDictionary( final String name,
+ final Map<String, Collection<CandidateTerm>> upperTermMap,
+ final Map<String, Collection<CandidateTerm>> mixedTermMap,
+ final Map<String, Collection<CandidateTerm>> lowerTermMap ) {
+ _name = name;
+ _upperTermMap = upperTermMap;
+ _mixedTermMap = mixedTermMap;
+ _lowerTermMap = lowerTermMap;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public String getName() {
+ return _name;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public Collection<CandidateTerm> getCandidateTerms( final LookupToken lookupToken ) {
+ if ( lookupToken.isAllUpperCase() ) {
+ final Collection<CandidateTerm> cased = _upperTermMap.get( lookupToken.getText() );
+ if ( cased != null ) {
+ return cased;
+ }
+ } else if ( !lookupToken.isAllLowerCase() ) {
+ final Collection<CandidateTerm> mixed = _mixedTermMap.get( lookupToken.getText() );
+ if ( mixed != null ) {
+ return mixed;
+ }
+ }
+ return _lowerTermMap.get( lookupToken.getLowerText() );
+ }
+
+
+}
Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/JdbcDictionary.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/JdbcDictionary.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/JdbcDictionary.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/JdbcDictionary.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,236 @@
+package org.apache.ctakes.dictionary.cased.dictionary;
+
+
+import org.apache.ctakes.dictionary.cased.lookup.CandidateTerm;
+import org.apache.ctakes.dictionary.cased.lookup.LookupToken;
+import org.apache.ctakes.dictionary.cased.util.jdbc.JdbcUtil;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+
+import static org.apache.ctakes.dictionary.cased.table.column.Synonym.*;
+import static org.apache.ctakes.dictionary.cased.util.jdbc.JdbcUtil.*;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/14/2020
+ */
+final public class JdbcDictionary implements CasedDictionary {
+
+ static public final String DICTIONARY_TYPE = "JDBC";
+
+ static private final Logger LOGGER = Logger.getLogger( "JdbcDictionary" );
+
+ static private final String snomed_rxnorm_2020aa_url
+ = "jdbc:hsqldb:file:resources/org/apache/ctakes/dictionary/lookup/cased/sno_rx_2020aa/sno_rx_2020aa";
+ static private final String snomed_rxnorm_2020aa_driver = "org.hsqldb.jdbcDriver";
+ static private final String snomed_rxnorm_2020aa_user = "sa";
+ static private final String snomed_rxnorm_2020aa_pass = "";
+
+ private final String _name;
+
+
+ private final PreparedStatement _selectUpperCall;
+ private final PreparedStatement _selectMixedCall;
+ private final PreparedStatement _selectLowerCall;
+
+
+ /**
+ * @param name unique name for dictionary
+ * @param uimaContext -
+ */
+ public JdbcDictionary( final String name, final UimaContext uimaContext ) throws SQLException {
+ this( name,
+ getParameterValue( name, "driver", uimaContext, HSQL_DRIVER ),
+ getParameterValue( name, "url", uimaContext, "" ),
+ getParameterValue( name, "upper", uimaContext, UPPER_TABLE ),
+ getParameterValue( name, "mixed", uimaContext, MIXED_TABLE ),
+ getParameterValue( name, "lower", uimaContext, LOWER_TABLE ),
+ getParameterValue( name, "user", uimaContext, DEFAULT_USER ),
+ getParameterValue( name, "pass", uimaContext, DEFAULT_PASS ) );
+ }
+
+ /**
+ * @param name unique name for dictionary
+ * @param jdbcDriver -
+ * @param jdbcUrl -
+ * @param upperName Name of table containing uppercase-only terms
+ * @param mixedName Name of table containing mixed case terms
+ * @param lowerName Name of table containing lowercase-only terms
+ * @param jdbcUser -
+ * @param jdbcPass -
+ */
+ public JdbcDictionary( final String name,
+ final String jdbcDriver,
+ final String jdbcUrl,
+ final String upperName,
+ final String mixedName,
+ final String lowerName,
+ final String jdbcUser,
+ final String jdbcPass ) throws SQLException {
+ _name = name;
+ _selectUpperCall = JdbcUtil.createPreparedStatement( name,
+ jdbcDriver, jdbcUrl, jdbcUser, jdbcPass, upperName, INDEX_WORD.name() );
+ LOGGER.info( "Connected to " + name + " table " + upperName );
+ _selectMixedCall = JdbcUtil.createPreparedStatement( name,
+ jdbcDriver, jdbcUrl, jdbcUser, jdbcPass, mixedName, INDEX_WORD.name() );
+ LOGGER.info( "Connected to " + name + " table " + mixedName );
+ _selectLowerCall = JdbcUtil.createPreparedStatement( name,
+ jdbcDriver, jdbcUrl, jdbcUser, jdbcPass, lowerName, INDEX_WORD.name() );
+ LOGGER.info( "Connected to " + name + " table " + lowerName );
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public String getName() {
+ return _name;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public Collection<CandidateTerm> getCandidateTerms( final LookupToken lookupToken ) {
+ final Collection<CandidateTerm> candidates = new HashSet<>();
+ if ( lookupToken.isAllUpperCase() ) {
+ candidates.addAll( getUpperTerms( lookupToken.getText() ) );
+// final Collection<CandidateTerm> cased = getUpperTerms( lookupToken.getText() );
+// if ( !cased.isEmpty() ) {
+// LOGGER.info( "Token " + lookupToken.getText() + " UPPER " + cased.stream()
+// .map( CandidateTerm::getTokens )
+// .map( t -> String.join( " ", t ) )
+// .collect( Collectors.joining( " ; " ) ) );
+// return cased;
+// }
+ }
+ if ( !lookupToken.isAllLowerCase() ) {
+ candidates.addAll( getMixedTerms( lookupToken.getText() ) );
+// final Collection<CandidateTerm> mixed = getMixedTerms( lookupToken.getText() );
+// if ( !mixed.isEmpty() ) {
+// LOGGER.info( "Token " + lookupToken.getText() + " MIXED " + mixed.stream()
+// .map( CandidateTerm::getTokens )
+// .map( t -> String.join( " ", t ) )
+// .collect( Collectors.joining( " ; " ) ) );
+// return mixed;
+// }
+ }
+ candidates.addAll( getLowerTerms( lookupToken.getLowerText() ) );
+// final Collection<CandidateTerm> lower = getLowerTerms( lookupToken.getLowerText() );
+// if ( !lower.isEmpty() ) {
+// LOGGER.info( "Token " + lookupToken.getText() + " LOWER " + lower.stream()
+// .map( CandidateTerm::getTokens )
+// .map( t -> String.join( " ", t ) )
+// .collect( Collectors.joining( " ; " ) ) );
+// return lower;
+// }
+// LOGGER.info( "Token " + lookupToken.getText() + " NOTHING " );
+
+ // return getLowerTerms( lookupToken.getLowerText() );
+ return candidates;
+ }
+
+
+ /**
+ * @param text to lookup
+ * @return uppercase candidate terms
+ */
+ public Collection<CandidateTerm> getUpperTerms( final String text ) {
+ final List<CandidateTerm> candidateTerms = new ArrayList<>();
+ try {
+ JdbcUtil.fillSelectCall( _selectUpperCall, text );
+ final ResultSet resultSet = _selectUpperCall.executeQuery();
+ while ( resultSet.next() ) {
+ final CandidateTerm candidateTerm = new CandidateTerm(
+ resultSet.getLong( CUI.getColumn() ),
+ resultSet.getString( PREFIX.getColumn() ),
+ resultSet.getString( INDEX_WORD.getColumn() ),
+ resultSet.getString( SUFFIX.getColumn() ),
+ true,
+ false,
+ resultSet.getInt( RANK.getColumn() ),
+ resultSet.getInt( INSTANCES.getColumn() ) );
+ candidateTerms.add( candidateTerm );
+ }
+ // Though the ResultSet interface documentation states that there are automatic closures,
+ // it is up to the driver to implement this behavior ... historically some drivers have not done so
+ resultSet.close();
+ } catch ( SQLException e ) {
+ LOGGER.error( e.getMessage() );
+ }
+ return candidateTerms;
+ }
+
+ /**
+ * @param text to lookup
+ * @return mixed case candidate terms
+ */
+ public Collection<CandidateTerm> getMixedTerms( final String text ) {
+ final List<CandidateTerm> candidateTerms = new ArrayList<>();
+ try {
+ JdbcUtil.fillSelectCall( _selectMixedCall, text );
+ final ResultSet resultSet = _selectMixedCall.executeQuery();
+ while ( resultSet.next() ) {
+ final CandidateTerm candidateTerm = new CandidateTerm(
+ resultSet.getLong( CUI.getColumn() ),
+ resultSet.getString( PREFIX.getColumn() ),
+ resultSet.getString( INDEX_WORD.getColumn() ),
+ resultSet.getString( SUFFIX.getColumn() ),
+ false,
+ false,
+ resultSet.getInt( RANK.getColumn() ),
+ resultSet.getInt( INSTANCES.getColumn() ) );
+ candidateTerms.add( candidateTerm );
+ }
+ // Though the ResultSet interface documentation states that there are automatic closures,
+ // it is up to the driver to implement this behavior ... historically some drivers have not done so
+ resultSet.close();
+ } catch ( SQLException e ) {
+ LOGGER.error( e.getMessage() );
+ }
+ return candidateTerms;
+ }
+
+
+ /**
+ * @param text to lookup
+ * @return lowercase candidate terms
+ */
+ public Collection<CandidateTerm> getLowerTerms( final String text ) {
+ final List<CandidateTerm> candidateTerms = new ArrayList<>();
+ try {
+ JdbcUtil.fillSelectCall( _selectLowerCall, text );
+ final ResultSet resultSet = _selectLowerCall.executeQuery();
+ while ( resultSet.next() ) {
+ final CandidateTerm candidateTerm = new CandidateTerm(
+ resultSet.getLong( CUI.getColumn() ),
+ resultSet.getString( PREFIX.getColumn() ),
+ resultSet.getString( INDEX_WORD.getColumn() ),
+ resultSet.getString( SUFFIX.getColumn() ),
+ false,
+ true,
+ resultSet.getInt( RANK.getColumn() ),
+ resultSet.getInt( INSTANCES.getColumn() ) );
+ candidateTerms.add( candidateTerm );
+ }
+ // Though the ResultSet interface documentation states that there are automatic closures,
+ // it is up to the driver to implement this behavior ... historically some drivers have not done so
+ resultSet.close();
+ } catch ( SQLException e ) {
+ LOGGER.error( e.getMessage() );
+ }
+ return candidateTerms;
+ }
+
+
+}