You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2020/09/25 00:59:37 UTC
svn commit: r1881994 [3/3] - in
/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased:
./ ae/ annotation/ dictionary/ encoder/ lookup/ table/ table/column/ util/
util/bsv/ util/jdbc/ util/textspan/ util/tokenize/ ...
Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/tokenize/TokenizedTermMapper.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/tokenize/TokenizedTermMapper.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/tokenize/TokenizedTermMapper.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/tokenize/TokenizedTermMapper.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.cased.util.tokenize;
+
+import org.apache.ctakes.dictionary.cased.lookup.CandidateTerm;
+import org.apache.ctakes.dictionary.lookup2.term.RareWordTerm;
+import org.apache.log4j.Logger;
+
+import java.util.*;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+
+/**
+ * Given a collection of {@link CandidateTerm} Objects,
+ * this factory can create a Map of {@link RareWordTerm} collections
+ * indexed by rare word.
+ * <p/>
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/9/14
+ */
+final public class TokenizedTermMapper {
+
+ static private final Logger LOGGER = Logger.getLogger( "TokenizedTermMapper" );
+
+ private TokenizedTermMapper() {
+ }
+
+
+ // LookupDesc for the standard excluded pos tags are
+ // VB,VBD,VBG,VBN,VBP,VBZ,CC,CD,DT,EX,LS,MD,PDT,POS,PP,PP$,PRP,PRP$,RP,TO,WDT,WP,WPS,WRB
+ // Listing every verb in the language seems a pain, but listing the others is possible.
+ // Verbs should be rare in the dictionaries, excepting perhaps the activity and concept dictionaries
+ // CD, CC, DT, EX, MD, PDT, PP, PP$, PRP, PRP$, RP, TO, WDT, WP, WPS, WRB
+ // why not WP$ (possessive wh- pronoun "whose")
+ // PP$ is a Brown POS tag, not Penn Treebank (as are the rest)
+ static private final Collection<String> BAD_POS_TERMS = new HashSet<>( Arrays.asList(
+ // CD cardinal number
+ "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
+ // CC coordinating conjunction
+ "and", "or", "but", "for", "nor", "so", "yet", "both",
+ // DT determiner
+ "this", "that", "these", "those", "the", "all", "an", "another", "any", "each",
+ "either", "many", "much", "neither", "no", "some", "such", "that", "the", "them", "these", "this", "those",
+ // EX existential there
+ "there",
+ // IN
+ "among", "upon", "in", "into", "below", "atop", "until", "over", "under", "towards", "to",
+ "whether", "despite", "if",
+ // MD modal
+ "can", "should", "will", "may", "might", "must", "could", "would", "need", "ought", "shall",
+ "cannot", "shouldn",
+ // PDT predeterminer
+ "some", "any", "all", "both", "half", "none", "twice",
+ // PP prepositional phrase (preposition)
+ "at", "before", "after", "behind", "beneath", "beside", "between", "into", "through", "across", "of",
+ "concerning", "like", "except", "with", "without", "toward", "to", "past", "against", "during", "until",
+ "throughout", "below", "besides", "beyond", "from", "inside", "near", "outside", "since", "upon",
+ // PP$ possessive personal pronoun - Brown POS tag, not Penn TreeBank
+ "my", "our",
+ // PRP personal pronoun
+ "i", "you", "he", "she", "it", "him", "himself", "we",
+ // PRP$ possesive pronoun
+ "mine", "yours", "his", "hers", "its", "our", "ours", "theirs",
+ // RP particle - this contains some prepositions
+ "about", "off", "up", "along", "away", "back", "by", "down", "forward", "in", "on", "out",
+ "over", "around", "under",
+ // TO to - also a preposition
+ "to",
+ // WDT wh- determiner
+ "what", "whatever", "which", "whichever", "that",
+ // WP, WPS, WP$ wh- pronoun, nominative wh- pronoun
+ "who", "whom", "which", "that", "whoever", "whomever", "whose",
+ // WRB
+ "how", "where", "when", "however", "wherever", "whenever", "wherein", "why" ) );
+
+ static private final Collection<String> BAD_UPPER_POS_TERMS
+ = BAD_POS_TERMS.stream()
+ .map( String::toUpperCase )
+ .collect( Collectors.toSet() );
+
+// static public Map<String, Collection<CandidateTerm>> createTermMap( final Collection<TokenizedTerm> tokenizedTerms ) {
+// final Map<String, Collection<CandidateTerm>> termMap = new HashMap<>();
+// final Map<String, Long> tokenCountMap = createTokenCountMap( tokenizedTerms );
+// for ( TokenizedTerm tokenizedTerm : tokenizedTerms ) {
+// final String[] tokens = tokenizedTerm.getTokens();
+// final int rareWordIndex = getRareWordIndex( tokens, tokenCountMap );
+// if ( rareWordIndex < 0 ) {
+// LOGGER.warn( "Bad Rare Word Index for " + String.join( " ", tokens ) );
+// continue;
+// }
+// termMap.computeIfAbsent( tokens[ rareWordIndex ], l -> new ArrayList<>() )
+// .add( new CandidateTerm( tokenizedTerm, rareWordIndex ) );
+// }
+// return termMap;
+// }
+
+
+ static public void createTermMap( final Collection<TokenizedTerm> tokenizedTerms,
+ final Map<String, Collection<CandidateTerm>> upperTerms,
+ final Map<String, Collection<CandidateTerm>> mixedTerms,
+ final Map<String, Collection<CandidateTerm>> lowerTerms ) {
+ final Map<String, Long> tokenCountMap = createTokenCountMap( tokenizedTerms );
+ for ( TokenizedTerm tokenizedTerm : tokenizedTerms ) {
+ final String[] tokens = tokenizedTerm.getTokens();
+ final int rareWordIndex = getRareWordIndex( tokens, tokenCountMap );
+ if ( rareWordIndex < 0 ) {
+ LOGGER.warn( "Bad Rare Word Index for " + String.join( " ", tokens ) );
+ continue;
+ }
+ if ( tokenizedTerm.isAllUpperCase() ) {
+ upperTerms.computeIfAbsent( tokens[ rareWordIndex ], l -> new ArrayList<>() )
+ .add( new CandidateTerm( tokenizedTerm, rareWordIndex ) );
+ } else if ( tokenizedTerm.isAllLowerCase() ) {
+ lowerTerms.computeIfAbsent( tokens[ rareWordIndex ], l -> new ArrayList<>() )
+ .add( new CandidateTerm( tokenizedTerm, rareWordIndex ) );
+ } else {
+ mixedTerms.computeIfAbsent( tokens[ rareWordIndex ], l -> new ArrayList<>() )
+ .add( new CandidateTerm( tokenizedTerm, rareWordIndex ) );
+ }
+ }
+ }
+
+
+ static private Map<String, Long> createTokenCountMap( final Collection<TokenizedTerm> tokenizedTerms ) {
+ return tokenizedTerms.stream()
+ .map( TokenizedTerm::getTokens )
+ .flatMap( Arrays::stream )
+ .filter( TokenizedTermMapper::isRarableToken )
+ .collect( Collectors.groupingBy( Function.identity(), Collectors.counting() ) );
+ }
+
+
+ static private int getRareWordIndex( final String[] tokens, final Map<String, Long> tokenCountMap ) {
+ if ( tokens.length == 1 ) {
+ return 0;
+ }
+ int bestIndex = 0;
+ long bestCount = Integer.MAX_VALUE;
+ for ( int i = 0; i < tokens.length; i++ ) {
+ if ( isRarableToken( tokens[ i ] ) ) {
+ final Long count = tokenCountMap.get( tokens[ i ] );
+ if ( count != null && count < bestCount ) {
+ bestIndex = i;
+ bestCount = count;
+ }
+ }
+ }
+ return bestIndex;
+ }
+
+
+ static private boolean isRarableToken( final String token ) {
+ if ( token.length() <= 1 ) {
+ return false;
+ }
+ boolean hasLetter = false;
+ for ( int i = 0; i < token.length(); i++ ) {
+ if ( Character.isLetter( token.charAt( i ) ) ) {
+ hasLetter = true;
+ break;
+ }
+ }
+ if ( !hasLetter ) {
+ return false;
+ }
+ return !BAD_POS_TERMS.contains( token ) && !BAD_UPPER_POS_TERMS.contains( token );
+ }
+
+
+}
Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/wsd/WsdUtil.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/wsd/WsdUtil.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/wsd/WsdUtil.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/wsd/WsdUtil.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,58 @@
+package org.apache.ctakes.dictionary.cased.wsd;
+
+
+import org.apache.ctakes.dictionary.cased.lookup.DiscoveredTerm;
+import org.apache.ctakes.dictionary.cased.util.textspan.MagicTextSpan;
+
+import java.util.*;
+import java.util.function.Function;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 9/23/2020
+ */
+final public class WsdUtil {
+
+ private WsdUtil() {
+ }
+
+
+ static private final Function<DiscoveredTerm, Integer> caseCompared = d -> d.matchesLookupCase() ? 1 : 0;
+ static private final Function<DiscoveredTerm, Integer> skipCompared = d -> 100 - d.getTotalSkips();
+ static private final Function<DiscoveredTerm, Integer> consecutiveSkipCompared = d -> 100 - d.getConsecutiveSkips();
+ static private final Function<DiscoveredTerm, Integer> rankCompared = d -> 1000 - d.getRank();
+
+ static public Map<MagicTextSpan, Collection<DiscoveredTerm>> getSemanticWsdSpanTerms(
+ final Collection<DiscoveredTerm> semanticTerms,
+ final Map<DiscoveredTerm, Collection<MagicTextSpan>> termSpanMap ) {
+ final Map<MagicTextSpan, Collection<DiscoveredTerm>> spanTermsMap = new HashMap<>();
+ for ( DiscoveredTerm term : semanticTerms ) {
+ final Collection<MagicTextSpan> spans = termSpanMap.get( term );
+ for ( MagicTextSpan span : spans ) {
+ spanTermsMap.computeIfAbsent( span, s -> new HashSet<>() ).add( term );
+ }
+ }
+
+ final Map<MagicTextSpan, Collection<DiscoveredTerm>> wsdRemovals = new HashMap<>();
+ for ( Map.Entry<MagicTextSpan, Collection<DiscoveredTerm>> spanTerms : spanTermsMap.entrySet() ) {
+ if ( spanTerms.getValue().size() < 2 ) {
+ continue;
+ }
+ final DiscoveredTerm best = spanTerms.getValue().stream()
+ .max( Comparator.comparing( caseCompared )
+ .thenComparing( skipCompared )
+ .thenComparing( consecutiveSkipCompared )
+ .thenComparing( DiscoveredTerm::getInstances )
+ .thenComparing( rankCompared ) )
+ .orElse( null );
+ if ( best != null ) {
+ wsdRemovals.computeIfAbsent( spanTerms.getKey(), s -> new HashSet<>() )
+ .addAll( spanTerms.getValue() );
+ wsdRemovals.get( spanTerms.getKey() ).remove( best );
+ }
+ }
+ return wsdRemovals;
+ }
+
+}