You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2015/07/23 19:59:24 UTC
svn commit: r1692423 -
/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java
Author: seanfinan
Date: Thu Jul 23 17:59:23 2015
New Revision: 1692423
URL: http://svn.apache.org/r1692423
Log:
CTAKES-371 Automatic pre-tokenization of custom dictionary is closer to ctakes ptb
Modified:
ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java
Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java?rev=1692423&r1=1692422&r2=1692423&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java Thu Jul 23 17:59:23 2015
@@ -25,10 +25,7 @@ import org.apache.ctakes.dictionary.look
import org.apache.ctakes.dictionary.lookup2.util.collection.CollectionMap;
import org.apache.log4j.Logger;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
/**
* Given a collection of {@link org.apache.ctakes.dictionary.lookup2.dictionary.RareWordTermMapCreator.CuiTerm} Objects,
@@ -47,7 +44,7 @@ final public class RareWordTermMapCreato
private RareWordTermMapCreator() {
}
- static private final String[] PREFIXES = {
+ static private final Collection<String> PREFIXES = new HashSet<>( Arrays.asList(
"e-",
"a-",
"u-",
@@ -101,10 +98,23 @@ final public class RareWordTermMapCreato
"homo-",
"hetero-",
"ortho-",
- "phospho-",
- };
- static private final String[] SUFFIXES = { "-esque", "-ette", "-fest", "-fold", "-gate", "-itis", "-less", "-most",
- "-o-torium", "-rama", "-wise" };
+ "phospho-" ) );
+// static private final String[] SUFFIXES = { "-esque", "-ette", "-fest", "-fold", "-gate", "-itis", "-less", "-most",
+// "-o-torium", "-rama", "-wise" };
+
+ static private final Collection<String> SUFFIXES = new HashSet<>( Arrays.asList(
+ "-esque",
+ "-ette",
+ "-fest",
+ "-fold",
+ "-gate",
+ "-itis",
+ "-less",
+ "-most",
+ "-o-torium",
+ "-rama",
+ "-wise" ) );
+
// LookupDesc for the standard excluded pos tags are
// VB,VBD,VBG,VBN,VBP,VBZ,CC,CD,DT,EX,LS,MD,PDT,POS,PP,PP$,PRP,PRP$,RP,TO,WDT,WP,WPS,WRB
@@ -113,7 +123,7 @@ final public class RareWordTermMapCreato
// CD, CC, DT, EX, MD, PDT, PP, PP$, PRP, PRP$, RP, TO, WDT, WP, WPS, WRB
// why not WP$ (possessive wh- pronoun "whose")
// PP$ is a Brown POS tag, not Penn Treebank (as are the rest)
- static private final String[] BAD_POS_TERMS = {
+ static private final Collection<String> BAD_POS_TERMS = new HashSet<>( Arrays.asList(
// CD cardinal number
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
// CC coordinating conjunction
@@ -146,23 +156,23 @@ final public class RareWordTermMapCreato
// WP, WPS wh- pronoun, nominative wh- pronoun
"who", "whom", "which", "that", "whoever", "whomever",
// WRB
- "how", "where", "when", "however", "wherever", "whenever",
- };
+ "how", "where", "when", "however", "wherever", "whenever" ) );
static public CollectionMap<String, RareWordTerm, List<RareWordTerm>> createRareWordTermMap(
final Iterable<CuiTerm> cuiTerms ) {
final CollectionMap<String, RareWordTerm, List<RareWordTerm>> rareWordTermMap = new ArrayListMap<>();
final Map<String, Integer> tokenCountMap = createTokenCountMap( cuiTerms );
for ( CuiTerm cuiTerm : cuiTerms ) {
- final String rareWord = getRareWord( cuiTerm.getTerm(), tokenCountMap );
- final int wordIndex = getWordIndex( cuiTerm.getTerm(), rareWord );
- final int tokenCount = getTokenCount( cuiTerm.getTerm() );
+ final String term = cuiTerm.getTerm();
+ final String rareWord = getRareWord( term, tokenCountMap );
+ final int wordIndex = getWordIndex( term, rareWord );
+ final int tokenCount = getTokenCount( term );
if ( wordIndex < 0 ) {
- LOGGER.warn( "Bad Rare Word Index for " + rareWord + " in " + cuiTerm.getTerm() );
+ LOGGER.warn( "Bad Rare Word Index for " + rareWord + " in " + term );
continue;
}
- rareWordTermMap.placeValue( rareWord, new RareWordTerm( cuiTerm.getTerm(), cuiTerm.__cui,
- rareWord, wordIndex, tokenCount ) );
+ final RareWordTerm rareWordTerm = new RareWordTerm( term, cuiTerm.__cui, rareWord, wordIndex, tokenCount );
+ rareWordTermMap.placeValue( rareWord, rareWordTerm );
}
return rareWordTermMap;
}
@@ -218,12 +228,7 @@ final public class RareWordTermMapCreato
if ( !hasLetter ) {
return false;
}
- for ( String badPosTerm : BAD_POS_TERMS ) {
- if ( token.equals( badPosTerm ) ) {
- return false;
- }
- }
- return true;
+ return !BAD_POS_TERMS.contains( token );
}
static private int getWordIndex( final String tokenizedTerm, final String word ) {
@@ -282,26 +287,22 @@ final public class RareWordTermMapCreato
sb.append( c );
continue;
}
- if ( c != '-' ) {
+ if ( c == '-' && (isPrefix( sb.toString() ) || isSuffix( word, i + 1 )) ) {
+ // what precedes is a prefix or what follows is a suffix so append the dash to the current word and move on
+ sb.append( c );
+ continue;
+ }
+ if ( (c == '\'' && isOwnerApostrophe( word, i + 1 ))
+ || (c == '.' && isNumberDecimal( word, i + 1 )) ) {
+ // what follows is an 's or .# so add the preceding and move on
if ( sb.length() != 0 ) {
tokens.add( sb.toString() );
sb.setLength( 0 );
}
- tokens.add( "" + c );
- continue;
- }
- final boolean isPrefix = isPrefix( sb.toString() );
- if ( isPrefix ) {
- // what precedes is a prefix, so append the dash and move on
- sb.append( '-' );
- continue;
- }
- final boolean isSuffix = isSuffix( word, i + 1 );
- if ( isSuffix ) {
- // what follows is a suffix, so append the dash and move on
- sb.append( '-' );
+ sb.append( c );
continue;
}
+ // Wasn't a special symbol for consideration, so add the previous and symbol separately
if ( sb.length() != 0 ) {
tokens.add( sb.toString() );
sb.setLength( 0 );
@@ -315,30 +316,27 @@ final public class RareWordTermMapCreato
}
static private boolean isPrefix( final String word ) {
- final String prefixQ = word + "-";
- for ( String prefix : PREFIXES ) {
- if ( prefix.equals( prefixQ ) ) {
- return true;
- }
- }
- return false;
+ return PREFIXES.contains( word + "-" );
}
static private boolean isSuffix( final String word, final int startIndex ) {
- if ( word.length() >= startIndex ) {
+ if ( word.length() <= startIndex ) {
return false;
}
final String nextCharTerm = getNextCharTerm( word.substring( startIndex ) );
if ( nextCharTerm.isEmpty() ) {
return false;
}
- final String suffixQ = "-" + nextCharTerm;
- for ( String suffix : SUFFIXES ) {
- if ( suffix.equals( suffixQ ) ) {
- return true;
- }
- }
- return false;
+ return SUFFIXES.contains( "-" + nextCharTerm );
+ }
+
+ static private boolean isOwnerApostrophe( final CharSequence word, final int startIndex ) {
+ return word.length() == startIndex + 1 && word.charAt( startIndex ) == 's';
+ }
+
+ static private boolean isNumberDecimal( final CharSequence word, final int startIndex ) {
+ // Bizarre scenario in which ctakes tokenizes ".2" as a fraction, but not ".22"
+ return word.length() == startIndex + 1 && Character.isDigit( word.charAt( startIndex ) );
}
static private String getNextCharTerm( final String word ) {