You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2015/09/16 19:43:50 UTC
svn commit: r1703438 [2/2] - in /ctakes/sandbox/dictionarytool/src:
META-INF/ org/apache/ctakes/dictionarytool/
org/apache/ctakes/dictionarytool/reader/
org/apache/ctakes/dictionarytool/util/
org/apache/ctakes/dictionarytool/util/token/ org/apache/ctak...
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TextTokenizerCtakesPTB.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TextTokenizerCtakesPTB.java?rev=1703438&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TextTokenizerCtakesPTB.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TextTokenizerCtakesPTB.java Wed Sep 16 17:43:48 2015
@@ -0,0 +1,1316 @@
+package org.apache.ctakes.dictionarytool.util.token;
+
+import java.util.*;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import static java.lang.Character.*;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 7/15/2015
+ */
+public class TextTokenizerCtakesPTB {
+
+ static private final Logger LOGGER = Logger.getLogger( "TextTokenizerCtakesPTB" );
+
+
+ static private final int NOT_SET_INDICATOR = Integer.MIN_VALUE;
+
+
+ static private final char DASH = '-';
+ static final char APOSTROPHE = '\'';
+ static private final char PERIOD = '.';
+ static private final char HYPHEN_OR_MINUS_SIGN = '-';
+ static private final char NEWLINE = '\n';
+ static private final char CR = '\r';
+ static private final char COMMA = ',';
+ static private final String ELLIPSIS = "...";
+
+
+ static public String getTokenizedText( final String text ) {
+ if ( text.isEmpty() ) {
+ return text;
+ }
+ final List<String> tokens = tokenizeTextSegment( text );
+ final StringBuilder sb = new StringBuilder();
+ for ( String token : tokens ) {
+ sb.append( token ).append( " " );
+ }
+ // trim whitespace
+ sb.setLength( Math.max( 0, sb.length() - 1 ) );
+ return sb.toString();
+ }
+
+
+ /**
+ * Tokenize text that starts at offset offsetAdjustment within the complete text
+ *
+ * @param textSegment the text to tokenize
+ * @return the list of new tokens
+ */
+ static public List<String> tokenizeTextSegment( final String textSegment ) {
+ if ( textSegment == null || textSegment.trim().isEmpty() ) {
+ return Collections.emptyList();
+ }
+
+ final List<String> tokenList = new ArrayList<>();
+ final String lowerCaseText = textSegment.trim().toLowerCase();
+ // find first character of a token
+ int currentPosition = findFirstCharOfNextToken( lowerCaseText, 0 );
+ while ( currentPosition >= 0 ) {
+ // get current character and the one after that, which is used in making a number
+ // of decisions. if at the end of the input, use '\0' to represent the non-existent
+ // character after the current one just to avoid dealing with null
+ int tokenLength = NOT_SET_INDICATOR;
+ if ( currentPosition + 1 >= lowerCaseText.length()
+ || isWhitespace( lowerCaseText.charAt( currentPosition + 1 ) ) ) {
+ // we found the start of a token, but it was the last character in the input,
+ // so it is a 1-character token
+ //
+ // else we have at least 2 characters to consider
+ // Since the following character is whitespace, and the current character
+ // is the first character of a token, the current character is a one-character token
+ tokenLength = 1;
+ // add the token created
+ tokenList.add( lowerCaseText.substring( currentPosition, currentPosition + tokenLength ) );
+ currentPosition = findFirstCharOfNextToken( lowerCaseText, currentPosition + tokenLength );
+ continue;
+ }
+ final char firstCharOfToken = lowerCaseText.charAt( currentPosition );
+ final int symbolSuffixLength = getSymbolSuffixLength( firstCharOfToken, lowerCaseText, currentPosition );
+ if ( symbolSuffixLength >= 0 ) {
+ tokenLength = symbolSuffixLength;
+ } else if ( isPunctuation( firstCharOfToken ) ) { // other than any handled above
+ // Already handled minus sign and leading period (which could be part of a decimal)
+ // Since not processing 'web-text', no need to look for things like :)
+ // so is some type of 1-character punctuation token
+ tokenLength = 1;
+ } else if ( isLetterOrDigit( firstCharOfToken ) ) {
+ tokenLength = getLetterOrDigitTokenLength( tokenList, lowerCaseText, currentPosition, tokenLength );
+ } else { // some other symbol or punctuation not included in isPunctuation
+ // Since not processing 'web-text', no need to look for things like :)
+ // so it is some type of 1-character symbol token
+ tokenLength = 1;
+ }
+ // add the token created
+ if ( tokenLength < 0 ) {
+ throw new RuntimeException( "tokenLength = " + tokenLength + " currentPosition = " + currentPosition );
+ }
+ tokenList.add( lowerCaseText.substring( currentPosition, currentPosition + tokenLength ) );
+ currentPosition = findFirstCharOfNextToken( lowerCaseText, currentPosition + tokenLength );
+ }
+ return tokenList;
+ }
+
+
+ static private int getSymbolSuffixLength( final char firstCharOfToken,
+ final String lowerCasedText, final int currentPosition ) {
+ switch ( firstCharOfToken ) {
+ case PERIOD: {
+ // check if decimal number without the leading digits
+ final int numberLength = getLengthIfIsNumberThatStartsWithPeriod( currentPosition, lowerCasedText );
+ if ( numberLength > 0 ) {
+ return numberLength;
+ }
+ if ( isEllipsis( currentPosition, lowerCasedText ) ) {
+ return ELLIPSIS.length();
+ }
+ // Abbreviation does not start with period, and not part of some other token, so it is punctuation
+ return 1;
+ }
+ case HYPHEN_OR_MINUS_SIGN: {
+ // If it's the first character of a token, then this is not a hyphenated term that
+ // was supposed to be kept as one token, or we would have included it in the previous token
+ // Also telephone numbers do not start with a dash
+ // So assume the hyphen/dash char is a one-character token like in 5-6 or in -400
+ return 1;
+ }
+ case APOSTROPHE: {
+ // "can't" is not part of this case because the n is the start of the second token
+ // The 've part of should've is not handled here, when something like should've or he'll
+ // is found, 2 tokens are created (elsewhere)
+ // Check if start of a Name
+ final int nameLength = getLengthIfNameStartingWithApostrophe( currentPosition, lowerCasedText );
+ if ( nameLength > 0 ) {
+ return nameLength;
+ }
+ if ( isContractionThatStartsWithApostrophe( currentPosition, lowerCasedText ) ) {
+ // 'tis and 'twas which get tokenized as "'t is" and "'t was"
+ return 2;
+ // the "is" or "was" part will become a token on the next iteration
+ }
+ // is separate punctuation mark
+ return 1;
+ }
+ }
+ return -1;
+ }
+
+
+ // todo : mine from huge loopo above
+ static private int getLetterOrDigitTokenLength( final List<String> tokenList,
+ final String textSegment, final int currentIndex,
+ final int currentTokenLength ) {
+ final String lowerCasedText = textSegment.toLowerCase();
+ int tokenLength = currentTokenLength;
+ // First check the easy case - if just letters and digits until next whitespace (or until end of segment)
+ // then that is a word or a number, can skip all the other logic to check for +hyphens
+ // or contractions etc
+ final TokenIndexHolder tokenIndexHolder = new TokenIndexHolder( lowerCasedText, currentIndex );
+
+ int currentPosition = currentIndex;
+ if ( tokenIndexHolder.obviouslyIsNumber ) {
+ tokenLength = tokenIndexHolder.nextWhitespaceOrEndOfSegment - currentPosition;
+ } else if ( tokenIndexHolder.obviouslyIsWord ) {
+ // Check for things like "cannot" and "gonna" that appear to be one token but
+ // are supposed to be more than one according to PTB rules.
+ final String lowerCasedSubstring = lowerCasedText.substring( currentPosition, tokenIndexHolder.nextWhitespaceOrEndOfSegment );
+ int len = lenOfFirstTokenInContraction( lowerCasedSubstring );
+ if ( len > 0 ) {
+ // is a contraction that doesn't contain an apostrophe, like "gonna", create WordToken for first part,
+ // and create ContractionToken for other token(s)
+ tokenLength = len;
+ tokenList.add( lowerCasedText.substring( currentPosition, currentPosition + tokenLength ) );
+ currentPosition += tokenLength; // currentPosition
+ len = lenOfSecondTokenInContraction( lowerCasedSubstring );
+ tokenLength = len;
+ len = lenOfThirdTokenInContraction( lowerCasedSubstring );
+ if ( len > 0 ) { // if there is a 3rd, create the 2nd and set up for the 3rd to be created later
+ tokenList.add( lowerCasedText.substring( currentPosition, currentPosition + tokenLength ) );
+ currentPosition += tokenLength; // currentPosition
+ tokenLength = len;
+ }
+ } else {
+ tokenLength = tokenIndexHolder.nextWhitespaceOrEndOfSegment - currentPosition;
+ }
+ } else {
+ // Still within the "isLetterOrDigit(firstCharOfToken)" but not obviously number or word
+ int len;
+
+ // Not sure what the token is, the token could extend to
+ // include all to the end of an email address,
+ // or include all to the end of a URL,
+ // or include all to the end of a URL,
+ // or through the next period (for an abbreviation)
+ // or to the next hyphen,
+ // or beyond,
+ // or to the next whitespace (note already handle case of all alphanums to whitespace
+ // or to the end of input (note already handle case of all alphanums to end of input
+ // or the next apostrophe (for a most contractions)
+ // or until "n't" for such contractions
+ // or the next other punctuation symbol
+ // or beyond (for 80's)
+ // or could include some punctuation like 3,245.51
+
+ // Need to check for things like 80's before checking for contractions or else 80's looks like a contraction
+ if ( tokenIndexHolder.nextNonLetterOrNonDigit < lowerCasedText.length()
+ && lowerCasedText.charAt( tokenIndexHolder.nextNonLetterOrNonDigit ) == APOSTROPHE ) {
+ String lowerCasedSubstring = lowerCasedText.substring( currentPosition, tokenIndexHolder.nextWhitespaceOrEndOfSegment );
+ len = tokenLengthCheckingForSingleQuoteWordsToKeepTogether( lowerCasedSubstring );
+ if ( len > tokenIndexHolder.nextNonLetterOrNonDigit - currentPosition ) {
+ // if keeping the apostrophe attached
+ tokenLength = len;
+ }
+ // else let contraction checking later determine what to do
+ }
+ if ( tokenLength == NOT_SET_INDICATOR ) { // not found yet
+ final LengthPair lengthPair
+ = getLengthIfNextApostIsMiddleOfContraction( currentPosition, tokenIndexHolder.nextNonLetterOrNonDigit, lowerCasedText );
+ if ( lengthPair != null ) {
+ len = lengthPair.getRootLength();
+ tokenLength = len;
+ char c = lowerCasedText.charAt( currentPosition + len );
+ if ( c == 'n' || c == APOSTROPHE ) {
+ // if a "n't" contraction or a contraction where contraction token starts with '
+ if ( tokenLength < 0 ) {
+ throw new RuntimeException( "c = " + c + "tokenLength = " + tokenLength
+ + " currentPosition = " + currentPosition );
+ }
+ // First create the WordToken (no apostrophe)
+ if ( tokenLength > 0 ) {
+ tokenList.add( lowerCasedText.substring( currentPosition, currentPosition + tokenLength ) );
+ currentPosition += tokenLength;
+ }
+ // Set up to create the second token, for other contractions, the next token will start with an
+ // apostrophe and be handled above... but for "n't" contractions, next token won't start with apostrophe
+ // so just go ahead and handle it here instead of having to keep track of previous
+ // and handle n't in next loop.
+ tokenLength = lengthPair.getSuffixLength();
+ } else {
+ throw new RuntimeException(
+ "ERROR: getLengthIfNextApostIsMiddleOfContraction returned " + len + " but the character (" +
+ c + ") after that is not 'n' or apostrophe " );
+ }
+ } else {
+ len = getCodificationLength( currentPosition, lowerCasedText,
+ tokenIndexHolder.nextNonTelephoneOrPostalChar, tokenIndexHolder.nextWhitespaceOrEndOfSegment );
+ if ( len > 0 ) {
+ tokenLength = len;
+ } else {
+
+ // Still within the "isLetterOrDigit(firstCharOfToken)".
+ // not obviously a word or number (already checked those)
+ // and not Url, EmailAddress, or Abbreviation
+ // There could be a hyphen before the next white space,
+ // or a symbol before the next whitespace
+ // or apostrophe like in 80's or P'yongyang (one token each) or James' or Ted's (2 tokens each)
+ // Take alphanums, but consider hyphenated words and names with apostrophes
+ // and consider tele numbers and postal codes
+
+ if ( tokenIndexHolder.nextNonLetterOrNonDigit < lowerCasedText.length()
+ && lowerCasedText.charAt( tokenIndexHolder.nextNonLetterOrNonDigit ) == HYPHEN_OR_MINUS_SIGN ) {
+ // telephone numbers and postal codes handled above already
+ final String lowerCasedSubstring
+ = lowerCasedText.substring( currentPosition, tokenIndexHolder.nextWhitespaceOrEndOfSegment );
+ len = tokenLengthCheckingForHyphenatedTerms( lowerCasedSubstring );
+ tokenLength = len;
+ if ( tokenLength < 0 ) {
+ throw new RuntimeException(
+ "tokenLength = " + tokenLength + " currentPosition = " + currentPosition +
+ " nextNonLetterOrNonDigit = " + tokenIndexHolder.nextNonLetterOrNonDigit );
+ }
+ } else if ( tokenIndexHolder.nextNonNumericChar > 0
+ && (len
+ = lenIfIsNumberContainingComma( currentPosition, lowerCasedText, tokenIndexHolder.nextNonNumericChar )) >
+ 0 ) {
+ tokenLength = len;
+ } else if ( tokenIndexHolder.nextNonLetterDigitApostrophe < lowerCasedText.length()
+ && lowerCasedText.charAt( tokenIndexHolder.nextNonLetterDigitApostrophe ) == PERIOD ) {
+ // see if is a number with a decimal place (without commas, comma-containing numbers are handled above)
+ if ( tokenIndexHolder.nextNonDigit == lowerCasedText.length() - 1 ) {
+ // end of sentence, don't include the period as part of the number, count it as end of sentence marker (punctuation)
+ tokenLength = tokenIndexHolder.nextNonDigit - currentPosition;
+ //if (tokenLength<1) throw new RuntimeException("Period at end of sentence " + nextNonDigit + " " + nextNonLetterDigitApostrophe+" "+tokenLength+ " " + lowerCasedText);
+ } else if ( tokenIndexHolder.nextNonLetterDigitApostrophe == tokenIndexHolder.nextNonDigit ) {
+ // if not end of sentence, do include period (decimal point) in the NumToken
+ tokenLength = tokenIndexHolder.nextNonDigit + 1 + getLenToNextNonDigit( lowerCasedText, tokenIndexHolder.nextNonDigit + 1 ) -
+ currentPosition;
+ } else {
+ // something like 2J3. which is not a number or 2'3.
+ tokenLength = tokenIndexHolder.nextNonLetterOrNonDigit - currentPosition;
+ }
+ } else {
+ // breaking character is not - character and not ' character, so stop there
+ tokenLength = tokenIndexHolder.nextNonLetterOrNonDigit - currentPosition;
+ }
+ //} else {
+ // throw new UnsupportedOperationException("nextNonLetterDigitApostrophe = " + nextNonLetterDigitApostrophe);
+ //}
+ }
+ }
+ }
+ }
+ return currentPosition;
+ }
+
+
+
+ static private int getCodificationLength( final int currentPosition, final String lowerCasedText,
+ final int nextNonTelephoneOrPostalChar,
+ final int nextWhitespaceOrEndOfSegment ) {
+ final int telephoneLength = lenIfIsTelephoneNumber( currentPosition, lowerCasedText, nextNonTelephoneOrPostalChar );
+ if ( telephoneLength > 0 ) {
+ return telephoneLength;
+ }
+ final int postCodeLength = lenIfIsPostalCode( currentPosition, lowerCasedText, nextNonTelephoneOrPostalChar );
+ if ( postCodeLength > 0 ) {
+ return postCodeLength;
+ }
+ final int urlLength = lenIfIsUrl( currentPosition, lowerCasedText, nextWhitespaceOrEndOfSegment );
+ if ( urlLength > 0 ) {
+ return urlLength;
+ }
+ final int emailLength = lenIfIsEmailAddress( currentPosition, lowerCasedText, nextWhitespaceOrEndOfSegment );
+ if ( emailLength > 0 ) {
+ return emailLength;
+ }
+ final int abbrLength = lenIfIsAbbreviation( currentPosition, lowerCasedText, nextWhitespaceOrEndOfSegment );
+ if ( abbrLength > 0 ) {
+ return abbrLength;
+ }
+ return -1;
+ }
+
+ /**
+ * such as -4,012.67 or 5 or 5.5 or 4,000,153
+ *
+ * @param currentPosition
+ * @param text
+ * @param nextNonNumericChar
+ * @return
+ */
+ static private int lenIfIsNumberContainingComma( int currentPosition, String text, int nextNonNumericChar ) {
+ final String s = text.substring( 0, nextNonNumericChar ); // use substring so don't search until end of entire document
+ final int commaPosition = s.indexOf( COMMA, currentPosition );
+ if ( commaPosition < 0 || commaPosition > nextNonNumericChar ) {
+ return -1;
+ }
+ int len = -1;
+
+ final int periodPosition = s.indexOf( PERIOD, currentPosition );
+ int endOfWholeNumberPart = periodPosition;
+ if ( endOfWholeNumberPart < 0 ) {
+ endOfWholeNumberPart = s.length();
+ }
+ // the whole number part can contain commas as long as there are exactly 3 digits after each comma
+ if ( commaPosition == 0 || commaPosition > endOfWholeNumberPart ) {
+ return -1; // if comma is start or appears after the decimal point, then no commas in the whole-number-part
+ }
+ int position = commaPosition;
+
+ boolean didNotFindExactlyThreeDigitsAfterComma = false;
+
+ while ( !didNotFindExactlyThreeDigitsAfterComma ) {
+ len = position - currentPosition; // don't include the comma unless also can include next 3 digits
+ if ( position < endOfWholeNumberPart && s.charAt( position ) == COMMA ) {
+ position++;
+ }
+ for ( int i = 0; i < 3; i++ ) { // 3 digits after the comma if comma is part of a number
+ if ( position < endOfWholeNumberPart && isDigit( s.charAt( position ) ) ) {
+ position++;
+ } else {
+ didNotFindExactlyThreeDigitsAfterComma = true;
+ }
+ }
+ if ( position < endOfWholeNumberPart && isDigit( s.charAt( position ) ) ) {
+ // can't have 4 digits after comma like 3,4567
+ didNotFindExactlyThreeDigitsAfterComma = true;
+ }
+ }
+ if ( len <= 0 ) {
+ return -1;
+ }
+ // See if there is a decimal point that can continue the number, such as 3,456.56 or 4,012.
+ // But if the sentences ends with the period that follows the whole_number_part, count it as the sentence marker
+ // not as part of the number
+ if ( periodPosition != text.length() - 1 // not the final period of a sentence
+ && periodPosition == currentPosition + len ) { // but the period does appear right after the whole_number_part
+ len++;
+ while ( len < nextNonNumericChar - currentPosition && isDigit( s.charAt( currentPosition + len ) ) ) {
+ len++;
+ }
+ }
+ return len;
+ }
+
+
+// static private final Pattern TEN_DIGIT_ZIP_CODE = Pattern.compile( "^\\d{5}-\\d{4}" );
+
+ static private int lenIfIsPostalCode( final int currentPosition, final String text, final int nextNonPostalCodeChar ) {
+ if ( nextNonPostalCodeChar < 0 || nextNonPostalCodeChar - currentPosition != 10 ) {
+ return -1;
+ }
+ final String zipCode = text.substring( currentPosition, nextNonPostalCodeChar );
+ final int dashIndex = text.indexOf( DASH );
+ if ( dashIndex != 5 ) {
+ return -1;
+ }
+ int digitCount = 0;
+ for ( char c : zipCode.toCharArray() ) {
+ if ( isDigit( c ) ) {
+ digitCount++;
+ }
+ }
+ if ( digitCount != 9 ) {
+ return -1;
+ }
+ return 10;
+ }
+
+ static private final Pattern PHONE_PATTERN
+ = Pattern.compile( "^(\\d-\\d{4})|(\\d{3}-\\d{4})|((\\d-)?\\d{3}-\\d{3}-\\d{4})|(\\d{2}-\\d{4}-\\d{4})$" );
+ // 4-5555 555-1212 1-507-555-1212 , 507-555-1212 02-2348-2192
+ static private int lenIfIsTelephoneNumber( int currentPosition, String text, int nextNonTelephoneNumberChar ) {
+
+ if ( nextNonTelephoneNumberChar < 0 ) {
+ return nextNonTelephoneNumberChar;
+ }
+ final String s = text.substring( currentPosition, nextNonTelephoneNumberChar );
+ final Matcher matcher = PHONE_PATTERN.matcher( s );
+ if ( matcher.matches() ) {
+ return nextNonTelephoneNumberChar - currentPosition;
+ }
+ return -1;
+ }
+
+ static private int getLenToNextNonDigit( String s, int startingPosition ) {
+ char ch;
+ int i = 0;
+ while ( startingPosition + i < s.length() ) {
+ ch = s.charAt( startingPosition + i );
+ if ( !isDigit( ch ) ) {
+ return i;
+ }
+ i++;
+ }
+ return s.length() - startingPosition;
+ }
+
+
+
+
+ static private boolean isEllipsis( final int currentPosition, final String textSegment ) {
+ return textSegment.substring( currentPosition ).startsWith( ELLIPSIS );
+ }
+
+
+ static private final String[] NAME_STARTING_WITH_APOSTROPHE = { "'assad", "'awarta", "'ashira", };
+
+ static private int getLengthIfNameStartingWithApostrophe( final int currentPosition, final String lowerCaseText ) {
+ final String textLowerCased = lowerCaseText.substring( currentPosition );
+ if ( textLowerCased.length() == 1 ) {
+ return -1; // if no more chars after the apostrophe, it's a 1-char token
+ }
+ if ( !isLetter( lowerCaseText.charAt( currentPosition + 1 ) ) ) {
+ return -1;
+ }
+ // Could be the start of a quoted string like "'The boy ran', she said" or could be the start of a name like 'Assad
+ for ( String s : NAME_STARTING_WITH_APOSTROPHE ) {
+ if ( textLowerCased.startsWith( s ) ) {
+ return s.length();
+ }
+ }
+ return -1;
+ }
+
+
+ static private int getLengthIfIsNumberThatStartsWithPeriod( int currentPosition, String textSegment ) {
+ int len = textSegment.length() - currentPosition;
+ if ( len < 2 ) {
+ return -1;
+ }
+ int index = currentPosition + 1;
+ char ch = textSegment.charAt( index );
+ if ( !isDigit( ch ) ) {
+ return -1;
+ }
+ index++;
+ while ( index < currentPosition + len ) {
+ ch = textSegment.charAt( index );
+ if ( !isDigit( ch ) ) {
+ return index - currentPosition;
+ }
+ index++;
+ }
+
+ return len; // all rest were digits
+ }
+
+
+ /**
+ * Assumes no white space between currentPosition and endOfInputToConsider
+ * If last of a sentence is a period, then don't include the period with the abbreviation,
+ * count it as punctuation.
+ * That way we don't have to differentiate between "mg." being an abbreviation and "me." being simply
+ * the end of a sentence
+ *
+ * @param currentPosition
+ * @param mixedCaseText
+ * @param afterEndOfInputToConsider
+ * @return
+ */
+ static private int lenIfIsAbbreviation( int currentPosition, String mixedCaseText, int afterEndOfInputToConsider ) {
+ // Determine if all up to endOfInputToConsider contains at least 1 letter and ends with period
+ // Note input is known to contain at least 1 letter or otherwise would have already been determined to be a number
+ boolean containsLetter = false;
+ // consider as single abbreviation things like e.g. but for things like
+ // www.nlm.nih.gov (without the http) count as separate tokens
+ if ( afterEndOfInputToConsider - currentPosition >= 4 &&
+ mixedCaseText.substring( currentPosition, currentPosition + 4 ).toLowerCase().equals( "www." ) ) {
+ return -1;
+ }
+ for ( int i = currentPosition; i < afterEndOfInputToConsider; i++ ) {
+ char ch = mixedCaseText.charAt( i );
+ char peekAhead;
+ if ( i + 1 < afterEndOfInputToConsider ) {
+ peekAhead = mixedCaseText.charAt( i + 1 );
+ } else {
+ peekAhead = ' ';
+ }
+
+ if ( isLetter( ch ) ) {
+ containsLetter = true;
+ } else if ( ch != PERIOD ) { // if any symbol is found before the period, not considering it an abbreviation
+ return -1;
+ } else if ( !containsLetter || (i + 1 == mixedCaseText.length()) ) {
+ return -1; // no letter, or last character of sentence is this period, in which case period is end of sentence marker, not part of abbreviation
+ } else { // is a period and there was a letter before it and this period is not last char in sentence
+ // If before the period there are alphanums with at least one letter, and we are
+ // not at the end of the sentence, consider the period to be part of the preceding
+ // If there are more alphanums after, also terminated by period, include that too
+ // like in A.D. or e.g.
+ int soFar = (i + 1 - currentPosition);
+ int len = lenIfIsAbbreviation( i + 1, mixedCaseText, afterEndOfInputToConsider );
+ // If what's after the period satisfies abbreviation definition itself
+ if ( len > 0 ) {
+ return (soFar + len);
+ }
+ // else len<=0 and so what's after the period is not more abbreviation
+
+ if ( Character.isWhitespace( peekAhead ) || isPossibleFinalPunctuation( peekAhead ) ) {
+ // "e.g. edema" does have the abbreviation e.g. within it
+ return soFar;
+ } else if ( !isLetterOrDigit( peekAhead ) ) { // "e.g.[1]" does have the abbreviation e.g. within it
+ return soFar - 1;
+ }
+
+ // "e.g.abc" is not an abbreviation because the abc follows the . immediately
+ return -1; // period is end of sentence or is between alphanums
+
+ }
+ }
+
+ // No period found - just all letters
+ return -1;
+
+ }
+
+ static private final String POSSIBLE_FINAL_PUNCTUATION = "?!:";
+
+ static private boolean isPossibleFinalPunctuation( char c ) {
+ return POSSIBLE_FINAL_PUNCTUATION.indexOf( c ) > -1;
+ }
+
+ static private final String VALID_OTHER_EMAIL_ADDRESS_CHARACTERS = "!#$%&'*+/=?^_`{|}~-";
+ // those that can be used without quoting or escaping them
+
+ /**
+ * Assumes no white space between currentPosition and endOfInputToConsider
+ *
+ * @param currentPosition
+ * @param lowerCasedText
+ * @param endOfInputToConsider
+ * @return
+ */
+ static private int lenIfIsEmailAddress( int currentPosition, String lowerCasedText, int endOfInputToConsider ) {
+
+ int maxLenLocalPart = 64;
+ int maxTotalLen = 320;
+ int len = -1;
+ // (?:[a-z0-9!#$%&'*+/=?^_`{|}~-]
+ // @
+ // (?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])
+
+ char AT = '@';
+ char PERIOD = '.'; // as String not char
+ int indexOfAt = lowerCasedText.substring( currentPosition, endOfInputToConsider ).indexOf( AT );
+ if ( indexOfAt < 1 || currentPosition + indexOfAt + 1 == endOfInputToConsider || indexOfAt >
+ maxLenLocalPart ) { // '@' can't be the first character, but must be present, and can't be last char
+ // if no @ sign, or not in a valid position, don't bother doing anything more complicated, can't be an email address
+ return -1;
+ }
+
+
+ // @see http://tools.ietf.org/html/rfc3696#section-3
+
+ // ignoring quoted or escape chars
+ // ignoring ability to use IP address in square brackets for domain part
+
+ // First validate the local part (the part before the @ sign)
+ //String localPart = textSegment.substring(currentPosition, currentPosition+indexOfAt);
+ for ( int i = currentPosition; i < currentPosition + indexOfAt; i++ ) {
+ char ch = lowerCasedText.charAt( i );
+ CharSequence cs = lowerCasedText.subSequence( i, i + 1 );
+ if ( !isLetterOrDigit( ch ) && !VALID_OTHER_EMAIL_ADDRESS_CHARACTERS.contains( cs ) ) {
+ return -1;
+ }
+ if ( ch == PERIOD && (i == currentPosition ||
+ i == currentPosition + indexOfAt - 1) ) { // first and last of local name can't be period
+ return -1;
+ }
+ }
+
+ char prev = '@';
+ // The local part appears to be the right format for a valid email address, validate the domain part
+ for ( int i = currentPosition + indexOfAt + 1; i < endOfInputToConsider; i++ ) {
+ char ch = lowerCasedText.charAt( i );
+ //CharSequence cs = textSegment.subSequence(i, i+1);
+ if ( isLetterOrDigit( ch ) ) {
+ ; // fine, continue
+ } else if ( ch == HYPHEN_OR_MINUS_SIGN || ch == PERIOD ) {
+ // either stop one earlier, or error, or include at least one more char
+ // Is there at least one more valid character?
+ if ( i + 1 < endOfInputToConsider && isLetterOrDigit( lowerCasedText.charAt( i + 1 ) ) ) {
+ ; // keep going
+ } else if ( isLetterOrDigit( prev ) ) {
+ return i - currentPosition - 1;
+ } else {
+ return -1;
+ }
+ } else { //something else that ends the token, like an exclamation point
+ if ( isLetterOrDigit( prev ) ) {
+ return i - currentPosition - 1;
+ } else {
+ return -1;
+ }
+ }
+ }
+
+ len = endOfInputToConsider - currentPosition;
+ if ( len > maxTotalLen ) {
+ return -1;
+ }
+ return len;
+ }
+
+
+ static private final String[] URL_PREFIXES = { "http://", "https://", "ftp://", "mailto:" };
+
+ static private int lenIfIsUrl( final int currentPosition, final String lowerCasedText, final int endOfInputToConsider ) {
+ // http://host:port/path?search#fragment
+ // mailto:joe@example.com
+ final String potentialUrl = lowerCasedText.substring( currentPosition, endOfInputToConsider );
+ for ( String urlPrefix : URL_PREFIXES ) {
+ if ( potentialUrl.startsWith( urlPrefix ) && potentialUrl.length() > urlPrefix.length() ) {
+ return endOfInputToConsider - currentPosition;
+ }
+ }
+ return -1;
+ }
+
+
+ /*
+ * Find the index of the first character of the next token, where
+ * the index is >= startPosition, and the previous token ended at
+ * startPosition-1 (or there was no previous token for the 1st time)
+ * Returns -1 if there are no more tokens (eof or all white space
+ * but no newlines)
+ */
+ static public int findFirstCharOfNextToken( final String lowerCaseText, final int startPosition ) {
+ for ( int i = startPosition; i < lowerCaseText.length(); i++ ) {
+ // find a non-whitespace character
+ if ( !isWhitespace( lowerCaseText.charAt( i ) ) ) {
+ // the only token that can start with whitespace is a NewlineToken
+ return i;
+ }
+ }
+ // reached end of line
+ return -1;
+ }
+
+
+
+ /**
+ * @return s.length() or index of nonalphanumeric character
+ * Note does NOT return -1 if the rest are all alphanumeric, returns s.length in that case
+ * Returns -1 if s == null. returns s.length() if fromIndex is too big
+ */
+ static public int findNextNonAlphaNum( final String lowerCaseText, final int fromIndex ) {
+ if ( lowerCaseText == null ) {
+ throw new IndexOutOfBoundsException( "s==null, fromIndex = " + fromIndex );
+ }
+ for ( int i = fromIndex; i < lowerCaseText.length(); i++ ) {
+ if ( !Character.isLetterOrDigit( lowerCaseText.charAt( i ) ) ) {
+ return i;
+ }
+ }
+ return lowerCaseText.length();
+ }
+
+ // returns true if starts with 'tis and either that's all or the next char is not a letter
+ static private boolean startsWithWithoutBeingFollowedByLetter( String s, String compareTo ) {
+ if ( s.startsWith( compareTo ) ) {
+ if ( s.length() == compareTo.length() ) {
+ return true;
+ }
+ final char next = s.charAt( compareTo.length() );
+ return isLetter( next );
+ }
+ return false;
+ }
+
+ // Copied isPunctuation from edu.mayo.bmi.nlp.tokenizer.Tokenizer
+ static private boolean isPunctuation( final char c ) {
+ return PUNCTUATION_LOOKUP.contains( c );
+ }
+
+
+
+
+ /**
+ * Determine if the text starting at 'position' within 'text' is the start of a
+ * contraction such as "should've" or "hasn't" or "it's" by looking at whether
+ * there is a letter before the apostrophe, and the appropriate letters after the
+ * apostrophe (or in the case of "n't", verify the letter before is an 'n'
+ * Note that if the text starting at 'position' is something like "n't" which
+ * isn't a complete word, returns null.
+ *
+ * @param position first char of next token
+ * @param lowerCaseText text into which parameter position is an index into
+ * @return the length of the WordToken part of the contraction. Note this is not always the position of the
+ * apostrophe. For example, for can't, which is tokenized as ca n't the
+ * length is 2. For "it's", the length is also 2.
+ * @see #lenOfFirstTokenInContraction for handling contractions like "cannot" that don't have an apostrophe
+ */
+ public static LengthPair getLengthIfNextApostIsMiddleOfContraction( final int position, final int nextNonLetterDigit,
+ final String lowerCaseText ) {
+ if ( position < 0 ) {
+ return null;
+ }
+ if ( lowerCaseText.length() < position + 3 ) {
+ return null; // need at least one letter after the apostrophe and one before ('tis and 'twas handled elsewhere)
+ }
+ final int apostrophePosition = lowerCaseText.indexOf( APOSTROPHE, position );
+ // System.out.println("getLengthIfNextApostIsMiddleOfContraction: " + position + " " + nextNonLetterDigit + " " + lowerCasedText);
+
+ // if a token break is found before the apostrophe or no apostrophe found
+ // or there is no character after the apostrophe (out of input)
+ // or no letter before the apostrophe or no letter before "n't"
+ if ( nextNonLetterDigit != apostrophePosition ) {
+ return null;
+ }
+ if ( apostrophePosition < 1
+ || apostrophePosition >= lowerCaseText.length() - 1
+ || lowerCaseText.startsWith( "n't" ) ) {
+ return null;
+ }
+ // First just check the one character after the apostrophe before we start checking in more detail
+ // because we can rule out a lot of things this way
+ final String letterAfterApostrophe = lowerCaseText.substring( apostrophePosition + 1, apostrophePosition + 2 );
+ if ( !LETTERS_AFTER_APOSTROPHE_FOR_MIDDLE_OF_CONTRACTION.contains( letterAfterApostrophe ) ) {
+ return null;
+ }
+ final int subseqentNonAlphaNum = findNextNonAlphaNum( lowerCaseText, apostrophePosition + 1 );
+ final String restStartingWithApostrophe = lowerCaseText.substring( apostrophePosition, subseqentNonAlphaNum );
+ // "'n he could do" or 'n or 've or 'll or 't
+ final char prev = lowerCaseText.charAt( apostrophePosition - 1 ); // needed for checking for "n't"
+
+ for ( String s : POSSIBLE_CONTRACTION_ENDINGS ) {
+ int lenAfterApostrophe = s.length() - 1; // don't count the apostrophe itself
+ if ( s.equals( "n't" ) ) {
+ lenAfterApostrophe--; // adjust for the "n" in "n't"
+ }
+ if ( lowerCaseText.length() < apostrophePosition + lenAfterApostrophe ) {
+ continue; // not enough text for this POSSIBLE_CONTRACTION_ENDINGS to be a match
+ }
+
+ // if (s.equals("'t")) { // different in that the contraction token includes character before the apostrophe
+ // if (rest.equals(s) && (prev=='n' || prev=='N')) {
+ // throw new UnsupportedOperationException("the n't case is supposed to be handled elsewhere");
+ // //return text.length()-3; // TBD how to tell it to go back 1 for the n't case? -- don't -- the n't case is handled elsewhere
+ // } else {
+ // continue loop;
+ // }
+ // }
+
+ // if exact match with rest (end of sentence)
+ if ( s.equals( "n't" ) && prev == 'n' && lowerCaseText.charAt( apostrophePosition + 1 ) == 't'
+ && lowerCaseText.length() == apostrophePosition + 1 + 1 ) {
+ // n't
+ return new LengthPair( apostrophePosition - 1 - position, 3 );
+ } else if ( restStartingWithApostrophe.equals( s ) ) {
+ return new LengthPair( apostrophePosition - position, s.length() );
+ }
+ // there's at least one character after, check that it isn't a letter or number, which would be part of the same token
+ // and would mean the apostrophe wasn't part of a contractiona after all. for example "he'dr. smith" and "can'they" are
+ // more likely the end of quoted sentences and teh start of a new sentence than a misspelled contraction
+
+ // we checked exact match above.
+ // If same length as exact match but not an exact match, done with this one, go on
+ if ( lowerCaseText.length() == apostrophePosition + lenAfterApostrophe + 1 ) {
+ continue; // if not an exact match but has same length as exact match would, then not the right one
+ }
+
+ char after;
+ if ( restStartingWithApostrophe.length() <= position + lenAfterApostrophe + 1 ) {
+ after = '\00';
+ } else {
+ after = restStartingWithApostrophe.charAt( position + lenAfterApostrophe + 1 );
+ }
+ if ( restStartingWithApostrophe.startsWith( s )
+ && Character.isLetter( prev ) && !Character.isLetter( after ) ) {
+ // there was at least one letter before the apostrophe and after the apostrophe, and non letter after the contraction
+ return new LengthPair( apostrophePosition - position, s.length() );
+ } else if ( s.equals( "n't" ) && prev == 'n' && restStartingWithApostrophe.startsWith( "'t" )
+ && !Character.isLetter( after ) ) {
+ // n't
+ return new LengthPair( apostrophePosition - 1 - position, 3 );
+ }
+ }
+ return null;
+ }
+
+
+ static int lenOfFirstTokenInContraction( String s ) {
+ final LengthPair lengthPair = WORD_LENGTH_PAIRS.get( s );
+ if ( lengthPair != null ) {
+ return lengthPair.getRootLength();
+ }
+ return -1;
+ }
+
+ static int lenOfSecondTokenInContraction( String s ) {
+ final LengthPair lengthPair = WORD_LENGTH_PAIRS.get( s );
+ if ( lengthPair != null ) {
+ return lengthPair.getSuffixLength();
+ }
+ return -1;
+ }
+
+ static int lenOfThirdTokenInContraction( String s ) {
+ final LengthPair lengthPair = WORD_LENGTH_PAIRS.get( s );
+ if ( lengthPair != null ) {
+ return s.length() - lengthPair.getSuffixLength() - lengthPair.getSuffixLength();
+ }
+ return -1;
+ }
+
+
+
+// Find the 3 characters that are the next possible token breaks (look for next 3 whitespace, punctuation, but (*) count contiguous whitespace as one)
+// We are most interested in those that sometimes cause a split and sometimes don't -- apostrophes and hyphens.
+// 80's-esque should be one token according to the 2 rules. (potential break characters for that example
+// are the apostrophe, the hyphen, and the whitespace)
+// salon-o-torium should be one token, with a single contraction token and a single word token (potential
+// break characters for that example are hyphen hyphen whitespace)
+
+
+// Cases where first non alphanum is an apostrophe:
+// 1st nonalphanum 2nd nonalphanum 3rd nonalphanum
+
+// apostr hyphen apostr take at most up to 3rd break
+// apostr hyphen hyphen test for -o-torium, otherwise take at most up to 3rd break (ignore case of o-torium followed by something more meaningful)
+// apostr hyphen whtspc take at most up to 3rd break
+// apostr hyphen other take at most up to 3rd break
+
+// apostr apostr any take at most up to 2nd break
+
+// apostr whtspc any* take at most up to 2nd break
+
+// apostr other any take at most up to 2nd break
+
+// Note that an exception prefix between apostrophe and 1st hyphen does not avoid the break at the hyphen
+// so the logic is not something that can be broken down into just looking at the not splitting at the
+// apostrophe followed by normal hyphen processing.
+
+
+ private static String[] FULL_APOSTROPHE_WORDS = { "p'yongyang", };
+
+ static private final Collection<String> FULL_APOSTROPHE_WORDS_LOOKUP
+ = new HashSet<>( Arrays.asList( FULL_APOSTROPHE_WORDS ) );
+
+ /**
+ * Assumes apostrophe is not first character.... that case is handled elsewhere
+ * Assumes <code>s</code> is lower case.
+ */
+ static private boolean breakAtApostrophe( final String lowerCaseText, final int positionOfApostropheToTest ) {
+ if ( lowerCaseText.length() == positionOfApostropheToTest + 1 ) {
+ return true; // James'
+ }
+ if ( positionOfApostropheToTest == 0 ) {
+ throw new UnsupportedOperationException( "positionOfApostropheToTest==0" );
+ }
+ // First check for things like 80's that are all digits followed by 's and immediately
+ // after the s there can't be an alphanum
+ if ( allDigits( lowerCaseText.substring( 0, positionOfApostropheToTest ) )
+ && lowerCaseText.charAt( positionOfApostropheToTest + 1 ) == 's' ) {
+ if ( lowerCaseText.length() < positionOfApostropheToTest + 3 ) {
+ return false; // 80's<end_of_input>
+ }
+ // Check that after the 's there aren't more letters or digits which would be unknown like 'st or 's2
+ // and therefore don't want to assume ' should be kept together with rest.
+ final char after = lowerCaseText.charAt( positionOfApostropheToTest + 2 );
+ return Character.isLetterOrDigit( after );
+ }
+ // if not one of the exceptions above, break at the apostrophe
+ return !FULL_APOSTROPHE_WORDS_LOOKUP.contains( lowerCaseText );
+ }
+
+ // If at least 1 char long and all chars are digits
+ static private boolean allDigits( final String lowerCaseText ) {
+ if ( lowerCaseText == null || lowerCaseText.length() < 1 ) {
+ return false;
+ }
+ for ( char c : lowerCaseText.toCharArray() ) {
+ if ( !isDigit( c ) ) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+
+ /**
+ * for a word like 80's or P'yongyang or James' or Sean's or 80's-like or 80's-esque
+ * (or can't or haven't, which are to be split)
+ * determine whether the singlequote(apostrophe)
+ * needs to be kept with the surrounding letters/numbers
+ * and what to do about hyphenated afterwards if there is a hyphen after....
+ * For possessives, do split.
+ * Note that things that start with an apostrophe like 'Assad were handled elsewhere
+ *
+ * @return len of how much to keep: len to apostrophe, or to next breaking char (the space after s for "80's ") or end of hyphenated suffix that should also remain attached, or -1
+ */
+ static private int tokenLengthCheckingForSingleQuoteWordsToKeepTogether( final String lowerCasedText ) {
+ if ( lowerCasedText == null ) {
+ throw new UnsupportedOperationException( "no quote/apostrophe char found in (null)" );
+ }
+ final int firstBreak = lowerCasedText.indexOf( APOSTROPHE );
+ if ( firstBreak < 0 ) {
+ throw new UnsupportedOperationException( "no quote/apostrophe char found in '" + lowerCasedText + "'" );
+ }
+ if ( firstBreak == 0 ) {
+ return -1;
+ }
+ if ( firstBreak + 1 == lowerCasedText.length() ) {
+ return firstBreak;
+ }
+ if ( breakAtApostrophe( lowerCasedText, firstBreak ) ) {
+ return firstBreak;
+ }
+ // else going to keep at least past the apostrophe, but if there's a hyphenated word or a hyphenated suffix that should not be split,
+ // keep that much too
+ final int secondBreak = findNextNonAlphaNum( lowerCasedText, firstBreak + 1 );
+ if ( secondBreak == lowerCasedText.length() ) {
+ return secondBreak; // no more text, must stop here
+ }
+ // See if there are hyphenated suffix(es) that should also remain attached
+ if ( lowerCasedText.charAt( secondBreak ) != HYPHEN_OR_MINUS_SIGN ) {
+ return secondBreak;
+ }
+ // have to determine whether to keep the hyphen and how many hyphens
+ // 80's-esque
+ final int len = lenIfHyphenatedSuffix( lowerCasedText, secondBreak );
+ if ( len > 0 ) {
+ return secondBreak + len;
+ }
+ return secondBreak;
+ }
+
+
+
+ static private final Map<String,LengthPair> WORD_LENGTH_PAIRS = new HashMap<>( 7 );
+ static {
+ WORD_LENGTH_PAIRS.put( "cannot", new LengthPair( 3, 3 ) );
+ WORD_LENGTH_PAIRS.put( "gonna", new LengthPair( 3, 2 ) );
+ WORD_LENGTH_PAIRS.put( "gotta", new LengthPair( 3, 2 ) );
+ WORD_LENGTH_PAIRS.put( "lemme", new LengthPair( 3, 2 ) );
+ WORD_LENGTH_PAIRS.put( "wanna", new LengthPair( 3, 2 ) );
+ WORD_LENGTH_PAIRS.put( "whaddya", new LengthPair( 3, 2 ) );
+ WORD_LENGTH_PAIRS.put( "whatcha", new LengthPair( 3, 1 ) );
+ }
+
+
+ // more'n *n't // for can't and shouldn't etc.
+ static private final String[] POSSIBLE_CONTRACTION_ENDINGS = { "'s", "'ve", "'re", "'ll", "'d", "'n", "n't" };
+ // note 't is different in that n't is the contraction token
+ static private final String LETTERS_AFTER_APOSTROPHE_FOR_MIDDLE_OF_CONTRACTION = "svrldnt";
+
+
+ static private final String[] CONTRACTIONS_STARTING_WITH_APOSTROPHE = { "'tis", "'twas", };
+
+ static boolean isContractionThatStartsWithApostrophe( final int currentPosition, final String textSegment ) {
+ String text = textSegment.substring( currentPosition );
+ for ( String s : CONTRACTIONS_STARTING_WITH_APOSTROPHE ) {
+ if ( startsWithWithoutBeingFollowedByLetter( text, s ) ) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+
+ // Hyphenated interjections and affixes in the following list are not split into multiple tokens.
+ // For example, uh-oh and e-mail are both single tokens: uh-oh, e-mail.
+
+ // hyphenated prefixes to not split
+ static private final String[] HYPHENATED_PREFIXES = {
+ "e-",
+ "a-",
+ "u-",
+ "x-",
+ "agro-",
+ "ante-",
+ "anti-",
+ "arch-",
+ "be-",
+ "bi-",
+ "bio-",
+ "co-",
+ "counter-",
+ "cross-",
+ "cyber-",
+ "de-",
+ "eco-",
+ "ex-",
+ "extra-",
+ "inter-",
+ "intra-",
+ "macro-",
+ "mega-",
+ "micro-",
+ "mid-",
+ "mini-",
+ "multi-",
+ "neo-",
+ "non-",
+ "over-",
+ "pan-",
+ "para-",
+ "peri-",
+ "post-",
+ "pre-",
+ "pro-",
+ "pseudo-",
+ "quasi-",
+ "re-",
+ "semi-",
+ "sub-",
+ "super-",
+ "tri-",
+ "ultra-",
+ "un-",
+ "uni-",
+ "vice-",
+ // From email from Colin Warner <co...@ldc.upenn.edu> on 7/25/2010
+ "electro-",
+ "gasto-",
+ "homo-",
+ "hetero-",
+ "ortho-",
+ "phospho-",
+ };
+ static private final Collection<String> HYPHENATED_PREFIXES_LOOKUP = new HashSet<>( Arrays.asList( HYPHENATED_PREFIXES ) );
+
+ // hyphenated suffixes to not split
+ static private final String[] HYPHENATED_SUFFIXES
+ = { "-esque", "-ette", "-fest", "-fold", "-gate", "-itis", "-less", "-most", "-o-torium", "-rama", "-wise" };
+ static private final Collection<String> HYPHENATED_SUFFIXES_LOOKUP = new HashSet<>( Arrays.asList( HYPHENATED_SUFFIXES ) );
+
+ // complete words including hyphen
+ static private final String[] HYPHENATED_WORDS = { "mm-hm", "mm-mm", "o-kay", "uh-huh", "uh-oh" };
+ static private final Collection<String> HYPHENATED_WORDS_LOOKUP = new HashSet<>( Arrays.asList( HYPHENATED_WORDS ));
+
+ static private final char MINUS_OR_HYPHEN = '-';
+
+ static private final Character[] PUNCTUATION = { ';', ':', ',', '.', '(', ')', '[', ']', '{', '}', '<', '>', '\'',
+ '"', '/', '\\', '-' };
+ static private final Collection<Character> PUNCTUATION_LOOKUP = new HashSet<>( Arrays.asList( PUNCTUATION ));
+
+ static private final String PUNCTUATION_2 = ";:,.()[]{}<>\'\"/\\-";
+
+
+
+ /**
+ * There is the fixed list of hyphenated words to not be split (HYPHENATED_WORDS_LOOKUP)
+ * <p/>
+ * And here are some made-up examples of words using affixes to keep together
+ * chronic-itis 1 suffix
+ * mega-huge 1 prefix
+ * e-game-fest 1 prefix and 1 suffix
+ * salon-o-torium 1 suffix that contains 2 hyphens
+ * urban-esque-wise 2 suffixes
+ *
+ * @param lowerCaseString because of "-o-torium", input might contain more than 1 hyphen....
+ * @return len to keep together, as far as we know. see hyphen hyphen hyphen case below.
+ * throws exception if there's no hyphen;
+ * number of characters to keep.
+ * Does not mean to split at n+1 hyphen... need to recheck that one
+ */
+ public static int tokenLengthCheckingForHyphenatedTerms( final String lowerCaseString ) {
+ if ( lowerCaseString == null ) {
+ throw new UnsupportedOperationException( "no hyphen found in (null)" );
+ }
+ final int firstBreak = lowerCaseString.indexOf( MINUS_OR_HYPHEN );
+ if ( firstBreak < 0 ) {
+ throw new UnsupportedOperationException( "no hyphen found in '" + lowerCaseString + "'" );
+ }
+ if ( firstBreak == 0 ) {
+ return -1;
+ }
+ if ( firstBreak + 1 == lowerCaseString.length() ) {
+ return firstBreak; // if ends with hyphen, don't include the hyphen in the token. mega- by itself should be mega and -
+ }
+
+ // Find the 3 characters that are the next possible token breaks (look for next 3 whitespace, punctuation, but (*) count contiguous whitespace as one)
+ // We are most interested in those that sometimes cause a split and sometimes don't -- apostrophes and hyphens.
+ // 80's-esque should be one token according to the 2 rules. (potential break characters for that example
+ // are the apostrophe, the hyphen, and the whitespace)
+ // salon-o-torium should be one token, with a single contraction token and a single word token (potential
+ // break characters for that example are hyphen hyphen whitespace)
+
+
+ // Cases where first non alphanum is a hyphen:
+ // 1st nonalphanum 2nd nonalphanum 3rd nonalphanum
+ // hyphen hyphen apostr test for o-torium, otherwise take at most up to 2nd break
+ // hyphen hyphen hyphen test for o-torium, otherwise take at most up to 2nd break, but check rest next time (TBD)
+ // hyphen hyphen whtspc test for o-torium, otherwise take at most up to 2nd break
+ // hyphen hyphen other test for o-torium, otherwise take at most up to 2nd break
+
+ // hyphen apostr any hyphen and apostr not in a name together, take at most up to 2nd break
+
+ // hyphen whtspc any* take at most up to 2nd break
+
+ // hyphen other any take at most up to 2nd break
+
+ final int secondBreak = findNextNonAlphaNum( lowerCaseString, firstBreak + 1 );
+ if ( secondBreak == lowerCaseString.length() ) {
+ // determines if we should we split at first break or not
+ return lenIncludingHyphensToKeep( lowerCaseString, firstBreak, 1, secondBreak, -1 );
+ }
+ final int thirdBreak = findNextNonAlphaNum( lowerCaseString, secondBreak + 1 );
+ if ( lowerCaseString.charAt( secondBreak ) == MINUS_OR_HYPHEN ) {
+ // test for -o-torium, otherwise take at most up to 2nd break
+ return lenIncludingHyphensToKeep( lowerCaseString, firstBreak, 2, secondBreak, thirdBreak ); // take up to 2nd or 3rd break (or just to first, if not one of the exceptions)
+ } else if ( lowerCaseString.charAt( secondBreak ) == APOSTROPHE ) {
+ return lenIncludingHyphensToKeep( lowerCaseString, firstBreak, 1, secondBreak, thirdBreak );
+ } else if ( Character.isWhitespace( lowerCaseString.charAt( secondBreak ) ) ) {
+ return lenIncludingHyphensToKeep( lowerCaseString, firstBreak, 1, secondBreak, thirdBreak ); // take up to 2nd break (or just to first, if not one of the exceptions)
+ }
+ // some other symbol or punctuation
+ return lenIncludingHyphensToKeep( lowerCaseString, firstBreak, 1, secondBreak, thirdBreak );
+ }
+
+ // If there is 1 hyphen: prefix, suffix, or word like uh-oh
+ // If there are 2 hyphens: o-torium or prefix and suffix like mega-huge-esque
+ private static int lenIncludingHyphensToKeep( final String s, final int indexOfFirstHyphen,
+ final int numberOfHyphensToConsiderKeeping,
+ final int secondBreak, final int thirdBreak ) {
+ String possibleSuffix;
+ boolean lookup;
+ if ( numberOfHyphensToConsiderKeeping > 2 || numberOfHyphensToConsiderKeeping < 1 ) {
+ throw new UnsupportedOperationException(
+ "Not ready to handle numberOfHyphensToConsiderKeeping = " + numberOfHyphensToConsiderKeeping );
+ }
+ // FIRST CONSIDER suffixes
+ // Of the suffixes, first check those that have 2 hyphens (-o-torium)
+ if ( numberOfHyphensToConsiderKeeping == 2 ) {
+ possibleSuffix = s.substring( indexOfFirstHyphen, thirdBreak );
+ lookup = HYPHENATED_SUFFIXES_LOOKUP.contains( possibleSuffix );
+ if ( lookup ) {
+ return thirdBreak;
+ }
+ }
+
+ // Now either numberOfHyphensToConsiderKeeping==1 or was ==2 but no 2-hyphen suffix was found to match
+ // Try one-hyphen suffixes, either just 1 of them or 2 of them (but not yet checking for 1 with a prefix too, see prefixes section below for that...
+ possibleSuffix = s.substring( indexOfFirstHyphen, secondBreak );
+ lookup = HYPHENATED_SUFFIXES_LOOKUP.contains( possibleSuffix );
+ if ( lookup ) { // First hyphen is start of a suffix that should not be split off
+ // could be numberOfHyphensToConsiderKeeping==1 here, or could be 2 separate 1-hyphen suffixes are both
+ // used, so do need to check for a second suffix....
+ // Check if a second 1-hyphen suffix
+ if ( thirdBreak > secondBreak ) {
+ possibleSuffix = s.substring( secondBreak, thirdBreak );
+ lookup = HYPHENATED_SUFFIXES_LOOKUP.contains( possibleSuffix );
+ if ( lookup ) {
+ return thirdBreak; // 2 1-hyphen suffixes that all should be kept together
+ }
+ }
+ return secondBreak; // just 1 1-hyphen suffix from the list of exceptions
+ }
+
+ // Now consider HYPHENATED_WORDS_LOOKUP plus a suffix such as uh-oh-X
+ if ( numberOfHyphensToConsiderKeeping > 1 ) {
+ String possibleHyphenatedWordsLookupMatch = s.substring( 0, secondBreak );
+ possibleSuffix = s.substring( secondBreak, thirdBreak );
+ lookup = HYPHENATED_WORDS_LOOKUP.contains( possibleHyphenatedWordsLookupMatch ) &&
+ HYPHENATED_SUFFIXES_LOOKUP.contains( possibleSuffix );
+ if ( lookup ) {
+ return thirdBreak;
+ }
+ }
+
+
+ // NOW CONSIDER prefixes
+
+ String possiblePrefix = s.substring( 0, indexOfFirstHyphen + 1 );
+
+ lookup = HYPHENATED_PREFIXES_LOOKUP.contains( possiblePrefix );
+
+ // First consider prefix + one of the HYPHENATED_WORDS_LOOKUP
+ // Do this before considering just prefix so we get both if both are present.
+ if ( lookup && numberOfHyphensToConsiderKeeping > 1 ) {
+ String possibleHyphenatedWordsLookupMatch = s.substring( indexOfFirstHyphen + 1, thirdBreak ); // e.g. uh-oh
+ boolean lookup2 = HYPHENATED_WORDS_LOOKUP.contains( possibleHyphenatedWordsLookupMatch );
+ if ( lookup2 ) {
+ return thirdBreak;
+ }
+ }
+
+ if ( numberOfHyphensToConsiderKeeping == 1 ) {
+ if ( lookup ) {
+ return secondBreak;
+ }
+ }
+
+ if ( numberOfHyphensToConsiderKeeping == 2 ) {
+ if ( lookup ) { // a prefix was found that should not be split
+ // check for a one-hyphen suffix to go with the one-hyphen prefix
+ possibleSuffix = s.substring( secondBreak, thirdBreak );
+ boolean lookup2 = HYPHENATED_SUFFIXES_LOOKUP.contains( possibleSuffix );
+ if ( lookup2 ) {
+ return thirdBreak; // both a prefix and a suffix that are not to be split, keep all together
+ }
+ return secondBreak; // just a prefix that should not be split, split before second hyphen
+ } else { // not a prefix, or is a prefix that should be split
+ // Already checked for a 2-hyphen suffix without a prefix
+ // And already checked for word like uh-oh with a suffix
+ // And apparently neither of those, so don't check anything else, fall through to next check
+ //String m = "This condition checked already in other if-else " + indexOfFirstHyphen + COMMA + secondBreak + COMMA + thirdBreak + COMMA + s;
+ //throw new UnsupportedOperationException(m);
+ }
+
+ }
+
+ // Finally consider just HYPHENATED_WORDS_LOOKUP, without an affix, such as "uh-oh"
+ String possibleHyphenatedWordsLookupMatch = s.substring( 0, secondBreak );
+ lookup = HYPHENATED_WORDS_LOOKUP.contains( possibleHyphenatedWordsLookupMatch );
+ if ( lookup ) {
+ return secondBreak;
+ }
+
+ return indexOfFirstHyphen; // if the first hyphen is not eligible to keep, keep just up to it.
+ }
+
+ // if character at position is a hyphen and starts a hyphenated suffix that is an exception
+ // and should not be split from the rest of teh word, return length of the suffix
+ // return -1 if not an exception suffix
+ static int lenIfHyphenatedSuffix( String lowerCasedString, int position ) {
+ lowerCasedString = lowerCasedString.toLowerCase();
+ int next = findNextNonAlphaNum( lowerCasedString, position + 1 );
+ String possibleSuffix = lowerCasedString.substring( position, next );
+ if ( lowerCasedString.substring( position ).startsWith( "-o-" ) ) { // check for -o-torium
+ next = findNextNonAlphaNum( lowerCasedString, position + 3 );
+ possibleSuffix = lowerCasedString.substring( position, next );
+ }
+ boolean lookup = HYPHENATED_SUFFIXES_LOOKUP.contains( possibleSuffix );
+
+ if ( lookup ) {
+ return possibleSuffix.length();
+ }
+ return -1;
+ }
+
+
+
+
+
+
+
+
+
+
+
+ static private class LengthPair {
+ final private int __rootLength;
+ final private int __suffixLength;
+
+ private LengthPair( final int rootLength, final int suffixLength ) {
+ __rootLength = rootLength;
+ __suffixLength = suffixLength;
+ }
+ public int getRootLength() {
+ return __rootLength;
+ }
+ public int getSuffixLength() {
+ return __suffixLength;
+ }
+ }
+
+
+
+}
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TokenIndexHolder.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TokenIndexHolder.java?rev=1703438&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TokenIndexHolder.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TokenIndexHolder.java Wed Sep 16 17:43:48 2015
@@ -0,0 +1,116 @@
+package org.apache.ctakes.dictionarytool.util.token;
+
+import static java.lang.Character.isLetter;
+import static java.lang.Character.isDigit;
+import static java.lang.Character.isWhitespace;
+
+/**
+* @author SPF , chip-nlp
+* @version %I%
+* @since 7/22/2015
+*/
+final public class TokenIndexHolder {
+ boolean obviouslyIsWord = true; // until we find a non alphanum before a whitespace
+ boolean obviouslyIsNumber = true; // until we find a non digit before a whitespace
+ int nextWhitespaceOrEndOfSegment = -1;
+ int nextNonLetterOrNonDigit = -1;
+ int nextNonLetterDigitApostrophe = -1;
+ int nextNonTelephoneOrPostalChar = -1; // digits and dash aka hyphen
+ int nextNonNumericChar = -1; // 9,876.012345 is an example with all the numeric chars
+ int nextNonDigit = -1;
+
+ public TokenIndexHolder( final String lowerCasedText, final int currentPosition ) {
+ for ( int i=currentPosition; i<lowerCasedText.length(); i++ ) {
+ final char c = lowerCasedText.charAt( i );
+ if ( isWhitespace( c ) ) {
+ setAsWhitespace( i );
+ break;
+ }
+ final boolean isLetter = isLetter( c );
+ final boolean isDigit = isDigit( c );
+ if ( !isLetter && !isDigit ) {
+ // not whitespace, not letter, not digit, therefore symbol
+ setAsSymbol( c, i );
+ // don't break here though, keep going to set nextWhitespace correctly for other uses
+ } else if ( isLetter ) {
+ setAsCharacter( i );
+ }
+ // else is a digit, none of the flags need to be set for digit characters.
+ }
+ if ( nextWhitespaceOrEndOfSegment < 0 ) {
+ // reached the end of the text
+ initializeAsLastToken( lowerCasedText.length() );
+ }
+ }
+
+
+ private void setAsWhitespace( final int i ) {
+ nextNonLetterOrNonDigit = setUnknownToIndex( nextNonLetterOrNonDigit, i );
+ nextNonLetterDigitApostrophe = setUnknownToIndex( nextNonLetterDigitApostrophe, i );
+ nextNonDigit = setUnknownToIndex( nextNonDigit, i );
+ nextNonTelephoneOrPostalChar = setUnknownToIndex( nextNonTelephoneOrPostalChar, i );
+ nextNonNumericChar = setUnknownToIndex( nextNonNumericChar, i );
+ nextWhitespaceOrEndOfSegment = i;
+ }
+
+ private void setAsSymbol( final char c, final int i ) {
+ obviouslyIsWord = false; // not sure if it will be word all the way to whitespace
+ obviouslyIsNumber = false; // not sure if it will be number all the way to whitespace
+ nextNonDigit = setUnknownToIndex( nextNonDigit, i );
+ nextNonLetterOrNonDigit = setUnknownToIndex( nextNonLetterOrNonDigit, i );
+ if ( c != TextTokenizerCtakesPTB.APOSTROPHE ) {
+ nextNonLetterDigitApostrophe = setUnknownToIndex( nextNonLetterDigitApostrophe, i );
+ if ( !isTelephoneNumberChar( c ) ) {
+ nextNonTelephoneOrPostalChar = setUnknownToIndex( nextNonTelephoneOrPostalChar, i );
+ }
+ if ( !isNumericChar( c ) ) {
+ nextNonNumericChar = setUnknownToIndex( nextNonNumericChar, i );
+ }
+ }
+ }
+
+ private void setAsCharacter( final int i ) {
+ obviouslyIsNumber = false; // not sure if it will be number all the way to whitespace
+ // The above -should- be incorrect when considering scientific numbers
+ // nextNonLetterOrNonDigit is not changed here
+ // nextNonLetterDigitApostrophe is not changed here
+ nextNonDigit = setUnknownToIndex( nextNonDigit, i );
+// if ( !isTelephoneNumberChar( c ) ) { // wtf? never a telephone # character (according to ctakes ptb)
+ nextNonTelephoneOrPostalChar = setUnknownToIndex( nextNonTelephoneOrPostalChar, i );
+// }
+// if ( !isNumericChar( c ) ) { // wtf? never a # character (according to ctakes ptb)
+ nextNonNumericChar = setUnknownToIndex( nextNonNumericChar, i );
+// }
+ }
+
+ private void initializeAsLastToken( final int textLength ) {
+ nextWhitespaceOrEndOfSegment = setUnknownToIndex( nextWhitespaceOrEndOfSegment, textLength );
+ nextNonLetterOrNonDigit = setUnknownToIndex( nextNonLetterOrNonDigit, textLength );
+ nextNonLetterDigitApostrophe = setUnknownToIndex( nextNonLetterDigitApostrophe, textLength );
+ nextNonTelephoneOrPostalChar = setUnknownToIndex( nextNonTelephoneOrPostalChar, textLength );
+ nextNonNumericChar = setUnknownToIndex( nextNonNumericChar, textLength );
+ }
+
+ /**
+ * @param c character to test
+ * @return true if in 0123456789-
+ */
+ static private boolean isTelephoneNumberChar( final char c ) {
+ return isDigit( c ) || c == '-';
+ }
+
+ /**
+ * @param c character to test
+ * @return true if in ,.0123456789
+ */
+ static private boolean isNumericChar( final char c ) {
+ return isDigit( c ) || c == ',' || c == '.';
+ }
+
+ static private int setUnknownToIndex( final int currentValue, final int index ) {
+ if ( currentValue >= 0 ) {
+ return currentValue;
+ }
+ return index;
+ }
+}
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiTextsMapWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiTextsMapWriter.java?rev=1703438&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiTextsMapWriter.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiTextsMapWriter.java Wed Sep 16 17:43:48 2015
@@ -0,0 +1,64 @@
+package org.apache.ctakes.dictionarytool.writer;
+
+import org.apache.ctakes.dictionarytool.util.CuiTuiUtil;
+import org.apache.ctakes.dictionarytool.util.FileUtil;
+import org.apache.ctakes.dictionarytool.util.TokenUtil;
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Map;
+import java.util.Set;
+import java.util.logging.Logger;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 9/16/2015
+ */
+final public class CuiTuiTextsMapWriter {
+
+ static private final Logger LOGGER = Logger.getLogger( "CuiTuiTextsMapWriter" );
+
+
+ private CuiTuiTextsMapWriter() {
+ }
+
+ static public void writeCuiTuiTexts( final String bsvFilePath,
+ final HashSetMap<Long, Integer> validCuisAndTuis,
+ final HashSetMap<Long, String> cuiTexts ) {
+ System.out.println( "Writing map of Cuis and Tuis and Texts to " + bsvFilePath );
+ long lineCount = 0;
+ try {
+ final BufferedWriter writer = FileUtil.createWriter( bsvFilePath );
+ for ( Map.Entry<Long, Set<String>> cuiTextsEntry : cuiTexts.entrySet() ) {
+ final Long code = cuiTextsEntry.getKey();
+ final String cui = CuiTuiUtil.getAsCui( code );
+ final Collection<Integer> tuiCodes = validCuisAndTuis.get( code );
+ if ( tuiCodes == null ) {
+ LOGGER.severe( "No Tuis for " + code );
+ continue;
+ }
+ for ( Integer tuiCode : tuiCodes ) {
+ final String tui = CuiTuiUtil.getAsTui( tuiCode );
+ for ( String text : cuiTextsEntry.getValue() ) {
+ lineCount++;
+ writer.write( TokenUtil.createBsvLine( cui, tui, text ) );
+ writer.newLine();
+ if ( lineCount % 100000 == 0 ) {
+ LOGGER.info( "File Line " + lineCount );
+ }
+ }
+ }
+ }
+ writer.close();
+ } catch ( IOException ioE ) {
+ LOGGER.severe( "Error writing Term on line " + lineCount + " in file " + bsvFilePath );
+ }
+ LOGGER.info( "Wrote " + lineCount + " terms to " + bsvFilePath );
+ }
+
+
+
+}