You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2015/09/16 19:43:50 UTC

svn commit: r1703438 [2/2] - in /ctakes/sandbox/dictionarytool/src: META-INF/ org/apache/ctakes/dictionarytool/ org/apache/ctakes/dictionarytool/reader/ org/apache/ctakes/dictionarytool/util/ org/apache/ctakes/dictionarytool/util/token/ org/apache/ctak...

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TextTokenizerCtakesPTB.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TextTokenizerCtakesPTB.java?rev=1703438&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TextTokenizerCtakesPTB.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TextTokenizerCtakesPTB.java Wed Sep 16 17:43:48 2015
@@ -0,0 +1,1316 @@
+package org.apache.ctakes.dictionarytool.util.token;
+
+import java.util.*;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import static java.lang.Character.*;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 7/15/2015
+ */
+public class TextTokenizerCtakesPTB {
+
+   static private final Logger LOGGER = Logger.getLogger( "TextTokenizerCtakesPTB" );
+
+
+   static private final int NOT_SET_INDICATOR = Integer.MIN_VALUE;
+
+
+   static private final char DASH = '-';
+   static  final char APOSTROPHE = '\'';
+   static private final char PERIOD = '.';
+   static private final char HYPHEN_OR_MINUS_SIGN = '-';
+   static private final char NEWLINE = '\n';
+   static private final char CR = '\r';
+   static private final char COMMA = ',';
+   static private final String ELLIPSIS = "...";
+
+
+   static public String getTokenizedText( final String text ) {
+      if ( text.isEmpty() ) {
+         return text;
+      }
+      final List<String> tokens = tokenizeTextSegment( text );
+      final StringBuilder sb = new StringBuilder();
+      for ( String token : tokens ) {
+         sb.append( token ).append( " " );
+      }
+      // trim whitespace
+      sb.setLength( Math.max( 0, sb.length() - 1 ) );
+      return sb.toString();
+   }
+
+
+   /**
+    * Tokenize text that starts at offset offsetAdjustment within the complete text
+    *
+    * @param textSegment               the text to tokenize
+    * @return the list of new tokens
+    */
+   static public List<String> tokenizeTextSegment( final String textSegment ) {
+      if ( textSegment == null || textSegment.trim().isEmpty() ) {
+         return Collections.emptyList();
+      }
+
+      final List<String> tokenList = new ArrayList<>();
+      final String lowerCaseText = textSegment.trim().toLowerCase();
+      // find first character of a token
+      int currentPosition = findFirstCharOfNextToken( lowerCaseText, 0 );
+      while ( currentPosition >= 0 ) {
+         // get current character and the one after that, which is used in making a number
+         // of decisions. if at the end of the input, use '\0' to represent the non-existent
+         // character after the current one just to avoid dealing with null
+         int tokenLength = NOT_SET_INDICATOR;
+         if ( currentPosition + 1 >= lowerCaseText.length()
+              || isWhitespace( lowerCaseText.charAt( currentPosition + 1 ) ) ) {
+            // we found the start of a token, but it was the last character in the input,
+            // so it is a 1-character token
+            //
+            // else we have at least 2 characters to consider
+            // Since the following character is whitespace, and the current character
+            // is the first character of a token, the current character is a one-character token
+            tokenLength = 1;
+            // add the token created
+            tokenList.add( lowerCaseText.substring( currentPosition, currentPosition + tokenLength ) );
+            currentPosition = findFirstCharOfNextToken( lowerCaseText, currentPosition + tokenLength );
+            continue;
+         }
+         final char firstCharOfToken = lowerCaseText.charAt( currentPosition );
+         final int symbolSuffixLength = getSymbolSuffixLength( firstCharOfToken, lowerCaseText, currentPosition );
+         if ( symbolSuffixLength >= 0 ) {
+            tokenLength = symbolSuffixLength;
+         } else if ( isPunctuation( firstCharOfToken ) ) { // other than any handled above
+            // Already handled minus sign and leading period (which could be part of a decimal)
+            // Since not processing 'web-text', no need to look for things like :)
+            // so is some type of 1-character punctuation token
+            tokenLength = 1;
+         } else if ( isLetterOrDigit( firstCharOfToken ) ) {
+            tokenLength = getLetterOrDigitTokenLength( tokenList, lowerCaseText, currentPosition, tokenLength );
+         } else { // some other symbol or punctuation not included in isPunctuation
+            // Since not processing 'web-text', no need to look for things like :)
+            // so it is some type of 1-character symbol token
+            tokenLength = 1;
+         }
+         // add the token created
+         if ( tokenLength < 0 ) {
+            throw new RuntimeException( "tokenLength = " + tokenLength + " currentPosition = " + currentPosition );
+         }
+         tokenList.add( lowerCaseText.substring( currentPosition, currentPosition + tokenLength ) );
+         currentPosition = findFirstCharOfNextToken( lowerCaseText, currentPosition + tokenLength );
+      }
+      return tokenList;
+   }
+
+
+   static private int getSymbolSuffixLength( final char firstCharOfToken,
+                                             final String lowerCasedText, final int currentPosition ) {
+      switch ( firstCharOfToken ) {
+         case PERIOD: {
+            // check if decimal number without the leading digits
+            final int numberLength = getLengthIfIsNumberThatStartsWithPeriod( currentPosition, lowerCasedText );
+            if ( numberLength > 0 ) {
+               return numberLength;
+            }
+            if ( isEllipsis( currentPosition, lowerCasedText ) ) {
+               return ELLIPSIS.length();
+            }
+            // Abbreviation does not start with period, and not part of some other token, so it is punctuation
+            return 1;
+         }
+         case HYPHEN_OR_MINUS_SIGN: {
+            // If it's the first character of a token, then this is not a hyphenated term that
+            // was supposed to be kept as one token, or we would have included it in the previous token
+            // Also telephone numbers do not start with a dash
+            // So assume the hyphen/dash char is a one-character token like in 5-6 or in -400
+            return 1;
+         }
+         case APOSTROPHE: {
+            // "can't" is not part of this case because the n is the start of the second token
+            // The 've part of should've is not handled here, when something like should've or he'll
+            // is found, 2 tokens are created (elsewhere)
+            // Check if start of a Name
+            final int nameLength = getLengthIfNameStartingWithApostrophe( currentPosition, lowerCasedText );
+            if ( nameLength > 0 ) {
+               return nameLength;
+            }
+            if ( isContractionThatStartsWithApostrophe( currentPosition, lowerCasedText ) ) {
+               // 'tis and 'twas which get tokenized as  "'t is"  and  "'t was"
+               return 2;
+               // the "is" or "was" part will become a token on the next iteration
+            }
+            // is separate punctuation mark
+            return 1;
+         }
+      }
+      return -1;
+   }
+
+
+   // todo : mine from huge loopo above
+   static private int getLetterOrDigitTokenLength( final List<String> tokenList,
+                                                   final String textSegment, final int currentIndex,
+                                                   final int currentTokenLength ) {
+      final String lowerCasedText = textSegment.toLowerCase();
+      int tokenLength = currentTokenLength;
+      // First check the easy case - if just letters and digits until next whitespace (or until end of segment)
+      // then that is a word or a number, can skip all the other logic to check for +hyphens
+      // or contractions etc
+      final TokenIndexHolder tokenIndexHolder = new TokenIndexHolder( lowerCasedText, currentIndex );
+
+      int currentPosition = currentIndex;
+      if ( tokenIndexHolder.obviouslyIsNumber ) {
+         tokenLength = tokenIndexHolder.nextWhitespaceOrEndOfSegment - currentPosition;
+      } else if ( tokenIndexHolder.obviouslyIsWord ) {
+         // Check for things like "cannot" and "gonna" that appear to be one token but
+         // are supposed to be more than one according to PTB rules.
+         final String lowerCasedSubstring = lowerCasedText.substring( currentPosition, tokenIndexHolder.nextWhitespaceOrEndOfSegment );
+         int len = lenOfFirstTokenInContraction( lowerCasedSubstring );
+         if ( len > 0 ) {
+         // is a contraction that doesn't contain an apostrophe, like "gonna", create WordToken for first part,
+            // and create ContractionToken for other token(s)
+            tokenLength = len;
+            tokenList.add( lowerCasedText.substring( currentPosition, currentPosition + tokenLength ) );
+            currentPosition += tokenLength; // currentPosition
+            len = lenOfSecondTokenInContraction( lowerCasedSubstring );
+            tokenLength = len;
+            len = lenOfThirdTokenInContraction( lowerCasedSubstring );
+            if ( len > 0 ) { // if there is a 3rd, create the 2nd and set up for the 3rd to be created later
+               tokenList.add( lowerCasedText.substring( currentPosition, currentPosition + tokenLength ) );
+               currentPosition += tokenLength; // currentPosition
+               tokenLength = len;
+            }
+         } else {
+            tokenLength = tokenIndexHolder.nextWhitespaceOrEndOfSegment - currentPosition;
+         }
+      } else {
+         // Still within the "isLetterOrDigit(firstCharOfToken)" but not obviously number or word
+         int len;
+
+         // Not sure what the token is, the token could extend to
+         // include all to the end of an email address,
+         // or include all to the end of a URL,
+         // or include all to the end of a URL,
+         // or through the next period (for an abbreviation)
+         // or to the next hyphen,
+         // or beyond,
+         // or to the next whitespace (note already handle case of all alphanums to whitespace
+         // or to the end of input (note already handle case of all alphanums to end of input
+         // or the next apostrophe (for a most contractions)
+         // or until "n't" for such contractions
+         // or the next other punctuation symbol
+         // or beyond (for 80's)
+         // or could include some punctuation like 3,245.51
+
+         // Need to check for things like 80's before checking for contractions or else 80's looks like a contraction
+         if ( tokenIndexHolder.nextNonLetterOrNonDigit < lowerCasedText.length()
+              && lowerCasedText.charAt( tokenIndexHolder.nextNonLetterOrNonDigit ) == APOSTROPHE ) {
+            String lowerCasedSubstring = lowerCasedText.substring( currentPosition, tokenIndexHolder.nextWhitespaceOrEndOfSegment );
+            len = tokenLengthCheckingForSingleQuoteWordsToKeepTogether( lowerCasedSubstring );
+            if ( len > tokenIndexHolder.nextNonLetterOrNonDigit - currentPosition ) {
+               // if keeping the apostrophe attached
+               tokenLength = len;
+            }
+            // else let contraction checking later determine what to do
+         }
+         if ( tokenLength == NOT_SET_INDICATOR ) { // not found yet
+            final LengthPair lengthPair
+                  = getLengthIfNextApostIsMiddleOfContraction( currentPosition, tokenIndexHolder.nextNonLetterOrNonDigit, lowerCasedText );
+            if ( lengthPair != null ) {
+               len = lengthPair.getRootLength();
+               tokenLength = len;
+               char c = lowerCasedText.charAt( currentPosition + len );
+               if ( c == 'n' || c == APOSTROPHE ) {
+                    // if a "n't" contraction or a contraction where contraction token starts with '
+                  if ( tokenLength < 0 ) {
+                     throw new RuntimeException( "c = " + c + "tokenLength = " + tokenLength
+                                                 + " currentPosition = " + currentPosition );
+                  }
+                  // First create the WordToken (no apostrophe)
+                  if ( tokenLength > 0 ) {
+                     tokenList.add( lowerCasedText.substring( currentPosition, currentPosition + tokenLength ) );
+                     currentPosition += tokenLength;
+                  }
+                  // Set up to create the second token, for other contractions, the next token will start with an
+                  // apostrophe and be handled above... but for "n't" contractions, next token won't start with apostrophe
+                  // so just go ahead and handle it here instead of having to keep track of previous
+                  // and handle n't in next loop.
+                  tokenLength = lengthPair.getSuffixLength();
+               } else {
+                  throw new RuntimeException(
+                        "ERROR: getLengthIfNextApostIsMiddleOfContraction returned " + len + " but the character (" +
+                        c + ") after that is not 'n' or apostrophe " );
+               }
+            } else {
+               len = getCodificationLength( currentPosition, lowerCasedText,
+                     tokenIndexHolder.nextNonTelephoneOrPostalChar, tokenIndexHolder.nextWhitespaceOrEndOfSegment );
+               if ( len > 0 ) {
+                  tokenLength = len;
+               } else {
+
+                  // Still within the "isLetterOrDigit(firstCharOfToken)".
+                  // not obviously a word or number (already checked those)
+                  // and not Url, EmailAddress, or Abbreviation
+                  // There could be a hyphen before the next white space,
+                  // or a symbol before the next whitespace
+                  // or apostrophe like in 80's or P'yongyang (one token each) or James' or Ted's (2 tokens each)
+                  // Take alphanums, but consider hyphenated words and names with apostrophes
+                  // and consider tele numbers and postal codes
+
+                  if ( tokenIndexHolder.nextNonLetterOrNonDigit < lowerCasedText.length()
+                       && lowerCasedText.charAt( tokenIndexHolder.nextNonLetterOrNonDigit ) == HYPHEN_OR_MINUS_SIGN ) {
+                     // telephone numbers and postal codes handled above already
+                     final String lowerCasedSubstring
+                           = lowerCasedText.substring( currentPosition, tokenIndexHolder.nextWhitespaceOrEndOfSegment );
+                     len = tokenLengthCheckingForHyphenatedTerms( lowerCasedSubstring );
+                     tokenLength = len;
+                     if ( tokenLength < 0 ) {
+                        throw new RuntimeException(
+                              "tokenLength = " + tokenLength + " currentPosition = " + currentPosition +
+                              " nextNonLetterOrNonDigit = " + tokenIndexHolder.nextNonLetterOrNonDigit );
+                     }
+                  } else if ( tokenIndexHolder.nextNonNumericChar > 0
+                              && (len
+                        = lenIfIsNumberContainingComma( currentPosition, lowerCasedText, tokenIndexHolder.nextNonNumericChar )) >
+                                 0 ) {
+                     tokenLength = len;
+                  } else if ( tokenIndexHolder.nextNonLetterDigitApostrophe < lowerCasedText.length()
+                              && lowerCasedText.charAt( tokenIndexHolder.nextNonLetterDigitApostrophe ) == PERIOD ) {
+                     // see if is a number with a decimal place (without commas, comma-containing numbers are handled above)
+                     if ( tokenIndexHolder.nextNonDigit == lowerCasedText.length() - 1 ) {
+                        // end of sentence, don't include the period as part of the number, count it as end of sentence marker (punctuation)
+                        tokenLength = tokenIndexHolder.nextNonDigit - currentPosition;
+                        //if (tokenLength<1) throw new RuntimeException("Period at end of sentence " + nextNonDigit + " " + nextNonLetterDigitApostrophe+" "+tokenLength+ " " + lowerCasedText);
+                     } else if ( tokenIndexHolder.nextNonLetterDigitApostrophe == tokenIndexHolder.nextNonDigit ) {
+                        // if not end of sentence, do include period (decimal point) in the NumToken
+                        tokenLength = tokenIndexHolder.nextNonDigit + 1 + getLenToNextNonDigit( lowerCasedText, tokenIndexHolder.nextNonDigit + 1 ) -
+                                      currentPosition;
+                     } else {
+                        // something like 2J3. which is not a number or 2'3.
+                        tokenLength = tokenIndexHolder.nextNonLetterOrNonDigit - currentPosition;
+                     }
+                  } else {
+                     // breaking character is not - character and not ' character, so stop there
+                     tokenLength = tokenIndexHolder.nextNonLetterOrNonDigit - currentPosition;
+                  }
+                  //} else {
+                  //    throw new UnsupportedOperationException("nextNonLetterDigitApostrophe = " + nextNonLetterDigitApostrophe);
+                  //}
+               }
+            }
+         }
+      }
+      return currentPosition;
+   }
+
+
+
+   static private int getCodificationLength( final int currentPosition, final String lowerCasedText,
+                                         final int nextNonTelephoneOrPostalChar,
+                                         final int nextWhitespaceOrEndOfSegment ) {
+      final int telephoneLength = lenIfIsTelephoneNumber( currentPosition, lowerCasedText, nextNonTelephoneOrPostalChar );
+      if ( telephoneLength > 0 ) {
+         return telephoneLength;
+      }
+      final int postCodeLength = lenIfIsPostalCode( currentPosition, lowerCasedText, nextNonTelephoneOrPostalChar );
+      if ( postCodeLength > 0 ) {
+         return postCodeLength;
+      }
+      final int urlLength = lenIfIsUrl( currentPosition, lowerCasedText, nextWhitespaceOrEndOfSegment );
+      if ( urlLength > 0 ) {
+         return urlLength;
+      }
+      final int emailLength = lenIfIsEmailAddress( currentPosition, lowerCasedText, nextWhitespaceOrEndOfSegment );
+      if ( emailLength > 0 ) {
+         return emailLength;
+      }
+      final int abbrLength = lenIfIsAbbreviation( currentPosition, lowerCasedText, nextWhitespaceOrEndOfSegment );
+      if ( abbrLength > 0 ) {
+         return abbrLength;
+      }
+      return -1;
+   }
+
+   /**
+    * such as -4,012.67 or 5 or 5.5 or 4,000,153
+    *
+    * @param currentPosition
+    * @param text
+    * @param nextNonNumericChar
+    * @return
+    */
+   static private int lenIfIsNumberContainingComma( int currentPosition, String text, int nextNonNumericChar ) {
+      final String s = text.substring( 0, nextNonNumericChar ); // use substring so don't search until end of entire document
+      final int commaPosition = s.indexOf( COMMA, currentPosition );
+      if ( commaPosition < 0 || commaPosition > nextNonNumericChar ) {
+         return -1;
+      }
+      int len = -1;
+
+      final int periodPosition = s.indexOf( PERIOD, currentPosition );
+      int endOfWholeNumberPart = periodPosition;
+      if ( endOfWholeNumberPart < 0 ) {
+         endOfWholeNumberPart = s.length();
+      }
+      // the whole number part can contain commas as long as there are exactly 3 digits after each comma
+      if ( commaPosition == 0 || commaPosition > endOfWholeNumberPart ) {
+         return -1; // if comma is start or appears after the decimal point, then no commas in the whole-number-part
+      }
+      int position = commaPosition;
+
+      boolean didNotFindExactlyThreeDigitsAfterComma = false;
+
+      while ( !didNotFindExactlyThreeDigitsAfterComma ) {
+         len = position - currentPosition; // don't include the comma unless also can include next 3 digits
+         if ( position < endOfWholeNumberPart && s.charAt( position ) == COMMA ) {
+            position++;
+         }
+         for ( int i = 0; i < 3; i++ ) { // 3 digits after the comma if comma is part of a number
+            if ( position < endOfWholeNumberPart && isDigit( s.charAt( position ) ) ) {
+               position++;
+            } else {
+               didNotFindExactlyThreeDigitsAfterComma = true;
+            }
+         }
+         if ( position < endOfWholeNumberPart && isDigit( s.charAt( position ) ) ) {
+            // can't have 4 digits after comma like 3,4567
+            didNotFindExactlyThreeDigitsAfterComma = true;
+         }
+      }
+      if ( len <= 0 ) {
+         return -1;
+      }
+      // See if there is a decimal point that can continue the number, such as  3,456.56  or 4,012.
+      // But if the sentences ends with the period that follows the whole_number_part, count it as the sentence marker
+      // not as part of the number
+      if ( periodPosition != text.length() - 1 // not the final period of a sentence
+           && periodPosition == currentPosition + len ) { // but the period does appear right after the whole_number_part
+         len++;
+         while ( len < nextNonNumericChar - currentPosition && isDigit( s.charAt( currentPosition + len ) ) ) {
+            len++;
+         }
+      }
+      return len;
+   }
+
+
+//   static private final Pattern TEN_DIGIT_ZIP_CODE = Pattern.compile( "^\\d{5}-\\d{4}" );
+
+   static private int lenIfIsPostalCode( final int currentPosition, final String text, final int nextNonPostalCodeChar ) {
+      if ( nextNonPostalCodeChar < 0 || nextNonPostalCodeChar - currentPosition != 10 ) {
+         return -1;
+      }
+      final String zipCode = text.substring( currentPosition, nextNonPostalCodeChar );
+      final int dashIndex = text.indexOf( DASH );
+      if ( dashIndex != 5 ) {
+         return -1;
+      }
+      int digitCount = 0;
+      for ( char c : zipCode.toCharArray() ) {
+         if ( isDigit( c ) ) {
+            digitCount++;
+         }
+      }
+      if ( digitCount != 9 ) {
+         return -1;
+      }
+      return 10;
+   }
+
+   static private final Pattern PHONE_PATTERN
+         = Pattern.compile( "^(\\d-\\d{4})|(\\d{3}-\\d{4})|((\\d-)?\\d{3}-\\d{3}-\\d{4})|(\\d{2}-\\d{4}-\\d{4})$" );
+   //                            4-5555      555-1212      1-507-555-1212 , 507-555-1212      02-2348-2192
+   static private int lenIfIsTelephoneNumber( int currentPosition, String text, int nextNonTelephoneNumberChar ) {
+
+      if ( nextNonTelephoneNumberChar < 0 ) {
+         return nextNonTelephoneNumberChar;
+      }
+      final String s = text.substring( currentPosition, nextNonTelephoneNumberChar );
+      final Matcher matcher = PHONE_PATTERN.matcher( s );
+      if ( matcher.matches() ) {
+         return nextNonTelephoneNumberChar - currentPosition;
+      }
+      return -1;
+   }
+
+   static private int getLenToNextNonDigit( String s, int startingPosition ) {
+      char ch;
+      int i = 0;
+      while ( startingPosition + i < s.length() ) {
+         ch = s.charAt( startingPosition + i );
+         if ( !isDigit( ch ) ) {
+            return i;
+         }
+         i++;
+      }
+      return s.length() - startingPosition;
+   }
+
+
+
+
+   static private boolean isEllipsis( final int currentPosition, final String textSegment ) {
+      return textSegment.substring( currentPosition ).startsWith( ELLIPSIS );
+   }
+
+
+   static private final String[] NAME_STARTING_WITH_APOSTROPHE = { "'assad", "'awarta", "'ashira", };
+
+   static private int getLengthIfNameStartingWithApostrophe( final int currentPosition, final String lowerCaseText ) {
+      final String textLowerCased = lowerCaseText.substring( currentPosition );
+      if ( textLowerCased.length() == 1 ) {
+         return -1; // if no more chars after the apostrophe, it's a 1-char token
+      }
+      if ( !isLetter( lowerCaseText.charAt( currentPosition + 1 ) ) ) {
+         return -1;
+      }
+      // Could be the start of a quoted string like "'The boy ran', she said" or could be the start of a name like 'Assad
+      for ( String s : NAME_STARTING_WITH_APOSTROPHE ) {
+         if ( textLowerCased.startsWith( s ) ) {
+            return s.length();
+         }
+      }
+      return -1;
+   }
+
+
+   static private int getLengthIfIsNumberThatStartsWithPeriod( int currentPosition, String textSegment ) {
+      int len = textSegment.length() - currentPosition;
+      if ( len < 2 ) {
+         return -1;
+      }
+      int index = currentPosition + 1;
+      char ch = textSegment.charAt( index );
+      if ( !isDigit( ch ) ) {
+         return -1;
+      }
+      index++;
+      while ( index < currentPosition + len ) {
+         ch = textSegment.charAt( index );
+         if ( !isDigit( ch ) ) {
+            return index - currentPosition;
+         }
+         index++;
+      }
+
+      return len; // all rest were digits
+   }
+
+
+   /**
+    * Assumes no white space between currentPosition and endOfInputToConsider
+    * If last of a sentence is a period, then don't include the period with the abbreviation,
+    * count it as punctuation.
+    * That way we don't have to differentiate between "mg." being an abbreviation and "me." being simply
+    * the end of a sentence
+    *
+    * @param currentPosition
+    * @param mixedCaseText
+    * @param afterEndOfInputToConsider
+    * @return
+    */
+   static private int lenIfIsAbbreviation( int currentPosition, String mixedCaseText, int afterEndOfInputToConsider ) {
+      // Determine if all up to endOfInputToConsider contains at least 1 letter and ends with period
+      // Note input is known to contain at least 1 letter or otherwise would have already been determined to be a number
+      boolean containsLetter = false;
+      // consider as single abbreviation things like e.g. but for things like
+      // www.nlm.nih.gov (without the http) count as separate tokens
+      if ( afterEndOfInputToConsider - currentPosition >= 4 &&
+           mixedCaseText.substring( currentPosition, currentPosition + 4 ).toLowerCase().equals( "www." ) ) {
+         return -1;
+      }
+      for ( int i = currentPosition; i < afterEndOfInputToConsider; i++ ) {
+         char ch = mixedCaseText.charAt( i );
+         char peekAhead;
+         if ( i + 1 < afterEndOfInputToConsider ) {
+            peekAhead = mixedCaseText.charAt( i + 1 );
+         } else {
+            peekAhead = ' ';
+         }
+
+         if ( isLetter( ch ) ) {
+            containsLetter = true;
+         } else if ( ch != PERIOD ) { // if any symbol is found before the period, not considering it an abbreviation
+            return -1;
+         } else if ( !containsLetter || (i + 1 == mixedCaseText.length()) ) {
+            return -1; // no letter, or last character of sentence is this period, in which case period is end of sentence marker, not part of abbreviation
+         } else { // is a period and there was a letter before it and this period is not last char in sentence
+            // If before the period there are alphanums with at least one letter, and we are
+            // not at the end of the sentence, consider the period to be part of the preceding
+            // If there are more alphanums after, also terminated by period, include that too
+            // like in A.D. or e.g.
+            int soFar = (i + 1 - currentPosition);
+            int len = lenIfIsAbbreviation( i + 1, mixedCaseText, afterEndOfInputToConsider );
+            // If what's after the period satisfies abbreviation definition itself
+            if ( len > 0 ) {
+               return (soFar + len);
+            }
+            // else len<=0 and so what's after the period is not more abbreviation
+
+            if ( Character.isWhitespace( peekAhead ) || isPossibleFinalPunctuation( peekAhead ) ) {
+               // "e.g. edema" does have the abbreviation e.g. within it
+               return soFar;
+            } else if ( !isLetterOrDigit( peekAhead ) ) { // "e.g.[1]" does have the abbreviation e.g. within it
+               return soFar - 1;
+            }
+
+            // "e.g.abc" is not an abbreviation because the abc follows the . immediately
+            return -1; // period is end of sentence or is between alphanums
+
+         }
+      }
+
+      // No period found - just all letters
+      return -1;
+
+   }
+
+   static private final String POSSIBLE_FINAL_PUNCTUATION = "?!:";
+
+   static private boolean isPossibleFinalPunctuation( char c ) {
+      return POSSIBLE_FINAL_PUNCTUATION.indexOf( c ) > -1;
+   }
+
+   static private final String VALID_OTHER_EMAIL_ADDRESS_CHARACTERS = "!#$%&'*+/=?^_`{|}~-";
+         // those that can be used without quoting or escaping them
+
+   /**
+    * Assumes no white space between currentPosition and endOfInputToConsider
+    *
+    * @param currentPosition
+    * @param lowerCasedText
+    * @param endOfInputToConsider
+    * @return
+    */
+   static private int lenIfIsEmailAddress( int currentPosition, String lowerCasedText, int endOfInputToConsider ) {
+
+      int maxLenLocalPart = 64;
+      int maxTotalLen = 320;
+      int len = -1;
+      // (?:[a-z0-9!#$%&'*+/=?^_`{|}~-]
+      // @
+      // (?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])
+
+      char AT = '@';
+      char PERIOD = '.'; // as String not char
+      int indexOfAt = lowerCasedText.substring( currentPosition, endOfInputToConsider ).indexOf( AT );
+      if ( indexOfAt < 1 || currentPosition + indexOfAt + 1 == endOfInputToConsider || indexOfAt >
+                                                                                       maxLenLocalPart ) { // '@' can't be the first character, but must be present, and can't be last char
+         // if no @ sign, or not in a valid position, don't bother doing anything more complicated, can't be an email address
+         return -1;
+      }
+
+
+      // @see http://tools.ietf.org/html/rfc3696#section-3
+
+      // ignoring quoted or escape chars
+      // ignoring ability to use IP address in square brackets for domain part
+
+      // First validate the local part (the part before the @ sign)
+      //String localPart = textSegment.substring(currentPosition, currentPosition+indexOfAt);
+      for ( int i = currentPosition; i < currentPosition + indexOfAt; i++ ) {
+         char ch = lowerCasedText.charAt( i );
+         CharSequence cs = lowerCasedText.subSequence( i, i + 1 );
+         if ( !isLetterOrDigit( ch ) && !VALID_OTHER_EMAIL_ADDRESS_CHARACTERS.contains( cs ) ) {
+            return -1;
+         }
+         if ( ch == PERIOD && (i == currentPosition ||
+                               i == currentPosition + indexOfAt - 1) ) { // first and last of local name can't be period
+            return -1;
+         }
+      }
+
+      char prev = '@';
+      // The local part appears to be the right format for a valid email address, validate the domain part
+      for ( int i = currentPosition + indexOfAt + 1; i < endOfInputToConsider; i++ ) {
+         char ch = lowerCasedText.charAt( i );
+         //CharSequence cs = textSegment.subSequence(i, i+1);
+         if ( isLetterOrDigit( ch ) ) {
+            ; // fine, continue
+         } else if ( ch == HYPHEN_OR_MINUS_SIGN || ch == PERIOD ) {
+            // either stop one earlier, or error, or include at least one more char
+            // Is there at least one more valid character?
+            if ( i + 1 < endOfInputToConsider && isLetterOrDigit( lowerCasedText.charAt( i + 1 ) ) ) {
+               ; // keep going
+            } else if ( isLetterOrDigit( prev ) ) {
+               return i - currentPosition - 1;
+            } else {
+               return -1;
+            }
+         } else { //something else that ends the token, like an exclamation point
+            if ( isLetterOrDigit( prev ) ) {
+               return i - currentPosition - 1;
+            } else {
+               return -1;
+            }
+         }
+      }
+
+      len = endOfInputToConsider - currentPosition;
+      if ( len > maxTotalLen ) {
+         return -1;
+      }
+      return len;
+   }
+
+
+   static private final String[] URL_PREFIXES = { "http://", "https://", "ftp://", "mailto:" };
+
+   static private int lenIfIsUrl( final int currentPosition, final String lowerCasedText, final int endOfInputToConsider ) {
+      // http://host:port/path?search#fragment
+      // mailto:joe@example.com
+      final String potentialUrl = lowerCasedText.substring( currentPosition, endOfInputToConsider );
+      for ( String urlPrefix : URL_PREFIXES ) {
+         if ( potentialUrl.startsWith( urlPrefix ) && potentialUrl.length() > urlPrefix.length() ) {
+            return endOfInputToConsider - currentPosition;
+         }
+      }
+      return -1;
+   }
+
+
+   /*
+    * Find the index of the first character of the next token, where
+    * the index is >= startPosition, and the previous token ended at
+    * startPosition-1 (or there was no previous token for the 1st time)
+    * Returns -1 if there are no more tokens (eof or all white space
+    * but no newlines)
+    */
+   static public int findFirstCharOfNextToken( final String lowerCaseText, final int startPosition ) {
+      for ( int i = startPosition; i < lowerCaseText.length(); i++ ) {
+         // find a non-whitespace character
+         if ( !isWhitespace( lowerCaseText.charAt( i ) ) ) {
+            // the only token that can start with whitespace is a NewlineToken
+            return i;
+         }
+      }
+      // reached end of line
+      return -1;
+   }
+
+
+
+   /**
+    * @return s.length() or index of nonalphanumeric character
+    * Note does NOT return -1 if the rest are all alphanumeric, returns s.length in that case
+    * Returns -1 if s == null. returns s.length() if fromIndex is too big
+    */
+   static public int findNextNonAlphaNum( final String lowerCaseText, final int fromIndex ) {
+      if ( lowerCaseText == null ) {
+         throw new IndexOutOfBoundsException( "s==null, fromIndex = " + fromIndex );
+      }
+      for ( int i = fromIndex; i < lowerCaseText.length(); i++ ) {
+         if ( !Character.isLetterOrDigit( lowerCaseText.charAt( i ) ) ) {
+            return i;
+         }
+      }
+      return lowerCaseText.length();
+   }
+
+   // returns true if starts with 'tis and either that's all or the next char is not a letter
+   static private boolean startsWithWithoutBeingFollowedByLetter( String s, String compareTo ) {
+      if ( s.startsWith( compareTo ) ) {
+         if ( s.length() == compareTo.length() ) {
+            return true;
+         }
+         final char next = s.charAt( compareTo.length() );
+         return isLetter( next );
+      }
+      return false;
+   }
+
+   // Copied isPunctuation from edu.mayo.bmi.nlp.tokenizer.Tokenizer
+   static private boolean isPunctuation( final char c ) {
+      return PUNCTUATION_LOOKUP.contains( c );
+   }
+
+
+
+
+   /**
+    * Determine if the text starting at 'position' within 'text' is the start of a
+    * contraction such as "should've" or "hasn't" or "it's" by looking at whether
+    * there is a letter before the apostrophe, and the appropriate letters after the
+    * apostrophe (or in the case of "n't", verify the letter before is an 'n'
+    * Note that if the text starting at 'position' is something like "n't" which
+    * isn't a complete word, returns null.
+    *
+    * @param position       first char of next token
+    * @param lowerCaseText text into which parameter position is an index into
+    * @return the length of the WordToken part of the contraction.  Note this is not always the position of the
+    * apostrophe.  For example, for can't, which is tokenized as ca n't the
+    * length is 2.  For "it's", the length is also 2.
+    * @see #lenOfFirstTokenInContraction for handling contractions like "cannot" that don't have an apostrophe
+    */
+   public static LengthPair getLengthIfNextApostIsMiddleOfContraction( final int position, final int nextNonLetterDigit,
+                                                                              final String lowerCaseText ) {
+      if ( position < 0 ) {
+         return null;
+      }
+      if ( lowerCaseText.length() < position + 3 ) {
+         return null; // need at least one letter after the apostrophe and one before ('tis and 'twas handled elsewhere)
+      }
+      final int apostrophePosition = lowerCaseText.indexOf( APOSTROPHE, position );
+      // System.out.println("getLengthIfNextApostIsMiddleOfContraction: " + position + " " + nextNonLetterDigit + " " + lowerCasedText);
+
+      // if a token break is found before the apostrophe or no apostrophe found
+      // or there is no character after the apostrophe (out of input)
+      // or no letter before the apostrophe or no letter before "n't"
+      if ( nextNonLetterDigit != apostrophePosition ) {
+         return null;
+      }
+      if ( apostrophePosition < 1
+           || apostrophePosition >= lowerCaseText.length() - 1
+           || lowerCaseText.startsWith( "n't" ) ) {
+         return null;
+      }
+      // First just check the one character after the apostrophe before we start checking in more detail
+      // because we can rule out a lot of things this way
+      final String letterAfterApostrophe = lowerCaseText.substring( apostrophePosition + 1, apostrophePosition + 2 );
+      if ( !LETTERS_AFTER_APOSTROPHE_FOR_MIDDLE_OF_CONTRACTION.contains( letterAfterApostrophe ) ) {
+         return null;
+      }
+      final int subseqentNonAlphaNum = findNextNonAlphaNum( lowerCaseText, apostrophePosition + 1 );
+      final String restStartingWithApostrophe = lowerCaseText.substring( apostrophePosition, subseqentNonAlphaNum );
+      // "'n he could do" or 'n or 've or 'll or 't
+      final char prev = lowerCaseText.charAt( apostrophePosition - 1 ); // needed for checking for "n't"
+
+      for ( String s : POSSIBLE_CONTRACTION_ENDINGS ) {
+         int lenAfterApostrophe = s.length() - 1; // don't count the apostrophe itself
+         if ( s.equals( "n't" ) ) {
+            lenAfterApostrophe--; // adjust for the "n" in "n't"
+         }
+         if ( lowerCaseText.length() < apostrophePosition + lenAfterApostrophe ) {
+            continue; // not enough text for this POSSIBLE_CONTRACTION_ENDINGS to be a match
+         }
+
+         //	    if (s.equals("'t")) { // different in that the contraction token includes character before the apostrophe
+         //		if (rest.equals(s) && (prev=='n' || prev=='N')) {
+         //		    throw new UnsupportedOperationException("the n't case is supposed to be handled elsewhere");
+         //		    //return text.length()-3; // TBD how to tell it to go back 1 for the n't case? -- don't -- the n't case is  handled elsewhere
+         //		} else {
+         //		    continue loop;
+         //		}
+         //	    }
+
+         // if exact match with rest (end of sentence)
+         if ( s.equals( "n't" ) && prev == 'n' && lowerCaseText.charAt( apostrophePosition + 1 ) == 't'
+              && lowerCaseText.length() == apostrophePosition + 1 + 1 ) {
+            // n't
+            return new LengthPair( apostrophePosition - 1 - position, 3 );
+         } else if ( restStartingWithApostrophe.equals( s ) ) {
+            return new LengthPair( apostrophePosition - position, s.length() );
+         }
+         // there's at least one character after, check that it isn't a letter or number, which would be part of the same token
+         // and would mean the apostrophe wasn't part of a contractiona after all. for example "he'dr. smith"  and "can'they" are
+         // more likely the end of quoted sentences and teh start of a new sentence than a misspelled contraction
+
+         // we checked exact match above.
+         // If same length as exact match but not an exact match, done with this one, go on
+         if ( lowerCaseText.length() == apostrophePosition + lenAfterApostrophe + 1 ) {
+            continue; // if not an exact match but has same length as exact match would, then not the right one
+         }
+
+         char after;
+         if ( restStartingWithApostrophe.length() <= position + lenAfterApostrophe + 1 ) {
+            after = '\00';
+         } else {
+            after = restStartingWithApostrophe.charAt( position + lenAfterApostrophe + 1 );
+         }
+         if ( restStartingWithApostrophe.startsWith( s )
+              && Character.isLetter( prev ) && !Character.isLetter( after ) ) {
+            // there was at least one letter before the apostrophe and after the apostrophe, and non letter after the contraction
+            return new LengthPair( apostrophePosition - position, s.length() );
+         } else if ( s.equals( "n't" ) && prev == 'n' && restStartingWithApostrophe.startsWith( "'t" )
+                     && !Character.isLetter( after ) ) {
+            // n't
+            return new LengthPair( apostrophePosition - 1 - position, 3 );
+         }
+      }
+      return null;
+   }
+
+
+   static int lenOfFirstTokenInContraction( String s ) {
+      final LengthPair lengthPair = WORD_LENGTH_PAIRS.get( s );
+      if ( lengthPair != null ) {
+         return lengthPair.getRootLength();
+      }
+      return -1;
+   }
+
+   static int lenOfSecondTokenInContraction( String s ) {
+      final LengthPair lengthPair = WORD_LENGTH_PAIRS.get( s );
+      if ( lengthPair != null ) {
+         return lengthPair.getSuffixLength();
+      }
+      return -1;
+   }
+
+   static int lenOfThirdTokenInContraction( String s ) {
+      final LengthPair lengthPair = WORD_LENGTH_PAIRS.get( s );
+      if ( lengthPair != null ) {
+         return s.length() - lengthPair.getSuffixLength()  - lengthPair.getSuffixLength();
+      }
+      return -1;
+   }
+
+
+
+// Find the 3 characters that are the next possible token breaks (look for next 3 whitespace, punctuation, but (*) count contiguous whitespace as one)
+// We are most interested in those that sometimes cause a split and sometimes don't -- apostrophes and hyphens.
+// 80's-esque should be one token according to the 2 rules. (potential break characters for that example
+// are the apostrophe, the hyphen, and the whitespace)
+// salon-o-torium should be one token, with a single contraction token and a single word token (potential
+// break characters for that example are hyphen hyphen whitespace)
+
+
+// Cases where first non alphanum is an apostrophe:
+// 1st nonalphanum    2nd nonalphanum    3rd nonalphanum
+
+//   apostr           hyphen             apostr        take at most up to 3rd break
+//   apostr           hyphen             hyphen        test for -o-torium, otherwise take at most up to 3rd break (ignore case of o-torium followed by something more meaningful)
+//   apostr           hyphen             whtspc        take at most up to 3rd break
+//   apostr           hyphen             other         take at most up to 3rd break
+
+//   apostr           apostr             any           take at most up to 2nd break
+
+//   apostr           whtspc             any*          take at most up to 2nd break
+
+//   apostr           other              any           take at most up to 2nd break
+
+// Note that an exception prefix between apostrophe and 1st hyphen does not avoid the break at the hyphen
+// so the logic is not something that can be broken down into just looking at the not splitting at the
+// apostrophe followed by normal hyphen processing.
+
+
+   private static String[] FULL_APOSTROPHE_WORDS = { "p'yongyang", };
+
+   static private final Collection<String> FULL_APOSTROPHE_WORDS_LOOKUP
+         = new HashSet<>( Arrays.asList( FULL_APOSTROPHE_WORDS ) );
+
+   /**
+    * Assumes apostrophe is not first character.... that case is handled elsewhere
+    * Assumes <code>s</code> is lower case.
+    */
+   static private boolean breakAtApostrophe( final String lowerCaseText, final int positionOfApostropheToTest ) {
+      if ( lowerCaseText.length() == positionOfApostropheToTest + 1 ) {
+         return true; // James'
+      }
+      if ( positionOfApostropheToTest == 0 ) {
+         throw new UnsupportedOperationException( "positionOfApostropheToTest==0" );
+      }
+      // First check for things like 80's that are all digits followed by 's and immediately
+      // after the s there can't be an alphanum
+      if ( allDigits( lowerCaseText.substring( 0, positionOfApostropheToTest ) )
+           && lowerCaseText.charAt( positionOfApostropheToTest + 1 ) == 's' ) {
+         if ( lowerCaseText.length() < positionOfApostropheToTest + 3 ) {
+            return false; // 80's<end_of_input>
+         }
+         // Check that after the 's there aren't more letters or digits which would be unknown like 'st or 's2
+         // and therefore don't want to assume ' should be kept together with rest.
+         final char after = lowerCaseText.charAt( positionOfApostropheToTest + 2 );
+         return Character.isLetterOrDigit( after );
+      }
+      // if not one of the exceptions above, break at the apostrophe
+      return !FULL_APOSTROPHE_WORDS_LOOKUP.contains( lowerCaseText );
+   }
+
+   // If at least 1 char long and all chars are digits
+   static private boolean allDigits( final String lowerCaseText ) {
+      if ( lowerCaseText == null || lowerCaseText.length() < 1 ) {
+         return false;
+      }
+      for ( char c : lowerCaseText.toCharArray() ) {
+         if ( !isDigit( c ) ) {
+            return false;
+         }
+      }
+      return true;
+   }
+
+
+   /**
+    * for a word like 80's or P'yongyang or James' or Sean's or 80's-like or 80's-esque
+    * (or can't or haven't, which are to be split)
+    * determine whether the singlequote(apostrophe)
+    * needs to be kept with the surrounding letters/numbers
+    * and what to do about hyphenated afterwards if there is a hyphen after....
+    * For possessives, do split.
+    * Note that things that start with an apostrophe like 'Assad were handled elsewhere
+    *
+    * @return len of how much to keep: len to apostrophe, or to next breaking char (the space after s for "80's ") or end of hyphenated suffix that should also remain attached, or -1
+    */
+   static private int tokenLengthCheckingForSingleQuoteWordsToKeepTogether( final String lowerCasedText ) {
+      if ( lowerCasedText == null ) {
+         throw new UnsupportedOperationException( "no quote/apostrophe char found in (null)" );
+      }
+      final int firstBreak = lowerCasedText.indexOf( APOSTROPHE );
+      if ( firstBreak < 0 ) {
+         throw new UnsupportedOperationException( "no quote/apostrophe char found in '" + lowerCasedText + "'" );
+      }
+      if ( firstBreak == 0 ) {
+         return -1;
+      }
+      if ( firstBreak + 1 == lowerCasedText.length() ) {
+         return firstBreak;
+      }
+      if ( breakAtApostrophe( lowerCasedText, firstBreak ) ) {
+         return firstBreak;
+      }
+      // else going to keep at least past the apostrophe, but if there's a hyphenated word or a hyphenated suffix that should not be split,
+      // keep that much too
+      final int secondBreak = findNextNonAlphaNum( lowerCasedText, firstBreak + 1 );
+      if ( secondBreak == lowerCasedText.length() ) {
+         return secondBreak; // no more text, must stop here
+      }
+      // See if there are hyphenated suffix(es) that should also remain attached
+      if ( lowerCasedText.charAt( secondBreak ) != HYPHEN_OR_MINUS_SIGN ) {
+         return secondBreak;
+      }
+      // have to determine whether to keep the hyphen and how many hyphens
+      // 80's-esque
+      final int len = lenIfHyphenatedSuffix( lowerCasedText, secondBreak );
+      if ( len > 0 ) {
+         return secondBreak + len;
+      }
+      return secondBreak;
+   }
+
+
+
+   static private final Map<String,LengthPair> WORD_LENGTH_PAIRS = new HashMap<>( 7 );
+   static {
+      WORD_LENGTH_PAIRS.put( "cannot", new LengthPair( 3, 3 ) );
+      WORD_LENGTH_PAIRS.put( "gonna", new LengthPair( 3, 2 ) );
+      WORD_LENGTH_PAIRS.put( "gotta", new LengthPair( 3, 2 ) );
+      WORD_LENGTH_PAIRS.put( "lemme", new LengthPair( 3, 2 ) );
+      WORD_LENGTH_PAIRS.put( "wanna", new LengthPair( 3, 2 ) );
+      WORD_LENGTH_PAIRS.put( "whaddya", new LengthPair( 3, 2 ) );
+      WORD_LENGTH_PAIRS.put( "whatcha", new LengthPair( 3, 1 ) );
+   }
+
+
+   // more'n *n't // for can't and shouldn't etc.
+   static private final String[] POSSIBLE_CONTRACTION_ENDINGS = { "'s", "'ve", "'re", "'ll", "'d", "'n", "n't" };
+         // note 't is different in that n't is the contraction token
+   static private final String LETTERS_AFTER_APOSTROPHE_FOR_MIDDLE_OF_CONTRACTION = "svrldnt";
+
+
+   static private final String[] CONTRACTIONS_STARTING_WITH_APOSTROPHE = { "'tis", "'twas", };
+
+   static boolean isContractionThatStartsWithApostrophe( final int currentPosition, final String textSegment ) {
+      String text = textSegment.substring( currentPosition );
+      for ( String s : CONTRACTIONS_STARTING_WITH_APOSTROPHE ) {
+         if ( startsWithWithoutBeingFollowedByLetter( text, s ) ) {
+            return true;
+         }
+      }
+      return false;
+   }
+
+
+   //    Hyphenated interjections and affixes in the following list are not split into multiple tokens.
+   //    For example, uh-oh and e-mail are both single tokens: uh-oh, e-mail.
+
+   // hyphenated prefixes to not split
+   static private final String[] HYPHENATED_PREFIXES = {
+         "e-",
+         "a-",
+         "u-",
+         "x-",
+         "agro-",
+         "ante-",
+         "anti-",
+         "arch-",
+         "be-",
+         "bi-",
+         "bio-",
+         "co-",
+         "counter-",
+         "cross-",
+         "cyber-",
+         "de-",
+         "eco-",
+         "ex-",
+         "extra-",
+         "inter-",
+         "intra-",
+         "macro-",
+         "mega-",
+         "micro-",
+         "mid-",
+         "mini-",
+         "multi-",
+         "neo-",
+         "non-",
+         "over-",
+         "pan-",
+         "para-",
+         "peri-",
+         "post-",
+         "pre-",
+         "pro-",
+         "pseudo-",
+         "quasi-",
+         "re-",
+         "semi-",
+         "sub-",
+         "super-",
+         "tri-",
+         "ultra-",
+         "un-",
+         "uni-",
+         "vice-",
+         // From email from Colin Warner <co...@ldc.upenn.edu> on 7/25/2010
+         "electro-",
+         "gasto-",
+         "homo-",
+         "hetero-",
+         "ortho-",
+         "phospho-",
+   };
+   static private final Collection<String> HYPHENATED_PREFIXES_LOOKUP = new HashSet<>( Arrays.asList( HYPHENATED_PREFIXES ) );
+
+   // hyphenated suffixes to not split
+   static private final String[] HYPHENATED_SUFFIXES
+         = { "-esque", "-ette", "-fest", "-fold", "-gate", "-itis", "-less", "-most", "-o-torium", "-rama", "-wise" };
+   static private final Collection<String> HYPHENATED_SUFFIXES_LOOKUP = new HashSet<>( Arrays.asList( HYPHENATED_SUFFIXES ) );
+
+   // complete words including hyphen
+   static private final String[] HYPHENATED_WORDS = { "mm-hm", "mm-mm", "o-kay", "uh-huh", "uh-oh" };
+   static private final Collection<String> HYPHENATED_WORDS_LOOKUP = new HashSet<>( Arrays.asList( HYPHENATED_WORDS ));
+
+   static private final char MINUS_OR_HYPHEN = '-';
+
+   static private final Character[] PUNCTUATION = { ';', ':', ',', '.', '(', ')', '[', ']', '{', '}', '<', '>', '\'',
+                                               '"', '/', '\\', '-' };
+   static private final Collection<Character> PUNCTUATION_LOOKUP = new HashSet<>( Arrays.asList( PUNCTUATION ));
+
+   static private final String PUNCTUATION_2 = ";:,.()[]{}<>\'\"/\\-";
+
+
+
+   /**
+    * There is the fixed list of hyphenated words to not be split (HYPHENATED_WORDS_LOOKUP)
+    * <p/>
+    * And here are some made-up examples of words using affixes to keep together
+    * chronic-itis      1 suffix
+    * mega-huge         1 prefix
+    * e-game-fest       1 prefix and 1 suffix
+    * salon-o-torium    1 suffix that contains 2 hyphens
+    * urban-esque-wise  2 suffixes
+    *
+    * @param lowerCaseString because of "-o-torium", input might contain more than 1 hyphen....
+    * @return len to keep together, as far as we know. see hyphen hyphen hyphen case below.
+    * throws exception if there's no hyphen;
+    * number of characters to keep.
+    * Does not mean to split at n+1 hyphen... need to recheck that one
+    */
+   public static int tokenLengthCheckingForHyphenatedTerms( final String lowerCaseString ) {
+      if ( lowerCaseString == null ) {
+         throw new UnsupportedOperationException( "no hyphen found in (null)" );
+      }
+      final int firstBreak = lowerCaseString.indexOf( MINUS_OR_HYPHEN );
+      if ( firstBreak < 0 ) {
+         throw new UnsupportedOperationException( "no hyphen found in '" + lowerCaseString + "'" );
+      }
+      if ( firstBreak == 0 ) {
+         return -1;
+      }
+      if ( firstBreak + 1 == lowerCaseString.length() ) {
+         return firstBreak; // if ends with hyphen, don't include the hyphen in the token.   mega-  by itself should be mega and -
+      }
+
+      // Find the 3 characters that are the next possible token breaks (look for next 3 whitespace, punctuation, but (*) count contiguous whitespace as one)
+      // We are most interested in those that sometimes cause a split and sometimes don't -- apostrophes and hyphens.
+      // 80's-esque should be one token according to the 2 rules. (potential break characters for that example
+      // are the apostrophe, the hyphen, and the whitespace)
+      // salon-o-torium should be one token, with a single contraction token and a single word token (potential
+      // break characters for that example are hyphen hyphen whitespace)
+
+
+      // Cases where first non alphanum is a hyphen:
+      // 1st nonalphanum    2nd nonalphanum    3rd nonalphanum
+      //   hyphen           hyphen             apostr        test for o-torium, otherwise take at most up to 2nd break
+      //   hyphen           hyphen             hyphen        test for o-torium, otherwise take at most up to 2nd break, but check rest next time (TBD)
+      //   hyphen           hyphen             whtspc        test for o-torium, otherwise take at most up to 2nd break
+      //   hyphen           hyphen             other         test for o-torium, otherwise take at most up to 2nd break
+
+      //   hyphen           apostr             any           hyphen and apostr not in a name together, take at most up to 2nd break
+
+      //   hyphen           whtspc             any*          take at most up to 2nd break
+
+      //   hyphen           other              any           take at most up to 2nd break
+
+      final int secondBreak = findNextNonAlphaNum( lowerCaseString, firstBreak + 1 );
+      if ( secondBreak == lowerCaseString.length() ) {
+         // determines if we should we split at first break or not
+         return lenIncludingHyphensToKeep( lowerCaseString, firstBreak, 1, secondBreak, -1 );
+      }
+      final int thirdBreak = findNextNonAlphaNum( lowerCaseString, secondBreak + 1 );
+      if ( lowerCaseString.charAt( secondBreak ) == MINUS_OR_HYPHEN ) {
+         // test for -o-torium, otherwise take at most up to 2nd break
+         return lenIncludingHyphensToKeep( lowerCaseString, firstBreak, 2, secondBreak, thirdBreak ); // take up to 2nd or 3rd break (or just to first, if not one of the exceptions)
+      } else if ( lowerCaseString.charAt( secondBreak ) == APOSTROPHE ) {
+         return lenIncludingHyphensToKeep( lowerCaseString, firstBreak, 1, secondBreak, thirdBreak );
+      } else if ( Character.isWhitespace( lowerCaseString.charAt( secondBreak ) ) ) {
+         return lenIncludingHyphensToKeep( lowerCaseString, firstBreak, 1, secondBreak, thirdBreak );  // take up to 2nd break (or just to first, if not one of the exceptions)
+      }
+      // some other symbol or punctuation
+      return lenIncludingHyphensToKeep( lowerCaseString, firstBreak, 1, secondBreak, thirdBreak );
+   }
+
+   // If there is 1 hyphen:  prefix, suffix, or word like uh-oh
+   // If there are 2 hyphens: o-torium or prefix and suffix like mega-huge-esque
+   private static int lenIncludingHyphensToKeep( final String s, final int indexOfFirstHyphen,
+                                                 final int numberOfHyphensToConsiderKeeping,
+                                                 final int secondBreak, final int thirdBreak ) {
+      String possibleSuffix;
+      boolean lookup;
+      if ( numberOfHyphensToConsiderKeeping > 2 || numberOfHyphensToConsiderKeeping < 1 ) {
+         throw new UnsupportedOperationException(
+               "Not ready to handle numberOfHyphensToConsiderKeeping = " + numberOfHyphensToConsiderKeeping );
+      }
+      // FIRST CONSIDER suffixes
+      // Of the suffixes, first check those that have 2 hyphens (-o-torium)
+      if ( numberOfHyphensToConsiderKeeping == 2 ) {
+         possibleSuffix = s.substring( indexOfFirstHyphen, thirdBreak );
+         lookup = HYPHENATED_SUFFIXES_LOOKUP.contains( possibleSuffix );
+         if ( lookup ) {
+            return thirdBreak;
+         }
+      }
+
+      // Now either numberOfHyphensToConsiderKeeping==1 or was ==2 but no 2-hyphen suffix was found to match
+      // Try one-hyphen suffixes, either just 1 of them or 2 of them (but not yet checking for 1 with a prefix too, see prefixes section below for that...
+      possibleSuffix = s.substring( indexOfFirstHyphen, secondBreak );
+      lookup = HYPHENATED_SUFFIXES_LOOKUP.contains( possibleSuffix );
+      if ( lookup ) { // First hyphen is start of a suffix that should not be split off
+         // could be numberOfHyphensToConsiderKeeping==1 here, or could be 2 separate 1-hyphen suffixes are both
+         // used, so do need to check for a second suffix....
+         // Check if a second 1-hyphen suffix
+         if ( thirdBreak > secondBreak ) {
+            possibleSuffix = s.substring( secondBreak, thirdBreak );
+            lookup = HYPHENATED_SUFFIXES_LOOKUP.contains( possibleSuffix );
+            if ( lookup ) {
+               return thirdBreak; // 2 1-hyphen suffixes that all should be kept together
+            }
+         }
+         return secondBreak; // just 1 1-hyphen suffix from the list of exceptions
+      }
+
+      // Now consider  HYPHENATED_WORDS_LOOKUP plus a suffix   such as uh-oh-X
+      if ( numberOfHyphensToConsiderKeeping > 1 ) {
+         String possibleHyphenatedWordsLookupMatch = s.substring( 0, secondBreak );
+         possibleSuffix = s.substring( secondBreak, thirdBreak );
+         lookup = HYPHENATED_WORDS_LOOKUP.contains( possibleHyphenatedWordsLookupMatch ) &&
+                  HYPHENATED_SUFFIXES_LOOKUP.contains( possibleSuffix );
+         if ( lookup ) {
+            return thirdBreak;
+         }
+      }
+
+
+      //  NOW CONSIDER prefixes
+
+      String possiblePrefix = s.substring( 0, indexOfFirstHyphen + 1 );
+
+      lookup = HYPHENATED_PREFIXES_LOOKUP.contains( possiblePrefix );
+
+      // First consider prefix + one of the HYPHENATED_WORDS_LOOKUP
+      // Do this before considering just prefix so we get both if both are present.
+      if ( lookup && numberOfHyphensToConsiderKeeping > 1 ) {
+         String possibleHyphenatedWordsLookupMatch = s.substring( indexOfFirstHyphen + 1, thirdBreak ); // e.g. uh-oh
+         boolean lookup2 = HYPHENATED_WORDS_LOOKUP.contains( possibleHyphenatedWordsLookupMatch );
+         if ( lookup2 ) {
+            return thirdBreak;
+         }
+      }
+
+      if ( numberOfHyphensToConsiderKeeping == 1 ) {
+         if ( lookup ) {
+            return secondBreak;
+         }
+      }
+
+      if ( numberOfHyphensToConsiderKeeping == 2 ) {
+         if ( lookup ) { // a prefix was found that should not be split
+            // check for a one-hyphen suffix to go with the one-hyphen prefix
+            possibleSuffix = s.substring( secondBreak, thirdBreak );
+            boolean lookup2 = HYPHENATED_SUFFIXES_LOOKUP.contains( possibleSuffix );
+            if ( lookup2 ) {
+               return thirdBreak; // both a prefix and a suffix that are not to be split, keep all together
+            }
+            return secondBreak; // just a prefix that should not be split, split before second hyphen
+         } else { // not a prefix, or is a prefix that should be split
+            // Already checked for a 2-hyphen suffix without a prefix
+            // And already checked for word like uh-oh with a suffix
+            // And apparently neither of those, so don't check anything else, fall through to next check
+            //String m = "This condition checked already in other if-else " + indexOfFirstHyphen + COMMA + secondBreak + COMMA + thirdBreak + COMMA + s;
+            //throw new UnsupportedOperationException(m);
+         }
+
+      }
+
+      // Finally consider just HYPHENATED_WORDS_LOOKUP, without an affix, such as "uh-oh"
+      String possibleHyphenatedWordsLookupMatch = s.substring( 0, secondBreak );
+      lookup = HYPHENATED_WORDS_LOOKUP.contains( possibleHyphenatedWordsLookupMatch );
+      if ( lookup ) {
+         return secondBreak;
+      }
+
+      return indexOfFirstHyphen; // if the first hyphen is not eligible to keep, keep just up to it.
+   }
+
+   // if character at position is a hyphen and starts a hyphenated suffix that is an exception
+   // and should not be split from the rest of teh word, return length of the suffix
+   // return -1 if not an exception suffix
+   static int lenIfHyphenatedSuffix( String lowerCasedString, int position ) {
+      lowerCasedString = lowerCasedString.toLowerCase();
+      int next = findNextNonAlphaNum( lowerCasedString, position + 1 );
+      String possibleSuffix = lowerCasedString.substring( position, next );
+      if ( lowerCasedString.substring( position ).startsWith( "-o-" ) ) { // check for -o-torium
+         next = findNextNonAlphaNum( lowerCasedString, position + 3 );
+         possibleSuffix = lowerCasedString.substring( position, next );
+      }
+      boolean lookup = HYPHENATED_SUFFIXES_LOOKUP.contains( possibleSuffix );
+
+      if ( lookup ) {
+         return possibleSuffix.length();
+      }
+      return -1;
+   }
+
+
+
+
+
+
+
+
+
+
+
+   static private class LengthPair {
+      final private int __rootLength;
+      final private int __suffixLength;
+
+      private LengthPair( final int rootLength, final int suffixLength ) {
+         __rootLength = rootLength;
+         __suffixLength = suffixLength;
+      }
+      public int getRootLength() {
+         return __rootLength;
+      }
+      public int getSuffixLength() {
+         return __suffixLength;
+      }
+   }
+
+
+
+}

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TokenIndexHolder.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TokenIndexHolder.java?rev=1703438&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TokenIndexHolder.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TokenIndexHolder.java Wed Sep 16 17:43:48 2015
@@ -0,0 +1,116 @@
+package org.apache.ctakes.dictionarytool.util.token;
+
+import static java.lang.Character.isLetter;
+import static java.lang.Character.isDigit;
+import static java.lang.Character.isWhitespace;
+
+/**
+* @author SPF , chip-nlp
+* @version %I%
+* @since 7/22/2015
+*/
+final public class TokenIndexHolder {
+   boolean obviouslyIsWord = true; // until we find a non alphanum before a whitespace
+   boolean obviouslyIsNumber = true; // until we find a non digit before a whitespace
+   int nextWhitespaceOrEndOfSegment = -1;
+   int nextNonLetterOrNonDigit = -1;
+   int nextNonLetterDigitApostrophe = -1;
+   int nextNonTelephoneOrPostalChar = -1; // digits and dash aka hyphen
+   int nextNonNumericChar = -1; // 9,876.012345  is an example with all the numeric chars
+   int nextNonDigit = -1;
+
+   public TokenIndexHolder( final String lowerCasedText, final int currentPosition ) {
+      for ( int i=currentPosition; i<lowerCasedText.length(); i++ ) {
+         final char c = lowerCasedText.charAt( i );
+         if ( isWhitespace( c ) ) {
+            setAsWhitespace( i );
+            break;
+         }
+         final boolean isLetter = isLetter( c );
+         final boolean isDigit = isDigit( c );
+         if ( !isLetter && !isDigit ) {
+            // not whitespace, not letter, not digit, therefore symbol
+            setAsSymbol( c, i );
+            // don't break here though, keep going to set nextWhitespace correctly for other uses
+         } else if ( isLetter ) {
+            setAsCharacter( i );
+         }
+         // else is a digit, none of the flags need to be set for digit characters.
+      }
+      if ( nextWhitespaceOrEndOfSegment < 0 ) {
+         // reached the end of the text
+         initializeAsLastToken( lowerCasedText.length() );
+      }
+   }
+
+
+   private void setAsWhitespace( final int i ) {
+      nextNonLetterOrNonDigit = setUnknownToIndex( nextNonLetterOrNonDigit, i );
+      nextNonLetterDigitApostrophe = setUnknownToIndex( nextNonLetterDigitApostrophe, i );
+      nextNonDigit = setUnknownToIndex( nextNonDigit, i );
+      nextNonTelephoneOrPostalChar = setUnknownToIndex( nextNonTelephoneOrPostalChar, i );
+      nextNonNumericChar = setUnknownToIndex( nextNonNumericChar, i );
+      nextWhitespaceOrEndOfSegment = i;
+   }
+
+   private void setAsSymbol( final char c, final int i ) {
+      obviouslyIsWord = false; // not sure if it will be word all the way to whitespace
+      obviouslyIsNumber = false; // not sure if it will be number all the way to whitespace
+      nextNonDigit = setUnknownToIndex( nextNonDigit, i );
+      nextNonLetterOrNonDigit = setUnknownToIndex( nextNonLetterOrNonDigit, i );
+      if ( c != TextTokenizerCtakesPTB.APOSTROPHE ) {
+         nextNonLetterDigitApostrophe = setUnknownToIndex( nextNonLetterDigitApostrophe, i );
+         if ( !isTelephoneNumberChar( c ) ) {
+            nextNonTelephoneOrPostalChar = setUnknownToIndex( nextNonTelephoneOrPostalChar, i );
+         }
+         if ( !isNumericChar( c ) ) {
+            nextNonNumericChar = setUnknownToIndex( nextNonNumericChar, i );
+         }
+      }
+   }
+
+   private void setAsCharacter( final int i ) {
+      obviouslyIsNumber = false; // not sure if it will be number all the way to whitespace
+      // The above -should- be incorrect when considering scientific numbers
+      // nextNonLetterOrNonDigit is not changed here
+      // nextNonLetterDigitApostrophe is not changed here
+      nextNonDigit = setUnknownToIndex( nextNonDigit, i );
+//      if ( !isTelephoneNumberChar( c ) ) {  // wtf?  never a telephone # character (according to ctakes ptb)
+         nextNonTelephoneOrPostalChar = setUnknownToIndex( nextNonTelephoneOrPostalChar, i );
+//      }
+//      if ( !isNumericChar( c ) ) {  // wtf?  never a # character (according to ctakes ptb)
+         nextNonNumericChar = setUnknownToIndex( nextNonNumericChar, i );
+//      }
+   }
+
+   private void initializeAsLastToken( final int textLength ) {
+      nextWhitespaceOrEndOfSegment = setUnknownToIndex( nextWhitespaceOrEndOfSegment, textLength );
+      nextNonLetterOrNonDigit = setUnknownToIndex( nextNonLetterOrNonDigit, textLength );
+      nextNonLetterDigitApostrophe = setUnknownToIndex( nextNonLetterDigitApostrophe, textLength );
+      nextNonTelephoneOrPostalChar = setUnknownToIndex( nextNonTelephoneOrPostalChar, textLength );
+      nextNonNumericChar = setUnknownToIndex( nextNonNumericChar, textLength );
+   }
+
+   /**
+    * @param c character to test
+    * @return true if in 0123456789-
+    */
+   static private boolean isTelephoneNumberChar( final char c ) {
+      return isDigit( c ) || c == '-';
+   }
+
+   /**
+    * @param c character to test
+    * @return true if in ,.0123456789
+    */
+   static private boolean isNumericChar( final char c ) {
+      return isDigit( c ) || c == ',' || c == '.';
+   }
+
+   static private int setUnknownToIndex( final int currentValue, final int index ) {
+      if ( currentValue >= 0 ) {
+         return currentValue;
+      }
+      return index;
+   }
+}

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiTextsMapWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiTextsMapWriter.java?rev=1703438&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiTextsMapWriter.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiTextsMapWriter.java Wed Sep 16 17:43:48 2015
@@ -0,0 +1,64 @@
+package org.apache.ctakes.dictionarytool.writer;
+
+import org.apache.ctakes.dictionarytool.util.CuiTuiUtil;
+import org.apache.ctakes.dictionarytool.util.FileUtil;
+import org.apache.ctakes.dictionarytool.util.TokenUtil;
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Map;
+import java.util.Set;
+import java.util.logging.Logger;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 9/16/2015
+ */
+final public class CuiTuiTextsMapWriter {
+
+   static private final Logger LOGGER = Logger.getLogger( "CuiTuiTextsMapWriter" );
+
+
+   private CuiTuiTextsMapWriter() {
+   }
+
+   static public void writeCuiTuiTexts( final String bsvFilePath,
+                                        final HashSetMap<Long, Integer> validCuisAndTuis,
+                                        final HashSetMap<Long, String> cuiTexts ) {
+      System.out.println( "Writing map of Cuis and Tuis and Texts to " + bsvFilePath );
+      long lineCount = 0;
+      try {
+         final BufferedWriter writer = FileUtil.createWriter( bsvFilePath );
+         for ( Map.Entry<Long, Set<String>> cuiTextsEntry : cuiTexts.entrySet() ) {
+            final Long code = cuiTextsEntry.getKey();
+            final String cui = CuiTuiUtil.getAsCui( code );
+            final Collection<Integer> tuiCodes = validCuisAndTuis.get( code );
+            if ( tuiCodes == null ) {
+               LOGGER.severe( "No Tuis for " + code );
+               continue;
+            }
+            for ( Integer tuiCode : tuiCodes ) {
+               final String tui = CuiTuiUtil.getAsTui( tuiCode );
+               for ( String text : cuiTextsEntry.getValue() ) {
+                  lineCount++;
+                  writer.write( TokenUtil.createBsvLine( cui, tui, text ) );
+                  writer.newLine();
+                  if ( lineCount % 100000 == 0 ) {
+                     LOGGER.info( "File Line " + lineCount );
+                  }
+               }
+            }
+         }
+         writer.close();
+      } catch ( IOException ioE ) {
+         LOGGER.severe( "Error writing Term on line " + lineCount + " in file " + bsvFilePath );
+      }
+      LOGGER.info( "Wrote " + lineCount + " terms to " + bsvFilePath );
+   }
+
+
+
+}