You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2013/11/13 16:26:30 UTC

svn commit: r1541553 - /ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java

Author: tmill
Date: Wed Nov 13 15:26:29 2013
New Revision: 1541553

URL: http://svn.apache.org/r1541553
Log:
Fixes CTAKES-266. Checks for zero-length word token before creating token before contraction.

Modified:
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java?rev=1541553&r1=1541552&r2=1541553&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java Wed Nov 13 15:26:29 2013
@@ -343,12 +343,13 @@ public class TokenizerPTB {
 			        	char c = lowerCasedText.charAt(currentPosition+len);
 			        	if (c=='n' || c==APOSTROPHE) { // if a "n't" contraction or a contraction where contraction token starts with '
 			        	    if (tokenLen < 0) throw new RuntimeException("c = " + c + "tokenLen = " + tokenLen + " currentPosition = " + currentPosition);
-			        	    // First create the WordToken (no apostrophe)
-			        	    bta = createToken(tokenClass, textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
-			        	    //System.out.println("bta = " + bta + " class = " + bta.getClass() + " tokenLen = " + tokenLen + " currentPosition = " + currentPosition);
-			        	    tokens.add(bta);
-			        	    currentPosition+=tokenLen; // currentPosition
-
+			        	    // First create the WordToken (no apostrophe)
+			        	    if(tokenLen > 0){
+			        	      bta = createToken(tokenClass, textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
+			        	      //System.out.println("bta = " + bta + " class = " + bta.getClass() + " tokenLen = " + tokenLen + " currentPosition = " + currentPosition);
+			        	      tokens.add(bta);
+			        	      currentPosition+=tokenLen; // currentPosition
+			        	    }
 			        	    // Set up to create the second token, for other contractions, the next token will start with an 
 			        	    // apostrophe and be handled above... but for "n't" contractions, next token won't start with apostrophe
 			        	    // so just go ahead and handle it here instead of having to keep track of previous