You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2013/11/13 16:26:30 UTC
svn commit: r1541553 -
/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java
Author: tmill
Date: Wed Nov 13 15:26:29 2013
New Revision: 1541553
URL: http://svn.apache.org/r1541553
Log:
Fixes CTAKES-266. Checks for zero-length word token before creating token before contraction.
Modified:
ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java
Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java?rev=1541553&r1=1541552&r2=1541553&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java Wed Nov 13 15:26:29 2013
@@ -343,12 +343,13 @@ public class TokenizerPTB {
char c = lowerCasedText.charAt(currentPosition+len);
if (c=='n' || c==APOSTROPHE) { // if a "n't" contraction or a contraction where contraction token starts with '
if (tokenLen < 0) throw new RuntimeException("c = " + c + "tokenLen = " + tokenLen + " currentPosition = " + currentPosition);
- // First create the WordToken (no apostrophe)
- bta = createToken(tokenClass, textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
- //System.out.println("bta = " + bta + " class = " + bta.getClass() + " tokenLen = " + tokenLen + " currentPosition = " + currentPosition);
- tokens.add(bta);
- currentPosition+=tokenLen; // currentPosition
-
+ // First create the WordToken (no apostrophe)
+ if(tokenLen > 0){
+ bta = createToken(tokenClass, textSegment, jcas, currentPosition, currentPosition+tokenLen, offsetAdjustment);
+ //System.out.println("bta = " + bta + " class = " + bta.getClass() + " tokenLen = " + tokenLen + " currentPosition = " + currentPosition);
+ tokens.add(bta);
+ currentPosition+=tokenLen; // currentPosition
+ }
// Set up to create the second token, for other contractions, the next token will start with an
// apostrophe and be handled above... but for "n't" contractions, next token won't start with apostrophe
// so just go ahead and handle it here instead of having to keep track of previous