You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ca...@apache.org on 2009/03/31 20:06:12 UTC

svn commit: r760554 - in /incubator/pdfbox/trunk: src/main/java/org/apache/pdfbox/util/ test/input/

Author: carrier
Date: Tue Mar 31 18:06:11 2009
New Revision: 760554

URL: http://svn.apache.org/viewvc?rev=760554&view=rev
Log:
Patch for PDFBOX-444 by Justin LeFebvre and myself to fix errors merging in diacritics during text extraction. Also includes updated regression test files and some code formatting changes

Modified:
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
    incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPosition.java
    incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf-sorted.txt
    incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf.txt
    incubator/pdfbox/trunk/test/input/cweb.pdf-sorted.txt
    incubator/pdfbox/trunk/test/input/cweb.pdf.txt

Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=760554&r1=760553&r2=760554&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Tue Mar 31 18:06:11 2009
@@ -102,7 +102,7 @@
      * and to correct the direction of right to left text, such as Arabic and Hebrew.
      */
     private TextNormalize normalize = null;
-    
+
     /**
      * Instantiate a new PDFTextStripper object.  This object will load properties from
      * Resources/PDFTextStripper.properties.
@@ -231,8 +231,8 @@
         }
 
         if( startBookmarkPageNumber == -1 && startBookmark != null &&
-            endBookmarkPageNumber == -1 && endBookmark != null &&
-            startBookmark.getCOSObject() == endBookmark.getCOSObject() )
+                endBookmarkPageNumber == -1 && endBookmark != null &&
+                startBookmark.getCOSObject() == endBookmark.getCOSObject() )
         {
             //this is a special case where both the start and end bookmark
             //are the same but point to nothing.  In this case
@@ -302,8 +302,8 @@
     protected void processPage( PDPage page, COSStream content ) throws IOException
     {
         if( currentPageNo >= startPage && currentPageNo <= endPage &&
-            (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber ) &&
-            (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber ))
+                (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber ) &&
+                (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber ))
         {
             startPage( page );
             pageArticles = page.getThreadBeads();
@@ -347,7 +347,7 @@
     {
         startArticle(true);
     }
-    
+
     /**
      * Start a new article, which is typically defined as a column
      * on a single page (also referred to as a bead).  
@@ -417,11 +417,11 @@
         float lastWordSpacing = -1;
         float maxHeightForLine = -1;
         TextPosition lastPosition = null;
-        
+
         if (normalize == null) {
             normalize = new TextNormalize();
         }
-        
+
         for( int i = 0; i < charactersByArticle.size(); i++)
         {
             List textList = (List)charactersByArticle.get( i );
@@ -432,7 +432,7 @@
             }
 
             Iterator textIter = textList.iterator();
-                        
+
             /* Before we can display the text, we need to do some normalizing.
              * Arabic and Hebrew text is right to left and is typically stored
              * in its logical format, which means that the rightmost character is 
@@ -477,9 +477,9 @@
             if (rtlCnt > ltrCnt) {
                 isRtlDominant = true;
             }
-            
+
             startArticle(!isRtlDominant);
-            
+
             // we will later use this to skip reordering
             boolean hasRtl = false;
             if (rtlCnt > 0)
@@ -489,7 +489,7 @@
              * We queue up a line at a time before we print so that we can convert
              * the line from presentation form to logical form (if needed). */
             String lineStr = "";
-            
+
             textIter = textList.iterator();    // start from the beginning again
             while( textIter.hasNext() )
             {
@@ -504,16 +504,16 @@
                 /* If we are sorting, then we need to use the text direction
                  * adjusted coordinates, because they were used in the sorting. */
                 if (sortByPosition) {
-                	positionX = position.getXDirAdj();
-                	positionY = position.getYDirAdj();
-                	positionWidth = position.getWidthDirAdj();
-                	positionHeight = position.getHeightDir();
+                    positionX = position.getXDirAdj();
+                    positionY = position.getYDirAdj();
+                    positionWidth = position.getWidthDirAdj();
+                    positionHeight = position.getHeightDir();
                 }
                 else {
-                	positionX = position.getX();
-                	positionY = position.getY();
-                	positionWidth = position.getWidth();
-                	positionHeight = position.getHeight();
+                    positionX = position.getX();
+                    positionY = position.getY();
+                    positionWidth = position.getWidth();
+                    positionHeight = position.getHeight();
                 }
 
                 //try to get width of a space character
@@ -579,13 +579,13 @@
                             lastPosition.getCharacter() != null &&
                             !lastPosition.getCharacter().endsWith( " " ) )
                     {
-	                    lineStr += getWordSeparator();
-	                }
+                        lineStr += getWordSeparator();
+                    }
 
                 }
 
                 if (positionY >= maxYForLine) {
-                	maxYForLine = positionY;
+                    maxYForLine = positionY;
                 }
 
                 // RDD - endX is what PDF considers to be the x coordinate of the
@@ -602,7 +602,7 @@
                 lastPosition = position;
                 lastWordSpacing = wordSpacing;
             }
-            
+
             // print the final line
             if (lineStr.length() > 0) {
                 if (hasRtl)
@@ -613,7 +613,7 @@
 
                 writeString(lineStr);
             }
-            
+
             endArticle();
         }
 
@@ -623,7 +623,7 @@
     private boolean overlap( float y1, float height1, float y2, float height2 )
     {
         return within( y1, y2, .1f) || (y2 <= y1 && y2 >= y1-height1) ||
-               (y1 <= y2 && y1 >= y2-height2);
+        (y1 <= y2 && y1 >= y2-height2);
     }
 
     /**
@@ -638,7 +638,7 @@
         output.write(getPageSeparator());
         output.flush();
     }
-    
+
     /**
      * Write the line separator value to the output stream
      * @throws IOException
@@ -647,7 +647,7 @@
     {
         output.write(getLineSeparator());
     }
-    
+
 
     /**
      * Write the word separator value to the output stream
@@ -668,7 +668,7 @@
     {
         output.write( text.getCharacter() );
     }
-    
+
     /**
      * Write a Java string to the output stream.
      *
@@ -691,8 +691,8 @@
     {
         return second > first - variance && second < first + variance;
     }
-    
-    
+
+
     /**
      * This will process a TextPosition object and add the
      * text to the list of characters on a page.  It takes care of
@@ -738,11 +738,11 @@
                 //only want to suppress
 
                 if( charCharacter != null &&
-                    //charCharacter.equals( textCharacter ) &&
-                    within( charX, textX, tolerance ) &&
-                    within( charY,
-                    		textY,
-                            tolerance ) )
+                        //charCharacter.equals( textCharacter ) &&
+                        within( charX, textX, tolerance ) &&
+                        within( charY,
+                                textY,
+                                tolerance ) )
                 {
                     suppressCharacter = true;
                 }
@@ -777,8 +777,8 @@
                             foundArticleDivisionIndex = i*2+1;
                         }
                         else if( (x < rect.getLowerLeftX() ||
-                                  y < rect.getUpperRightY()) &&
-                            notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
+                                y < rect.getUpperRightY()) &&
+                                notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
                         {
                             notFoundButFirstLeftAndAboveArticleDivisionIndex = i*2;
                         }
@@ -824,38 +824,38 @@
             {
                 articleDivisionIndex = charactersByArticle.size()-1;
             }
-            
+
             List textList = (List) charactersByArticle.get( articleDivisionIndex );
-            
-            
-            
-            /* In the wild, some PDF encoded documents put diacritics (accents on 
-             * top of characters) into a separate Tj element.  When displaying them 
-             * graphically, the two chunks get overlayed.  With text output though, 
-             * we need to do the overlay. This code recombines the diacritic with 
-             * its associated character.
+
+            /* In the wild, some PDF encoded documents put diacritics (accents on
+             * top of characters) into a separate Tj element.  When displaying them
+             * graphically, the two chunks get overlayed.  With text output though,
+             * we need to do the overlay. This code recombines the diacritic with
+             * its associated character if the two are consecutive.
              */ 
-            // First, do we even care. The assumption is we do IFF
-            // we have a single diacritic.
-            boolean wasMerged = false;
-            String cText = text.getCharacter();
-            if (cText.length() == 1 &&  Character.getType(cText.charAt(0)) == Character.NON_SPACING_MARK) {
+            if(textList.isEmpty()){
+                textList.add(text);
+            }
+            else{
                 /* test if we overlap the previous entry.  
                  * Note that we are making an assumption that we need to only look back
                  * one TextPosition to find what we are overlapping.  
                  * This may not always be true. */
-                if(!textList.isEmpty()){
-                    TextPosition previous = (TextPosition)textList.get(textList.size()-1);
-                    if ((previous != null) && previous.contains(text)) {
-                        previous.mergeDiacritic(text);
-                        wasMerged = true;
-                    }
+                TextPosition previousTextPosition = (TextPosition)textList.get(textList.size()-1);
+                if(text.isDiacritic() && previousTextPosition.contains(text)){
+                    previousTextPosition.mergeDiacritic(text);
+                }
+                /* If the previous TextPosition was the diacritic, merge it into this
+                 * one and remove it from the list. */
+                else if(previousTextPosition.isDiacritic() && text.contains(previousTextPosition)){
+                    text.mergeDiacritic(previousTextPosition);
+                    textList.remove(textList.size()-1);
+                    textList.add(text);
+                }
+                else{
+                    textList.add(text);
                 }
             }
-            
-            // if we could not merge with the previous entry, add it to the list
-            if (wasMerged == false)
-                textList.add(text);
         }
     }
 

Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPosition.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPosition.java?rev=760554&r1=760553&r2=760554&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPosition.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPosition.java Tue Mar 31 18:06:11 2009
@@ -27,24 +27,24 @@
  */
 public class TextPosition
 {
-	/* TextMatrix for the start of the text object.  Coordinates
-	 * are in display units and have not been adjusted. */
-	private Matrix textPos;
-	
-	// ending X and Y coordinates in display units
-	private float endX;
-	private float endY;
-	
-	private float maxTextHeight; // maximum height of text, in display units
-	private int rot; // 0, 90, 180, 270 degrees of page rotation
-	private float pageHeight;
-	private float pageWidth;
+    /* TextMatrix for the start of the text object.  Coordinates
+     * are in display units and have not been adjusted. */
+    private Matrix textPos;
+
+    // ending X and Y coordinates in display units
+    private float endX;
+    private float endY;
+
+    private float maxTextHeight; // maximum height of text, in display units
+    private int rot; // 0, 90, 180, 270 degrees of page rotation
+    private float pageHeight;
+    private float pageWidth;
     private float[] widths;
     private float widthOfSpace; // width of a space, in display units
     private String str; 
     private PDFont font;
     private float fontSize;
-    private float wordSpacing;	// word spacing value, in display units
+    private float wordSpacing;  // word spacing value, in display units
 
     protected TextPosition()
     {
@@ -66,32 +66,32 @@
      * @param ws The word spacing parameter (in display units)
      */
     public TextPosition(
-    		PDPage page,
-    		Matrix textPositionSt,
-    		Matrix textPositionEnd,
-    		float maxFontH,
-    		float[] individualWidths,
-    		float spaceWidth,
-    		String string,
-    		PDFont currentFont,
-    		float fontSizeValue,
-    		float ws
+            PDPage page,
+            Matrix textPositionSt,
+            Matrix textPositionEnd,
+            float maxFontH,
+            float[] individualWidths,
+            float spaceWidth,
+            String string,
+            PDFont currentFont,
+            float fontSizeValue,
+            float ws
     )
     {
-    	this.textPos = textPositionSt;
-    	
-    	this.endX = textPositionEnd.getXPosition();
-    	this.endY = textPositionEnd.getYPosition();
-    	
-    	this.rot = page.findRotation();
-    	// make sure it is 0 to 270 and no negative numbers
-    	if(this.rot < 0)
-    		rot += 360;
-    	
-    	this.maxTextHeight = maxFontH;
-    	this.pageHeight = page.findMediaBox().getHeight();
-    	this.pageWidth = page.findMediaBox().getWidth();
-    	        
+        this.textPos = textPositionSt;
+
+        this.endX = textPositionEnd.getXPosition();
+        this.endY = textPositionEnd.getYPosition();
+
+        this.rot = page.findRotation();
+        // make sure it is 0 to 270 and no negative numbers
+        if(this.rot < 0)
+            rot += 360;
+
+        this.maxTextHeight = maxFontH;
+        this.pageHeight = page.findMediaBox().getHeight();
+        this.pageWidth = page.findMediaBox().getWidth();
+
         this.widths = individualWidths;
         this.widthOfSpace = spaceWidth;
         this.str = string;
@@ -116,7 +116,7 @@
      * @return The Matrix containing all infos of the starting textposition
      */
     public Matrix getTextPos() {
-    	return textPos;
+        return textPos;
     }
 
     /**
@@ -125,31 +125,31 @@
      * @return The direction of the text (0, 90, 180, or 270)
      */
     public float getDir() {
-    	float a = textPos.getValue(0,0);
-    	float b = textPos.getValue(0,1);
-    	float c = textPos.getValue(1,0);
-    	float d = textPos.getValue(1,1);
-    	
-    	// 12 0   left to right
-    	// 0 12 
-    	if ((a > 0) && (Math.abs(b) < d) && (Math.abs(c) < a) && (d > 0))
-    		return 0;
-    	// -12 0   right to left (upside down)
-    	// 0 -12
-    	else if ((a < 0) && (Math.abs(b) < Math.abs(d)) && (Math.abs(c) < Math.abs(a)) && (d < 0))
-    		return 180;
-    	// 0  12	up
-    	// -12 0 
-    	else if ((Math.abs(a) < Math.abs(c)) && (b > 0) && (c < 0) && (Math.abs(d) < b))
-    		return 90;
-    	// 0  -12	down
-    	// 12 0 
-    	else if ((Math.abs(a) < c) && (b < 0) && (c > 0) && (Math.abs(d) < Math.abs(b)))
-    		return 270;
- 
-    	return 0;
+        float a = textPos.getValue(0,0);
+        float b = textPos.getValue(0,1);
+        float c = textPos.getValue(1,0);
+        float d = textPos.getValue(1,1);
+
+        // 12 0   left to right
+        // 0 12 
+        if ((a > 0) && (Math.abs(b) < d) && (Math.abs(c) < a) && (d > 0))
+            return 0;
+        // -12 0   right to left (upside down)
+        // 0 -12
+        else if ((a < 0) && (Math.abs(b) < Math.abs(d)) && (Math.abs(c) < Math.abs(a)) && (d < 0))
+            return 180;
+        // 0  12    up
+        // -12 0 
+        else if ((Math.abs(a) < Math.abs(c)) && (b > 0) && (c < 0) && (Math.abs(d) < b))
+            return 90;
+        // 0  -12   down
+        // 12 0 
+        else if ((Math.abs(a) < c) && (b < 0) && (c > 0) && (Math.abs(d) < Math.abs(b)))
+            return 270;
+
+        return 0;
     }
-    
+
     /**
      * Return the X starting coordinate of the text, adjusted by 
      * the given rotation amount.  The rotation adjusts where the 0,0
@@ -160,18 +160,18 @@
      */
     private float getX_rot(float a_rot)
     {
-    	if (a_rot == 0)
-    		return textPos.getValue(2,0);
-    	else if (a_rot == 90)
-    		return textPos.getValue(2,1);
-    	else if (a_rot == 180)
-    		return pageWidth - textPos.getValue(2,0);
-    	else if (a_rot == 270)
-    		return pageHeight - textPos.getValue(2,1);
-    	else 
-    		return 0;
+        if (a_rot == 0)
+            return textPos.getValue(2,0);
+        else if (a_rot == 90)
+            return textPos.getValue(2,1);
+        else if (a_rot == 180)
+            return pageWidth - textPos.getValue(2,0);
+        else if (a_rot == 270)
+            return pageHeight - textPos.getValue(2,1);
+        else 
+            return 0;
     }
-    
+
     /**
      * This will get the page rotation adjusted x position of the character.
      * This is adjusted based on page rotation so that the upper 
@@ -181,9 +181,9 @@
      */
     public float getX()
     {
-    	return getX_rot(rot);
+        return getX_rot(rot);
     }
-    
+
     /**
      * This will get the text direction adjusted x position of the character.
      * This is adjusted based on text direction so that the first character
@@ -192,7 +192,7 @@
      * @return The x coordinate of the text.
      */
     public float getXDirAdj() {
-    	return getX_rot(getDir());	
+        return getX_rot(getDir());  
     }
 
     /** 
@@ -204,18 +204,18 @@
      */
     private float getY_ll_rot(float a_rot)
     {
-    	if (a_rot == 0)
-    		return textPos.getValue(2,1);
-    	else if (a_rot == 90)
-    		return pageWidth - textPos.getValue(2,0);
-    	else if (a_rot == 180)
-    		return pageHeight - textPos.getValue(2,1);
-    	else if (a_rot == 270)
-    		return textPos.getValue(2,0);
-    	else 
-    		return 0;
+        if (a_rot == 0)
+            return textPos.getValue(2,1);
+        else if (a_rot == 90)
+            return pageWidth - textPos.getValue(2,0);
+        else if (a_rot == 180)
+            return pageHeight - textPos.getValue(2,1);
+        else if (a_rot == 270)
+            return textPos.getValue(2,0);
+        else 
+            return 0;
     }
-    
+
     /**
      * This will get the y position of the text, adjusted so that 0,0 is upper left and 
      * it is adjusted based on the page rotation. 
@@ -224,12 +224,12 @@
      */
     public float getY()
     {
-    	if ((rot == 0) || (rot == 180))
-    		return pageHeight - getY_ll_rot(rot);
-    	else
-    		return pageWidth - getY_ll_rot(rot);
+        if ((rot == 0) || (rot == 180))
+            return pageHeight - getY_ll_rot(rot);
+        else
+            return pageWidth - getY_ll_rot(rot);
     }
-    
+
     /**
      * This will get the y position of the text, adjusted so that 0,0 is upper left and 
      * it is adjusted based on the text direction. 
@@ -238,16 +238,16 @@
      */
     public float getYDirAdj()
     {
-    	float dir = getDir();
-    	// some PDFBox code assumes that the 0,0 point is in upper left, not lower left
-    	if ((dir == 0) || (dir == 180))
-    		return pageHeight - getY_ll_rot(dir);
-    	else
-    		return pageWidth - getY_ll_rot(dir);
+        float dir = getDir();
+        // some PDFBox code assumes that the 0,0 point is in upper left, not lower left
+        if ((dir == 0) || (dir == 180))
+            return pageHeight - getY_ll_rot(dir);
+        else
+            return pageWidth - getY_ll_rot(dir);
     }
 
 
-    
+
     /**
      * Get the length or width of the text, based on a given rotation. 
      * 
@@ -256,30 +256,30 @@
      */
     private float getWidth_rot(float a_rot)
     {
-    	if ((a_rot == 90) || (a_rot == 270)) {
-    		return Math.abs(endY - textPos.getYPosition());
-    	}
-    	else {
-    		return Math.abs(endX - textPos.getXPosition());
-    	}
+        if ((a_rot == 90) || (a_rot == 270)) {
+            return Math.abs(endY - textPos.getYPosition());
+        }
+        else {
+            return Math.abs(endX - textPos.getXPosition());
+        }
     }
-    
+
     /**
      * This will get the width of the string when page rotation adjusted coordinates are used.
      *
      * @return The width of the text in display units.
      */
     public float getWidth() {
-    	return getWidth_rot(rot);
+        return getWidth_rot(rot);
     }
-    
+
     /**
      * This will get the width of the string when text direction adjusted coordinates are used.
      *
      * @return The width of the text in display units.
      */
     public float getWidthDirAdj() {
-    	return getWidth_rot(getDir());
+        return getWidth_rot(getDir());
     }
 
     /**
@@ -288,17 +288,17 @@
      * @return The maximum height of all characters in this string.
      */
     public float getHeight() {
-    	return maxTextHeight;
+        return maxTextHeight;
     }
-    
+
     /**
      * This will get the maximum height of all characters in this string.
      *
      * @return The maximum height of all characters in this string.
      */
     public float getHeightDir() {
-    	// this is not really a rotation-dependent calculation, but this is defined for symmetry.
-    	return maxTextHeight;
+        // this is not really a rotation-dependent calculation, but this is defined for symmetry.
+        return maxTextHeight;
     }
 
     /**
@@ -390,24 +390,44 @@
      */
     public boolean contains( TextPosition tp2)
     {
-        // get the center of the rectangle being tested
-        double xcenter = tp2.getXDirAdj() + tp2.getWidthDirAdj()/2.0;
-        double ydelta = tp2.getHeightDir()/2.0;
-        double ycenter = tp2.getYDirAdj() + ydelta;
-        
-        // If the x-coordinate of tp2's center is within this obj's x-coordinates
-        // and the y-coordinate of tp2's center is in this obj's rectangle expanded
-        // by ydelta, then at least 50% (with respect to the x-direction) of tp2
-        // is within this obj 
-        if ( (xcenter > getXDirAdj()) && 
-                (xcenter < getXDirAdj() + getWidthDirAdj()) &&
-                (ycenter > getYDirAdj() - ydelta) &&
-                (ycenter < getYDirAdj() + getHeightDir() + ydelta))  
-            return true;
-        else 
+        double thisXstart = getXDirAdj();
+        double thisXend = getXDirAdj() + getWidthDirAdj();
+
+        double tp2Xstart = tp2.getXDirAdj();
+        double tp2Xend = tp2.getXDirAdj() + tp2.getWidthDirAdj();
+
+        /*
+         * No X overlap at all so return as soon as possible. 
+         */
+        if(tp2Xend <= thisXstart || tp2Xstart >= thisXend){
             return false;
+        }   
+        /*
+         * No Y overlap at all so return as soon as possible. 
+         * Note: 0.0 is in the upper left and y-coordinate is 
+         * top of TextPosition
+         */
+        if((tp2.getYDirAdj() + tp2.getHeightDir() < getYDirAdj()) ||
+                (tp2.getYDirAdj() > getYDirAdj() + getHeightDir())){
+            return false; 
+        }          
+        /* We're going to calculate the percentage of overlap. If its less
+         * than a 15% x-coordinate overlap then we'll return false because its negligible.
+         * .15 was determined by trial and error in the regression test files.
+         */
+        else if((tp2Xstart > thisXstart) && (tp2Xend > thisXend)){
+            double overlap = thisXend - tp2Xstart;
+            double overlapPercent = overlap/getWidthDirAdj();
+            return (overlapPercent > .15);
+        }
+        else if((tp2Xstart < thisXstart) && (tp2Xend < thisXend)){
+            double overlap = tp2Xend - thisXstart;
+            double overlapPercent = overlap/getWidthDirAdj();
+            return (overlapPercent > .15);
+        }
+        return true; 
     }
-    
+
     /**
      * Merge a single character TextPosition into the current object.
      * This is to be used only for cases where we have a diacritic that
@@ -421,53 +441,122 @@
     {
         if (diacritic.getCharacter().length() > 1)
             return;
-        
-        float xdiac = diacritic.getXDirAdj() + diacritic.getWidthDirAdj()/2;
-        float xcurr = getXDirAdj();
-        
-        int lastChIx = str.length();
-        for (int i = 0; i < lastChIx; i++) {
-            
-            // The diacritic modifies this character.
-            if (xdiac >= xcurr && xdiac <= (xcurr + widths[i])) {
-                StringBuffer buf = new StringBuffer();
-                
-                buf.append(str.substring(0,i));
-                
-                float[] widths2 = new float[widths.length+1];
-                System.arraycopy(widths, 0, widths2, 0, i);
-               
-                /* we add the diacritic to the right or left of the character
-                 * depending on the direction of the character.  Note that this
-                 * is only required because the text is currently stored in 
-                 * presentation order and not in logical order. 
-                 */
-                int dir = Character.getDirectionality(str.charAt(i));
-                if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT)
-                        || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC)
-                        || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING)
-                        || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)) {
-                    buf.append(diacritic.getCharacter());
-                    widths2[i] = 0;
-                    buf.append(str.charAt(i));
-                    widths2[i+1] = widths[i];
-                } 
-                else {
-                    buf.append(str.charAt(i));
-                    widths2[i] = widths[i];
-                    buf.append(diacritic.getCharacter());
-                    widths2[i+1] = 0;
+
+        float diacXStart = diacritic.getXDirAdj();
+        float diacXEnd = diacXStart + diacritic.widths[0];
+
+        float currCharXStart = getXDirAdj();
+
+        int strLen = str.length();
+        boolean wasAdded = false;
+
+        for (int i = 0; i < strLen && wasAdded == false; i++) {
+
+            float currCharXEnd = currCharXStart + widths[i];
+
+            /*
+             * This is the case where there is an overlap of the diacritic character with
+             * the current character and the previous character. If no previous character,
+             * just append the diacritic after the current one. 
+             */
+            if(diacXStart < currCharXStart && diacXEnd <= currCharXEnd){               
+                if(i == 0){
+                    insertDiacritic(i, diacritic);
+                }
+                else{    
+                    float distanceOverlapping1 = diacXEnd - currCharXStart;
+                    float percentage1 = distanceOverlapping1/widths[i];
+
+                    float distanceOverlapping2 = currCharXStart - diacXStart;
+                    float percentage2 = distanceOverlapping2/widths[i-1];
+
+                    if(percentage1 >= percentage2){
+                        insertDiacritic(i, diacritic);
+                    }
+                    else{
+                        insertDiacritic(i-1, diacritic);
+                    }
                 }
-                
-                // Get the rest of the string
-                buf.append(str.substring(i+1, lastChIx));
-                System.arraycopy(widths, i+1, widths2, i+2, widths.length-i-1);
-                
-                str = buf.toString();
-                widths = widths2;
-                break;
+                wasAdded = true;
+            }
+            //diacritic completely covers this character and therefore we assume that
+            //this is the character the diacritic belongs to
+            else if(diacXStart < currCharXStart && diacXEnd > currCharXEnd){
+                insertDiacritic(i, diacritic);
+                wasAdded = true;
+            }
+            //Otherwise, The diacritic modifies this character because its completely
+            //contained by the character width
+            else if(diacXStart >= currCharXStart && diacXEnd <= currCharXEnd) {
+                insertDiacritic(i, diacritic);
+                wasAdded = true;   
             }
-            xcurr += widths[i];
+            /*
+             * Last character in the TextPosition so we add diacritic to the end
+             */
+            else if(diacXStart >= currCharXStart && diacXEnd > currCharXEnd && i == (strLen - 1)){
+                insertDiacritic(i, diacritic);
+                wasAdded = true;
+            }        
+            /*
+             * Couldn't find anything useful so we go to the next character in the 
+             * TextPosition 
+             */
+            currCharXStart += widths[i];
         }
     }
+    /**
+     * Inserts the diacritic TextPosition to the str of this TextPosition
+     * and updates the widths array to include the extra character width.
+     * @param i current character
+     * @param diacritic The diacritic TextPosition
+     */
+    private void insertDiacritic(int i, TextPosition diacritic){
+        /* we add the diacritic to the right or left of the character
+         * depending on the direction of the character.  Note that this
+         * is only required because the text is currently stored in 
+         * presentation order and not in logical order. 
+         */
+        int dir = Character.getDirectionality(str.charAt(i));
+        StringBuffer buf = new StringBuffer();
+
+        buf.append(str.substring(0,i));
+
+        float[] widths2 = new float[widths.length+1];
+        System.arraycopy(widths, 0, widths2, 0, i);
+
+        if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT)
+                || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC)
+                || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING)
+                || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)) {
+            buf.append(diacritic.getCharacter());
+            widths2[i] = 0;
+            buf.append(str.charAt(i));
+            widths2[i+1] = widths[i];
+        } 
+        else {
+            buf.append(str.charAt(i));
+            widths2[i] = widths[i];
+            buf.append(diacritic.getCharacter());
+            widths2[i+1] = 0;
+        }
+
+        // Get the rest of the string
+        buf.append(str.substring(i+1, str.length()));
+        System.arraycopy(widths, i+1, widths2, i+2, widths.length-i-1);
+
+        str = buf.toString();
+        widths = widths2;
+    }
+
+    /**
+     * 
+     * @return True if the current character is a diacritic char. 
+     */
+    public boolean isDiacritic() {
+        String cText = this.getCharacter();
+        return (cText.length() == 1 &&  (Character.getType(cText.charAt(0)) == Character.NON_SPACING_MARK
+                || Character.getType(cText.charAt(0)) == Character.MODIFIER_SYMBOL
+                || Character.getType(cText.charAt(0)) == Character.MODIFIER_LETTER));
+    }
 }

Modified: incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf-sorted.txt?rev=760554&r1=760553&r2=760554&view=diff
==============================================================================
Binary files - no diff available.

Modified: incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf.txt
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf.txt?rev=760554&r1=760553&r2=760554&view=diff
==============================================================================
Binary files - no diff available.

Modified: incubator/pdfbox/trunk/test/input/cweb.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/test/input/cweb.pdf-sorted.txt?rev=760554&r1=760553&r2=760554&view=diff
==============================================================================
Binary files - no diff available.

Modified: incubator/pdfbox/trunk/test/input/cweb.pdf.txt
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/test/input/cweb.pdf.txt?rev=760554&r1=760553&r2=760554&view=diff
==============================================================================
Binary files - no diff available.