You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ca...@apache.org on 2009/03/31 20:06:12 UTC
svn commit: r760554 - in /incubator/pdfbox/trunk:
src/main/java/org/apache/pdfbox/util/ test/input/
Author: carrier
Date: Tue Mar 31 18:06:11 2009
New Revision: 760554
URL: http://svn.apache.org/viewvc?rev=760554&view=rev
Log:
Patch for PDFBOX-444 by Justin LeFebvre and myself to fix errors merging in diacritics during text extraction. Also includes updated regression test files and some code formatting changes
Modified:
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPosition.java
incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf-sorted.txt
incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf.txt
incubator/pdfbox/trunk/test/input/cweb.pdf-sorted.txt
incubator/pdfbox/trunk/test/input/cweb.pdf.txt
Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=760554&r1=760553&r2=760554&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Tue Mar 31 18:06:11 2009
@@ -102,7 +102,7 @@
* and to correct the direction of right to left text, such as Arabic and Hebrew.
*/
private TextNormalize normalize = null;
-
+
/**
* Instantiate a new PDFTextStripper object. This object will load properties from
* Resources/PDFTextStripper.properties.
@@ -231,8 +231,8 @@
}
if( startBookmarkPageNumber == -1 && startBookmark != null &&
- endBookmarkPageNumber == -1 && endBookmark != null &&
- startBookmark.getCOSObject() == endBookmark.getCOSObject() )
+ endBookmarkPageNumber == -1 && endBookmark != null &&
+ startBookmark.getCOSObject() == endBookmark.getCOSObject() )
{
//this is a special case where both the start and end bookmark
//are the same but point to nothing. In this case
@@ -302,8 +302,8 @@
protected void processPage( PDPage page, COSStream content ) throws IOException
{
if( currentPageNo >= startPage && currentPageNo <= endPage &&
- (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber ) &&
- (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber ))
+ (startBookmarkPageNumber == -1 || currentPageNo >= startBookmarkPageNumber ) &&
+ (endBookmarkPageNumber == -1 || currentPageNo <= endBookmarkPageNumber ))
{
startPage( page );
pageArticles = page.getThreadBeads();
@@ -347,7 +347,7 @@
{
startArticle(true);
}
-
+
/**
* Start a new article, which is typically defined as a column
* on a single page (also referred to as a bead).
@@ -417,11 +417,11 @@
float lastWordSpacing = -1;
float maxHeightForLine = -1;
TextPosition lastPosition = null;
-
+
if (normalize == null) {
normalize = new TextNormalize();
}
-
+
for( int i = 0; i < charactersByArticle.size(); i++)
{
List textList = (List)charactersByArticle.get( i );
@@ -432,7 +432,7 @@
}
Iterator textIter = textList.iterator();
-
+
/* Before we can display the text, we need to do some normalizing.
* Arabic and Hebrew text is right to left and is typically stored
* in its logical format, which means that the rightmost character is
@@ -477,9 +477,9 @@
if (rtlCnt > ltrCnt) {
isRtlDominant = true;
}
-
+
startArticle(!isRtlDominant);
-
+
// we will later use this to skip reordering
boolean hasRtl = false;
if (rtlCnt > 0)
@@ -489,7 +489,7 @@
* We queue up a line at a time before we print so that we can convert
* the line from presentation form to logical form (if needed). */
String lineStr = "";
-
+
textIter = textList.iterator(); // start from the beginning again
while( textIter.hasNext() )
{
@@ -504,16 +504,16 @@
/* If we are sorting, then we need to use the text direction
* adjusted coordinates, because they were used in the sorting. */
if (sortByPosition) {
- positionX = position.getXDirAdj();
- positionY = position.getYDirAdj();
- positionWidth = position.getWidthDirAdj();
- positionHeight = position.getHeightDir();
+ positionX = position.getXDirAdj();
+ positionY = position.getYDirAdj();
+ positionWidth = position.getWidthDirAdj();
+ positionHeight = position.getHeightDir();
}
else {
- positionX = position.getX();
- positionY = position.getY();
- positionWidth = position.getWidth();
- positionHeight = position.getHeight();
+ positionX = position.getX();
+ positionY = position.getY();
+ positionWidth = position.getWidth();
+ positionHeight = position.getHeight();
}
//try to get width of a space character
@@ -579,13 +579,13 @@
lastPosition.getCharacter() != null &&
!lastPosition.getCharacter().endsWith( " " ) )
{
- lineStr += getWordSeparator();
- }
+ lineStr += getWordSeparator();
+ }
}
if (positionY >= maxYForLine) {
- maxYForLine = positionY;
+ maxYForLine = positionY;
}
// RDD - endX is what PDF considers to be the x coordinate of the
@@ -602,7 +602,7 @@
lastPosition = position;
lastWordSpacing = wordSpacing;
}
-
+
// print the final line
if (lineStr.length() > 0) {
if (hasRtl)
@@ -613,7 +613,7 @@
writeString(lineStr);
}
-
+
endArticle();
}
@@ -623,7 +623,7 @@
private boolean overlap( float y1, float height1, float y2, float height2 )
{
return within( y1, y2, .1f) || (y2 <= y1 && y2 >= y1-height1) ||
- (y1 <= y2 && y1 >= y2-height2);
+ (y1 <= y2 && y1 >= y2-height2);
}
/**
@@ -638,7 +638,7 @@
output.write(getPageSeparator());
output.flush();
}
-
+
/**
* Write the line separator value to the output stream
* @throws IOException
@@ -647,7 +647,7 @@
{
output.write(getLineSeparator());
}
-
+
/**
* Write the word separator value to the output stream
@@ -668,7 +668,7 @@
{
output.write( text.getCharacter() );
}
-
+
/**
* Write a Java string to the output stream.
*
@@ -691,8 +691,8 @@
{
return second > first - variance && second < first + variance;
}
-
-
+
+
/**
* This will process a TextPosition object and add the
* text to the list of characters on a page. It takes care of
@@ -738,11 +738,11 @@
//only want to suppress
if( charCharacter != null &&
- //charCharacter.equals( textCharacter ) &&
- within( charX, textX, tolerance ) &&
- within( charY,
- textY,
- tolerance ) )
+ //charCharacter.equals( textCharacter ) &&
+ within( charX, textX, tolerance ) &&
+ within( charY,
+ textY,
+ tolerance ) )
{
suppressCharacter = true;
}
@@ -777,8 +777,8 @@
foundArticleDivisionIndex = i*2+1;
}
else if( (x < rect.getLowerLeftX() ||
- y < rect.getUpperRightY()) &&
- notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
+ y < rect.getUpperRightY()) &&
+ notFoundButFirstLeftAndAboveArticleDivisionIndex == -1)
{
notFoundButFirstLeftAndAboveArticleDivisionIndex = i*2;
}
@@ -824,38 +824,38 @@
{
articleDivisionIndex = charactersByArticle.size()-1;
}
-
+
List textList = (List) charactersByArticle.get( articleDivisionIndex );
-
-
-
- /* In the wild, some PDF encoded documents put diacritics (accents on
- * top of characters) into a separate Tj element. When displaying them
- * graphically, the two chunks get overlayed. With text output though,
- * we need to do the overlay. This code recombines the diacritic with
- * its associated character.
+
+ /* In the wild, some PDF encoded documents put diacritics (accents on
+ * top of characters) into a separate Tj element. When displaying them
+ * graphically, the two chunks get overlayed. With text output though,
+ * we need to do the overlay. This code recombines the diacritic with
+ * its associated character if the two are consecutive.
*/
- // First, do we even care. The assumption is we do IFF
- // we have a single diacritic.
- boolean wasMerged = false;
- String cText = text.getCharacter();
- if (cText.length() == 1 && Character.getType(cText.charAt(0)) == Character.NON_SPACING_MARK) {
+ if(textList.isEmpty()){
+ textList.add(text);
+ }
+ else{
/* test if we overlap the previous entry.
* Note that we are making an assumption that we need to only look back
* one TextPosition to find what we are overlapping.
* This may not always be true. */
- if(!textList.isEmpty()){
- TextPosition previous = (TextPosition)textList.get(textList.size()-1);
- if ((previous != null) && previous.contains(text)) {
- previous.mergeDiacritic(text);
- wasMerged = true;
- }
+ TextPosition previousTextPosition = (TextPosition)textList.get(textList.size()-1);
+ if(text.isDiacritic() && previousTextPosition.contains(text)){
+ previousTextPosition.mergeDiacritic(text);
+ }
+ /* If the previous TextPosition was the diacritic, merge it into this
+ * one and remove it from the list. */
+ else if(previousTextPosition.isDiacritic() && text.contains(previousTextPosition)){
+ text.mergeDiacritic(previousTextPosition);
+ textList.remove(textList.size()-1);
+ textList.add(text);
+ }
+ else{
+ textList.add(text);
}
}
-
- // if we could not merge with the previous entry, add it to the list
- if (wasMerged == false)
- textList.add(text);
}
}
Modified: incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPosition.java
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPosition.java?rev=760554&r1=760553&r2=760554&view=diff
==============================================================================
--- incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPosition.java (original)
+++ incubator/pdfbox/trunk/src/main/java/org/apache/pdfbox/util/TextPosition.java Tue Mar 31 18:06:11 2009
@@ -27,24 +27,24 @@
*/
public class TextPosition
{
- /* TextMatrix for the start of the text object. Coordinates
- * are in display units and have not been adjusted. */
- private Matrix textPos;
-
- // ending X and Y coordinates in display units
- private float endX;
- private float endY;
-
- private float maxTextHeight; // maximum height of text, in display units
- private int rot; // 0, 90, 180, 270 degrees of page rotation
- private float pageHeight;
- private float pageWidth;
+ /* TextMatrix for the start of the text object. Coordinates
+ * are in display units and have not been adjusted. */
+ private Matrix textPos;
+
+ // ending X and Y coordinates in display units
+ private float endX;
+ private float endY;
+
+ private float maxTextHeight; // maximum height of text, in display units
+ private int rot; // 0, 90, 180, 270 degrees of page rotation
+ private float pageHeight;
+ private float pageWidth;
private float[] widths;
private float widthOfSpace; // width of a space, in display units
private String str;
private PDFont font;
private float fontSize;
- private float wordSpacing; // word spacing value, in display units
+ private float wordSpacing; // word spacing value, in display units
protected TextPosition()
{
@@ -66,32 +66,32 @@
* @param ws The word spacing parameter (in display units)
*/
public TextPosition(
- PDPage page,
- Matrix textPositionSt,
- Matrix textPositionEnd,
- float maxFontH,
- float[] individualWidths,
- float spaceWidth,
- String string,
- PDFont currentFont,
- float fontSizeValue,
- float ws
+ PDPage page,
+ Matrix textPositionSt,
+ Matrix textPositionEnd,
+ float maxFontH,
+ float[] individualWidths,
+ float spaceWidth,
+ String string,
+ PDFont currentFont,
+ float fontSizeValue,
+ float ws
)
{
- this.textPos = textPositionSt;
-
- this.endX = textPositionEnd.getXPosition();
- this.endY = textPositionEnd.getYPosition();
-
- this.rot = page.findRotation();
- // make sure it is 0 to 270 and no negative numbers
- if(this.rot < 0)
- rot += 360;
-
- this.maxTextHeight = maxFontH;
- this.pageHeight = page.findMediaBox().getHeight();
- this.pageWidth = page.findMediaBox().getWidth();
-
+ this.textPos = textPositionSt;
+
+ this.endX = textPositionEnd.getXPosition();
+ this.endY = textPositionEnd.getYPosition();
+
+ this.rot = page.findRotation();
+ // make sure it is 0 to 270 and no negative numbers
+ if(this.rot < 0)
+ rot += 360;
+
+ this.maxTextHeight = maxFontH;
+ this.pageHeight = page.findMediaBox().getHeight();
+ this.pageWidth = page.findMediaBox().getWidth();
+
this.widths = individualWidths;
this.widthOfSpace = spaceWidth;
this.str = string;
@@ -116,7 +116,7 @@
* @return The Matrix containing all infos of the starting textposition
*/
public Matrix getTextPos() {
- return textPos;
+ return textPos;
}
/**
@@ -125,31 +125,31 @@
* @return The direction of the text (0, 90, 180, or 270)
*/
public float getDir() {
- float a = textPos.getValue(0,0);
- float b = textPos.getValue(0,1);
- float c = textPos.getValue(1,0);
- float d = textPos.getValue(1,1);
-
- // 12 0 left to right
- // 0 12
- if ((a > 0) && (Math.abs(b) < d) && (Math.abs(c) < a) && (d > 0))
- return 0;
- // -12 0 right to left (upside down)
- // 0 -12
- else if ((a < 0) && (Math.abs(b) < Math.abs(d)) && (Math.abs(c) < Math.abs(a)) && (d < 0))
- return 180;
- // 0 12 up
- // -12 0
- else if ((Math.abs(a) < Math.abs(c)) && (b > 0) && (c < 0) && (Math.abs(d) < b))
- return 90;
- // 0 -12 down
- // 12 0
- else if ((Math.abs(a) < c) && (b < 0) && (c > 0) && (Math.abs(d) < Math.abs(b)))
- return 270;
-
- return 0;
+ float a = textPos.getValue(0,0);
+ float b = textPos.getValue(0,1);
+ float c = textPos.getValue(1,0);
+ float d = textPos.getValue(1,1);
+
+ // 12 0 left to right
+ // 0 12
+ if ((a > 0) && (Math.abs(b) < d) && (Math.abs(c) < a) && (d > 0))
+ return 0;
+ // -12 0 right to left (upside down)
+ // 0 -12
+ else if ((a < 0) && (Math.abs(b) < Math.abs(d)) && (Math.abs(c) < Math.abs(a)) && (d < 0))
+ return 180;
+ // 0 12 up
+ // -12 0
+ else if ((Math.abs(a) < Math.abs(c)) && (b > 0) && (c < 0) && (Math.abs(d) < b))
+ return 90;
+ // 0 -12 down
+ // 12 0
+ else if ((Math.abs(a) < c) && (b < 0) && (c > 0) && (Math.abs(d) < Math.abs(b)))
+ return 270;
+
+ return 0;
}
-
+
/**
* Return the X starting coordinate of the text, adjusted by
* the given rotation amount. The rotation adjusts where the 0,0
@@ -160,18 +160,18 @@
*/
private float getX_rot(float a_rot)
{
- if (a_rot == 0)
- return textPos.getValue(2,0);
- else if (a_rot == 90)
- return textPos.getValue(2,1);
- else if (a_rot == 180)
- return pageWidth - textPos.getValue(2,0);
- else if (a_rot == 270)
- return pageHeight - textPos.getValue(2,1);
- else
- return 0;
+ if (a_rot == 0)
+ return textPos.getValue(2,0);
+ else if (a_rot == 90)
+ return textPos.getValue(2,1);
+ else if (a_rot == 180)
+ return pageWidth - textPos.getValue(2,0);
+ else if (a_rot == 270)
+ return pageHeight - textPos.getValue(2,1);
+ else
+ return 0;
}
-
+
/**
* This will get the page rotation adjusted x position of the character.
* This is adjusted based on page rotation so that the upper
@@ -181,9 +181,9 @@
*/
public float getX()
{
- return getX_rot(rot);
+ return getX_rot(rot);
}
-
+
/**
* This will get the text direction adjusted x position of the character.
* This is adjusted based on text direction so that the first character
@@ -192,7 +192,7 @@
* @return The x coordinate of the text.
*/
public float getXDirAdj() {
- return getX_rot(getDir());
+ return getX_rot(getDir());
}
/**
@@ -204,18 +204,18 @@
*/
private float getY_ll_rot(float a_rot)
{
- if (a_rot == 0)
- return textPos.getValue(2,1);
- else if (a_rot == 90)
- return pageWidth - textPos.getValue(2,0);
- else if (a_rot == 180)
- return pageHeight - textPos.getValue(2,1);
- else if (a_rot == 270)
- return textPos.getValue(2,0);
- else
- return 0;
+ if (a_rot == 0)
+ return textPos.getValue(2,1);
+ else if (a_rot == 90)
+ return pageWidth - textPos.getValue(2,0);
+ else if (a_rot == 180)
+ return pageHeight - textPos.getValue(2,1);
+ else if (a_rot == 270)
+ return textPos.getValue(2,0);
+ else
+ return 0;
}
-
+
/**
* This will get the y position of the text, adjusted so that 0,0 is upper left and
* it is adjusted based on the page rotation.
@@ -224,12 +224,12 @@
*/
public float getY()
{
- if ((rot == 0) || (rot == 180))
- return pageHeight - getY_ll_rot(rot);
- else
- return pageWidth - getY_ll_rot(rot);
+ if ((rot == 0) || (rot == 180))
+ return pageHeight - getY_ll_rot(rot);
+ else
+ return pageWidth - getY_ll_rot(rot);
}
-
+
/**
* This will get the y position of the text, adjusted so that 0,0 is upper left and
* it is adjusted based on the text direction.
@@ -238,16 +238,16 @@
*/
public float getYDirAdj()
{
- float dir = getDir();
- // some PDFBox code assumes that the 0,0 point is in upper left, not lower left
- if ((dir == 0) || (dir == 180))
- return pageHeight - getY_ll_rot(dir);
- else
- return pageWidth - getY_ll_rot(dir);
+ float dir = getDir();
+ // some PDFBox code assumes that the 0,0 point is in upper left, not lower left
+ if ((dir == 0) || (dir == 180))
+ return pageHeight - getY_ll_rot(dir);
+ else
+ return pageWidth - getY_ll_rot(dir);
}
-
+
/**
* Get the length or width of the text, based on a given rotation.
*
@@ -256,30 +256,30 @@
*/
private float getWidth_rot(float a_rot)
{
- if ((a_rot == 90) || (a_rot == 270)) {
- return Math.abs(endY - textPos.getYPosition());
- }
- else {
- return Math.abs(endX - textPos.getXPosition());
- }
+ if ((a_rot == 90) || (a_rot == 270)) {
+ return Math.abs(endY - textPos.getYPosition());
+ }
+ else {
+ return Math.abs(endX - textPos.getXPosition());
+ }
}
-
+
/**
* This will get the width of the string when page rotation adjusted coordinates are used.
*
* @return The width of the text in display units.
*/
public float getWidth() {
- return getWidth_rot(rot);
+ return getWidth_rot(rot);
}
-
+
/**
* This will get the width of the string when text direction adjusted coordinates are used.
*
* @return The width of the text in display units.
*/
public float getWidthDirAdj() {
- return getWidth_rot(getDir());
+ return getWidth_rot(getDir());
}
/**
@@ -288,17 +288,17 @@
* @return The maximum height of all characters in this string.
*/
public float getHeight() {
- return maxTextHeight;
+ return maxTextHeight;
}
-
+
/**
* This will get the maximum height of all characters in this string.
*
* @return The maximum height of all characters in this string.
*/
public float getHeightDir() {
- // this is not really a rotation-dependent calculation, but this is defined for symmetry.
- return maxTextHeight;
+ // this is not really a rotation-dependent calculation, but this is defined for symmetry.
+ return maxTextHeight;
}
/**
@@ -390,24 +390,44 @@
*/
public boolean contains( TextPosition tp2)
{
- // get the center of the rectangle being tested
- double xcenter = tp2.getXDirAdj() + tp2.getWidthDirAdj()/2.0;
- double ydelta = tp2.getHeightDir()/2.0;
- double ycenter = tp2.getYDirAdj() + ydelta;
-
- // If the x-coordinate of tp2's center is within this obj's x-coordinates
- // and the y-coordinate of tp2's center is in this obj's rectangle expanded
- // by ydelta, then at least 50% (with respect to the x-direction) of tp2
- // is within this obj
- if ( (xcenter > getXDirAdj()) &&
- (xcenter < getXDirAdj() + getWidthDirAdj()) &&
- (ycenter > getYDirAdj() - ydelta) &&
- (ycenter < getYDirAdj() + getHeightDir() + ydelta))
- return true;
- else
+ double thisXstart = getXDirAdj();
+ double thisXend = getXDirAdj() + getWidthDirAdj();
+
+ double tp2Xstart = tp2.getXDirAdj();
+ double tp2Xend = tp2.getXDirAdj() + tp2.getWidthDirAdj();
+
+ /*
+ * No X overlap at all so return as soon as possible.
+ */
+ if(tp2Xend <= thisXstart || tp2Xstart >= thisXend){
return false;
+ }
+ /*
+ * No Y overlap at all so return as soon as possible.
+ * Note: 0.0 is in the upper left and y-coordinate is
+ * top of TextPosition
+ */
+ if((tp2.getYDirAdj() + tp2.getHeightDir() < getYDirAdj()) ||
+ (tp2.getYDirAdj() > getYDirAdj() + getHeightDir())){
+ return false;
+ }
+ /* We're going to calculate the percentage of overlap. If its less
+ * than a 15% x-coordinate overlap then we'll return false because its negligible.
+ * .15 was determined by trial and error in the regression test files.
+ */
+ else if((tp2Xstart > thisXstart) && (tp2Xend > thisXend)){
+ double overlap = thisXend - tp2Xstart;
+ double overlapPercent = overlap/getWidthDirAdj();
+ return (overlapPercent > .15);
+ }
+ else if((tp2Xstart < thisXstart) && (tp2Xend < thisXend)){
+ double overlap = tp2Xend - thisXstart;
+ double overlapPercent = overlap/getWidthDirAdj();
+ return (overlapPercent > .15);
+ }
+ return true;
}
-
+
/**
* Merge a single character TextPosition into the current object.
* This is to be used only for cases where we have a diacritic that
@@ -421,53 +441,122 @@
{
if (diacritic.getCharacter().length() > 1)
return;
-
- float xdiac = diacritic.getXDirAdj() + diacritic.getWidthDirAdj()/2;
- float xcurr = getXDirAdj();
-
- int lastChIx = str.length();
- for (int i = 0; i < lastChIx; i++) {
-
- // The diacritic modifies this character.
- if (xdiac >= xcurr && xdiac <= (xcurr + widths[i])) {
- StringBuffer buf = new StringBuffer();
-
- buf.append(str.substring(0,i));
-
- float[] widths2 = new float[widths.length+1];
- System.arraycopy(widths, 0, widths2, 0, i);
-
- /* we add the diacritic to the right or left of the character
- * depending on the direction of the character. Note that this
- * is only required because the text is currently stored in
- * presentation order and not in logical order.
- */
- int dir = Character.getDirectionality(str.charAt(i));
- if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT)
- || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC)
- || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING)
- || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)) {
- buf.append(diacritic.getCharacter());
- widths2[i] = 0;
- buf.append(str.charAt(i));
- widths2[i+1] = widths[i];
- }
- else {
- buf.append(str.charAt(i));
- widths2[i] = widths[i];
- buf.append(diacritic.getCharacter());
- widths2[i+1] = 0;
+
+ float diacXStart = diacritic.getXDirAdj();
+ float diacXEnd = diacXStart + diacritic.widths[0];
+
+ float currCharXStart = getXDirAdj();
+
+ int strLen = str.length();
+ boolean wasAdded = false;
+
+ for (int i = 0; i < strLen && wasAdded == false; i++) {
+
+ float currCharXEnd = currCharXStart + widths[i];
+
+ /*
+ * This is the case where there is an overlap of the diacritic character with
+ * the current character and the previous character. If no previous character,
+ * just append the diacritic after the current one.
+ */
+ if(diacXStart < currCharXStart && diacXEnd <= currCharXEnd){
+ if(i == 0){
+ insertDiacritic(i, diacritic);
+ }
+ else{
+ float distanceOverlapping1 = diacXEnd - currCharXStart;
+ float percentage1 = distanceOverlapping1/widths[i];
+
+ float distanceOverlapping2 = currCharXStart - diacXStart;
+ float percentage2 = distanceOverlapping2/widths[i-1];
+
+ if(percentage1 >= percentage2){
+ insertDiacritic(i, diacritic);
+ }
+ else{
+ insertDiacritic(i-1, diacritic);
+ }
}
-
- // Get the rest of the string
- buf.append(str.substring(i+1, lastChIx));
- System.arraycopy(widths, i+1, widths2, i+2, widths.length-i-1);
-
- str = buf.toString();
- widths = widths2;
- break;
+ wasAdded = true;
+ }
+ //diacritic completely covers this character and therefore we assume that
+ //this is the character the diacritic belongs to
+ else if(diacXStart < currCharXStart && diacXEnd > currCharXEnd){
+ insertDiacritic(i, diacritic);
+ wasAdded = true;
+ }
+ //Otherwise, The diacritic modifies this character because its completely
+ //contained by the character width
+ else if(diacXStart >= currCharXStart && diacXEnd <= currCharXEnd) {
+ insertDiacritic(i, diacritic);
+ wasAdded = true;
}
- xcurr += widths[i];
+ /*
+ * Last character in the TextPosition so we add diacritic to the end
+ */
+ else if(diacXStart >= currCharXStart && diacXEnd > currCharXEnd && i == (strLen - 1)){
+ insertDiacritic(i, diacritic);
+ wasAdded = true;
+ }
+ /*
+ * Couldn't find anything useful so we go to the next character in the
+ * TextPosition
+ */
+ currCharXStart += widths[i];
}
}
+ /**
+ * Inserts the diacritic TextPosition to the str of this TextPosition
+ * and updates the widths array to include the extra character width.
+ * @param i current character
+ * @param diacritic The diacritic TextPosition
+ */
+ private void insertDiacritic(int i, TextPosition diacritic){
+ /* we add the diacritic to the right or left of the character
+ * depending on the direction of the character. Note that this
+ * is only required because the text is currently stored in
+ * presentation order and not in logical order.
+ */
+ int dir = Character.getDirectionality(str.charAt(i));
+ StringBuffer buf = new StringBuffer();
+
+ buf.append(str.substring(0,i));
+
+ float[] widths2 = new float[widths.length+1];
+ System.arraycopy(widths, 0, widths2, 0, i);
+
+ if ((dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT)
+ || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC)
+ || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING)
+ || (dir == Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE)) {
+ buf.append(diacritic.getCharacter());
+ widths2[i] = 0;
+ buf.append(str.charAt(i));
+ widths2[i+1] = widths[i];
+ }
+ else {
+ buf.append(str.charAt(i));
+ widths2[i] = widths[i];
+ buf.append(diacritic.getCharacter());
+ widths2[i+1] = 0;
+ }
+
+ // Get the rest of the string
+ buf.append(str.substring(i+1, str.length()));
+ System.arraycopy(widths, i+1, widths2, i+2, widths.length-i-1);
+
+ str = buf.toString();
+ widths = widths2;
+ }
+
+ /**
+ *
+ * @return True if the current character is a diacritic char.
+ */
+ public boolean isDiacritic() {
+ String cText = this.getCharacter();
+ return (cText.length() == 1 && (Character.getType(cText.charAt(0)) == Character.NON_SPACING_MARK
+ || Character.getType(cText.charAt(0)) == Character.MODIFIER_SYMBOL
+ || Character.getType(cText.charAt(0)) == Character.MODIFIER_LETTER));
+ }
}
Modified: incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf-sorted.txt?rev=760554&r1=760553&r2=760554&view=diff
==============================================================================
Binary files - no diff available.
Modified: incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf.txt
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/test/input/Garcia2004_thesis.pdf.txt?rev=760554&r1=760553&r2=760554&view=diff
==============================================================================
Binary files - no diff available.
Modified: incubator/pdfbox/trunk/test/input/cweb.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/test/input/cweb.pdf-sorted.txt?rev=760554&r1=760553&r2=760554&view=diff
==============================================================================
Binary files - no diff available.
Modified: incubator/pdfbox/trunk/test/input/cweb.pdf.txt
URL: http://svn.apache.org/viewvc/incubator/pdfbox/trunk/test/input/cweb.pdf.txt?rev=760554&r1=760553&r2=760554&view=diff
==============================================================================
Binary files - no diff available.