You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2010/12/02 18:27:45 UTC
svn commit: r1041477 -
/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
Author: lehmi
Date: Thu Dec 2 17:27:45 2010
New Revision: 1041477
URL: http://svn.apache.org/viewvc?rev=1041477&view=rev
Log:
PDFBOX-521: improved max height calculation as proposed by Mel Martinez
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1041477&r1=1041476&r2=1041477&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Thu Dec 2 17:27:45 2010
@@ -42,6 +42,7 @@ import org.apache.pdfbox.pdmodel.common.
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
+import org.apache.pdfbox.util.TextPosition;
/**
@@ -727,7 +728,7 @@ public class PDFTextStripper extends PDF
writeLine(normalize(line,isRtlDominant,hasRtl),isRtlDominant);
line.clear();
- lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition);
+ lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
endOfLastTextX = ENDOFLASTTEXTX_RESET_VALUE;
expectedStartOfNextWordX = EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE;
@@ -1561,13 +1562,14 @@ public class PDFTextStripper extends PDF
* @param lastPosition the previous text position
* @param lastLineStartPosition the last text position that followed a line
* separator.
+ * @param maxHeightForLine max height for positions since lastLineStartPosition
* @throws IOException
*/
protected PositionWrapper handleLineSeparation(PositionWrapper current,
- PositionWrapper lastPosition, PositionWrapper lastLineStartPosition)
+ PositionWrapper lastPosition, PositionWrapper lastLineStartPosition, float maxHeightForLine)
throws IOException {
current.setLineStart();
- isParagraphSeparation(current, lastPosition, lastLineStartPosition);
+ isParagraphSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
lastLineStartPosition = current;
if (current.isParagraphStart()) {
if(lastPosition.isArticleStart()) {
@@ -1605,9 +1607,10 @@ public class PDFTextStripper extends PDF
* @param lastPosition the previous text position (should not be null).
* @param lastLineStartPosition the last text position that followed a line
* separator. May be null.
+ * @param maxHeightForLine max height for text positions since lasLineStartPosition.
*/
protected void isParagraphSeparation(PositionWrapper position,
- PositionWrapper lastPosition, PositionWrapper lastLineStartPosition){
+ PositionWrapper lastPosition, PositionWrapper lastLineStartPosition, float maxHeightForLine){
boolean result = false;
if(lastLineStartPosition == null) {
result = true;
@@ -1616,7 +1619,7 @@ public class PDFTextStripper extends PDF
lastPosition.getTextPosition().getYDirAdj());
float xGap = (position.getTextPosition().getXDirAdj()-
lastLineStartPosition.getTextPosition().getXDirAdj());//do we need to flip this for rtl?
- if(yGap > (getDropThreshold()*position.getTextPosition().getHeightDir())){
+ if(yGap > (getDropThreshold()*maxHeightForLine)){
result = true;
}else if(xGap > (getIndentThreshold()*position.getTextPosition().getWidthOfSpace())){
//text is indented, but try to screen for hanging indent