You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2010/12/02 18:27:45 UTC

svn commit: r1041477 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java

Author: lehmi
Date: Thu Dec  2 17:27:45 2010
New Revision: 1041477

URL: http://svn.apache.org/viewvc?rev=1041477&view=rev
Log:
PDFBOX-521: improved max height calculation as proposed by Mel Martinez

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1041477&r1=1041476&r2=1041477&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Thu Dec  2 17:27:45 2010
@@ -42,6 +42,7 @@ import org.apache.pdfbox.pdmodel.common.
 import org.apache.pdfbox.pdmodel.common.PDStream;
 import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
 import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
+import org.apache.pdfbox.util.TextPosition;
 
 
 /**
@@ -727,7 +728,7 @@ public class PDFTextStripper extends PDF
                         writeLine(normalize(line,isRtlDominant,hasRtl),isRtlDominant);
                         line.clear();
 
-                        lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition);
+                        lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
 
                         endOfLastTextX = ENDOFLASTTEXTX_RESET_VALUE;
                         expectedStartOfNextWordX = EXPECTEDSTARTOFNEXTWORDX_RESET_VALUE;
@@ -1561,13 +1562,14 @@ public class PDFTextStripper extends PDF
      * @param lastPosition the previous text position
      * @param lastLineStartPosition the last text position that followed a line
      *        separator.
+     * @param maxHeightForLine max height for positions since lastLineStartPosition
      * @throws IOException
      */
     protected PositionWrapper handleLineSeparation(PositionWrapper current,
-            PositionWrapper lastPosition, PositionWrapper lastLineStartPosition)
+            PositionWrapper lastPosition, PositionWrapper lastLineStartPosition, float maxHeightForLine)
             throws IOException {
         current.setLineStart();
-        isParagraphSeparation(current, lastPosition, lastLineStartPosition);
+        isParagraphSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
         lastLineStartPosition = current;
         if (current.isParagraphStart())  {
             if(lastPosition.isArticleStart()) {
@@ -1605,9 +1607,10 @@ public class PDFTextStripper extends PDF
      * @param lastPosition the previous text position (should not be null).
      * @param lastLineStartPosition the last text position that followed a line
      *            separator. May be null.
+     * @param maxHeightForLine max height for text positions since lasLineStartPosition.
      */
     protected void isParagraphSeparation(PositionWrapper position,  
-            PositionWrapper lastPosition, PositionWrapper lastLineStartPosition){
+            PositionWrapper lastPosition, PositionWrapper lastLineStartPosition, float maxHeightForLine){
         boolean result = false;
         if(lastLineStartPosition == null) {
             result = true;
@@ -1616,7 +1619,7 @@ public class PDFTextStripper extends PDF
                     lastPosition.getTextPosition().getYDirAdj());
             float xGap = (position.getTextPosition().getXDirAdj()-
                     lastLineStartPosition.getTextPosition().getXDirAdj());//do we need to flip this for rtl?
-            if(yGap > (getDropThreshold()*position.getTextPosition().getHeightDir())){
+            if(yGap > (getDropThreshold()*maxHeightForLine)){
                         result = true;
             }else if(xGap > (getIndentThreshold()*position.getTextPosition().getWidthOfSpace())){
                 //text is indented, but try to screen for hanging indent