You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2020/11/07 13:25:59 UTC

svn commit: r1883174 - in /pdfbox/branches/2.0/pdfbox/src: main/java/org/apache/pdfbox/text/PDFTextStripper.java test/resources/input/eu-001.pdf-sorted.txt test/resources/input/eu-001.pdf.txt

Author: tilman
Date: Sat Nov  7 13:25:58 2020
New Revision: 1883174

URL: http://svn.apache.org/viewvc?rev=1883174&view=rev
Log:
PDFBOX-5002: avoid merged text when a big font is followed by at least two lines of text in a smaller font, as suggested by Thierry Guérin

Modified:
    pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
    pdfbox/branches/2.0/pdfbox/src/test/resources/input/eu-001.pdf-sorted.txt
    pdfbox/branches/2.0/pdfbox/src/test/resources/input/eu-001.pdf.txt

Modified: pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java?rev=1883174&r1=1883173&r2=1883174&view=diff
==============================================================================
--- pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java (original)
+++ pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/text/PDFTextStripper.java Sat Nov  7 13:25:58 2020
@@ -686,6 +686,16 @@ public class PDFTextStripper extends Leg
                     {
                         line.add(LineItem.getWordSeparator());
                     }
+                    // if there is at least the equivalent of one space
+                    // between the last character and the current one,
+                    // reset the max line height as the font size may have completely changed
+                    if (Math.abs(position.getX()
+                            - lastPosition.getTextPosition().getX()) > (wordSpacing + deltaSpace))
+                    {
+                        maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
+                        maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
+                        minYTopForLine = MIN_Y_TOP_FOR_LINE_RESET_VALUE;
+                    }
                 }
                 if (positionY >= maxYForLine)
                 {

Modified: pdfbox/branches/2.0/pdfbox/src/test/resources/input/eu-001.pdf-sorted.txt
URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/test/resources/input/eu-001.pdf-sorted.txt?rev=1883174&r1=1883173&r2=1883174&view=diff
==============================================================================
--- pdfbox/branches/2.0/pdfbox/src/test/resources/input/eu-001.pdf-sorted.txt (original)
+++ pdfbox/branches/2.0/pdfbox/src/test/resources/input/eu-001.pdf-sorted.txt Sat Nov  7 13:25:58 2020
@@ -124,10 +124,12 @@ Other organic substances
  to air to water to land 
 kg/year kg/year kg/year 
 Anthracene 50 1 1 
-Benzene 1 000 200 (as 200 (as BTEX) BTEX) 
+Benzene 1 000 200 (as 200 (as 
+BTEX) BTEX) 
 Benzo(g,h,i)perylene - 1 - 
 Di-(2-ethyl hexyl) phthalate (DEHP) 10 1 1 
-Ethyl benzene - 200 (as 200 (as BTEX) BTEX) 
+Ethyl benzene - 200 (as 200 (as 
+BTEX) BTEX) 
 Ethylene oxide 1 000 10 10 
 Fluoranthene - 1 - 
 Naphthalene 100 10 10 
@@ -137,10 +139,12 @@ Octylphenols and octylphenol ethoxylates
 Organotin compounds (as total Sn) - 50 50 
 Phenols (as total C) - 20 20 
 Polycyclic Aromatic hydrocarbons (PAHs) 50 5 5 
-Toluene - 200 (as 200 (as BTEX) BTEX) 
+Toluene - 200 (as 200 (as 
+BTEX) BTEX) 
 Total Organic Carbon (TOC) (as total C or 
 COD/3) - 50 000 - 
-Xylenes - 200 (as 200 (as BTEX) BTEX) 
+Xylenes - 200 (as 200 (as 
+BTEX) BTEX) 
  
  
 Inorganic substances 

Modified: pdfbox/branches/2.0/pdfbox/src/test/resources/input/eu-001.pdf.txt
URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/test/resources/input/eu-001.pdf.txt?rev=1883174&r1=1883173&r2=1883174&view=diff
==============================================================================
--- pdfbox/branches/2.0/pdfbox/src/test/resources/input/eu-001.pdf.txt (original)
+++ pdfbox/branches/2.0/pdfbox/src/test/resources/input/eu-001.pdf.txt Sat Nov  7 13:25:58 2020
@@ -148,12 +148,14 @@ kg/year
 to land 
 kg/year 
 Anthracene 50 1 1 
-Benzene 1 000 200 (as BTEX) 
+Benzene 1 000 200 (as 
+BTEX) 
 200 (as 
 BTEX) 
 Benzo(g,h,i)perylene - 1 - 
 Di-(2-ethyl hexyl) phthalate (DEHP) 10 1 1 
-Ethyl benzene - 200 (as BTEX) 
+Ethyl benzene - 200 (as 
+BTEX) 
 200 (as 
 BTEX) 
 Ethylene oxide 1 000 10 10 
@@ -165,12 +167,14 @@ Octylphenols and octylphenol ethoxylates
 Organotin compounds (as total Sn) - 50 50 
 Phenols (as total C) - 20 20 
 Polycyclic Aromatic hydrocarbons (PAHs) 50 5 5 
-Toluene - 200 (as BTEX) 
+Toluene - 200 (as 
+BTEX) 
 200 (as 
 BTEX) 
 Total Organic Carbon (TOC) (as total C or 
 COD/3) - 50 000 - 
-Xylenes - 200 (as BTEX) 
+Xylenes - 200 (as 
+BTEX) 
 200 (as 
 BTEX)