You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/04/14 15:23:16 UTC

[tika] 02/02: TIKA-3091 prevent npe in PDFParserConfig by initializing three parameters with default values.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit f51ae0aef10a052308d6830fcefad77347bc5ebd
Author: tallison <ta...@apache.org>
AuthorDate: Tue Apr 14 11:09:52 2020 -0400

    TIKA-3091 prevent npe in PDFParserConfig by initializing
    three parameters with default values.
    
    # Conflicts:
    #	tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
---
 .../java/org/apache/tika/parser/pdf/PDFParser.java | 11 +++++++++
 .../apache/tika/parser/pdf/PDFParserConfig.java    | 28 ++++++++++++++++++++--
 .../org/apache/tika/parser/pdf/PDFParserTest.java  | 10 ++++++++
 3 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 2e637e0..6d8b5b1 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -524,6 +524,17 @@ public class PDFParser extends AbstractParser implements Initializable {
     }
 
     @Field
+    void setAverageCharTolerance(float averageCharTolerance) {
+        defaultConfig.setAverageCharTolerance(averageCharTolerance);
+    }
+
+    @Field
+    void setSpacingTolerance(float spacingTolerance) {
+        defaultConfig.setSpacingTolerance(spacingTolerance);
+    }
+
+
+    @Field
     void setCatchIntermediateExceptions(boolean catchIntermediateExceptions) {
         defaultConfig.setCatchIntermediateIOExceptions(catchIntermediateExceptions);
     }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index b5d6824..da8b309 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -114,10 +114,16 @@ public class PDFParserConfig implements Serializable {
     private boolean extractMarkedContent = false;
 
     //The character width-based tolerance value used to estimate where spaces in text should be added
-    private Float averageCharTolerance;
+    //Default taken from PDFBox.
+    private Float averageCharTolerance = 0.5f;
 
     //The space width-based tolerance value used to estimate where spaces in text should be added
-    private Float spacingTolerance;
+    //Default taken from PDFBox.
+    private Float spacingTolerance = 0.3f;
+
+    // The multiplication factor for line height to decide when a new paragraph starts.
+    //Default taken from PDFBox.
+    private Float dropThreshold = 2.5f;
 
     //If the PDF has an XFA element, process only that and skip extracting
     //content from elsewhere in the document.
@@ -238,6 +244,10 @@ public class PDFParserConfig implements Serializable {
 
         setSetKCMS(getBooleanProp(props.getProperty("setKCMS"), false));
 
+        setAverageCharTolerance(getFloatProp(props.getProperty("averageCharTolerance"), averageCharTolerance));
+        setSpacingTolerance(getFloatProp(props.getProperty("spacingTolerance"), spacingTolerance));
+        setDropThreshold(getFloatProp(props.getProperty("dropThreshold"), dropThreshold));
+
         boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
         boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true);
 
@@ -287,6 +297,9 @@ public class PDFParserConfig implements Serializable {
         if (getSpacingTolerance() != null) {
             pdf2XHTML.setSpacingTolerance(getSpacingTolerance());
         }
+        if (getDropThreshold() != null) {
+            pdf2XHTML.setDropThreshold(dropThreshold);
+        }
         pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText());
     }
 
@@ -513,6 +526,14 @@ public class PDFParserConfig implements Serializable {
         this.spacingTolerance = spacingTolerance;
     }
 
+    public Float getDropThreshold() {
+        return dropThreshold;
+    }
+
+    public void setDropThreshold(float dropThreshold) {
+        this.dropThreshold = dropThreshold;
+    }
+
     public AccessChecker getAccessChecker() {
         return accessChecker;
     }
@@ -824,6 +845,7 @@ public class PDFParserConfig implements Serializable {
         if (getCatchIntermediateIOExceptions() != config.getCatchIntermediateIOExceptions()) return false;
         if (!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return false;
         if (!getSpacingTolerance().equals(config.getSpacingTolerance())) return false;
+        if (!getDropThreshold().equals(config.getDropThreshold())) return false;
         if (!getOcrStrategy().equals(config.getOcrStrategy())) return false;
         if (getOcrImageType() != config.getOcrImageType()) return false;
         if (!getOcrImageFormatName().equals(config.getOcrImageFormatName())) return false;
@@ -844,6 +866,7 @@ public class PDFParserConfig implements Serializable {
         result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0);
         result = 31 * result + getAverageCharTolerance().hashCode();
         result = 31 * result + getSpacingTolerance().hashCode();
+        result = 31 * result + getDropThreshold().hashCode();
         result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0);
         result = 31 * result + ocrStrategy.hashCode();
         result = 31 * result + getOcrDPI();
@@ -869,6 +892,7 @@ public class PDFParserConfig implements Serializable {
                 ", extractUniqueInlineImagesOnly=" + extractUniqueInlineImagesOnly +
                 ", averageCharTolerance=" + averageCharTolerance +
                 ", spacingTolerance=" + spacingTolerance +
+                ", dropThreshold=" + dropThreshold +
                 ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA +
                 ", ocrStrategy=" + ocrStrategy +
                 ", ocrDPI=" + ocrDPI +
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index edcd513..4e2e3c5 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1528,6 +1528,16 @@ public class PDFParserTest extends TikaTest {
 
     }
 
+    @Test
+    public void testNPEInPDFParserConfig() {
+        //TIKA-3091
+        PDFParserConfig config = new PDFParserConfig();
+        //don't care about values; want to make sure no NPE is thrown
+        String txt = config.toString();
+        config.hashCode();
+        config.equals(new PDFParserConfig());
+    }
+
     @Test //TIKA-3041
     @Ignore("turn back on if we add file from PDFBOX-52")
     public void testPDFBox52() throws Exception {