You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/04/14 15:10:08 UTC

[tika] branch master updated: TIKA-3091 prevent npe in PDFParserConfig by initializing three parameters with default values.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new d811de9  TIKA-3091 prevent npe in PDFParserConfig by initializing three parameters with default values.
d811de9 is described below

commit d811de93fe1d1f356e58708a06b300017a7a2f51
Author: tallison <ta...@apache.org>
AuthorDate: Tue Apr 14 11:09:52 2020 -0400

    TIKA-3091 prevent npe in PDFParserConfig by initializing
    three parameters with default values.
---
 .../src/main/java/org/apache/tika/parser/pdf/PDFParser.java | 11 +++++++++++
 .../java/org/apache/tika/parser/pdf/PDFParserConfig.java    | 13 ++++++++++---
 .../test/java/org/apache/tika/parser/pdf/PDFParserTest.java | 10 ++++++++++
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index ac57724..ec80dc4 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -504,6 +504,17 @@ public class PDFParser extends AbstractParser implements Initializable {
     }
 
     @Field
+    void setAverageCharTolerance(float averageCharTolerance) {
+        defaultConfig.setAverageCharTolerance(averageCharTolerance);
+    }
+
+    @Field
+    void setSpacingTolerance(float spacingTolerance) {
+        defaultConfig.setSpacingTolerance(spacingTolerance);
+    }
+
+
+    @Field
     void setCatchIntermediateExceptions(boolean catchIntermediateExceptions) {
         defaultConfig.setCatchIntermediateIOExceptions(catchIntermediateExceptions);
     }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 178a5f8..fc8bea6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -114,13 +114,16 @@ public class PDFParserConfig implements Serializable {
     private boolean extractMarkedContent = false;
 
     //The character width-based tolerance value used to estimate where spaces in text should be added
-    private Float averageCharTolerance;
+    //Default taken from PDFBox.
+    private Float averageCharTolerance = 0.5f;
 
     //The space width-based tolerance value used to estimate where spaces in text should be added
-    private Float spacingTolerance;
+    //Default taken from PDFBox.
+    private Float spacingTolerance = 0.3f;
 
     // The multiplication factor for line height to decide when a new paragraph starts.
-    private float dropThreshold;
+    //Default taken from PDFBox.
+    private float dropThreshold = 2.5f;
 
     //If the PDF has an XFA element, process only that and skip extracting
     //content from elsewhere in the document.
@@ -235,6 +238,10 @@ public class PDFParserConfig implements Serializable {
 
         setSetKCMS(getBooleanProp(props.getProperty("setKCMS"), false));
 
+        setAverageCharTolerance(getFloatProp(props.getProperty("averageCharTolerance"), averageCharTolerance));
+        setSpacingTolerance(getFloatProp(props.getProperty("spacingTolerance"), spacingTolerance));
+        setDropThreshold(getFloatProp(props.getProperty("dropThreshold"), dropThreshold));
+
         boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
         boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true);
 
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index e771ff7..08ef5f7 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1512,6 +1512,16 @@ public class PDFParserTest extends TikaTest {
 
     }
 
+    @Test
+    public void testNPEInPDFParserConfig() {
+        //TIKA-3091
+        PDFParserConfig config = new PDFParserConfig();
+        //don't care about values; want to make sure no NPE is thrown
+        String txt = config.toString();
+        config.hashCode();
+        config.equals(new PDFParserConfig());
+    }
+
     @Test //TIKA-3041
     @Ignore("turn back on if we add file from PDFBOX-52")
     public void testPDFBox52() throws Exception {