You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/04/14 15:10:08 UTC
[tika] branch master updated: TIKA-3091 prevent npe in
PDFParserConfig by initializing three parameters with default values.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new d811de9 TIKA-3091 prevent npe in PDFParserConfig by initializing three parameters with default values.
d811de9 is described below
commit d811de93fe1d1f356e58708a06b300017a7a2f51
Author: tallison <ta...@apache.org>
AuthorDate: Tue Apr 14 11:09:52 2020 -0400
TIKA-3091 prevent npe in PDFParserConfig by initializing
three parameters with default values.
---
.../src/main/java/org/apache/tika/parser/pdf/PDFParser.java | 11 +++++++++++
.../java/org/apache/tika/parser/pdf/PDFParserConfig.java | 13 ++++++++++---
.../test/java/org/apache/tika/parser/pdf/PDFParserTest.java | 10 ++++++++++
3 files changed, 31 insertions(+), 3 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index ac57724..ec80dc4 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -504,6 +504,17 @@ public class PDFParser extends AbstractParser implements Initializable {
}
@Field
+ void setAverageCharTolerance(float averageCharTolerance) {
+ defaultConfig.setAverageCharTolerance(averageCharTolerance);
+ }
+
+ @Field
+ void setSpacingTolerance(float spacingTolerance) {
+ defaultConfig.setSpacingTolerance(spacingTolerance);
+ }
+
+
+ @Field
void setCatchIntermediateExceptions(boolean catchIntermediateExceptions) {
defaultConfig.setCatchIntermediateIOExceptions(catchIntermediateExceptions);
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 178a5f8..fc8bea6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -114,13 +114,16 @@ public class PDFParserConfig implements Serializable {
private boolean extractMarkedContent = false;
//The character width-based tolerance value used to estimate where spaces in text should be added
- private Float averageCharTolerance;
+ //Default taken from PDFBox.
+ private Float averageCharTolerance = 0.5f;
//The space width-based tolerance value used to estimate where spaces in text should be added
- private Float spacingTolerance;
+ //Default taken from PDFBox.
+ private Float spacingTolerance = 0.3f;
// The multiplication factor for line height to decide when a new paragraph starts.
- private float dropThreshold;
+ //Default taken from PDFBox.
+ private float dropThreshold = 2.5f;
//If the PDF has an XFA element, process only that and skip extracting
//content from elsewhere in the document.
@@ -235,6 +238,10 @@ public class PDFParserConfig implements Serializable {
setSetKCMS(getBooleanProp(props.getProperty("setKCMS"), false));
+ setAverageCharTolerance(getFloatProp(props.getProperty("averageCharTolerance"), averageCharTolerance));
+ setSpacingTolerance(getFloatProp(props.getProperty("spacingTolerance"), spacingTolerance));
+ setDropThreshold(getFloatProp(props.getProperty("dropThreshold"), dropThreshold));
+
boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true);
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index e771ff7..08ef5f7 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1512,6 +1512,16 @@ public class PDFParserTest extends TikaTest {
}
+ @Test
+ public void testNPEInPDFParserConfig() {
+ //TIKA-3091
+ PDFParserConfig config = new PDFParserConfig();
+ //don't care about values; want to make sure no NPE is thrown
+ String txt = config.toString();
+ config.hashCode();
+ config.equals(new PDFParserConfig());
+ }
+
@Test //TIKA-3041
@Ignore("turn back on if we add file from PDFBOX-52")
public void testPDFBox52() throws Exception {