You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/04/14 15:23:16 UTC
[tika] 02/02: TIKA-3091 prevent npe in PDFParserConfig by
initializing three parameters with default values.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit f51ae0aef10a052308d6830fcefad77347bc5ebd
Author: tallison <ta...@apache.org>
AuthorDate: Tue Apr 14 11:09:52 2020 -0400
TIKA-3091 prevent npe in PDFParserConfig by initializing
three parameters with default values.
# Conflicts:
# tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
---
.../java/org/apache/tika/parser/pdf/PDFParser.java | 11 +++++++++
.../apache/tika/parser/pdf/PDFParserConfig.java | 28 ++++++++++++++++++++--
.../org/apache/tika/parser/pdf/PDFParserTest.java | 10 ++++++++
3 files changed, 47 insertions(+), 2 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 2e637e0..6d8b5b1 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -524,6 +524,17 @@ public class PDFParser extends AbstractParser implements Initializable {
}
@Field
+ void setAverageCharTolerance(float averageCharTolerance) {
+ defaultConfig.setAverageCharTolerance(averageCharTolerance);
+ }
+
+ @Field
+ void setSpacingTolerance(float spacingTolerance) {
+ defaultConfig.setSpacingTolerance(spacingTolerance);
+ }
+
+
+ @Field
void setCatchIntermediateExceptions(boolean catchIntermediateExceptions) {
defaultConfig.setCatchIntermediateIOExceptions(catchIntermediateExceptions);
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index b5d6824..da8b309 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -114,10 +114,16 @@ public class PDFParserConfig implements Serializable {
private boolean extractMarkedContent = false;
//The character width-based tolerance value used to estimate where spaces in text should be added
- private Float averageCharTolerance;
+ //Default taken from PDFBox.
+ private Float averageCharTolerance = 0.5f;
//The space width-based tolerance value used to estimate where spaces in text should be added
- private Float spacingTolerance;
+ //Default taken from PDFBox.
+ private Float spacingTolerance = 0.3f;
+
+ // The multiplication factor for line height to decide when a new paragraph starts.
+ //Default taken from PDFBox.
+ private Float dropThreshold = 2.5f;
//If the PDF has an XFA element, process only that and skip extracting
//content from elsewhere in the document.
@@ -238,6 +244,10 @@ public class PDFParserConfig implements Serializable {
setSetKCMS(getBooleanProp(props.getProperty("setKCMS"), false));
+ setAverageCharTolerance(getFloatProp(props.getProperty("averageCharTolerance"), averageCharTolerance));
+ setSpacingTolerance(getFloatProp(props.getProperty("spacingTolerance"), spacingTolerance));
+ setDropThreshold(getFloatProp(props.getProperty("dropThreshold"), dropThreshold));
+
boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true);
@@ -287,6 +297,9 @@ public class PDFParserConfig implements Serializable {
if (getSpacingTolerance() != null) {
pdf2XHTML.setSpacingTolerance(getSpacingTolerance());
}
+ if (getDropThreshold() != null) {
+ pdf2XHTML.setDropThreshold(dropThreshold);
+ }
pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText());
}
@@ -513,6 +526,14 @@ public class PDFParserConfig implements Serializable {
this.spacingTolerance = spacingTolerance;
}
+ public Float getDropThreshold() {
+ return dropThreshold;
+ }
+
+ public void setDropThreshold(float dropThreshold) {
+ this.dropThreshold = dropThreshold;
+ }
+
public AccessChecker getAccessChecker() {
return accessChecker;
}
@@ -824,6 +845,7 @@ public class PDFParserConfig implements Serializable {
if (getCatchIntermediateIOExceptions() != config.getCatchIntermediateIOExceptions()) return false;
if (!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return false;
if (!getSpacingTolerance().equals(config.getSpacingTolerance())) return false;
+ if (!getDropThreshold().equals(config.getDropThreshold())) return false;
if (!getOcrStrategy().equals(config.getOcrStrategy())) return false;
if (getOcrImageType() != config.getOcrImageType()) return false;
if (!getOcrImageFormatName().equals(config.getOcrImageFormatName())) return false;
@@ -844,6 +866,7 @@ public class PDFParserConfig implements Serializable {
result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0);
result = 31 * result + getAverageCharTolerance().hashCode();
result = 31 * result + getSpacingTolerance().hashCode();
+ result = 31 * result + getDropThreshold().hashCode();
result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0);
result = 31 * result + ocrStrategy.hashCode();
result = 31 * result + getOcrDPI();
@@ -869,6 +892,7 @@ public class PDFParserConfig implements Serializable {
", extractUniqueInlineImagesOnly=" + extractUniqueInlineImagesOnly +
", averageCharTolerance=" + averageCharTolerance +
", spacingTolerance=" + spacingTolerance +
+ ", dropThreshold=" + dropThreshold +
", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA +
", ocrStrategy=" + ocrStrategy +
", ocrDPI=" + ocrDPI +
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index edcd513..4e2e3c5 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1528,6 +1528,16 @@ public class PDFParserTest extends TikaTest {
}
+ @Test
+ public void testNPEInPDFParserConfig() {
+ //TIKA-3091
+ PDFParserConfig config = new PDFParserConfig();
+ //don't care about values; want to make sure no NPE is thrown
+ String txt = config.toString();
+ config.hashCode();
+ config.equals(new PDFParserConfig());
+ }
+
@Test //TIKA-3041
@Ignore("turn back on if we add file from PDFBOX-52")
public void testPDFBox52() throws Exception {