You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/07/15 19:08:08 UTC

[tika] branch main updated: TIKA-3131 -- swap default values of averageCharTolerance and spacingTolerance to match PDFBox defaults (#325)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 5f747ac  TIKA-3131 -- swap default values of averageCharTolerance and spacingTolerance to match PDFBox defaults (#325)
5f747ac is described below

commit 5f747ac3c7d19224cd9d9086346251096c1109fc
Author: Clark Perkins <cl...@users.noreply.github.com>
AuthorDate: Wed Jul 15 14:08:01 2020 -0500

    TIKA-3131 -- swap default values of averageCharTolerance and spacingTolerance to match PDFBox defaults (#325)
---
 .../src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 81d7e0f..bb588df 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -119,11 +119,11 @@ public class PDFParserConfig implements Serializable {
 
     //The character width-based tolerance value used to estimate where spaces in text should be added
     //Default taken from PDFBox.
-    private Float averageCharTolerance = 0.5f;
+    private Float averageCharTolerance = 0.3f;
 
     //The space width-based tolerance value used to estimate where spaces in text should be added
     //Default taken from PDFBox.
-    private Float spacingTolerance = 0.3f;
+    private Float spacingTolerance = 0.5f;
 
     // The multiplication factor for line height to decide when a new paragraph starts.
     //Default taken from PDFBox.