You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/02/09 22:46:46 UTC

[tika] branch main updated: TIKA-3297 -- add pdf parser config

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new fdd019c  TIKA-3297 -- add pdf parser config
fdd019c is described below

commit fdd019ca9685f649324fdc80749f65ff3d9d13ba
Author: tballison <ta...@apache.org>
AuthorDate: Tue Feb 9 17:45:18 2021 -0500

    TIKA-3297 -- add pdf parser config
---
 .../java/org/apache/tika/parser/pdf/PDFParser.java |   6 +-
 .../apache/tika/parser/pdf/PDFParserConfig.java    | 235 ++++++---------------
 .../apache/tika/parser/pdf/PDFParser.properties    |  43 ----
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  26 +++
 .../tika/server/classic/TikaResourceTest.java      |  26 ++-
 .../tika/server/classic/UnpackerResourceTest.java  |   2 +-
 6 files changed, 115 insertions(+), 223 deletions(-)

diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 89ed18b..805f9c0 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -124,7 +124,11 @@ public class PDFParser extends AbstractParser implements Initializable {
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
 
-        PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
+        PDFParserConfig localConfig = defaultConfig;
+        PDFParserConfig userConfig = context.get(PDFParserConfig.class);
+        if (userConfig != null) {
+            localConfig = defaultConfig.cloneAndUpdate(userConfig);
+        }
         if (localConfig.isSetKCMS()) {
             System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
         }
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index d52f91a..386d42d 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -16,14 +16,16 @@
  */
 package org.apache.tika.parser.pdf;
 
-import java.io.IOException;
-import java.io.InputStream;
 import java.io.Serializable;
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import java.util.HashSet;
 import java.util.Locale;
-import java.util.Properties;
+import java.util.Set;
 
 import org.apache.pdfbox.rendering.ImageType;
 import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.tika.exception.TikaException;
 
 /**
  * Config for PDFParser.
@@ -84,7 +86,7 @@ public class PDFParserConfig implements Serializable {
     private boolean enableAutoSpace = true;
 
     // True if we let PDFBox remove duplicate overlapping text:
-    private boolean suppressDuplicateOverlappingText;
+    private boolean suppressDuplicateOverlappingText = false;
 
     // True if we extract annotation text ourselves
     // (workaround for PDFBOX-1143):
@@ -138,7 +140,7 @@ public class PDFParserConfig implements Serializable {
     private String ocrImageFormatName = "png";
     private float ocrImageQuality = 1.0f;
 
-    private AccessChecker accessChecker;
+    private AccessChecker accessChecker = new AccessChecker();
 
     //The PDFParser can throw IOExceptions if there is a problem
     //with a streams.  If this is set to true, Tika's
@@ -156,112 +158,7 @@ public class PDFParserConfig implements Serializable {
 
     private boolean detectAngles = false;
 
-    public PDFParserConfig() {
-        init(this.getClass().getResourceAsStream("PDFParser.properties"));
-    }
-
-    /**
-     * Loads properties from InputStream and then tries to close InputStream.
-     * If there is an IOException, this silently swallows the exception
-     * and goes back to the default.
-     *
-     * @param is
-     */
-    public PDFParserConfig(InputStream is) {
-        init(is);
-    }
-
-    //initializes object and then tries to close inputstream
-    private void init(InputStream is) {
-
-        if (is == null) {
-            return;
-        }
-        Properties props = new Properties();
-        try {
-            props.load(is);
-        } catch (IOException e) {
-        } finally {
-            if (is != null) {
-                try {
-                    is.close();
-                } catch (IOException e) {
-                    //swallow
-                }
-            }
-        }
-        setEnableAutoSpace(
-                getBooleanProp(props.getProperty("enableAutoSpace"), isEnableAutoSpace()));
-        setSuppressDuplicateOverlappingText(
-                getBooleanProp(props.getProperty("suppressDuplicateOverlappingText"),
-                        isSuppressDuplicateOverlappingText()));
-        setExtractAnnotationText(
-                getBooleanProp(props.getProperty("extractAnnotationText"),
-                        isExtractAnnotationText()));
-        setSortByPosition(
-                getBooleanProp(props.getProperty("sortByPosition"),
-                        isSortByPosition()));
-        setExtractAcroFormContent(
-                getBooleanProp(props.getProperty("extractAcroFormContent"),
-                        isExtractAcroFormContent()));
-		setExtractBookmarksText(
-				getBooleanProp(props.getProperty("extractBookmarksText"),
-						isExtractBookmarksText()));
-        setExtractInlineImages(
-                getBooleanProp(props.getProperty("extractInlineImages"),
-                        isExtractInlineImages()));
-        setExtractUniqueInlineImagesOnly(
-                getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"),
-                        isExtractUniqueInlineImagesOnly()));
-        setExtractInlineImageMetadataOnly(
-                getBooleanProp(props.getProperty("extractInlineImageMetadataOnly"),
-                        isExtractInlineImageMetadataOnly())
-        );
-        setExtractFontNames(
-                getBooleanProp(props.getProperty("extractFontNames"),
-                        isExtractFontNames()));
-
-
-        setIfXFAExtractOnlyXFA(
-            getBooleanProp(props.getProperty("ifXFAExtractOnlyXFA"),
-                isIfXFAExtractOnlyXFA()));
-
-        setCatchIntermediateIOExceptions(
-                getBooleanProp(props.getProperty("catchIntermediateIOExceptions"),
-                isCatchIntermediateIOExceptions()));
-
-        setOcrStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy")));
-
-        setOcrDPI(getIntProp(props.getProperty("ocrDPI"), getOcrDPI()));
-
-        setOcrImageFormatName(props.getProperty("ocrImageFormatName"));
-
-        setOcrImageType(parseImageType(props.getProperty("ocrImageType")));
-
-        setExtractActions(getBooleanProp(props.getProperty("extractActions"), false));
-
-        setExtractMarkedContent(getBooleanProp(props.getProperty("extractMarkedContent"), false));
-
-        setSetKCMS(getBooleanProp(props.getProperty("setKCMS"), false));
-
-        setAverageCharTolerance(getFloatProp(props.getProperty("averageCharTolerance"), averageCharTolerance));
-        setSpacingTolerance(getFloatProp(props.getProperty("spacingTolerance"), spacingTolerance));
-        setDropThreshold(getFloatProp(props.getProperty("dropThreshold"), dropThreshold));
-
-        boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
-        boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true);
-
-        if (checkExtractAccessPermission == false) {
-            //silently ignore the crazy configuration of checkExtractAccessPermission = false,
-            //but allowExtractionForAccessibility=false
-            accessChecker = new AccessChecker();
-        } else {
-            accessChecker = new AccessChecker(allowExtractionForAccessibility);
-        }
-
-        maxMainMemoryBytes = getLongProp(props.getProperty("maxMainMemoryBytes"), -1);
-        detectAngles = getBooleanProp(props.getProperty("detectAngles"), false);
-    }
+    private final Set<String> userConfigured = new HashSet<>();
 
     /**
      * Use this when you want to know how many images of what formats are in a PDF
@@ -276,6 +173,7 @@ public class PDFParserConfig implements Serializable {
      */
     void setExtractInlineImageMetadataOnly(boolean extractInlineImageMetadataOnly) {
         this.extractInlineImageMetadataOnly = extractInlineImageMetadataOnly;
+        userConfigured.add("extractInlineImageMetadataOnly");
     }
 
     /**
@@ -296,6 +194,7 @@ public class PDFParserConfig implements Serializable {
      */
     public void setExtractMarkedContent(boolean extractMarkedContent) {
         this.extractMarkedContent = extractMarkedContent;
+        userConfigured.add("extractMarkedContent");
     }
 
     public boolean isExtractMarkedContent() {
@@ -342,7 +241,7 @@ public class PDFParserConfig implements Serializable {
      */
     public void setExtractAcroFormContent(boolean extractAcroFormContent) {
         this.extractAcroFormContent = extractAcroFormContent;
-
+        userConfigured.add("extractAcroFormContent");
     }
 
     /**
@@ -362,6 +261,7 @@ public class PDFParserConfig implements Serializable {
      */
     public void setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA) {
         this.ifXFAExtractOnlyXFA = ifXFAExtractOnlyXFA;
+        userConfigured.add("ifXFAExtractOnlyXFA");
     }
 
 	/**
@@ -379,6 +279,7 @@ public class PDFParserConfig implements Serializable {
 	 */
 	public void setExtractBookmarksText(boolean extractBookmarksText) {
 		this.extractBookmarksText = extractBookmarksText;
+		userConfigured.add("extractBookmarksText");
 	}
 
     /**
@@ -387,6 +288,7 @@ public class PDFParserConfig implements Serializable {
      */
 	public void setExtractFontNames(boolean extractFontNames) {
 	    this.extractFontNames = extractFontNames;
+	    userConfigured.add("extractFontNames");
     }
 
     public boolean isExtractFontNames() {
@@ -415,6 +317,7 @@ public class PDFParserConfig implements Serializable {
      */
     public void setExtractInlineImages(boolean extractInlineImages) {
         this.extractInlineImages = extractInlineImages;
+        userConfigured.add("extractInlineImages");
     }
 
     /**
@@ -447,6 +350,7 @@ public class PDFParserConfig implements Serializable {
      */
     public void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) {
         this.extractUniqueInlineImagesOnly = extractUniqueInlineImagesOnly;
+        userConfigured.add("extractUniqueInlineImagesOnly");
     }
 
     /**
@@ -464,6 +368,7 @@ public class PDFParserConfig implements Serializable {
      */
     public void setEnableAutoSpace(boolean enableAutoSpace) {
         this.enableAutoSpace = enableAutoSpace;
+        userConfigured.add("enableAutoSpace");
     }
 
     /**
@@ -485,6 +390,7 @@ public class PDFParserConfig implements Serializable {
     public void setSuppressDuplicateOverlappingText(
             boolean suppressDuplicateOverlappingText) {
         this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingText;
+        userConfigured.add("suppressDuplicateOverlappingText");
     }
 
     /**
@@ -500,6 +406,7 @@ public class PDFParserConfig implements Serializable {
      */
     public void setExtractAnnotationText(boolean extractAnnotationText) {
         this.extractAnnotationText = extractAnnotationText;
+        userConfigured.add("extractAnnotationText");
     }
 
     /**
@@ -519,6 +426,7 @@ public class PDFParserConfig implements Serializable {
      */
     public void setSortByPosition(boolean sortByPosition) {
         this.sortByPosition = sortByPosition;
+        userConfigured.add("sortByPosition");
     }
 
     /**
@@ -533,6 +441,7 @@ public class PDFParserConfig implements Serializable {
      */
     public void setAverageCharTolerance(Float averageCharTolerance) {
         this.averageCharTolerance = averageCharTolerance;
+        userConfigured.add("averageCharTolerance");
     }
 
     /**
@@ -547,6 +456,7 @@ public class PDFParserConfig implements Serializable {
      */
     public void setSpacingTolerance(Float spacingTolerance) {
         this.spacingTolerance = spacingTolerance;
+        userConfigured.add("spacingTolerance");
     }
 
     /**
@@ -561,6 +471,7 @@ public class PDFParserConfig implements Serializable {
      */
     public void setDropThreshold(Float dropThreshold) {
         this.dropThreshold = dropThreshold;
+        userConfigured.add("dropThreshold");
     }
 
     public AccessChecker getAccessChecker() {
@@ -569,6 +480,7 @@ public class PDFParserConfig implements Serializable {
 
     public void setAccessChecker(AccessChecker accessChecker) {
         this.accessChecker = accessChecker;
+        userConfigured.add("accessChecker");
     }
 
     /**
@@ -589,6 +501,7 @@ public class PDFParserConfig implements Serializable {
      */
     public void setCatchIntermediateIOExceptions(boolean catchIntermediateIOExceptions) {
         this.catchIntermediateIOExceptions = catchIntermediateIOExceptions;
+        userConfigured.add("catchIntermediateIOExceptions");
     }
 
     /**
@@ -597,14 +510,7 @@ public class PDFParserConfig implements Serializable {
      */
     public void setOcrStrategy(OCR_STRATEGY ocrStrategy) {
         this.ocrStrategy = ocrStrategy;
-    }
-
-    /**
-     * Which strategy to use for OCR
-     * @param ocrStrategyString
-     */
-    public void setOcrStrategy(String ocrStrategyString) {
-        this.ocrStrategy = OCR_STRATEGY.parse(ocrStrategyString);
+        userConfigured.add("ocrStrategy");
     }
     /**
      *
@@ -614,47 +520,14 @@ public class PDFParserConfig implements Serializable {
         return ocrStrategy;
     }
 
-    private boolean getBooleanProp(String p, boolean defaultMissing) {
-        if (p == null) {
-            return defaultMissing;
-        }
-        if (p.toLowerCase(Locale.ROOT).equals("true")) {
-            return true;
-        } else if (p.toLowerCase(Locale.ROOT).equals("false")) {
-            return false;
-        } else {
-            return defaultMissing;
-        }
-    }
-    //throws NumberFormatException if there's a non-null unparseable
-    //string passed in
-    private int getIntProp(String p, int defaultMissing) {
-        if (p == null) {
-            return defaultMissing;
-        }
-
-        return Integer.parseInt(p);
-    }
-
-    //throws NumberFormatException if there's a non-null unparseable
-    //string passed in
-    private long getLongProp(String p, long defaultMissing) {
-        if (p == null) {
-            return defaultMissing;
-        }
-
-        return Long.parseLong(p);
+    /**
+     * Which strategy to use for OCR
+     * @param ocrStrategyString
+     */
+    public void setOcrStrategy(String ocrStrategyString) {
+        setOcrStrategy(OCR_STRATEGY.parse(ocrStrategyString));
     }
 
-    //throws NumberFormatException if there's a non-null unparseable
-    //string passed in
-    private static float getFloatProp(String p, float defaultMissing) {
-        if (p == null) {
-            return defaultMissing;
-        }
-
-        return Float.parseFloat(p);
-    }
     /**
      * String representation of the image format used to render
      * the page image for OCR (examples: png, tiff, jpeg)
@@ -679,6 +552,7 @@ public class PDFParserConfig implements Serializable {
                     "I'm sorry, but I don't recognize: "+ocrImageFormatName);
         }
         this.ocrImageFormatName = ocrImageFormatName;
+        userConfigured.add("ocrImageFormatName");
     }
 
     /**
@@ -696,6 +570,7 @@ public class PDFParserConfig implements Serializable {
      */
     public void setOcrImageType(ImageType ocrImageType) {
         this.ocrImageType = ocrImageType;
+        userConfigured.add("ocrImageType");
     }
 
     /**
@@ -722,6 +597,7 @@ public class PDFParserConfig implements Serializable {
      */
     public void setOcrDPI(int ocrDPI) {
         this.ocrDPI = ocrDPI;
+        userConfigured.add("ocrDPI");
     }
 
     /**
@@ -739,6 +615,7 @@ public class PDFParserConfig implements Serializable {
      */
     public void setOcrImageQuality(float ocrImageQuality) {
         this.ocrImageQuality = ocrImageQuality;
+        userConfigured.add("ocrImageQuality");
     }
 
     /**
@@ -750,6 +627,7 @@ public class PDFParserConfig implements Serializable {
      */
     public void setExtractActions(boolean v) {
         extractActions = v;
+        userConfigured.add("extractActions");
     }
 
     /**
@@ -770,17 +648,9 @@ public class PDFParserConfig implements Serializable {
         return maxMainMemoryBytes;
     }
 
-    /**
-     * @deprecated use {@link #setMaxMainMemoryBytes(long)}
-     * @param maxMainMemoryBytes
-     */
-    @Deprecated
-    public void setMaxMainMemoryBytes(int maxMainMemoryBytes) {
-        this.maxMainMemoryBytes = maxMainMemoryBytes;
-    }
-
     public void setMaxMainMemoryBytes(long maxMainMemoryBytes) {
         this.maxMainMemoryBytes = maxMainMemoryBytes;
+        userConfigured.add("maxMainMemoryBytes");
     }
 
     /**
@@ -801,6 +671,7 @@ public class PDFParserConfig implements Serializable {
      */
     public void setSetKCMS(boolean setKCMS) {
         this.setKCMS = setKCMS;
+        userConfigured.add("setKCMS");
     }
 
     public boolean isSetKCMS() {
@@ -829,12 +700,40 @@ public class PDFParserConfig implements Serializable {
 
     public void setDetectAngles(boolean detectAngles) {
         this.detectAngles = detectAngles;
+        userConfigured.add("detectAngles");
     }
 
     public boolean isDetectAngles() {
         return detectAngles;
     }
 
+    public PDFParserConfig cloneAndUpdate(PDFParserConfig updates) throws TikaException {
+        PDFParserConfig updated = new PDFParserConfig();
+        for (Field field : this.getClass().getDeclaredFields()) {
+            if (Modifier.isFinal(field.getModifiers())) {
+                continue;
+            } else if (Modifier.isStatic(field.getModifiers())) {
+                continue;
+            }
+            if ("userConfigured".equals(field.getName())) {
+                continue;
+            }
+            if (updates.userConfigured.contains(field.getName())) {
+                try {
+                    field.set(updated, field.get(updates));
+                } catch (IllegalAccessException e) {
+                    throw new TikaException("can't update " + field.getName(), e);
+                }
+            } else {
+                try {
+                    field.set(updated, field.get(this));
+                } catch (IllegalAccessException e) {
+                    throw new TikaException("can't update " + field.getName(), e);
+                }
+            }
+        }
+        return updated;
+    }
     @Override
     public boolean equals(Object o) {
         if (this == o) return true;
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
deleted file mode 100644
index e2c0c32..0000000
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
+++ /dev/null
@@ -1,43 +0,0 @@
-#  Licensed to the Apache Software Foundation (ASF) under one or more
-#  contributor license agreements.  See the NOTICE file distributed with
-#  this work for additional information regarding copyright ownership.
-#  The ASF licenses this file to You under the Apache License, Version 2.0
-#  (the "License"); you may not use this file except in compliance with
-#  the License.  You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-enableAutoSpace true
-extractAnnotationText true
-sortByPosition	false
-suppressDuplicateOverlappingText	false
-extractAcroFormContent	true
-extractBookmarksText true
-extractInlineImages false
-extractUniqueInlineImagesOnly true
-checkExtractAccessPermission false
-allowExtractionForAccessibility true
-ifXFAExtractOnlyXFA false
-catchIntermediateIOExceptions true
-#options: no_ocr, ocr_only, ocr_and_text_extraction, auto
-ocrStrategy auto
-#dots per inch for the ocr rendering of the page image
-ocrDPI 300
-#if you request tif, make sure you have imageio jars on your classpath!
-#and make sure to specify tiff
-ocrImageFormatName png
-#options: argb, binary, gray, rgb
-ocrImageType gray
-# Use up to 500MB when loading a pdf into a PDDocument
-maxMainMemoryBytes 524288000
-#whether or not to set KCMS for faster (but legacy/unsupported) image rendering
-setKCMS false
-#whether or not to add processing to detect angles and extract
-#text accordingly PDFBOX-4371
-detectAngles false
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 202f4e2..ba4191e 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1074,11 +1074,37 @@ public class PDFParserTest extends TikaTest {
             assertNotNull(is);
             TikaConfig tikaConfig = new TikaConfig(is);
             Parser p = new AutoDetectParser(tikaConfig);
+
             String text = getText(getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"), p);
             text = text.replaceAll("\\s+", " ");
 
             // Column text is now interleaved:
             assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", text);
+
+            //test overriding underlying settings with PDFParserConfig
+            ParseContext pc = new ParseContext();
+            PDFParserConfig config = new PDFParserConfig();
+            config.setSortByPosition(false);
+            pc.set(PDFParserConfig.class, config);
+            text = getText("testPDFTwoTextBoxes.pdf", p, new Metadata(), pc);
+            text = text.replaceAll("\\s+", " ");
+            // Column text is not interleaved:
+            assertContains("Left column line 1 Left column line 2 ", text);
+
+            //test a new PDFParserConfig and setting another value
+            //this tests that the underlying "sortByPosition" as set
+            //in the config file is still operative
+            config = new PDFParserConfig();
+            config.setOcrDPI(10000);
+            config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
+            pc.set(PDFParserConfig.class, config);
+            text = getText("testPDFTwoTextBoxes.pdf",
+                    p, new Metadata(), pc);
+            text = text.replaceAll("\\s+", " ");
+
+            // Column text is now interleaved:
+            assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2"
+                    , text);
         }
     }
 
diff --git a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
index 292306b..4cbe8c9 100644
--- a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
+++ b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
@@ -47,6 +47,7 @@ import static org.apache.cxf.helpers.HttpHeaderHelper.CONTENT_ENCODING;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
 
 public class TikaResourceTest extends CXFTestBase {
     public static final String TEST_DOC = "test-documents/test.doc";
@@ -288,7 +289,7 @@ public class TikaResourceTest extends CXFTestBase {
     //TIKA-2638 and TIKA-2816
     @Test
     public void testOCRLanguageConfig() throws Exception {
-        if (! new TesseractOCRParser().hasTesseract(new TesseractOCRConfig())) {
+        if (! new TesseractOCRParser().hasTesseract()) {
             return;
         }
 
@@ -307,7 +308,7 @@ public class TikaResourceTest extends CXFTestBase {
     //TIKA-2290
     @Test
     public void testPDFOCRConfig() throws Exception {
-        if (! new TesseractOCRParser().hasTesseract(new TesseractOCRConfig())) {
+        if (! new TesseractOCRParser().hasTesseract()) {
             return;
         }
 
@@ -318,6 +319,7 @@ public class TikaResourceTest extends CXFTestBase {
                 .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
         String responseMsg = getStringFromInputStream((InputStream) response
                 .getEntity());
+
         assertTrue(responseMsg.trim().equals(""));
 
         response = WebClient.create(endPoint + TIKA_PATH)
@@ -415,14 +417,18 @@ public class TikaResourceTest extends CXFTestBase {
             //started after the upgrade to 3.2.7
         }
 
-        response = WebClient.create(endPoint + TIKA_PATH)
-                .type("application/pdf")
-                .accept("text/plain")
-                .header(TesseractServerConfig.X_TIKA_OCR_HEADER_PREFIX +
-                                "tesseractPath",
-                        "bogus path")
-                .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
-        assertEquals(200, response.getStatus());
+        try {
+            response = WebClient.create(endPoint + TIKA_PATH)
+                    .type("application/pdf")
+                    .accept("text/plain")
+                    .header(TesseractServerConfig.X_TIKA_OCR_HEADER_PREFIX +
+                                    "tesseractPath",
+                            "bogus path")
+                    .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+            assertEquals(400, response.getStatus());
+        } catch (ProcessingException e) {
+
+        }
     }
 
     @Test
diff --git a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/UnpackerResourceTest.java b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/UnpackerResourceTest.java
index 4c5fae9..dc9d638 100644
--- a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/UnpackerResourceTest.java
+++ b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/UnpackerResourceTest.java
@@ -220,7 +220,7 @@ public class UnpackerResourceTest extends CXFTestBase {
 
     @Test
     public void testPDFRenderOCR() throws Exception {
-        Assume.assumeTrue( new TesseractOCRParser().hasTesseract(new TesseractOCRConfig()));
+        Assume.assumeTrue( new TesseractOCRParser().hasTesseract());
 
         Response response = WebClient.create(CXFTestBase.endPoint + ALL_PATH)
                 .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX+"ocrStrategy", "ocr_only")