You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/02/09 22:46:46 UTC
[tika] branch main updated: TIKA-3297 -- add pdf parser config
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new fdd019c TIKA-3297 -- add pdf parser config
fdd019c is described below
commit fdd019ca9685f649324fdc80749f65ff3d9d13ba
Author: tballison <ta...@apache.org>
AuthorDate: Tue Feb 9 17:45:18 2021 -0500
TIKA-3297 -- add pdf parser config
---
.../java/org/apache/tika/parser/pdf/PDFParser.java | 6 +-
.../apache/tika/parser/pdf/PDFParserConfig.java | 235 ++++++---------------
.../apache/tika/parser/pdf/PDFParser.properties | 43 ----
.../org/apache/tika/parser/pdf/PDFParserTest.java | 26 +++
.../tika/server/classic/TikaResourceTest.java | 26 ++-
.../tika/server/classic/UnpackerResourceTest.java | 2 +-
6 files changed, 115 insertions(+), 223 deletions(-)
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 89ed18b..805f9c0 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -124,7 +124,11 @@ public class PDFParser extends AbstractParser implements Initializable {
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
- PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
+ PDFParserConfig localConfig = defaultConfig;
+ PDFParserConfig userConfig = context.get(PDFParserConfig.class);
+ if (userConfig != null) {
+ localConfig = defaultConfig.cloneAndUpdate(userConfig);
+ }
if (localConfig.isSetKCMS()) {
System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index d52f91a..386d42d 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -16,14 +16,16 @@
*/
package org.apache.tika.parser.pdf;
-import java.io.IOException;
-import java.io.InputStream;
import java.io.Serializable;
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import java.util.HashSet;
import java.util.Locale;
-import java.util.Properties;
+import java.util.Set;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.tika.exception.TikaException;
/**
* Config for PDFParser.
@@ -84,7 +86,7 @@ public class PDFParserConfig implements Serializable {
private boolean enableAutoSpace = true;
// True if we let PDFBox remove duplicate overlapping text:
- private boolean suppressDuplicateOverlappingText;
+ private boolean suppressDuplicateOverlappingText = false;
// True if we extract annotation text ourselves
// (workaround for PDFBOX-1143):
@@ -138,7 +140,7 @@ public class PDFParserConfig implements Serializable {
private String ocrImageFormatName = "png";
private float ocrImageQuality = 1.0f;
- private AccessChecker accessChecker;
+ private AccessChecker accessChecker = new AccessChecker();
//The PDFParser can throw IOExceptions if there is a problem
//with a streams. If this is set to true, Tika's
@@ -156,112 +158,7 @@ public class PDFParserConfig implements Serializable {
private boolean detectAngles = false;
- public PDFParserConfig() {
- init(this.getClass().getResourceAsStream("PDFParser.properties"));
- }
-
- /**
- * Loads properties from InputStream and then tries to close InputStream.
- * If there is an IOException, this silently swallows the exception
- * and goes back to the default.
- *
- * @param is
- */
- public PDFParserConfig(InputStream is) {
- init(is);
- }
-
- //initializes object and then tries to close inputstream
- private void init(InputStream is) {
-
- if (is == null) {
- return;
- }
- Properties props = new Properties();
- try {
- props.load(is);
- } catch (IOException e) {
- } finally {
- if (is != null) {
- try {
- is.close();
- } catch (IOException e) {
- //swallow
- }
- }
- }
- setEnableAutoSpace(
- getBooleanProp(props.getProperty("enableAutoSpace"), isEnableAutoSpace()));
- setSuppressDuplicateOverlappingText(
- getBooleanProp(props.getProperty("suppressDuplicateOverlappingText"),
- isSuppressDuplicateOverlappingText()));
- setExtractAnnotationText(
- getBooleanProp(props.getProperty("extractAnnotationText"),
- isExtractAnnotationText()));
- setSortByPosition(
- getBooleanProp(props.getProperty("sortByPosition"),
- isSortByPosition()));
- setExtractAcroFormContent(
- getBooleanProp(props.getProperty("extractAcroFormContent"),
- isExtractAcroFormContent()));
- setExtractBookmarksText(
- getBooleanProp(props.getProperty("extractBookmarksText"),
- isExtractBookmarksText()));
- setExtractInlineImages(
- getBooleanProp(props.getProperty("extractInlineImages"),
- isExtractInlineImages()));
- setExtractUniqueInlineImagesOnly(
- getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"),
- isExtractUniqueInlineImagesOnly()));
- setExtractInlineImageMetadataOnly(
- getBooleanProp(props.getProperty("extractInlineImageMetadataOnly"),
- isExtractInlineImageMetadataOnly())
- );
- setExtractFontNames(
- getBooleanProp(props.getProperty("extractFontNames"),
- isExtractFontNames()));
-
-
- setIfXFAExtractOnlyXFA(
- getBooleanProp(props.getProperty("ifXFAExtractOnlyXFA"),
- isIfXFAExtractOnlyXFA()));
-
- setCatchIntermediateIOExceptions(
- getBooleanProp(props.getProperty("catchIntermediateIOExceptions"),
- isCatchIntermediateIOExceptions()));
-
- setOcrStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy")));
-
- setOcrDPI(getIntProp(props.getProperty("ocrDPI"), getOcrDPI()));
-
- setOcrImageFormatName(props.getProperty("ocrImageFormatName"));
-
- setOcrImageType(parseImageType(props.getProperty("ocrImageType")));
-
- setExtractActions(getBooleanProp(props.getProperty("extractActions"), false));
-
- setExtractMarkedContent(getBooleanProp(props.getProperty("extractMarkedContent"), false));
-
- setSetKCMS(getBooleanProp(props.getProperty("setKCMS"), false));
-
- setAverageCharTolerance(getFloatProp(props.getProperty("averageCharTolerance"), averageCharTolerance));
- setSpacingTolerance(getFloatProp(props.getProperty("spacingTolerance"), spacingTolerance));
- setDropThreshold(getFloatProp(props.getProperty("dropThreshold"), dropThreshold));
-
- boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
- boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true);
-
- if (checkExtractAccessPermission == false) {
- //silently ignore the crazy configuration of checkExtractAccessPermission = false,
- //but allowExtractionForAccessibility=false
- accessChecker = new AccessChecker();
- } else {
- accessChecker = new AccessChecker(allowExtractionForAccessibility);
- }
-
- maxMainMemoryBytes = getLongProp(props.getProperty("maxMainMemoryBytes"), -1);
- detectAngles = getBooleanProp(props.getProperty("detectAngles"), false);
- }
+ private final Set<String> userConfigured = new HashSet<>();
/**
* Use this when you want to know how many images of what formats are in a PDF
@@ -276,6 +173,7 @@ public class PDFParserConfig implements Serializable {
*/
void setExtractInlineImageMetadataOnly(boolean extractInlineImageMetadataOnly) {
this.extractInlineImageMetadataOnly = extractInlineImageMetadataOnly;
+ userConfigured.add("extractInlineImageMetadataOnly");
}
/**
@@ -296,6 +194,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setExtractMarkedContent(boolean extractMarkedContent) {
this.extractMarkedContent = extractMarkedContent;
+ userConfigured.add("extractMarkedContent");
}
public boolean isExtractMarkedContent() {
@@ -342,7 +241,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setExtractAcroFormContent(boolean extractAcroFormContent) {
this.extractAcroFormContent = extractAcroFormContent;
-
+ userConfigured.add("extractAcroFormContent");
}
/**
@@ -362,6 +261,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA) {
this.ifXFAExtractOnlyXFA = ifXFAExtractOnlyXFA;
+ userConfigured.add("ifXFAExtractOnlyXFA");
}
/**
@@ -379,6 +279,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setExtractBookmarksText(boolean extractBookmarksText) {
this.extractBookmarksText = extractBookmarksText;
+ userConfigured.add("extractBookmarksText");
}
/**
@@ -387,6 +288,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setExtractFontNames(boolean extractFontNames) {
this.extractFontNames = extractFontNames;
+ userConfigured.add("extractFontNames");
}
public boolean isExtractFontNames() {
@@ -415,6 +317,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setExtractInlineImages(boolean extractInlineImages) {
this.extractInlineImages = extractInlineImages;
+ userConfigured.add("extractInlineImages");
}
/**
@@ -447,6 +350,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) {
this.extractUniqueInlineImagesOnly = extractUniqueInlineImagesOnly;
+ userConfigured.add("extractUniqueInlineImagesOnly");
}
/**
@@ -464,6 +368,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setEnableAutoSpace(boolean enableAutoSpace) {
this.enableAutoSpace = enableAutoSpace;
+ userConfigured.add("enableAutoSpace");
}
/**
@@ -485,6 +390,7 @@ public class PDFParserConfig implements Serializable {
public void setSuppressDuplicateOverlappingText(
boolean suppressDuplicateOverlappingText) {
this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingText;
+ userConfigured.add("suppressDuplicateOverlappingText");
}
/**
@@ -500,6 +406,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setExtractAnnotationText(boolean extractAnnotationText) {
this.extractAnnotationText = extractAnnotationText;
+ userConfigured.add("extractAnnotationText");
}
/**
@@ -519,6 +426,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setSortByPosition(boolean sortByPosition) {
this.sortByPosition = sortByPosition;
+ userConfigured.add("sortByPosition");
}
/**
@@ -533,6 +441,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setAverageCharTolerance(Float averageCharTolerance) {
this.averageCharTolerance = averageCharTolerance;
+ userConfigured.add("averageCharTolerance");
}
/**
@@ -547,6 +456,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setSpacingTolerance(Float spacingTolerance) {
this.spacingTolerance = spacingTolerance;
+ userConfigured.add("spacingTolerance");
}
/**
@@ -561,6 +471,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setDropThreshold(Float dropThreshold) {
this.dropThreshold = dropThreshold;
+ userConfigured.add("dropThreshold");
}
public AccessChecker getAccessChecker() {
@@ -569,6 +480,7 @@ public class PDFParserConfig implements Serializable {
public void setAccessChecker(AccessChecker accessChecker) {
this.accessChecker = accessChecker;
+ userConfigured.add("accessChecker");
}
/**
@@ -589,6 +501,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setCatchIntermediateIOExceptions(boolean catchIntermediateIOExceptions) {
this.catchIntermediateIOExceptions = catchIntermediateIOExceptions;
+ userConfigured.add("catchIntermediateIOExceptions");
}
/**
@@ -597,14 +510,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setOcrStrategy(OCR_STRATEGY ocrStrategy) {
this.ocrStrategy = ocrStrategy;
- }
-
- /**
- * Which strategy to use for OCR
- * @param ocrStrategyString
- */
- public void setOcrStrategy(String ocrStrategyString) {
- this.ocrStrategy = OCR_STRATEGY.parse(ocrStrategyString);
+ userConfigured.add("ocrStrategy");
}
/**
*
@@ -614,47 +520,14 @@ public class PDFParserConfig implements Serializable {
return ocrStrategy;
}
- private boolean getBooleanProp(String p, boolean defaultMissing) {
- if (p == null) {
- return defaultMissing;
- }
- if (p.toLowerCase(Locale.ROOT).equals("true")) {
- return true;
- } else if (p.toLowerCase(Locale.ROOT).equals("false")) {
- return false;
- } else {
- return defaultMissing;
- }
- }
- //throws NumberFormatException if there's a non-null unparseable
- //string passed in
- private int getIntProp(String p, int defaultMissing) {
- if (p == null) {
- return defaultMissing;
- }
-
- return Integer.parseInt(p);
- }
-
- //throws NumberFormatException if there's a non-null unparseable
- //string passed in
- private long getLongProp(String p, long defaultMissing) {
- if (p == null) {
- return defaultMissing;
- }
-
- return Long.parseLong(p);
+ /**
+ * Which strategy to use for OCR
+ * @param ocrStrategyString
+ */
+ public void setOcrStrategy(String ocrStrategyString) {
+ setOcrStrategy(OCR_STRATEGY.parse(ocrStrategyString));
}
- //throws NumberFormatException if there's a non-null unparseable
- //string passed in
- private static float getFloatProp(String p, float defaultMissing) {
- if (p == null) {
- return defaultMissing;
- }
-
- return Float.parseFloat(p);
- }
/**
* String representation of the image format used to render
* the page image for OCR (examples: png, tiff, jpeg)
@@ -679,6 +552,7 @@ public class PDFParserConfig implements Serializable {
"I'm sorry, but I don't recognize: "+ocrImageFormatName);
}
this.ocrImageFormatName = ocrImageFormatName;
+ userConfigured.add("ocrImageFormatName");
}
/**
@@ -696,6 +570,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setOcrImageType(ImageType ocrImageType) {
this.ocrImageType = ocrImageType;
+ userConfigured.add("ocrImageType");
}
/**
@@ -722,6 +597,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setOcrDPI(int ocrDPI) {
this.ocrDPI = ocrDPI;
+ userConfigured.add("ocrDPI");
}
/**
@@ -739,6 +615,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setOcrImageQuality(float ocrImageQuality) {
this.ocrImageQuality = ocrImageQuality;
+ userConfigured.add("ocrImageQuality");
}
/**
@@ -750,6 +627,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setExtractActions(boolean v) {
extractActions = v;
+ userConfigured.add("extractActions");
}
/**
@@ -770,17 +648,9 @@ public class PDFParserConfig implements Serializable {
return maxMainMemoryBytes;
}
- /**
- * @deprecated use {@link #setMaxMainMemoryBytes(long)}
- * @param maxMainMemoryBytes
- */
- @Deprecated
- public void setMaxMainMemoryBytes(int maxMainMemoryBytes) {
- this.maxMainMemoryBytes = maxMainMemoryBytes;
- }
-
public void setMaxMainMemoryBytes(long maxMainMemoryBytes) {
this.maxMainMemoryBytes = maxMainMemoryBytes;
+ userConfigured.add("maxMainMemoryBytes");
}
/**
@@ -801,6 +671,7 @@ public class PDFParserConfig implements Serializable {
*/
public void setSetKCMS(boolean setKCMS) {
this.setKCMS = setKCMS;
+ userConfigured.add("setKCMS");
}
public boolean isSetKCMS() {
@@ -829,12 +700,40 @@ public class PDFParserConfig implements Serializable {
public void setDetectAngles(boolean detectAngles) {
this.detectAngles = detectAngles;
+ userConfigured.add("detectAngles");
}
public boolean isDetectAngles() {
return detectAngles;
}
+ public PDFParserConfig cloneAndUpdate(PDFParserConfig updates) throws TikaException {
+ PDFParserConfig updated = new PDFParserConfig();
+ for (Field field : this.getClass().getDeclaredFields()) {
+ if (Modifier.isFinal(field.getModifiers())) {
+ continue;
+ } else if (Modifier.isStatic(field.getModifiers())) {
+ continue;
+ }
+ if ("userConfigured".equals(field.getName())) {
+ continue;
+ }
+ if (updates.userConfigured.contains(field.getName())) {
+ try {
+ field.set(updated, field.get(updates));
+ } catch (IllegalAccessException e) {
+ throw new TikaException("can't update " + field.getName(), e);
+ }
+ } else {
+ try {
+ field.set(updated, field.get(this));
+ } catch (IllegalAccessException e) {
+ throw new TikaException("can't update " + field.getName(), e);
+ }
+ }
+ }
+ return updated;
+ }
@Override
public boolean equals(Object o) {
if (this == o) return true;
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
deleted file mode 100644
index e2c0c32..0000000
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
+++ /dev/null
@@ -1,43 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-enableAutoSpace true
-extractAnnotationText true
-sortByPosition false
-suppressDuplicateOverlappingText false
-extractAcroFormContent true
-extractBookmarksText true
-extractInlineImages false
-extractUniqueInlineImagesOnly true
-checkExtractAccessPermission false
-allowExtractionForAccessibility true
-ifXFAExtractOnlyXFA false
-catchIntermediateIOExceptions true
-#options: no_ocr, ocr_only, ocr_and_text_extraction, auto
-ocrStrategy auto
-#dots per inch for the ocr rendering of the page image
-ocrDPI 300
-#if you request tif, make sure you have imageio jars on your classpath!
-#and make sure to specify tiff
-ocrImageFormatName png
-#options: argb, binary, gray, rgb
-ocrImageType gray
-# Use up to 500MB when loading a pdf into a PDDocument
-maxMainMemoryBytes 524288000
-#whether or not to set KCMS for faster (but legacy/unsupported) image rendering
-setKCMS false
-#whether or not to add processing to detect angles and extract
-#text accordingly PDFBOX-4371
-detectAngles false
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 202f4e2..ba4191e 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1074,11 +1074,37 @@ public class PDFParserTest extends TikaTest {
assertNotNull(is);
TikaConfig tikaConfig = new TikaConfig(is);
Parser p = new AutoDetectParser(tikaConfig);
+
String text = getText(getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"), p);
text = text.replaceAll("\\s+", " ");
// Column text is now interleaved:
assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", text);
+
+ //test overriding underlying settings with PDFParserConfig
+ ParseContext pc = new ParseContext();
+ PDFParserConfig config = new PDFParserConfig();
+ config.setSortByPosition(false);
+ pc.set(PDFParserConfig.class, config);
+ text = getText("testPDFTwoTextBoxes.pdf", p, new Metadata(), pc);
+ text = text.replaceAll("\\s+", " ");
+ // Column text is not interleaved:
+ assertContains("Left column line 1 Left column line 2 ", text);
+
+ //test a new PDFParserConfig and setting another value
+ //this tests that the underlying "sortByPosition" as set
+ //in the config file is still operative
+ config = new PDFParserConfig();
+ config.setOcrDPI(10000);
+ config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
+ pc.set(PDFParserConfig.class, config);
+ text = getText("testPDFTwoTextBoxes.pdf",
+ p, new Metadata(), pc);
+ text = text.replaceAll("\\s+", " ");
+
+ // Column text is now interleaved:
+ assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2"
+ , text);
}
}
diff --git a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
index 292306b..4cbe8c9 100644
--- a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
+++ b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
@@ -47,6 +47,7 @@ import static org.apache.cxf.helpers.HttpHeaderHelper.CONTENT_ENCODING;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
public class TikaResourceTest extends CXFTestBase {
public static final String TEST_DOC = "test-documents/test.doc";
@@ -288,7 +289,7 @@ public class TikaResourceTest extends CXFTestBase {
//TIKA-2638 and TIKA-2816
@Test
public void testOCRLanguageConfig() throws Exception {
- if (! new TesseractOCRParser().hasTesseract(new TesseractOCRConfig())) {
+ if (! new TesseractOCRParser().hasTesseract()) {
return;
}
@@ -307,7 +308,7 @@ public class TikaResourceTest extends CXFTestBase {
//TIKA-2290
@Test
public void testPDFOCRConfig() throws Exception {
- if (! new TesseractOCRParser().hasTesseract(new TesseractOCRConfig())) {
+ if (! new TesseractOCRParser().hasTesseract()) {
return;
}
@@ -318,6 +319,7 @@ public class TikaResourceTest extends CXFTestBase {
.put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
String responseMsg = getStringFromInputStream((InputStream) response
.getEntity());
+
assertTrue(responseMsg.trim().equals(""));
response = WebClient.create(endPoint + TIKA_PATH)
@@ -415,14 +417,18 @@ public class TikaResourceTest extends CXFTestBase {
//started after the upgrade to 3.2.7
}
- response = WebClient.create(endPoint + TIKA_PATH)
- .type("application/pdf")
- .accept("text/plain")
- .header(TesseractServerConfig.X_TIKA_OCR_HEADER_PREFIX +
- "tesseractPath",
- "bogus path")
- .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
- assertEquals(200, response.getStatus());
+ try {
+ response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/pdf")
+ .accept("text/plain")
+ .header(TesseractServerConfig.X_TIKA_OCR_HEADER_PREFIX +
+ "tesseractPath",
+ "bogus path")
+ .put(ClassLoader.getSystemResourceAsStream("test-documents/testOCR.pdf"));
+ assertEquals(400, response.getStatus());
+ } catch (ProcessingException e) {
+
+ }
}
@Test
diff --git a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/UnpackerResourceTest.java b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/UnpackerResourceTest.java
index 4c5fae9..dc9d638 100644
--- a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/UnpackerResourceTest.java
+++ b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/UnpackerResourceTest.java
@@ -220,7 +220,7 @@ public class UnpackerResourceTest extends CXFTestBase {
@Test
public void testPDFRenderOCR() throws Exception {
- Assume.assumeTrue( new TesseractOCRParser().hasTesseract(new TesseractOCRConfig()));
+ Assume.assumeTrue( new TesseractOCRParser().hasTesseract());
Response response = WebClient.create(CXFTestBase.endPoint + ALL_PATH)
.header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX+"ocrStrategy", "ocr_only")