You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/07/02 19:19:17 UTC
[tika] branch branch_1x updated: TIKA-2669 -- pdf and tesseract
config set in a tika-config.xml file on server start up are always
overwritten to DefaultConfig in tika-server
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 6933efd TIKA-2669 -- pdf and tesseract config set in a tika-config.xml file on server start up are always overwritten to DefaultConfig in tika-server
6933efd is described below
commit 6933efd56ab28113b81d2a37abae00fef64bd878
Author: TALLISON <ta...@apache.org>
AuthorDate: Mon Jul 2 15:17:58 2018 -0400
TIKA-2669 -- pdf and tesseract config set in a tika-config.xml file on server start up are always overwritten to DefaultConfig in tika-server
---
.../apache/tika/server/resource/TikaResource.java | 17 +++--
.../java/org/apache/tika/server/CXFTestBase.java | 4 +-
.../org/apache/tika/server/TikaParsersTest.java | 69 +++++++++++----------
.../org/apache/tika/server/TikaResourceTest.java | 38 ++++++++++++
.../tika/server/tika-config-for-server-tests.xml | 29 +++++++++
.../src/test/resources/testPDFTwoTextBoxes.pdf | Bin 0 -> 57100 bytes
6 files changed, 119 insertions(+), 38 deletions(-)
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index b7f857e..0060738 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -161,17 +161,26 @@ public class TikaResource {
public static void fillParseContext(ParseContext parseContext, MultivaluedMap<String, String> httpHeaders,
Parser embeddedParser) {
- TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
- PDFParserConfig pdfParserConfig = new PDFParserConfig();
+ //lazily initialize configs
+ //if a header is submitted, any params set in --tika-config tika-config.xml
+ //upon server startup will be ignored.
+ TesseractOCRConfig ocrConfig = null;
+ PDFParserConfig pdfParserConfig = null;
for (String key : httpHeaders.keySet()) {
if (StringUtils.startsWith(key, X_TIKA_OCR_HEADER_PREFIX)) {
+ ocrConfig = (ocrConfig == null) ? new TesseractOCRConfig() : ocrConfig;
processHeaderConfig(httpHeaders, ocrConfig, key, X_TIKA_OCR_HEADER_PREFIX);
} else if (StringUtils.startsWith(key, X_TIKA_PDF_HEADER_PREFIX)) {
+ pdfParserConfig = (pdfParserConfig == null) ? new PDFParserConfig() : pdfParserConfig;
processHeaderConfig(httpHeaders, pdfParserConfig, key, X_TIKA_PDF_HEADER_PREFIX);
}
}
- parseContext.set(TesseractOCRConfig.class, ocrConfig);
- parseContext.set(PDFParserConfig.class, pdfParserConfig);
+ if (ocrConfig != null) {
+ parseContext.set(TesseractOCRConfig.class, ocrConfig);
+ }
+ if (pdfParserConfig != null) {
+ parseContext.set(PDFParserConfig.class, pdfParserConfig);
+ }
if (embeddedParser != null) {
parseContext.set(Parser.class, embeddedParser);
}
diff --git a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
index 7b35fec..f851e97 100644
--- a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
+++ b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
@@ -80,8 +80,8 @@ public abstract class CXFTestBase {
}
@Before
- public void setUp() {
- this.tika = TikaConfig.getDefaultConfig();
+ public void setUp() throws Exception {
+ this.tika = new TikaConfig(getClass().getResourceAsStream("tika-config-for-server-tests.xml"));
TikaResource.init(tika,
new CommonsDigester(DIGESTER_READ_LIMIT, "md5,sha1:32"),
new DefaultInputStreamFactory());
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java
index e4e60a5..eadacfa 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java
@@ -96,12 +96,12 @@ public class TikaParsersTest extends CXFTestBase {
.get();
String text = getStringFromInputStream((InputStream) response.getEntity());
- assertContains("<h2>DefaultParser</h2>", text);
+ assertContains("<h3>DefaultParser</h3>", text);
assertContains("Composite", text);
- assertContains("<h3>OpusParser", text);
- assertContains("<h3>PackageParser", text);
- assertContains("<h3>OOXMLParser", text);
+ assertContains("<h4>OpusParser", text);
+ assertContains("<h4>PackageParser", text);
+ assertContains("<h4>OOXMLParser", text);
assertContains(OpusParser.class.getName(), text);
assertContains(PackageParser.class.getName(), text);
@@ -138,46 +138,51 @@ public class TikaParsersTest extends CXFTestBase {
assertEquals(true, json.containsKey("name"));
assertEquals(true, json.containsKey("composite"));
assertEquals(true, json.containsKey("children"));
- assertEquals("org.apache.tika.parser.DefaultParser", json.get("name"));
+ assertEquals("org.apache.tika.parser.CompositeParser", json.get("name"));
assertEquals(Boolean.TRUE, json.get("composite"));
// At least 20 child parsers which aren't composite, except for CompositeExternalParser
Object[] children = (Object[]) (Object) json.get("children");
- assertTrue(children.length >= 20);
- boolean hasOpus = false, hasOOXML = false, hasPDF = false, hasZip = false;
+ assertTrue(children.length >= 2);
+ boolean hasOpus = false, hasOOXML = false, hasZip = false;
int nonComposite = 0;
int composite = 0;
for (Object o : children) {
- Map<String, Object> d = (Map<String, Object>) o;
- assertEquals(true, d.containsKey("name"));
- assertEquals(true, d.containsKey("composite"));
-
- if (d.get("composite") == Boolean.FALSE)
- nonComposite++;
- else
- composite++;
-
- // Will only have mime types if requested
- if (d.get("composite") == Boolean.FALSE)
- assertEquals(details, d.containsKey("supportedTypes"));
-
- String name = (String) d.get("name");
- if (OpusParser.class.getName().equals(name)) {
- hasOpus = true;
- }
- if (OOXMLParser.class.getName().equals(name)) {
- hasOOXML = true;
- }
- if (PDFParser.class.getName().equals(name)) {
- hasPDF = true;
+ Map<String, Object> child = (Map<String, Object>) o;
+ assertEquals(true, child.containsKey("name"));
+ assertEquals(true, child.containsKey("composite"));
+
+ Object[] grandChildrenArr = (Object[]) child.get("children");
+ if (grandChildrenArr == null) {
+ continue;
}
- if (PackageParser.class.getName().equals(name)) {
- hasZip = true;
+ assertTrue(grandChildrenArr.length > 50);
+ for (Object grandChildO : grandChildrenArr) {
+ Map<String, Object> grandChildren = (Map<String, Object>) grandChildO;
+
+ if (grandChildren.get("composite") == Boolean.FALSE)
+ nonComposite++;
+ else
+ composite++;
+
+ // Will only have mime types if requested
+ if (grandChildren.get("composite") == Boolean.FALSE)
+ assertEquals(details, grandChildren.containsKey("supportedTypes"));
+
+ String name = (String) grandChildren.get("name");
+ if (OpusParser.class.getName().equals(name)) {
+ hasOpus = true;
+ }
+ if (OOXMLParser.class.getName().equals(name)) {
+ hasOOXML = true;
+ }
+ if (PackageParser.class.getName().equals(name)) {
+ hasZip = true;
+ }
}
}
assertEquals(true, hasOpus);
assertEquals(true, hasOOXML);
- assertEquals(true, hasPDF);
assertEquals(true, hasZip);
assertTrue(nonComposite > 20);
assertTrue(composite == 0 || composite == 1); // if CompositeExternalParser is available it will be 1
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index 295ce74..b519170 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -258,6 +258,44 @@ public class TikaResourceTest extends CXFTestBase {
assertEquals(500, response.getStatus());
}
+ //TIKA-2669
+ @Test
+ public void testPDFConfig() throws Exception {
+
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/pdf")
+ .accept("text/plain")
+ .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
+ String responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+ responseMsg = responseMsg.replaceAll("[\r\n ]+", " ").trim();
+ assertEquals("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2",
+ responseMsg);
+
+ response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/pdf")
+ .accept("text/plain")
+ .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX+"sortByPosition", "false")
+ .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
+ responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+ responseMsg = responseMsg.replaceAll("[\r\n ]+", " ").trim();
+ assertEquals("Left column line 1 Left column line 2 Right column line 1 Right column line 2", responseMsg);
+
+ //make sure that default reverts to initial config option
+ response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/pdf")
+ .accept("text/plain")
+ .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
+ responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+ responseMsg = responseMsg.replaceAll("[\r\n ]+", " ").trim();
+ assertEquals("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2",
+ responseMsg);
+
+ }
+
+
@Test
public void testExtractTextAcceptPlainText() throws Exception {
//TIKA-2384
diff --git a/tika-server/src/test/resources/org/apache/tika/server/tika-config-for-server-tests.xml b/tika-server/src/test/resources/org/apache/tika/server/tika-config-for-server-tests.xml
new file mode 100644
index 0000000..8867655
--- /dev/null
+++ b/tika-server/src/test/resources/org/apache/tika/server/tika-config-for-server-tests.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="sortByPosition" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
\ No newline at end of file
diff --git a/tika-server/src/test/resources/testPDFTwoTextBoxes.pdf b/tika-server/src/test/resources/testPDFTwoTextBoxes.pdf
new file mode 100644
index 0000000..f24e9e7
Binary files /dev/null and b/tika-server/src/test/resources/testPDFTwoTextBoxes.pdf differ