You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/07/02 19:19:17 UTC

[tika] branch branch_1x updated: TIKA-2669 -- pdf and tesseract config set in a tika-config.xml file on server start up are always overwritten to DefaultConfig in tika-server

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 6933efd  TIKA-2669 -- pdf and tesseract config set in a tika-config.xml file on server start up are always overwritten to DefaultConfig in tika-server
6933efd is described below

commit 6933efd56ab28113b81d2a37abae00fef64bd878
Author: TALLISON <ta...@apache.org>
AuthorDate: Mon Jul 2 15:17:58 2018 -0400

    TIKA-2669 -- pdf and tesseract config set in a tika-config.xml file on server start up are always overwritten to DefaultConfig in tika-server
---
 .../apache/tika/server/resource/TikaResource.java  |  17 +++--
 .../java/org/apache/tika/server/CXFTestBase.java   |   4 +-
 .../org/apache/tika/server/TikaParsersTest.java    |  69 +++++++++++----------
 .../org/apache/tika/server/TikaResourceTest.java   |  38 ++++++++++++
 .../tika/server/tika-config-for-server-tests.xml   |  29 +++++++++
 .../src/test/resources/testPDFTwoTextBoxes.pdf     | Bin 0 -> 57100 bytes
 6 files changed, 119 insertions(+), 38 deletions(-)

diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index b7f857e..0060738 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -161,17 +161,26 @@ public class TikaResource {
 
     public static void fillParseContext(ParseContext parseContext, MultivaluedMap<String, String> httpHeaders,
                                         Parser embeddedParser) {
-        TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
-        PDFParserConfig pdfParserConfig = new PDFParserConfig();
+        //lazily initialize configs
+        //if a header is submitted, any params set in --tika-config tika-config.xml
+        //upon server startup will be ignored.
+        TesseractOCRConfig ocrConfig = null;
+        PDFParserConfig pdfParserConfig = null;
         for (String key : httpHeaders.keySet()) {
             if (StringUtils.startsWith(key, X_TIKA_OCR_HEADER_PREFIX)) {
+                ocrConfig = (ocrConfig == null) ? new TesseractOCRConfig() : ocrConfig;
                 processHeaderConfig(httpHeaders, ocrConfig, key, X_TIKA_OCR_HEADER_PREFIX);
             } else if (StringUtils.startsWith(key, X_TIKA_PDF_HEADER_PREFIX)) {
+                pdfParserConfig = (pdfParserConfig == null) ? new PDFParserConfig() : pdfParserConfig;
                 processHeaderConfig(httpHeaders, pdfParserConfig, key, X_TIKA_PDF_HEADER_PREFIX);
             }
         }
-        parseContext.set(TesseractOCRConfig.class, ocrConfig);
-        parseContext.set(PDFParserConfig.class, pdfParserConfig);
+        if (ocrConfig != null) {
+            parseContext.set(TesseractOCRConfig.class, ocrConfig);
+        }
+        if (pdfParserConfig != null) {
+            parseContext.set(PDFParserConfig.class, pdfParserConfig);
+        }
         if (embeddedParser != null) {
             parseContext.set(Parser.class, embeddedParser);
         }
diff --git a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
index 7b35fec..f851e97 100644
--- a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
+++ b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
@@ -80,8 +80,8 @@ public abstract class CXFTestBase {
     }
 
     @Before
-    public void setUp() {
-        this.tika = TikaConfig.getDefaultConfig();
+    public void setUp() throws Exception {
+        this.tika = new TikaConfig(getClass().getResourceAsStream("tika-config-for-server-tests.xml"));
         TikaResource.init(tika,
                 new CommonsDigester(DIGESTER_READ_LIMIT, "md5,sha1:32"),
                 new DefaultInputStreamFactory());
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java
index e4e60a5..eadacfa 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java
@@ -96,12 +96,12 @@ public class TikaParsersTest extends CXFTestBase {
                     .get();
 
             String text = getStringFromInputStream((InputStream) response.getEntity());
-            assertContains("<h2>DefaultParser</h2>", text);
+            assertContains("<h3>DefaultParser</h3>", text);
             assertContains("Composite", text);
 
-            assertContains("<h3>OpusParser", text);
-            assertContains("<h3>PackageParser", text);
-            assertContains("<h3>OOXMLParser", text);
+            assertContains("<h4>OpusParser", text);
+            assertContains("<h4>PackageParser", text);
+            assertContains("<h4>OOXMLParser", text);
 
             assertContains(OpusParser.class.getName(), text);
             assertContains(PackageParser.class.getName(), text);
@@ -138,46 +138,51 @@ public class TikaParsersTest extends CXFTestBase {
             assertEquals(true, json.containsKey("name"));
             assertEquals(true, json.containsKey("composite"));
             assertEquals(true, json.containsKey("children"));
-            assertEquals("org.apache.tika.parser.DefaultParser", json.get("name"));
+            assertEquals("org.apache.tika.parser.CompositeParser", json.get("name"));
             assertEquals(Boolean.TRUE, json.get("composite"));
 
             // At least 20 child parsers which aren't composite, except for CompositeExternalParser
             Object[] children = (Object[]) (Object) json.get("children");
-            assertTrue(children.length >= 20);
-            boolean hasOpus = false, hasOOXML = false, hasPDF = false, hasZip = false;
+            assertTrue(children.length >= 2);
+            boolean hasOpus = false, hasOOXML = false, hasZip = false;
             int nonComposite = 0;
             int composite = 0;
             for (Object o : children) {
-                Map<String, Object> d = (Map<String, Object>) o;
-                assertEquals(true, d.containsKey("name"));
-                assertEquals(true, d.containsKey("composite"));
-
-                if (d.get("composite") == Boolean.FALSE)
-                	nonComposite++;
-                else
-                	composite++;
-                
-                // Will only have mime types if requested
-                if (d.get("composite") == Boolean.FALSE)
-                	assertEquals(details, d.containsKey("supportedTypes"));
-
-                String name = (String) d.get("name");
-                if (OpusParser.class.getName().equals(name)) {
-                    hasOpus = true;
-                }
-                if (OOXMLParser.class.getName().equals(name)) {
-                    hasOOXML = true;
-                }
-                if (PDFParser.class.getName().equals(name)) {
-                    hasPDF = true;
+                Map<String, Object> child = (Map<String, Object>) o;
+                assertEquals(true, child.containsKey("name"));
+                assertEquals(true, child.containsKey("composite"));
+
+                Object[] grandChildrenArr = (Object[]) child.get("children");
+                if (grandChildrenArr == null) {
+                    continue;
                 }
-                if (PackageParser.class.getName().equals(name)) {
-                    hasZip = true;
+                assertTrue(grandChildrenArr.length > 50);
+                for (Object grandChildO : grandChildrenArr) {
+                    Map<String, Object> grandChildren = (Map<String, Object>) grandChildO;
+
+                    if (grandChildren.get("composite") == Boolean.FALSE)
+                        nonComposite++;
+                    else
+                        composite++;
+
+                    // Will only have mime types if requested
+                    if (grandChildren.get("composite") == Boolean.FALSE)
+                        assertEquals(details, grandChildren.containsKey("supportedTypes"));
+
+                    String name = (String) grandChildren.get("name");
+                    if (OpusParser.class.getName().equals(name)) {
+                        hasOpus = true;
+                    }
+                    if (OOXMLParser.class.getName().equals(name)) {
+                        hasOOXML = true;
+                    }
+                    if (PackageParser.class.getName().equals(name)) {
+                        hasZip = true;
+                    }
                 }
             }
             assertEquals(true, hasOpus);
             assertEquals(true, hasOOXML);
-            assertEquals(true, hasPDF);
             assertEquals(true, hasZip);
             assertTrue(nonComposite > 20);
             assertTrue(composite == 0 || composite == 1); // if CompositeExternalParser is available it will be 1
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index 295ce74..b519170 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -258,6 +258,44 @@ public class TikaResourceTest extends CXFTestBase {
         assertEquals(500, response.getStatus());
     }
 
+    //TIKA-2669
+    @Test
+    public void testPDFConfig() throws Exception {
+
+        Response response = WebClient.create(endPoint + TIKA_PATH)
+                .type("application/pdf")
+                .accept("text/plain")
+                .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
+        String responseMsg = getStringFromInputStream((InputStream) response
+                .getEntity());
+        responseMsg = responseMsg.replaceAll("[\r\n ]+", " ").trim();
+        assertEquals("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2",
+                responseMsg);
+
+        response = WebClient.create(endPoint + TIKA_PATH)
+                .type("application/pdf")
+                .accept("text/plain")
+                .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX+"sortByPosition", "false")
+                .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
+        responseMsg = getStringFromInputStream((InputStream) response
+                .getEntity());
+        responseMsg = responseMsg.replaceAll("[\r\n ]+", " ").trim();
+        assertEquals("Left column line 1 Left column line 2 Right column line 1 Right column line 2", responseMsg);
+
+        //make sure that default reverts to initial config option
+        response = WebClient.create(endPoint + TIKA_PATH)
+                .type("application/pdf")
+                .accept("text/plain")
+                .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
+        responseMsg = getStringFromInputStream((InputStream) response
+                .getEntity());
+        responseMsg = responseMsg.replaceAll("[\r\n ]+", " ").trim();
+        assertEquals("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2",
+                responseMsg);
+
+    }
+
+
     @Test
     public void testExtractTextAcceptPlainText() throws Exception {
         //TIKA-2384
diff --git a/tika-server/src/test/resources/org/apache/tika/server/tika-config-for-server-tests.xml b/tika-server/src/test/resources/org/apache/tika/server/tika-config-for-server-tests.xml
new file mode 100644
index 0000000..8867655
--- /dev/null
+++ b/tika-server/src/test/resources/org/apache/tika/server/tika-config-for-server-tests.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser">
+            <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+        </parser>
+        <parser class="org.apache.tika.parser.pdf.PDFParser">
+            <params>
+                <param name="sortByPosition" type="bool">true</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
\ No newline at end of file
diff --git a/tika-server/src/test/resources/testPDFTwoTextBoxes.pdf b/tika-server/src/test/resources/testPDFTwoTextBoxes.pdf
new file mode 100644
index 0000000..f24e9e7
Binary files /dev/null and b/tika-server/src/test/resources/testPDFTwoTextBoxes.pdf differ