You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/04/14 10:12:18 UTC
[tika] branch main updated: TIKA-3352 -- add a json option for the
/tika endpoint
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 7a896cd TIKA-3352 -- add a json option for the /tika endpoint
7a896cd is described below
commit 7a896cd5068e6135d116d03dada39b56a9c52f5a
Author: tallison <ta...@apache.org>
AuthorDate: Wed Apr 14 06:11:59 2021 -0400
TIKA-3352 -- add a json option for the /tika endpoint
---
CHANGES.txt | 2 +
.../tika/server/classic/TikaResourceTest.java | 39 +++++++++
.../apache/tika/server/core/TikaServerProcess.java | 2 +-
.../core/resource/RecursiveMetadataResource.java | 4 +-
.../tika/server/core/resource/TikaResource.java | 92 +++++++++++++++++++++-
.../org/apache/tika/server/core/CXFTestBase.java | 15 +++-
.../server/core/RecursiveMetadataResourceTest.java | 77 ++++++++++++++++++
.../apache/tika/server/core/StackTraceOffTest.java | 12 ++-
.../apache/tika/server/core/StackTraceTest.java | 12 ++-
.../core/TikaResourceMetadataFilterTest.java | 82 +++++++++++++++++++
...ourceTest.java => TikaResourceNoStackTest.java} | 82 +++++++------------
.../apache/tika/server/core/TikaResourceTest.java | 87 ++++++++++++++++++++
.../resources/configs/metadata-filter-include.xml | 30 +++++++
.../test-documents/mock/hello_world_long.xml | 30 +++++++
14 files changed, 501 insertions(+), 65 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 17e2cc5..0524ecf 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -44,7 +44,9 @@ Release 2.0.0-ALPHA - 01/13/2021
This output is not available in tika-server-core.
+Release 1.27 - ??
+ * Add json output for /tika endpoint in tika-server (TIKA-3352).
* Tika's OpenNLPDetector now covers 148 languages and language-script pairs (TIKA-3340).
Release 1.26 - 03/24/2021
diff --git a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
index 54a5909..e0855cb 100644
--- a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
+++ b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
@@ -23,6 +23,7 @@ import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.InputStream;
+import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
@@ -42,6 +43,10 @@ import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
import org.junit.Test;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadata;
import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.server.classic.config.PDFServerConfig;
import org.apache.tika.server.classic.config.TesseractServerConfig;
@@ -50,6 +55,7 @@ import org.apache.tika.server.core.TikaServerParseExceptionMapper;
import org.apache.tika.server.core.config.DocumentSelectorConfig;
import org.apache.tika.server.core.config.PasswordProviderConfig;
import org.apache.tika.server.core.resource.TikaResource;
+import org.apache.tika.server.core.writer.JSONMessageBodyWriter;
public class TikaResourceTest extends CXFTestBase {
public static final String TEST_DOC = "test-documents/test.doc";
@@ -74,6 +80,7 @@ public class TikaResourceTest extends CXFTestBase {
protected void setUpProviders(JAXRSServerFactoryBean sf) {
List<Object> providers = new ArrayList<>();
providers.add(new TikaServerParseExceptionMapper(false));
+ providers.add(new JSONMessageBodyWriter());
sf.setProviders(providers);
}
@@ -562,4 +569,36 @@ public class TikaResourceTest extends CXFTestBase {
return new MultipartBody(att);
}
+ @Test
+ public void testJson() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH + "/text")
+ .accept("application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ Metadata metadata =
+ JsonMetadata.fromJson(new InputStreamReader(
+ ((InputStream)response.getEntity()), StandardCharsets.UTF_8));
+ assertContains("embed4.txt", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ assertContains("General Congress", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ assertNotFound("<p", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
+ }
+
+ @Test
+ public void testJsonWriteLimitEmbedded() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH + "/text")
+ .accept("application/json")
+ .header("writeLimit", "500")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ Metadata metadata =
+ JsonMetadata.fromJson(new InputStreamReader(
+ ((InputStream)response.getEntity()), StandardCharsets.UTF_8));
+ assertContains("embed2a.txt", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ assertContains("When in the Course", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ assertNotFound("declare the causes", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
+ assertTrue(metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION).startsWith(
+ "org.apache.tika.sax.WriteOutContentHandler$WriteLimitReachedException"));
+ assertNotFound("embed4.txt", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+
+ }
}
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
index 9dd6787..3b29bfd 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/TikaServerProcess.java
@@ -232,7 +232,7 @@ public class TikaServerProcess {
serverThread.start();
}
- TikaResource.init(tika, digester, inputStreamFactory, serverStatus);
+ TikaResource.init(tika, tikaServerConfig, digester, inputStreamFactory, serverStatus);
JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
List<ResourceProvider> resourceProviders = new ArrayList<>();
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
index 09b333a..ae55b52 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
@@ -51,8 +51,8 @@ import org.apache.tika.server.core.TikaServerParseException;
@Path("/rmeta")
public class RecursiveMetadataResource {
- private static final String HANDLER_TYPE_PARAM = "handler";
- private static final BasicContentHandlerFactory.HANDLER_TYPE DEFAULT_HANDLER_TYPE =
+ protected static final String HANDLER_TYPE_PARAM = "handler";
+ protected static final BasicContentHandlerFactory.HANDLER_TYPE DEFAULT_HANDLER_TYPE =
BasicContentHandlerFactory.HANDLER_TYPE.XML;
private static final Logger LOG = LoggerFactory.getLogger(RecursiveMetadataResource.class);
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index e129da4..e7dcf83 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -18,6 +18,8 @@
package org.apache.tika.server.core.resource;
import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.tika.server.core.resource.RecursiveMetadataResource.DEFAULT_HANDLER_TYPE;
+import static org.apache.tika.server.core.resource.RecursiveMetadataResource.HANDLER_TYPE_PARAM;
import java.io.IOException;
import java.io.InputStream;
@@ -36,6 +38,7 @@ import javax.ws.rs.GET;
import javax.ws.rs.POST;
import javax.ws.rs.PUT;
import javax.ws.rs.Path;
+import javax.ws.rs.PathParam;
import javax.ws.rs.Produces;
import javax.ws.rs.WebApplicationException;
import javax.ws.rs.core.Context;
@@ -62,12 +65,14 @@ import org.xml.sax.SAXException;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ExpandedTitleContentHandler;
import org.apache.tika.sax.RichTextContentHandler;
@@ -76,7 +81,9 @@ import org.apache.tika.server.core.CompositeParseContextConfig;
import org.apache.tika.server.core.InputStreamFactory;
import org.apache.tika.server.core.ParseContextConfig;
import org.apache.tika.server.core.ServerStatus;
+import org.apache.tika.server.core.TikaServerConfig;
import org.apache.tika.server.core.TikaServerParseException;
+import org.apache.tika.utils.ExceptionUtils;
@Path("/tika")
public class TikaResource {
@@ -87,6 +94,7 @@ public class TikaResource {
private static final Logger LOG = LoggerFactory.getLogger(TikaResource.class);
private static Pattern ALLOWABLE_HEADER_CHARS = Pattern.compile("(?i)^[-/_+\\.A-Z0-9 ]+$");
private static TikaConfig tikaConfig;
+ private static TikaServerConfig tikaServerConfig;
private static DigestingParser.Digester digester = null;
private static InputStreamFactory inputStreamFactory = null;
private static ServerStatus SERVER_STATUS = null;
@@ -94,9 +102,11 @@ public class TikaResource {
private static ParseContextConfig PARSE_CONTEXT_CONFIG = new CompositeParseContextConfig();
- public static void init(TikaConfig config, DigestingParser.Digester digestr,
+ public static void init(TikaConfig config, TikaServerConfig tikaServerConfg,
+ DigestingParser.Digester digestr,
InputStreamFactory iSF, ServerStatus serverStatus) {
tikaConfig = config;
+ tikaServerConfig = tikaServerConfg;
digester = digestr;
inputStreamFactory = iSF;
SERVER_STATUS = serverStatus;
@@ -508,6 +518,86 @@ public class TikaResource {
httpHeaders.getRequestHeaders(), info, "xml");
}
+ @POST
+ @Consumes("multipart/form-data")
+ @Produces("application/json")
+ @Path("form{" + HANDLER_TYPE_PARAM + " : (\\w+)?}")
+ public Metadata getJsonFromMultipart(Attachment att,
+ @Context HttpHeaders httpHeaders,
+ @Context final UriInfo info,
+ @PathParam(HANDLER_TYPE_PARAM)
+ String handlerTypeName)
+ throws IOException, TikaException {
+ Metadata metadata = new Metadata();
+ parseToMetadata(getInputStream(att.getObject(InputStream.class), metadata, httpHeaders),
+ metadata, preparePostHeaderMap(att, httpHeaders), info, handlerTypeName);
+ TikaResource.getConfig().getMetadataFilter().filter(metadata);
+ return metadata;
+ }
+
+ @PUT
+ @Consumes("*/*")
+ @Produces("application/json")
+ @Path("{" + HANDLER_TYPE_PARAM + " : (\\w+)?}")
+ public Metadata getJson(final InputStream is, @Context
+ HttpHeaders httpHeaders,
+ @Context final UriInfo info, @PathParam(HANDLER_TYPE_PARAM)
+ String handlerTypeName)
+ throws IOException, TikaException {
+ Metadata metadata = new Metadata();
+ parseToMetadata(getInputStream(is, metadata, httpHeaders), metadata,
+ httpHeaders.getRequestHeaders(), info, handlerTypeName);
+ TikaResource.getConfig().getMetadataFilter().filter(metadata);
+ return metadata;
+ }
+
+ private void parseToMetadata(InputStream inputStream,
+ Metadata metadata,
+ MultivaluedMap<String, String> httpHeaders,
+ UriInfo info, String handlerTypeName) throws IOException {
+ final Parser parser = createParser();
+ final ParseContext context = new ParseContext();
+
+ fillMetadata(parser, metadata, httpHeaders);
+ fillParseContext(httpHeaders, metadata, context);
+
+ logRequest(LOG, "/tika", metadata);
+ int writeLimit = -1;
+ if (httpHeaders.containsKey("writeLimit")) {
+ writeLimit = Integer.parseInt(httpHeaders.getFirst("writeLimit"));
+ }
+ BasicContentHandlerFactory.HANDLER_TYPE type =
+ BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
+ BasicContentHandlerFactory fact = new BasicContentHandlerFactory(type, writeLimit);
+ ContentHandler contentHandler = fact.getNewContentHandler();
+
+ try {
+ parse(parser, LOG, info.getPath(), inputStream, contentHandler, metadata, context);
+ } catch (TikaServerParseException e) {
+ if (tikaServerConfig.isReturnStackTrace()) {
+ Throwable cause = e.getCause();
+ if (cause != null) {
+ metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION,
+ ExceptionUtils.getStackTrace(cause));
+ } else {
+ metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION,
+ ExceptionUtils.getStackTrace(e));
+ }
+ } else {
+ throw e;
+ }
+ } catch (OutOfMemoryError e) {
+ if (tikaServerConfig.isReturnStackTrace()) {
+ metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION,
+ ExceptionUtils.getStackTrace(e));
+ } else {
+ throw e;
+ }
+ } finally {
+ metadata.add(TikaCoreProperties.TIKA_CONTENT, contentHandler.toString());
+ }
+ }
+
private StreamingOutput produceOutput(final InputStream is, Metadata metadata,
final MultivaluedMap<String, String> httpHeaders,
final UriInfo info, final String format) {
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
index a2c34c4..ed8b0f1 100644
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
@@ -57,9 +57,9 @@ import org.apache.tika.server.core.resource.UnpackerResource;
public abstract class CXFTestBase {
protected static final String endPoint = "http://localhost:" + TikaServerConfig.DEFAULT_PORT;
- private final static int DIGESTER_READ_LIMIT = 20 * 1024 * 1024;
+ protected final static int DIGESTER_READ_LIMIT = 20 * 1024 * 1024;
protected Server server;
- private TikaConfig tika;
+ protected TikaConfig tika;
public static void assertContains(String needle, String haystack) {
assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
@@ -101,7 +101,10 @@ public abstract class CXFTestBase {
public void setUp() throws Exception {
this.tika = new TikaConfig(getTikaConfigInputStream());
- TikaResource.init(tika, new CommonsDigester(DIGESTER_READ_LIMIT, "md5,sha1:32"),
+ TikaServerConfig tikaServerConfig = getTikaServerConfig();
+ TikaResource.init(tika, tikaServerConfig,
+ new CommonsDigester(DIGESTER_READ_LIMIT, "md5," +
+ "sha1:32"),
getInputStreamFactory(tika), new ServerStatus("", 0, true));
JAXRSServerFactoryBean sf = new JAXRSServerFactoryBean();
//set compression interceptors
@@ -121,6 +124,12 @@ public abstract class CXFTestBase {
server = sf.create();
}
+ protected TikaServerConfig getTikaServerConfig() {
+ TikaServerConfig tikaServerConfig = new TikaServerConfig();
+ tikaServerConfig.setReturnStackTrace(true);
+ return tikaServerConfig;
+ }
+
protected InputStreamFactory getInputStreamFactory(TikaConfig tikaConfig) {
return new DefaultInputStreamFactory();
}
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java
new file mode 100644
index 0000000..971e0a5
--- /dev/null
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server.core;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.junit.Test;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.server.core.resource.RecursiveMetadataResource;
+import org.apache.tika.server.core.writer.MetadataListMessageBodyWriter;
+
+public class RecursiveMetadataResourceTest extends CXFTestBase {
+
+ private static final String META_PATH = "/rmeta";
+
+ public static final String TEST_NULL_POINTER = "test-documents/mock/null_pointer.xml";
+
+ @Override
+ protected void setUpResources(JAXRSServerFactoryBean sf) {
+ sf.setResourceClasses(RecursiveMetadataResource.class);
+ sf.setResourceProvider(RecursiveMetadataResource.class,
+ new SingletonResourceProvider(new RecursiveMetadataResource()));
+ }
+
+ @Override
+ protected void setUpProviders(JAXRSServerFactoryBean sf) {
+ List<Object> providers = new ArrayList<>();
+ providers.add(new MetadataListMessageBodyWriter());
+ sf.setProviders(providers);
+ }
+
+ @Test
+ public void testNPE() throws Exception {
+ Response response = WebClient.create(endPoint + META_PATH).accept("application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_NULL_POINTER));
+
+ Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+ Metadata metadata = metadataList.get(0);
+ assertEquals("Nikolai Lobachevsky", metadata.get("author"));
+ assertEquals("application/mock+xml", metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("some content", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ assertContains("null pointer message",
+ metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION));
+
+ }
+
+}
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceOffTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceOffTest.java
index e706f58..db9384f 100644
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceOffTest.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceOffTest.java
@@ -87,7 +87,11 @@ public class StackTraceOffTest extends CXFTestBase {
if ("/rmeta".equals(path)) {
continue;
}
- Response response = WebClient.create(endPoint + path).accept("*/*")
+ String accept = "*/*";
+ if ("/tika".equals(path)) {
+ accept = "text/plain";
+ }
+ Response response = WebClient.create(endPoint + path).accept(accept)
.header("Content-Disposition",
"attachment; filename=" + TEST_PASSWORD_PROTECTED)
.put(ClassLoader.getSystemResourceAsStream(TEST_PASSWORD_PROTECTED));
@@ -104,7 +108,11 @@ public class StackTraceOffTest extends CXFTestBase {
if ("/rmeta".equals(path)) {
continue;
}
- Response response = WebClient.create(endPoint + path).accept("*/*")
+ String accept = "*/*";
+ if ("/tika".equals(path)) {
+ accept = "text/plain";
+ }
+ Response response = WebClient.create(endPoint + path).accept(accept)
.put(ClassLoader.getSystemResourceAsStream(TEST_NULL));
assertNotNull("null response: " + path, response);
assertEquals("unprocessable: " + path, UNPROCESSEABLE, response.getStatus());
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceTest.java
index eb52854..eb3ad3c 100644
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceTest.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/StackTraceTest.java
@@ -83,7 +83,11 @@ public class StackTraceTest extends CXFTestBase {
if ("/rmeta".equals(path)) {
continue;
}
- Response response = WebClient.create(endPoint + path).accept("*/*")
+ String accept = "*/*";
+ if ("/tika".equals(path)) {
+ accept = "text/plain";
+ }
+ Response response = WebClient.create(endPoint + path).accept(accept)
.header("Content-Disposition",
"attachment; filename=" + TEST_PASSWORD_PROTECTED)
.put(ClassLoader.getSystemResourceAsStream(TEST_PASSWORD_PROTECTED));
@@ -100,7 +104,11 @@ public class StackTraceTest extends CXFTestBase {
if ("/rmeta".equals(path)) {
continue;
}
- Response response = WebClient.create(endPoint + path).accept("*/*")
+ String accept = "*/*";
+ if ("/tika".equals(path)) {
+ accept = "text/plain";
+ }
+ Response response = WebClient.create(endPoint + path).accept(accept)
.put(ClassLoader.getSystemResourceAsStream(TEST_NULL));
assertNotNull("null response: " + path, response);
assertEquals("unprocessable: " + path, UNPROCESSEABLE, response.getStatus());
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceMetadataFilterTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceMetadataFilterTest.java
new file mode 100644
index 0000000..699762f
--- /dev/null
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceMetadataFilterTest.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server.core;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.junit.Test;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadata;
+import org.apache.tika.server.core.resource.TikaResource;
+import org.apache.tika.server.core.writer.JSONMessageBodyWriter;
+
+public class TikaResourceMetadataFilterTest extends CXFTestBase {
+
+ public static final String TEST_HELLO_WORLD = "test-documents/mock/hello_world.xml";
+
+ private static final String TIKA_PATH = "/tika";
+
+ @Override
+ protected InputStream getTikaConfigInputStream() {
+ return getClass().getResourceAsStream("/configs/metadata-filter-include.xml");
+ }
+
+ @Override
+ protected void setUpResources(JAXRSServerFactoryBean sf) {
+ sf.setResourceClasses(TikaResource.class);
+ sf.setResourceProvider(TikaResource.class,
+ new SingletonResourceProvider(new TikaResource()));
+ }
+
+ @Override
+ protected void setUpProviders(JAXRSServerFactoryBean sf) {
+ List<Object> providers = new ArrayList<Object>();
+ providers.add(new TikaServerParseExceptionMapper(false));
+ providers.add(new JSONMessageBodyWriter());
+ sf.setProviders(providers);
+ }
+
+
+ @Test
+ public void testBasic() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH).accept(
+ "application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD));
+ Metadata metadata =
+ JsonMetadata.fromJson(new InputStreamReader(
+ ((InputStream)response.getEntity()), StandardCharsets.UTF_8));
+ assertEquals(2, metadata.names().length);
+ assertNull(metadata.get("author"));
+ assertEquals("application/mock+xml", metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("hello world", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ }
+
+}
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceNoStackTest.java
similarity index 50%
copy from tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
copy to tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceNoStackTest.java
index f3dced5..43a3620 100644
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceNoStackTest.java
@@ -18,25 +18,25 @@
package org.apache.tika.server.core;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
-import javax.ws.rs.core.MultivaluedHashMap;
-import javax.ws.rs.core.MultivaluedMap;
import javax.ws.rs.core.Response;
-import org.apache.cxf.attachment.AttachmentUtil;
import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
import org.apache.cxf.jaxrs.client.WebClient;
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
import org.junit.Test;
import org.apache.tika.server.core.resource.TikaResource;
+import org.apache.tika.server.core.writer.JSONMessageBodyWriter;
+
+public class TikaResourceNoStackTest extends CXFTestBase {
-public class TikaResourceTest extends CXFTestBase {
public static final String TEST_HELLO_WORLD = "test-documents/mock/hello_world.xml";
+ public static final String TEST_HELLO_WORLD_LONG = "test-documents/mock/hello_world_long.xml";
+ public static final String TEST_NULL_POINTER = "test-documents/mock/null_pointer.xml";
public static final String TEST_OOM = "test-documents/mock/fake_oom.xml";
private static final String STREAM_CLOSED_FAULT = "java.io.IOException: Stream Closed";
@@ -45,6 +45,13 @@ public class TikaResourceTest extends CXFTestBase {
private static final int UNPROCESSEABLE = 422;
@Override
+ public TikaServerConfig getTikaServerConfig() {
+ TikaServerConfig tikaServerConfig = new TikaServerConfig();
+ tikaServerConfig.setReturnStackTrace(false);
+ return tikaServerConfig;
+ }
+
+ @Override
protected void setUpResources(JAXRSServerFactoryBean sf) {
sf.setResourceClasses(TikaResource.class);
sf.setResourceProvider(TikaResource.class,
@@ -55,62 +62,29 @@ public class TikaResourceTest extends CXFTestBase {
protected void setUpProviders(JAXRSServerFactoryBean sf) {
List<Object> providers = new ArrayList<Object>();
providers.add(new TikaServerParseExceptionMapper(false));
+ providers.add(new JSONMessageBodyWriter());
sf.setProviders(providers);
}
@Test
- public void testHelloWorld() throws Exception {
- Response response =
- WebClient.create(endPoint + TIKA_PATH).type("text/plain").accept("text/plain")
- .get();
- assertEquals(TikaResource.GREETING,
- getStringFromInputStream((InputStream) response.getEntity()));
+ public void testJsonNPE() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH).accept(
+ "application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_NULL_POINTER));
+ assertEquals(422, response.getStatus());
+ String content = getStringFromInputStream((InputStream) response.getEntity());
+ assertEquals(0, content.length());
}
@Test
- public void testHeaders() throws Exception {
- MultivaluedMap<String, String> map = new MultivaluedHashMap<>();
- map.addAll("meta_mymeta", "first", "second", "third");
- Response response = WebClient.create(endPoint + TIKA_PATH).headers(map).accept("text/xml")
- .put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD));
- String xml = getStringFromInputStream((InputStream) response.getEntity());
- //can't figure out why these values are comma-delimited, rather
- //than a true list...is this really the expected behavior?
- //this at least tests that the pass-through, basically works...
- //except for multi-values... :D
- assertContains("<meta name=\"mymeta\" content=\"first,second,third\"/>", xml);
+ public void testJsonWriteLimit() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .header("writeLimit", "100")
+ .accept("application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD_LONG));
+ assertEquals(500, response.getStatus());
+ String content = getStringFromInputStream((InputStream) response.getEntity());
+ assertEquals(0, content.length());
}
- @Test
- public void testJAXBAndActivationDependency() {
- //TIKA-2778
- AttachmentUtil.getCommandMap();
- }
-
- @Test
- public void testOOMInLegacyMode() throws Exception {
-
- Response response = null;
- try {
- response = WebClient.create(endPoint + TIKA_PATH).accept("text/plain")
- .put(ClassLoader.getSystemResourceAsStream(TEST_OOM));
- } catch (Exception e) {
- //oom may or may not cause an exception depending
- //on the timing
- }
-
- response = WebClient.create(endPoint + TIKA_PATH).accept("text/plain")
- .put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD));
- String responseMsg = getStringFromInputStream((InputStream) response.getEntity());
-
- assertContains("hello world", responseMsg);
- }
-
- @Test
- public void testApplicationWadl() throws Exception {
- Response response =
- WebClient.create(endPoint + TIKA_PATH + "?_wadl").accept("text/plain").get();
- String resp = getStringFromInputStream((InputStream) response.getEntity());
- assertTrue(resp.startsWith("<application"));
- }
}
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
index f3dced5..b058db2 100644
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
@@ -21,6 +21,8 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import javax.ws.rs.core.MultivaluedHashMap;
@@ -33,10 +35,17 @@ import org.apache.cxf.jaxrs.client.WebClient;
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
import org.junit.Test;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadata;
import org.apache.tika.server.core.resource.TikaResource;
+import org.apache.tika.server.core.writer.JSONMessageBodyWriter;
public class TikaResourceTest extends CXFTestBase {
+
public static final String TEST_HELLO_WORLD = "test-documents/mock/hello_world.xml";
+ public static final String TEST_HELLO_WORLD_LONG = "test-documents/mock/hello_world_long.xml";
+ public static final String TEST_NULL_POINTER = "test-documents/mock/null_pointer.xml";
public static final String TEST_OOM = "test-documents/mock/fake_oom.xml";
private static final String STREAM_CLOSED_FAULT = "java.io.IOException: Stream Closed";
@@ -55,6 +64,7 @@ public class TikaResourceTest extends CXFTestBase {
protected void setUpProviders(JAXRSServerFactoryBean sf) {
List<Object> providers = new ArrayList<Object>();
providers.add(new TikaServerParseExceptionMapper(false));
+ providers.add(new JSONMessageBodyWriter());
sf.setProviders(providers);
}
@@ -113,4 +123,81 @@ public class TikaResourceTest extends CXFTestBase {
String resp = getStringFromInputStream((InputStream) response.getEntity());
assertTrue(resp.startsWith("<application"));
}
+
+ @Test
+ public void testJson() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH).accept(
+ "application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD));
+ Metadata metadata =
+ JsonMetadata.fromJson(new InputStreamReader(
+ ((InputStream)response.getEntity()), StandardCharsets.UTF_8));
+
+ assertEquals("Nikolai Lobachevsky", metadata.get("author"));
+ assertEquals("application/mock+xml", metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("hello world", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ }
+
+ @Test
+ public void testJsonNPE() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH).accept(
+ "application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_NULL_POINTER));
+ Metadata metadata =
+ JsonMetadata.fromJson(new InputStreamReader(
+ ((InputStream)response.getEntity()), StandardCharsets.UTF_8));
+
+ assertEquals("Nikolai Lobachevsky", metadata.get("author"));
+ assertEquals("application/mock+xml", metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("some content", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ assertContains("null pointer message",
+ metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION));
+ }
+
+ @Test
+ public void testJsonWriteLimit() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .header("writeLimit", "100")
+ .accept("application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD_LONG));
+ Metadata metadata =
+ JsonMetadata.fromJson(new InputStreamReader(
+ ((InputStream)response.getEntity()), StandardCharsets.UTF_8));
+
+ assertEquals("Nikolai Lobachevsky", metadata.get("author"));
+ assertEquals("application/mock+xml", metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("Hello world", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ assertNotFound("dissolve", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ assertTrue(metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION).startsWith(
+ "org.apache.tika.sax.WriteOutContentHandler$WriteLimitReachedException"
+ ));
+ }
+
+ @Test
+ public void testJsonHandlerType() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .accept("application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD_LONG));
+ Metadata metadata =
+ JsonMetadata.fromJson(new InputStreamReader(
+ ((InputStream)response.getEntity()), StandardCharsets.UTF_8));
+
+ assertEquals("Nikolai Lobachevsky", metadata.get("author"));
+ assertEquals("application/mock+xml", metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("Hello world", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ //default is xhtml
+ assertContains("<p>", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+
+ response = WebClient.create(endPoint + TIKA_PATH + "/text")
+ .accept("application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD_LONG));
+ metadata =
+ JsonMetadata.fromJson(new InputStreamReader(
+ ((InputStream)response.getEntity()), StandardCharsets.UTF_8));
+
+ assertEquals("Nikolai Lobachevsky", metadata.get("author"));
+ assertEquals("application/mock+xml", metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("Hello world", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ assertNotFound("<p>", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ }
}
diff --git a/tika-server/tika-server-core/src/test/resources/configs/metadata-filter-include.xml b/tika-server/tika-server-core/src/test/resources/configs/metadata-filter-include.xml
new file mode 100644
index 0000000..3a7a7c1
--- /dev/null
+++ b/tika-server/tika-server-core/src/test/resources/configs/metadata-filter-include.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <metadataFilters>
+ <metadataFilter class="org.apache.tika.metadata.filter.IncludeFieldMetadataFilter">
+ <params>
+ <param name="include" type="list">
+ <string>X-TIKA:content</string>
+ <string>extended-properties:Application</string>
+ <string>Content-Type</string>
+ </param>
+ </params>
+ </metadataFilter>
+ </metadataFilters>
+</properties>
diff --git a/tika-server/tika-server-core/src/test/resources/test-documents/mock/hello_world_long.xml b/tika-server/tika-server-core/src/test/resources/test-documents/mock/hello_world_long.xml
new file mode 100644
index 0000000..bf06ad2
--- /dev/null
+++ b/tika-server/tika-server-core/src/test/resources/test-documents/mock/hello_world_long.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<mock>
+ <metadata action="add" name="author">Nikolai Lobachevsky</metadata>
+ <metadata action="add" name="title">你好,世界</metadata>
+ <metadata action="add" name="my-key">parsers-value</metadata>
+ <write element="p">Hello world...</write>
+ <write element="p">When in the Course of human events, it becomes necessary for one people to dissolve the
+ political bands which have connected them with another, and to assume among the powers of the earth, the
+ separate and equal station to which the Laws of Nature and of Nature’s God entitle them, a decent respect
+ to the opinions of mankind requires that they should declare the causes which impel them to the separation.</write>
+</mock>
\ No newline at end of file