You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by se...@apache.org on 2013/12/05 17:28:25 UTC
svn commit: r1548195 - in /tika/trunk/tika-server/src:
main/java/org/apache/tika/server/ test/java/org/apache/tika/server/
Author: sergeyb
Date: Thu Dec 5 16:28:25 2013
New Revision: 1548195
URL: http://svn.apache.org/r1548195
Log:
[TIKA-1198] Support for multipart payloads
Modified:
tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataResource.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/UnpackerResource.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
Modified: tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java?rev=1548195&r1=1548194&r2=1548195&view=diff
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java (original)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java Thu Dec 5 16:28:25 2013
@@ -17,20 +17,24 @@
package org.apache.tika.server;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.xml.sax.helpers.DefaultHandler;
+import java.io.InputStream;
import javax.ws.rs.POST;
import javax.ws.rs.Path;
import javax.ws.rs.PathParam;
import javax.ws.rs.Produces;
-import javax.ws.rs.core.*;
+import javax.ws.rs.core.Context;
+import javax.ws.rs.core.HttpHeaders;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
import javax.ws.rs.core.Response.Status;
+import javax.ws.rs.core.UriInfo;
-import java.io.InputStream;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.xml.sax.helpers.DefaultHandler;
/**
* This JAX-RS endpoint provides access to the metadata contained within a
@@ -50,7 +54,7 @@ public class MetadataEP {
public MetadataEP(@Context HttpHeaders httpHeaders, @Context UriInfo info) {
parser = TikaResource.createParser();
- TikaResource.fillMetadata(parser, metadata, httpHeaders);
+ TikaResource.fillMetadata(parser, metadata, httpHeaders.getRequestHeaders());
TikaResource.logRequest(logger, info, metadata);
}
Modified: tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataResource.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataResource.java?rev=1548195&r1=1548194&r2=1548195&view=diff
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataResource.java (original)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataResource.java Thu Dec 5 16:28:25 2013
@@ -17,35 +17,51 @@
package org.apache.tika.server;
-import au.com.bytecode.opencsv.CSVWriter;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.xml.sax.helpers.DefaultHandler;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
+import javax.ws.rs.Consumes;
import javax.ws.rs.PUT;
import javax.ws.rs.Path;
import javax.ws.rs.Produces;
import javax.ws.rs.WebApplicationException;
import javax.ws.rs.core.Context;
import javax.ws.rs.core.HttpHeaders;
+import javax.ws.rs.core.MultivaluedMap;
import javax.ws.rs.core.StreamingOutput;
import javax.ws.rs.core.UriInfo;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.util.ArrayList;
-import java.util.Arrays;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.xml.sax.helpers.DefaultHandler;
+
+import au.com.bytecode.opencsv.CSVWriter;
@Path("/meta{id:(/.*)?}")
public class MetadataResource {
private static final Log logger = LogFactory.getLog(MetadataResource.class);
@PUT
+ @Consumes("multipart/form-data")
+ @Produces("text/csv")
+ public StreamingOutput getMetadataFromMultipart(Attachment att, @Context UriInfo info) throws Exception {
+ return produceMetadata(att.getObject(InputStream.class), att.getHeaders(), info);
+ }
+
+ @PUT
@Produces("text/csv")
public StreamingOutput getMetadata(InputStream is, @Context HttpHeaders httpHeaders, @Context UriInfo info) throws Exception {
+ return produceMetadata(is, httpHeaders.getRequestHeaders(), info);
+ }
+
+ private StreamingOutput produceMetadata(InputStream is, MultivaluedMap<String, String> httpHeaders, UriInfo info) throws Exception {
final Metadata metadata = new Metadata();
AutoDetectParser parser = TikaResource.createParser();
TikaResource.fillMetadata(parser, metadata, httpHeaders);
Modified: tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java?rev=1548195&r1=1548194&r2=1548195&view=diff
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java (original)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java Thu Dec 5 16:28:25 2013
@@ -22,7 +22,6 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
-import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -36,6 +35,7 @@ import javax.ws.rs.Produces;
import javax.ws.rs.WebApplicationException;
import javax.ws.rs.core.Context;
import javax.ws.rs.core.HttpHeaders;
+import javax.ws.rs.core.MultivaluedMap;
import javax.ws.rs.core.Response;
import javax.ws.rs.core.StreamingOutput;
import javax.ws.rs.core.UriInfo;
@@ -47,6 +47,7 @@ import javax.xml.transform.stream.Stream
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.tika.detect.Detector;
@@ -101,12 +102,12 @@ public class TikaResource {
return parser;
}
- public static String detectFilename(HttpHeaders httpHeaders) {
+ public static String detectFilename(MultivaluedMap<String, String> httpHeaders) {
- List<String> disposition = httpHeaders.getRequestHeader("Content-Disposition");
- if (disposition != null && !disposition.isEmpty()) {
+ String disposition = httpHeaders.getFirst("Content-Disposition");
+ if (disposition != null) {
try {
- ContentDisposition c = new ContentDisposition(disposition.get(0));
+ ContentDisposition c = new ContentDisposition(disposition);
// only support "attachment" dispositions
if ("attachment".equals(c.getDisposition())) {
@@ -121,21 +122,19 @@ public class TikaResource {
}
// this really should not be used, since it's not an official field
- List<String> fileName = httpHeaders.getRequestHeader("File-Name");
- if (fileName != null && !fileName.isEmpty()) {
- return fileName.get(0);
- }
- return null;
+ return httpHeaders.getFirst("File-Name");
}
@SuppressWarnings("serial")
-public static void fillMetadata(AutoDetectParser parser, Metadata metadata, HttpHeaders httpHeaders) {
+public static void fillMetadata(AutoDetectParser parser, Metadata metadata, MultivaluedMap<String, String> httpHeaders) {
String fileName = detectFilename(httpHeaders);
if (fileName != null) {
metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName);
}
- javax.ws.rs.core.MediaType mediaType = httpHeaders.getMediaType();
+ String contentTypeHeader = httpHeaders.getFirst(HttpHeaders.CONTENT_TYPE);
+ javax.ws.rs.core.MediaType mediaType = contentTypeHeader == null ? null
+ : javax.ws.rs.core.MediaType.valueOf(contentTypeHeader);
if (mediaType!=null && "xml".equals(mediaType.getSubtype()) ) {
mediaType = null;
}
@@ -164,9 +163,19 @@ public static void fillMetadata(AutoDete
}
@PUT
+ @Consumes("multipart/form-data")
+ @Produces("text/plain")
+ public StreamingOutput getTextFromMultipart(Attachment att, @Context final UriInfo info) {
+ return produceText(att.getObject(InputStream.class), att.getHeaders(), info);
+ }
+
+ @PUT
@Consumes("*/*")
@Produces("text/plain")
public StreamingOutput getText(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
+ return produceText(is, httpHeaders.getRequestHeaders(), info);
+ }
+ public StreamingOutput produceText(final InputStream is, MultivaluedMap<String, String> httpHeaders, final UriInfo info) {
final AutoDetectParser parser = createParser();
final Metadata metadata = new Metadata();
@@ -183,9 +192,7 @@ public static void fillMetadata(AutoDete
TikaInputStream tis = TikaInputStream.get(is);
try {
- tis.getFile();
-
- parser.parse(tis, body, metadata);
+ parser.parse(tis, body, metadata);
} catch (SAXException e) {
throw new WebApplicationException(e);
} catch (EncryptedDocumentException e) {
@@ -221,81 +228,36 @@ public static void fillMetadata(AutoDete
};
}
+ @PUT
+ @Consumes("multipart/form-data")
+ @Produces("text/html")
+ public StreamingOutput getHTMLFromMultipart(Attachment att, @Context final UriInfo info) {
+ return produceOutput(att.getObject(InputStream.class), att.getHeaders(), info, "html");
+ }
@PUT
@Consumes("*/*")
@Produces("text/html")
public StreamingOutput getHTML(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
- final AutoDetectParser parser = createParser();
- final Metadata metadata = new Metadata();
-
- fillMetadata(parser, metadata, httpHeaders);
-
- logRequest(logger, info, metadata);
-
- return new StreamingOutput() {
- public void write(OutputStream outputStream)
- throws IOException, WebApplicationException {
- Writer writer = new OutputStreamWriter(outputStream, "UTF-8");
- ContentHandler content;
-
- try {
- SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance( );
- TransformerHandler handler = factory.newTransformerHandler( );
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
- handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
- handler.setResult(new StreamResult(writer));
- content = new ExpandedTitleContentHandler( handler );
- }
- catch ( TransformerConfigurationException e ) {
- throw new WebApplicationException( e );
- }
-
- TikaInputStream tis = TikaInputStream.get(is);
-
- try {
- tis.getFile();
- parser.parse(tis, content, metadata);
- }
- catch (SAXException e) {
- throw new WebApplicationException(e);
- }
- catch (EncryptedDocumentException e) {
- logger.warn(String.format(
- "%s: Encrypted document",
- info.getPath()
- ), e);
- throw new WebApplicationException(e, Response.status(422).build());
- }
- catch (TikaException e) {
- logger.warn(String.format(
- "%s: Text extraction failed",
- info.getPath()
- ), e);
-
- if (e.getCause()!=null && e.getCause() instanceof WebApplicationException)
- throw (WebApplicationException) e.getCause();
-
- if (e.getCause()!=null && e.getCause() instanceof IllegalStateException)
- throw new WebApplicationException(Response.status(422).build());
-
- if (e.getCause()!=null && e.getCause() instanceof OldWordFileFormatException)
- throw new WebApplicationException(Response.status(422).build());
-
- throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
- }
- finally {
- tis.close();
- }
- }
- };
+ return produceOutput(is, httpHeaders.getRequestHeaders(), info, "html");
}
@PUT
+ @Consumes("multipart/form-data")
+ @Produces("text/xml")
+ public StreamingOutput getXMLFromMultipart(Attachment att, @Context final UriInfo info) {
+ return produceOutput(att.getObject(InputStream.class), att.getHeaders(), info, "xml");
+ }
+
+ @PUT
@Consumes("*/*")
@Produces("text/xml")
public StreamingOutput getXML(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
+ return produceOutput(is, httpHeaders.getRequestHeaders(), info, "xml");
+ }
+
+ private StreamingOutput produceOutput(final InputStream is, final MultivaluedMap<String, String> httpHeaders,
+ final UriInfo info, final String format) {
final AutoDetectParser parser = createParser();
final Metadata metadata = new Metadata();
@@ -312,7 +274,7 @@ public static void fillMetadata(AutoDete
try {
SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance( );
TransformerHandler handler = factory.newTransformerHandler( );
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, format);
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
handler.setResult(new StreamResult(writer));
@@ -325,7 +287,6 @@ public static void fillMetadata(AutoDete
TikaInputStream tis = TikaInputStream.get(is);
try {
- tis.getFile();
parser.parse(tis, content, metadata);
}
catch (SAXException e) {
Modified: tika/trunk/tika-server/src/main/java/org/apache/tika/server/UnpackerResource.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/UnpackerResource.java?rev=1548195&r1=1548194&r2=1548195&view=diff
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/UnpackerResource.java (original)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/UnpackerResource.java Thu Dec 5 16:28:25 2013
@@ -17,10 +17,33 @@
package org.apache.tika.server;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStreamWriter;
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.ws.rs.PUT;
+import javax.ws.rs.Path;
+import javax.ws.rs.Produces;
+import javax.ws.rs.WebApplicationException;
+import javax.ws.rs.core.Context;
+import javax.ws.rs.core.HttpHeaders;
+import javax.ws.rs.core.Response;
+import javax.ws.rs.core.UriInfo;
+
import org.apache.commons.lang.mutable.MutableInt;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-import org.apache.poi.poifs.filesystem.*;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.Ole10Native;
+import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.IOUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
@@ -37,18 +60,6 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-import javax.ws.rs.PUT;
-import javax.ws.rs.Path;
-import javax.ws.rs.Produces;
-import javax.ws.rs.WebApplicationException;
-import javax.ws.rs.core.Context;
-import javax.ws.rs.core.HttpHeaders;
-import javax.ws.rs.core.Response;
-import javax.ws.rs.core.UriInfo;
-import java.io.*;
-import java.util.HashMap;
-import java.util.Map;
-
@Path("/")
public class UnpackerResource {
private static final Log logger = LogFactory.getLog(UnpackerResource.class);
@@ -93,7 +104,7 @@ public class UnpackerResource {
AutoDetectParser parser = TikaResource.createParser();
- TikaResource.fillMetadata(parser, metadata, httpHeaders);
+ TikaResource.fillMetadata(parser, metadata, httpHeaders.getRequestHeaders());
TikaResource.logRequest(logger, info, metadata);
ContentHandler ch;
Modified: tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java?rev=1548195&r1=1548194&r2=1548195&view=diff
==============================================================================
--- tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java (original)
+++ tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java Thu Dec 5 16:28:25 2013
@@ -26,6 +26,7 @@ import org.apache.cxf.endpoint.Server;
import org.apache.cxf.jaxrs.JAXRSBindingFactory;
import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
import org.junit.Test;
@@ -151,4 +152,20 @@ public class TikaResourceTest extends CX
assertEquals(UNPROCESSEABLE, response.getStatus());
}
+
+ @Test
+ public void testSimpleWordMultipartXML() throws Exception {
+ ClassLoader.getSystemResourceAsStream(TEST_DOC);
+ Attachment attachmentPart =
+ new Attachment("myworddoc", "application/msword", ClassLoader.getSystemResourceAsStream(TEST_DOC));
+ WebClient webClient = WebClient.create(endPoint + TIKA_PATH);
+ WebClient.getConfig(webClient).getHttpConduit().getClient().setReceiveTimeout(1000000L);
+ Response response = webClient.type("multipart/form-data")
+ .accept("text/xml")
+ .put(attachmentPart);
+ String responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+ assertTrue(responseMsg.contains("test"));
+ }
+
}