You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by se...@apache.org on 2013/12/05 17:28:25 UTC

svn commit: r1548195 - in /tika/trunk/tika-server/src: main/java/org/apache/tika/server/ test/java/org/apache/tika/server/

Author: sergeyb
Date: Thu Dec  5 16:28:25 2013
New Revision: 1548195

URL: http://svn.apache.org/r1548195
Log:
[TIKA-1198] Support for multipart payloads

Modified:
    tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java
    tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataResource.java
    tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java
    tika/trunk/tika-server/src/main/java/org/apache/tika/server/UnpackerResource.java
    tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java

Modified: tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java?rev=1548195&r1=1548194&r2=1548195&view=diff
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java (original)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataEP.java Thu Dec  5 16:28:25 2013
@@ -17,20 +17,24 @@
 
 package org.apache.tika.server;
 
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.xml.sax.helpers.DefaultHandler;
+import java.io.InputStream;
 
 import javax.ws.rs.POST;
 import javax.ws.rs.Path;
 import javax.ws.rs.PathParam;
 import javax.ws.rs.Produces;
-import javax.ws.rs.core.*;
+import javax.ws.rs.core.Context;
+import javax.ws.rs.core.HttpHeaders;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.Response;
 import javax.ws.rs.core.Response.Status;
+import javax.ws.rs.core.UriInfo;
 
-import java.io.InputStream;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.xml.sax.helpers.DefaultHandler;
 
 /**
  * This JAX-RS endpoint provides access to the metadata contained within a
@@ -50,7 +54,7 @@ public class MetadataEP {
 
   public MetadataEP(@Context HttpHeaders httpHeaders, @Context UriInfo info) {
     parser = TikaResource.createParser();
-    TikaResource.fillMetadata(parser, metadata, httpHeaders);
+    TikaResource.fillMetadata(parser, metadata, httpHeaders.getRequestHeaders());
     TikaResource.logRequest(logger, info, metadata);
   }
 

Modified: tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataResource.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataResource.java?rev=1548195&r1=1548194&r2=1548195&view=diff
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataResource.java (original)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/MetadataResource.java Thu Dec  5 16:28:25 2013
@@ -17,35 +17,51 @@
 
 package org.apache.tika.server;
 
-import au.com.bytecode.opencsv.CSVWriter;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.xml.sax.helpers.DefaultHandler;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
 
+import javax.ws.rs.Consumes;
 import javax.ws.rs.PUT;
 import javax.ws.rs.Path;
 import javax.ws.rs.Produces;
 import javax.ws.rs.WebApplicationException;
 import javax.ws.rs.core.Context;
 import javax.ws.rs.core.HttpHeaders;
+import javax.ws.rs.core.MultivaluedMap;
 import javax.ws.rs.core.StreamingOutput;
 import javax.ws.rs.core.UriInfo;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.util.ArrayList;
-import java.util.Arrays;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.xml.sax.helpers.DefaultHandler;
+
+import au.com.bytecode.opencsv.CSVWriter;
 
 @Path("/meta{id:(/.*)?}")
 public class MetadataResource {
   private static final Log logger = LogFactory.getLog(MetadataResource.class);
 
   @PUT
+  @Consumes("multipart/form-data")
+  @Produces("text/csv")
+  public StreamingOutput getMetadataFromMultipart(Attachment att, @Context UriInfo info) throws Exception {
+	  return produceMetadata(att.getObject(InputStream.class), att.getHeaders(), info);
+  }
+  
+  @PUT
   @Produces("text/csv")
   public StreamingOutput getMetadata(InputStream is, @Context HttpHeaders httpHeaders, @Context UriInfo info) throws Exception {
+	  return produceMetadata(is, httpHeaders.getRequestHeaders(), info);
+  }
+  
+  private StreamingOutput produceMetadata(InputStream is, MultivaluedMap<String, String> httpHeaders, UriInfo info) throws Exception {
     final Metadata metadata = new Metadata();
     AutoDetectParser parser = TikaResource.createParser();
     TikaResource.fillMetadata(parser, metadata, httpHeaders);

Modified: tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java?rev=1548195&r1=1548194&r2=1548195&view=diff
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java (original)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/TikaResource.java Thu Dec  5 16:28:25 2013
@@ -22,7 +22,6 @@ import java.io.InputStream;
 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
 import java.io.Writer;
-import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
@@ -36,6 +35,7 @@ import javax.ws.rs.Produces;
 import javax.ws.rs.WebApplicationException;
 import javax.ws.rs.core.Context;
 import javax.ws.rs.core.HttpHeaders;
+import javax.ws.rs.core.MultivaluedMap;
 import javax.ws.rs.core.Response;
 import javax.ws.rs.core.StreamingOutput;
 import javax.ws.rs.core.UriInfo;
@@ -47,6 +47,7 @@ import javax.xml.transform.stream.Stream
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
 import org.apache.poi.extractor.ExtractorFactory;
 import org.apache.poi.hwpf.OldWordFileFormatException;
 import org.apache.tika.detect.Detector;
@@ -101,12 +102,12 @@ public class TikaResource {
     return parser;
   }
 
-  public static String detectFilename(HttpHeaders httpHeaders) {
+  public static String detectFilename(MultivaluedMap<String, String> httpHeaders) {
 
-    List<String> disposition = httpHeaders.getRequestHeader("Content-Disposition");
-    if (disposition != null && !disposition.isEmpty()) {
+    String disposition = httpHeaders.getFirst("Content-Disposition");
+    if (disposition != null) {
       try {
-        ContentDisposition c = new ContentDisposition(disposition.get(0));
+        ContentDisposition c = new ContentDisposition(disposition);
 
         // only support "attachment" dispositions
         if ("attachment".equals(c.getDisposition())) {
@@ -121,21 +122,19 @@ public class TikaResource {
     }
 
     // this really should not be used, since it's not an official field
-    List<String> fileName = httpHeaders.getRequestHeader("File-Name");
-    if (fileName != null && !fileName.isEmpty()) {
-      return fileName.get(0);
-    }
-    return null;
+    return httpHeaders.getFirst("File-Name");
   }
 
   @SuppressWarnings("serial")
-public static void fillMetadata(AutoDetectParser parser, Metadata metadata, HttpHeaders httpHeaders) {
+public static void fillMetadata(AutoDetectParser parser, Metadata metadata, MultivaluedMap<String, String> httpHeaders) {
     String fileName = detectFilename(httpHeaders);
     if (fileName != null) {
       metadata.set(TikaMetadataKeys.RESOURCE_NAME_KEY, fileName);
     }
 
-    javax.ws.rs.core.MediaType mediaType = httpHeaders.getMediaType();
+    String contentTypeHeader = httpHeaders.getFirst(HttpHeaders.CONTENT_TYPE);
+    javax.ws.rs.core.MediaType mediaType = contentTypeHeader == null ? null 
+        : javax.ws.rs.core.MediaType.valueOf(contentTypeHeader);
     if (mediaType!=null && "xml".equals(mediaType.getSubtype()) ) {
       mediaType = null;
     }
@@ -164,9 +163,19 @@ public static void fillMetadata(AutoDete
   }
 
   @PUT
+  @Consumes("multipart/form-data")
+  @Produces("text/plain")
+  public StreamingOutput getTextFromMultipart(Attachment att, @Context final UriInfo info) {
+	  return produceText(att.getObject(InputStream.class), att.getHeaders(), info);
+  }
+  
+  @PUT
   @Consumes("*/*")
   @Produces("text/plain")
   public StreamingOutput getText(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
+	  return produceText(is, httpHeaders.getRequestHeaders(), info);
+  }
+  public StreamingOutput produceText(final InputStream is, MultivaluedMap<String, String> httpHeaders, final UriInfo info) {	  
     final AutoDetectParser parser = createParser();
     final Metadata metadata = new Metadata();
 
@@ -183,9 +192,7 @@ public static void fillMetadata(AutoDete
         TikaInputStream tis = TikaInputStream.get(is);
 
         try {
-          tis.getFile();
-
-          parser.parse(tis, body, metadata);
+            parser.parse(tis, body, metadata);
         } catch (SAXException e) {
           throw new WebApplicationException(e);
         } catch (EncryptedDocumentException e) {
@@ -221,81 +228,36 @@ public static void fillMetadata(AutoDete
     };
   }
 
+  @PUT
+  @Consumes("multipart/form-data")
+  @Produces("text/html")
+  public StreamingOutput getHTMLFromMultipart(Attachment att, @Context final UriInfo info) {
+	  return produceOutput(att.getObject(InputStream.class), att.getHeaders(), info, "html");
+  }
 
   @PUT
   @Consumes("*/*")
   @Produces("text/html")
   public StreamingOutput getHTML(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
-      final AutoDetectParser parser = createParser();
-      final Metadata metadata = new Metadata();
-
-      fillMetadata(parser, metadata, httpHeaders);
-
-      logRequest(logger, info, metadata);
-
-      return new StreamingOutput() {
-          public void write(OutputStream outputStream)
-          throws IOException, WebApplicationException {
-              Writer writer = new OutputStreamWriter(outputStream, "UTF-8");
-              ContentHandler content;
-
-              try {
-                  SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance( );
-                  TransformerHandler handler = factory.newTransformerHandler( );
-                  handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
-                  handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
-                  handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
-                  handler.setResult(new StreamResult(writer));
-                  content = new ExpandedTitleContentHandler( handler );
-              }
-              catch ( TransformerConfigurationException e ) {
-                  throw new WebApplicationException( e );
-              }
-
-              TikaInputStream tis = TikaInputStream.get(is);
-
-              try {
-                  tis.getFile();
-                  parser.parse(tis, content, metadata);
-              }
-              catch (SAXException e) {
-                  throw new WebApplicationException(e);
-              }
-              catch (EncryptedDocumentException e) {
-                  logger.warn(String.format(
-                          "%s: Encrypted document",
-                          info.getPath()
-                  ), e);
-                  throw new WebApplicationException(e, Response.status(422).build());
-              }
-              catch (TikaException e) {
-                  logger.warn(String.format(
-                          "%s: Text extraction failed",
-                          info.getPath()
-                  ), e);
-
-                  if (e.getCause()!=null && e.getCause() instanceof WebApplicationException)
-                      throw (WebApplicationException) e.getCause();
-
-                  if (e.getCause()!=null && e.getCause() instanceof IllegalStateException)
-                      throw new WebApplicationException(Response.status(422).build());
-
-                  if (e.getCause()!=null && e.getCause() instanceof OldWordFileFormatException)
-                      throw new WebApplicationException(Response.status(422).build());
-
-                  throw new WebApplicationException(Response.Status.INTERNAL_SERVER_ERROR);
-              }
-              finally {
-                  tis.close();
-              }
-          }
-      };
+	  return produceOutput(is, httpHeaders.getRequestHeaders(), info, "html");
   }
 
   @PUT
+  @Consumes("multipart/form-data")
+  @Produces("text/xml")
+  public StreamingOutput getXMLFromMultipart(Attachment att, @Context final UriInfo info) {
+	  return produceOutput(att.getObject(InputStream.class), att.getHeaders(), info, "xml");
+  }
+  
+  @PUT
   @Consumes("*/*")
   @Produces("text/xml")
   public StreamingOutput getXML(final InputStream is, @Context HttpHeaders httpHeaders, @Context final UriInfo info) {
+    return produceOutput(is, httpHeaders.getRequestHeaders(), info, "xml");
+  }
+  
+  private StreamingOutput produceOutput(final InputStream is, final MultivaluedMap<String, String> httpHeaders, 
+        final UriInfo info, final String format) {
     final AutoDetectParser parser = createParser();
     final Metadata metadata = new Metadata();
 
@@ -312,7 +274,7 @@ public static void fillMetadata(AutoDete
         try {
           SAXTransformerFactory factory = (SAXTransformerFactory)SAXTransformerFactory.newInstance( );
           TransformerHandler handler = factory.newTransformerHandler( );
-          handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+          handler.getTransformer().setOutputProperty(OutputKeys.METHOD, format);
           handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
           handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
           handler.setResult(new StreamResult(writer));
@@ -325,7 +287,6 @@ public static void fillMetadata(AutoDete
         TikaInputStream tis = TikaInputStream.get(is);
 
         try {
-          tis.getFile();
           parser.parse(tis, content, metadata);
         }
         catch (SAXException e) {

Modified: tika/trunk/tika-server/src/main/java/org/apache/tika/server/UnpackerResource.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/UnpackerResource.java?rev=1548195&r1=1548194&r2=1548195&view=diff
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/UnpackerResource.java (original)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/UnpackerResource.java Thu Dec  5 16:28:25 2013
@@ -17,10 +17,33 @@
 
 package org.apache.tika.server;
 
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStreamWriter;
+import java.util.HashMap;
+import java.util.Map;
+
+import javax.ws.rs.PUT;
+import javax.ws.rs.Path;
+import javax.ws.rs.Produces;
+import javax.ws.rs.WebApplicationException;
+import javax.ws.rs.core.Context;
+import javax.ws.rs.core.HttpHeaders;
+import javax.ws.rs.core.Response;
+import javax.ws.rs.core.UriInfo;
+
 import org.apache.commons.lang.mutable.MutableInt;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.poi.poifs.filesystem.*;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.Ole10Native;
+import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.IOUtils;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
@@ -37,18 +60,6 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
-import javax.ws.rs.PUT;
-import javax.ws.rs.Path;
-import javax.ws.rs.Produces;
-import javax.ws.rs.WebApplicationException;
-import javax.ws.rs.core.Context;
-import javax.ws.rs.core.HttpHeaders;
-import javax.ws.rs.core.Response;
-import javax.ws.rs.core.UriInfo;
-import java.io.*;
-import java.util.HashMap;
-import java.util.Map;
-
 @Path("/")
 public class UnpackerResource {
   private static final Log logger = LogFactory.getLog(UnpackerResource.class);
@@ -93,7 +104,7 @@ public class UnpackerResource {
 
     AutoDetectParser parser = TikaResource.createParser();
 
-    TikaResource.fillMetadata(parser, metadata, httpHeaders);
+    TikaResource.fillMetadata(parser, metadata, httpHeaders.getRequestHeaders());
     TikaResource.logRequest(logger, info, metadata);
 
     ContentHandler ch;

Modified: tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java?rev=1548195&r1=1548194&r2=1548195&view=diff
==============================================================================
--- tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java (original)
+++ tika/trunk/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java Thu Dec  5 16:28:25 2013
@@ -26,6 +26,7 @@ import org.apache.cxf.endpoint.Server;
 import org.apache.cxf.jaxrs.JAXRSBindingFactory;
 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
 import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
 import org.junit.Test;
 
@@ -151,4 +152,20 @@ public class TikaResourceTest extends CX
 
     assertEquals(UNPROCESSEABLE, response.getStatus());
   }
+  
+  @Test
+  public void testSimpleWordMultipartXML() throws Exception {
+    ClassLoader.getSystemResourceAsStream(TEST_DOC);  
+	Attachment attachmentPart = 
+        new Attachment("myworddoc", "application/msword", ClassLoader.getSystemResourceAsStream(TEST_DOC));
+	WebClient webClient = WebClient.create(endPoint + TIKA_PATH);
+	WebClient.getConfig(webClient).getHttpConduit().getClient().setReceiveTimeout(1000000L);
+    Response response = webClient.type("multipart/form-data")
+      .accept("text/xml")
+      .put(attachmentPart);
+    String responseMsg = getStringFromInputStream((InputStream) response
+      .getEntity());
+    assertTrue(responseMsg.contains("test"));
+  }
+  
 }