You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/08/28 14:59:25 UTC

svn commit: r1698329 - in /tika/trunk: ./ tika-batch/src/main/java/org/apache/tika/batch/builders/ tika-core/src/main/java/org/apache/tika/sax/ tika-server/src/main/java/org/apache/tika/server/resource/ tika-server/src/test/java/org/apache/tika/server/

Author: tallison
Date: Fri Aug 28 12:59:24 2015
New Revision: 1698329

URL: http://svn.apache.org/r1698329
Log:
TIKA-1716 change default /rmeta content handler to xml and allow users to specify which content handler to use for content

Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/DefaultContentHandlerFactoryBuilder.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
    tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
    tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1698329&r1=1698328&r2=1698329&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri Aug 28 12:59:24 2015
@@ -1,5 +1,9 @@
 Release 1.11 - Current Development
 
+  * Changed default content handler type for "/rmeta" in tika-server
+    to "xml" to align with "-J" option in tika-app.  
+    Clients can now specify handler types via PathParam. (TIKA-1716).
+
   * The fantastic GROBID (or Grobid) GeneRation Of BIbliographic Data
     for machine learning from PDF files is now integrated as a 
     Tika parser (TIKA-1699, TIKA-1712).
@@ -9,7 +13,7 @@ Release 1.11 - Current Development
 
   * Upgraded to ASM 5.0.4 (TIKA-1705).
 
-  * Corrected Tika Config XML detector defintion explicit loading 
+  * Corrected Tika Config XML detector definition explicit loading 
     of MimeTypes (TIKA-1708)
 
   * In Tika Parsers, Batch, Server, App and Examples, use Apache

Modified: tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/DefaultContentHandlerFactoryBuilder.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/DefaultContentHandlerFactoryBuilder.java?rev=1698329&r1=1698328&r2=1698329&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/DefaultContentHandlerFactoryBuilder.java (original)
+++ tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/DefaultContentHandlerFactoryBuilder.java Fri Aug 28 12:59:24 2015
@@ -17,7 +17,6 @@ package org.apache.tika.batch.builders;
  * limitations under the License.
  */
 
-import java.util.Locale;
 import java.util.Map;
 
 import org.apache.tika.sax.BasicContentHandlerFactory;
@@ -40,25 +39,8 @@ public class DefaultContentHandlerFactor
         Map<String, String> attributes = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
         BasicContentHandlerFactory.HANDLER_TYPE type = null;
         String handlerTypeString = attributes.get("basicHandlerType");
-        if (handlerTypeString == null) {
-            handlerTypeString = "text";
-        }
-        handlerTypeString = handlerTypeString.toLowerCase(Locale.ROOT);
-        if (handlerTypeString.equals("xml")) {
-            type = BasicContentHandlerFactory.HANDLER_TYPE.XML;
-        } else if (handlerTypeString.equals("text")) {
-            type = BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
-        } else if (handlerTypeString.equals("txt")) {
-            type = BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
-        } else if (handlerTypeString.equals("html")) {
-            type = BasicContentHandlerFactory.HANDLER_TYPE.HTML;
-        } else if (handlerTypeString.equals("body")) {
-            type = BasicContentHandlerFactory.HANDLER_TYPE.BODY;
-        } else if (handlerTypeString.equals("ignore")) {
-            type = BasicContentHandlerFactory.HANDLER_TYPE.IGNORE;
-        } else {
-            type = BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
-        }
+        type = BasicContentHandlerFactory.parseHandlerType(handlerTypeString,
+                BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
         int writeLimit = -1;
         String writeLimitString = attributes.get("writeLimit");
         if (writeLimitString != null) {

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java?rev=1698329&r1=1698328&r2=1698329&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java Fri Aug 28 12:59:24 2015
@@ -19,6 +19,7 @@ package org.apache.tika.sax;
 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
 import java.io.UnsupportedEncodingException;
+import java.util.Locale;
 
 import org.xml.sax.ContentHandler;
 import org.xml.sax.helpers.DefaultHandler;
@@ -29,6 +30,33 @@ import org.xml.sax.helpers.DefaultHandle
 public class BasicContentHandlerFactory implements ContentHandlerFactory {
 
     /**
+     * Tries to parse string into handler type.  Returns default if string is null or
+     * parse fails.
+     * <p/>
+     * Options: xml, html, text, body, ignore (no content)
+     *
+     * @param handlerTypeName string to parse
+     * @param defaultType type to return if parse fails
+     * @return handler type
+     */
+    public static HANDLER_TYPE parseHandlerType(String handlerTypeName, HANDLER_TYPE defaultType) {
+        if (handlerTypeName == null) {
+            return defaultType;
+        }
+
+        String lcHandlerTypeName = handlerTypeName.toLowerCase(Locale.ROOT);
+        switch (lcHandlerTypeName) {
+            case "xml" : return HANDLER_TYPE.XML;
+            case "text" : return HANDLER_TYPE.TEXT;
+            case "txt" : return HANDLER_TYPE.TEXT;
+            case "html" : return HANDLER_TYPE.HTML;
+            case "body" : return HANDLER_TYPE.BODY;
+            case "ignore" : return HANDLER_TYPE.IGNORE;
+            default : return defaultType;
+        }
+    }
+
+    /**
      * Common handler types for content.
      */
     public enum HANDLER_TYPE {
@@ -37,7 +65,7 @@ public class BasicContentHandlerFactory
         TEXT,
         HTML,
         XML
-    };
+    }
 
     private final HANDLER_TYPE type;
     private final int writeLimit;

Modified: tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java?rev=1698329&r1=1698328&r2=1698329&view=diff
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java (original)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java Fri Aug 28 12:59:24 2015
@@ -21,6 +21,7 @@ import javax.ws.rs.Consumes;
 import javax.ws.rs.POST;
 import javax.ws.rs.PUT;
 import javax.ws.rs.Path;
+import javax.ws.rs.PathParam;
 import javax.ws.rs.Produces;
 import javax.ws.rs.core.Context;
 import javax.ws.rs.core.HttpHeaders;
@@ -42,32 +43,92 @@ import org.apache.tika.server.MetadataLi
 
 @Path("/rmeta")
 public class RecursiveMetadataResource {
+
+    private static final String HANDLER_TYPE_PARAM = "handler";
+    private static final BasicContentHandlerFactory.HANDLER_TYPE DEFAULT_HANDLER_TYPE =
+            BasicContentHandlerFactory.HANDLER_TYPE.XML;
     private static final Log logger = LogFactory.getLog(RecursiveMetadataResource.class);
 
+    /**
+     * Returns an InputStream that can be deserialized as a list of
+     * {@link Metadata} objects.
+     * The first in the list represents the main document, and the
+     * rest represent metadata for the embedded objects.  This works
+     * recursively through all descendants of the main document, not
+     * just the immediate children.
+     * <p>
+     * The extracted text content is stored with the key
+     * {@link RecursiveParserWrapper#TIKA_CONTENT}.
+     * <p>
+     * Specify the handler for the content (xml, html, text, ignore)
+     * in the path:<br/>
+     * /rmeta/form (default: xml)<br/>
+     * /rmeta/form/xml    (store the content as xml)<br/>
+     * /rmeta/form/text   (store the content as text)<br/>
+     * /rmeta/form/ignore (don't record any content)<br/>
+     *
+     * @param att attachment
+     * @param info uri info
+     * @param handlerTypeName which type of handler to use
+     * @return InputStream that can be deserialized as a list of {@link Metadata} objects
+     * @throws Exception
+     */
     @POST
     @Consumes("multipart/form-data")
-    @Produces({"text/csv", "application/json"})
-    @Path("form")
-    public Response getMetadataFromMultipart(Attachment att, @Context UriInfo info) throws Exception {
+    @Produces({"application/json"})
+    @Path("form{"+HANDLER_TYPE_PARAM+" : (\\w+)?}")
+    public Response getMetadataFromMultipart(Attachment att, @Context UriInfo info,
+                                             @PathParam(HANDLER_TYPE_PARAM) String handlerTypeName)
+            throws Exception {
         return Response.ok(
-                parseMetadata(att.getObject(InputStream.class), att.getHeaders(), info)).build();
+                parseMetadata(att.getObject(InputStream.class), att.getHeaders(), info, handlerTypeName)).build();
     }
 
+    /**
+     * Returns an InputStream that can be deserialized as a list of
+     * {@link Metadata} objects.
+     * The first in the list represents the main document, and the
+     * rest represent metadata for the embedded objects.  This works
+     * recursively through all descendants of the main document, not
+     * just the immediate children.
+     * <p>
+     * The extracted text content is stored with the key
+     * {@link RecursiveParserWrapper#TIKA_CONTENT}.
+     * <p>
+     * Specify the handler for the content (xml, html, text, ignore)
+     * in the path:<br/>
+     * /rmeta (default: xml)<br/>
+     * /rmeta/xml    (store the content as xml)<br/>
+     * /rmeta/text   (store the content as text)<br/>
+     * /rmeta/ignore (don't record any content)<br/>
+     *
+     * @param info uri info
+     * @param handlerTypeName which type of handler to use
+     * @return InputStream that can be deserialized as a list of {@link Metadata} objects
+     * @throws Exception
+     */
+
     @PUT
     @Produces("application/json")
-    public Response getMetadata(InputStream is, @Context HttpHeaders httpHeaders, @Context UriInfo info) throws Exception {
+    @Path("{"+HANDLER_TYPE_PARAM+" : (\\w+)?}")
+    public Response getMetadata(InputStream is,
+                                @Context HttpHeaders httpHeaders,
+                                @Context UriInfo info,
+                                @PathParam(HANDLER_TYPE_PARAM) String handlerTypeName
+                                ) throws Exception {
         return Response.ok(
-                parseMetadata(is, httpHeaders.getRequestHeaders(), info)).build();
+                parseMetadata(is, httpHeaders.getRequestHeaders(), info, handlerTypeName)).build();
     }
 
 	private MetadataList parseMetadata(InputStream is,
-			MultivaluedMap<String, String> httpHeaders, UriInfo info)
+			MultivaluedMap<String, String> httpHeaders, UriInfo info, String handlerTypeName)
 			throws Exception {
 		final Metadata metadata = new Metadata();
 		final ParseContext context = new ParseContext();
 		Parser parser = TikaResource.createParser();
-		// TODO: parameterize choice of handler and max chars?
-		BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
+		// TODO: parameterize choice of max chars/max embedded attachments
+		BasicContentHandlerFactory.HANDLER_TYPE type =
+                BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
 		RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser,
 				new BasicContentHandlerFactory(type, -1));
 		TikaResource.fillMetadata(parser, metadata, context, httpHeaders);

Modified: tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java?rev=1698329&r1=1698328&r2=1698329&view=diff
==============================================================================
--- tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java (original)
+++ tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java Fri Aug 28 12:59:24 2015
@@ -20,6 +20,8 @@ package org.apache.tika.server;
 import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
 
 import javax.ws.rs.core.Response;
 
@@ -31,15 +33,25 @@ import java.util.List;
 
 import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
 import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.server.resource.RecursiveMetadataResource;
 import org.apache.tika.server.writer.MetadataListMessageBodyWriter;
 import org.junit.Test;
 
 public class RecursiveMetadataResourceTest extends CXFTestBase {
+
+    private static final String FORM_PATH = "/form";
     private static final String META_PATH = "/rmeta";
+    private static final String TEXT_PATH = "/text";
+    private static final String IGNORE_PATH = "/ignore";
+    private static final String XML_PATH = "/xml";
+    private static final String UNPARSEABLE_PATH = "/somethingOrOther";
+    private static final String SLASH = "/";
+
     private static final String TEST_RECURSIVE_DOC = "test_recursive_embedded.docx";
 
     @Override
@@ -51,7 +63,7 @@ public class RecursiveMetadataResourceTe
 
     @Override
     protected void setUpProviders(JAXRSServerFactoryBean sf) {
-        List<Object> providers = new ArrayList<Object>();
+        List<Object> providers = new ArrayList();
         providers.add(new MetadataListMessageBodyWriter());
         sf.setProviders(providers);
     }
@@ -103,4 +115,163 @@ public class RecursiveMetadataResourceTe
         assertNotNull(metadataList.get(0).get("Author"));
         assertEquals("pavel", metadataList.get(0).get("Author"));
     }
+
+    @Test
+    public void testHandlerType() throws Exception {
+        //default unspecified
+        Response response = WebClient
+                .create(endPoint+META_PATH)
+                .accept("application/json")
+                .put(ClassLoader
+                        .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+        Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+        assertEquals(12, metadataList.size());
+        String content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
+        assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
+
+        //extra slash
+        response = WebClient
+                .create(endPoint + META_PATH + SLASH)
+                .accept("application/json")
+                .put(ClassLoader
+                        .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+        reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        metadataList = JsonMetadataList.fromJson(reader);
+        assertEquals(12, metadataList.size());
+        content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
+        assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
+
+        //unparseable
+        response = WebClient
+                .create(endPoint + META_PATH + UNPARSEABLE_PATH)
+                .accept("application/json")
+                .put(ClassLoader
+                        .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+        reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        metadataList = JsonMetadataList.fromJson(reader);
+        assertEquals(12, metadataList.size());
+        content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
+        assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
+
+        //xml
+        response = WebClient
+                .create(endPoint + META_PATH + XML_PATH)
+                .accept("application/json")
+                .put(ClassLoader
+                        .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+        reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        metadataList = JsonMetadataList.fromJson(reader);
+        assertEquals(12, metadataList.size());
+        content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
+        assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
+
+        //text
+        response = WebClient
+                .create(endPoint + META_PATH + TEXT_PATH)
+                .accept("application/json")
+                .put(ClassLoader
+                        .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+        reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        metadataList = JsonMetadataList.fromJson(reader);
+        assertEquals(12, metadataList.size());
+        content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
+        assertTrue(content.startsWith("embed_3"));
+
+        //ignore
+        response = WebClient
+                .create(endPoint + META_PATH + IGNORE_PATH)
+                .accept("application/json")
+                .put(ClassLoader
+                        .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+        reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        metadataList = JsonMetadataList.fromJson(reader);
+        assertEquals(12, metadataList.size());
+        assertNull(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT));
+
+    }
+
+    @Test
+    public void testHandlerTypeInMultipartXML() throws Exception {
+        //default unspecified
+        Attachment attachmentPart =
+                new Attachment("myworddocx",
+                        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                        ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+        WebClient webClient = WebClient.create(endPoint + META_PATH + FORM_PATH);
+
+        Response response = webClient.type("multipart/form-data")
+                .accept("application/json")
+                .post(attachmentPart);
+        Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+        assertEquals(12, metadataList.size());
+        String content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
+        assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
+
+        //unparseable
+        attachmentPart =
+                new Attachment("myworddocx",
+                        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                        ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+        webClient = WebClient.create(endPoint + META_PATH + FORM_PATH + UNPARSEABLE_PATH);
+
+        response = webClient.type("multipart/form-data")
+                .accept("application/json")
+                .post(attachmentPart);
+        reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        metadataList = JsonMetadataList.fromJson(reader);
+        assertEquals(12, metadataList.size());
+        content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
+        assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
+
+        //xml
+        attachmentPart =
+                new Attachment("myworddocx",
+                        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                        ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+        webClient = WebClient.create(endPoint + META_PATH + FORM_PATH + XML_PATH);
+
+        response = webClient.type("multipart/form-data")
+                .accept("application/json")
+                .post(attachmentPart);
+        reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        metadataList = JsonMetadataList.fromJson(reader);
+        assertEquals(12, metadataList.size());
+        content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
+        assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
+
+        //text
+        attachmentPart =
+                new Attachment("myworddocx",
+                        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                        ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+        webClient = WebClient.create(endPoint + META_PATH + FORM_PATH+TEXT_PATH);
+
+        response = webClient.type("multipart/form-data")
+                .accept("application/json")
+                .post(attachmentPart);
+        reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        metadataList = JsonMetadataList.fromJson(reader);
+        assertEquals(12, metadataList.size());
+        content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
+        assertTrue(content.startsWith("embed_3"));
+
+        //ignore -- no content
+        attachmentPart =
+                new Attachment("myworddocx",
+                        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                        ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+        webClient = WebClient.create(endPoint + META_PATH +FORM_PATH+IGNORE_PATH);
+
+        response = webClient.type("multipart/form-data")
+                .accept("application/json").query("handler", "ignore")
+                .post(attachmentPart);
+        reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        metadataList = JsonMetadataList.fromJson(reader);
+        assertEquals(12, metadataList.size());
+        assertNull(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT));
+    }
+
 }