You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/08/28 14:59:25 UTC
svn commit: r1698329 - in /tika/trunk: ./
tika-batch/src/main/java/org/apache/tika/batch/builders/
tika-core/src/main/java/org/apache/tika/sax/
tika-server/src/main/java/org/apache/tika/server/resource/
tika-server/src/test/java/org/apache/tika/server/
Author: tallison
Date: Fri Aug 28 12:59:24 2015
New Revision: 1698329
URL: http://svn.apache.org/r1698329
Log:
TIKA-1716 change default /rmeta content handler to xml and allow users to specify which content handler to use for content
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/DefaultContentHandlerFactoryBuilder.java
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1698329&r1=1698328&r2=1698329&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri Aug 28 12:59:24 2015
@@ -1,5 +1,9 @@
Release 1.11 - Current Development
+ * Changed default content handler type for "/rmeta" in tika-server
+ to "xml" to align with "-J" option in tika-app.
+ Clients can now specify handler types via PathParam. (TIKA-1716).
+
* The fantastic GROBID (or Grobid) GeneRation Of BIbliographic Data
for machine learning from PDF files is now integrated as a
Tika parser (TIKA-1699, TIKA-1712).
@@ -9,7 +13,7 @@ Release 1.11 - Current Development
* Upgraded to ASM 5.0.4 (TIKA-1705).
- * Corrected Tika Config XML detector defintion explicit loading
+ * Corrected Tika Config XML detector definition explicit loading
of MimeTypes (TIKA-1708)
* In Tika Parsers, Batch, Server, App and Examples, use Apache
Modified: tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/DefaultContentHandlerFactoryBuilder.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/DefaultContentHandlerFactoryBuilder.java?rev=1698329&r1=1698328&r2=1698329&view=diff
==============================================================================
--- tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/DefaultContentHandlerFactoryBuilder.java (original)
+++ tika/trunk/tika-batch/src/main/java/org/apache/tika/batch/builders/DefaultContentHandlerFactoryBuilder.java Fri Aug 28 12:59:24 2015
@@ -17,7 +17,6 @@ package org.apache.tika.batch.builders;
* limitations under the License.
*/
-import java.util.Locale;
import java.util.Map;
import org.apache.tika.sax.BasicContentHandlerFactory;
@@ -40,25 +39,8 @@ public class DefaultContentHandlerFactor
Map<String, String> attributes = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
BasicContentHandlerFactory.HANDLER_TYPE type = null;
String handlerTypeString = attributes.get("basicHandlerType");
- if (handlerTypeString == null) {
- handlerTypeString = "text";
- }
- handlerTypeString = handlerTypeString.toLowerCase(Locale.ROOT);
- if (handlerTypeString.equals("xml")) {
- type = BasicContentHandlerFactory.HANDLER_TYPE.XML;
- } else if (handlerTypeString.equals("text")) {
- type = BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
- } else if (handlerTypeString.equals("txt")) {
- type = BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
- } else if (handlerTypeString.equals("html")) {
- type = BasicContentHandlerFactory.HANDLER_TYPE.HTML;
- } else if (handlerTypeString.equals("body")) {
- type = BasicContentHandlerFactory.HANDLER_TYPE.BODY;
- } else if (handlerTypeString.equals("ignore")) {
- type = BasicContentHandlerFactory.HANDLER_TYPE.IGNORE;
- } else {
- type = BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
- }
+ type = BasicContentHandlerFactory.parseHandlerType(handlerTypeString,
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
int writeLimit = -1;
String writeLimitString = attributes.get("writeLimit");
if (writeLimitString != null) {
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java?rev=1698329&r1=1698328&r2=1698329&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java Fri Aug 28 12:59:24 2015
@@ -19,6 +19,7 @@ package org.apache.tika.sax;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
+import java.util.Locale;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;
@@ -29,6 +30,33 @@ import org.xml.sax.helpers.DefaultHandle
public class BasicContentHandlerFactory implements ContentHandlerFactory {
/**
+ * Tries to parse string into handler type. Returns default if string is null or
+ * parse fails.
+ * <p/>
+ * Options: xml, html, text, body, ignore (no content)
+ *
+ * @param handlerTypeName string to parse
+ * @param defaultType type to return if parse fails
+ * @return handler type
+ */
+ public static HANDLER_TYPE parseHandlerType(String handlerTypeName, HANDLER_TYPE defaultType) {
+ if (handlerTypeName == null) {
+ return defaultType;
+ }
+
+ String lcHandlerTypeName = handlerTypeName.toLowerCase(Locale.ROOT);
+ switch (lcHandlerTypeName) {
+ case "xml" : return HANDLER_TYPE.XML;
+ case "text" : return HANDLER_TYPE.TEXT;
+ case "txt" : return HANDLER_TYPE.TEXT;
+ case "html" : return HANDLER_TYPE.HTML;
+ case "body" : return HANDLER_TYPE.BODY;
+ case "ignore" : return HANDLER_TYPE.IGNORE;
+ default : return defaultType;
+ }
+ }
+
+ /**
* Common handler types for content.
*/
public enum HANDLER_TYPE {
@@ -37,7 +65,7 @@ public class BasicContentHandlerFactory
TEXT,
HTML,
XML
- };
+ }
private final HANDLER_TYPE type;
private final int writeLimit;
Modified: tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java?rev=1698329&r1=1698328&r2=1698329&view=diff
==============================================================================
--- tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java (original)
+++ tika/trunk/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java Fri Aug 28 12:59:24 2015
@@ -21,6 +21,7 @@ import javax.ws.rs.Consumes;
import javax.ws.rs.POST;
import javax.ws.rs.PUT;
import javax.ws.rs.Path;
+import javax.ws.rs.PathParam;
import javax.ws.rs.Produces;
import javax.ws.rs.core.Context;
import javax.ws.rs.core.HttpHeaders;
@@ -42,32 +43,92 @@ import org.apache.tika.server.MetadataLi
@Path("/rmeta")
public class RecursiveMetadataResource {
+
+ private static final String HANDLER_TYPE_PARAM = "handler";
+ private static final BasicContentHandlerFactory.HANDLER_TYPE DEFAULT_HANDLER_TYPE =
+ BasicContentHandlerFactory.HANDLER_TYPE.XML;
private static final Log logger = LogFactory.getLog(RecursiveMetadataResource.class);
+ /**
+ * Returns an InputStream that can be deserialized as a list of
+ * {@link Metadata} objects.
+ * The first in the list represents the main document, and the
+ * rest represent metadata for the embedded objects. This works
+ * recursively through all descendants of the main document, not
+ * just the immediate children.
+ * <p>
+ * The extracted text content is stored with the key
+ * {@link RecursiveParserWrapper#TIKA_CONTENT}.
+ * <p>
+ * Specify the handler for the content (xml, html, text, ignore)
+ * in the path:<br/>
+ * /rmeta/form (default: xml)<br/>
+ * /rmeta/form/xml (store the content as xml)<br/>
+ * /rmeta/form/text (store the content as text)<br/>
+ * /rmeta/form/ignore (don't record any content)<br/>
+ *
+ * @param att attachment
+ * @param info uri info
+ * @param handlerTypeName which type of handler to use
+ * @return InputStream that can be deserialized as a list of {@link Metadata} objects
+ * @throws Exception
+ */
@POST
@Consumes("multipart/form-data")
- @Produces({"text/csv", "application/json"})
- @Path("form")
- public Response getMetadataFromMultipart(Attachment att, @Context UriInfo info) throws Exception {
+ @Produces({"application/json"})
+ @Path("form{"+HANDLER_TYPE_PARAM+" : (\\w+)?}")
+ public Response getMetadataFromMultipart(Attachment att, @Context UriInfo info,
+ @PathParam(HANDLER_TYPE_PARAM) String handlerTypeName)
+ throws Exception {
return Response.ok(
- parseMetadata(att.getObject(InputStream.class), att.getHeaders(), info)).build();
+ parseMetadata(att.getObject(InputStream.class), att.getHeaders(), info, handlerTypeName)).build();
}
+ /**
+ * Returns an InputStream that can be deserialized as a list of
+ * {@link Metadata} objects.
+ * The first in the list represents the main document, and the
+ * rest represent metadata for the embedded objects. This works
+ * recursively through all descendants of the main document, not
+ * just the immediate children.
+ * <p>
+ * The extracted text content is stored with the key
+ * {@link RecursiveParserWrapper#TIKA_CONTENT}.
+ * <p>
+ * Specify the handler for the content (xml, html, text, ignore)
+ * in the path:<br/>
+ * /rmeta (default: xml)<br/>
+ * /rmeta/xml (store the content as xml)<br/>
+ * /rmeta/text (store the content as text)<br/>
+ * /rmeta/ignore (don't record any content)<br/>
+ *
+ * @param info uri info
+ * @param handlerTypeName which type of handler to use
+ * @return InputStream that can be deserialized as a list of {@link Metadata} objects
+ * @throws Exception
+ */
+
@PUT
@Produces("application/json")
- public Response getMetadata(InputStream is, @Context HttpHeaders httpHeaders, @Context UriInfo info) throws Exception {
+ @Path("{"+HANDLER_TYPE_PARAM+" : (\\w+)?}")
+ public Response getMetadata(InputStream is,
+ @Context HttpHeaders httpHeaders,
+ @Context UriInfo info,
+ @PathParam(HANDLER_TYPE_PARAM) String handlerTypeName
+ ) throws Exception {
return Response.ok(
- parseMetadata(is, httpHeaders.getRequestHeaders(), info)).build();
+ parseMetadata(is, httpHeaders.getRequestHeaders(), info, handlerTypeName)).build();
}
private MetadataList parseMetadata(InputStream is,
- MultivaluedMap<String, String> httpHeaders, UriInfo info)
+ MultivaluedMap<String, String> httpHeaders, UriInfo info, String handlerTypeName)
throws Exception {
final Metadata metadata = new Metadata();
final ParseContext context = new ParseContext();
Parser parser = TikaResource.createParser();
- // TODO: parameterize choice of handler and max chars?
- BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.HANDLER_TYPE.TEXT;
+ // TODO: parameterize choice of max chars/max embedded attachments
+ BasicContentHandlerFactory.HANDLER_TYPE type =
+ BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser,
new BasicContentHandlerFactory(type, -1));
TikaResource.fillMetadata(parser, metadata, context, httpHeaders);
Modified: tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java?rev=1698329&r1=1698328&r2=1698329&view=diff
==============================================================================
--- tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java (original)
+++ tika/trunk/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java Fri Aug 28 12:59:24 2015
@@ -20,6 +20,8 @@ package org.apache.tika.server;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
import javax.ws.rs.core.Response;
@@ -31,15 +33,25 @@ import java.util.List;
import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.server.resource.RecursiveMetadataResource;
import org.apache.tika.server.writer.MetadataListMessageBodyWriter;
import org.junit.Test;
public class RecursiveMetadataResourceTest extends CXFTestBase {
+
+ private static final String FORM_PATH = "/form";
private static final String META_PATH = "/rmeta";
+ private static final String TEXT_PATH = "/text";
+ private static final String IGNORE_PATH = "/ignore";
+ private static final String XML_PATH = "/xml";
+ private static final String UNPARSEABLE_PATH = "/somethingOrOther";
+ private static final String SLASH = "/";
+
private static final String TEST_RECURSIVE_DOC = "test_recursive_embedded.docx";
@Override
@@ -51,7 +63,7 @@ public class RecursiveMetadataResourceTe
@Override
protected void setUpProviders(JAXRSServerFactoryBean sf) {
- List<Object> providers = new ArrayList<Object>();
+ List<Object> providers = new ArrayList();
providers.add(new MetadataListMessageBodyWriter());
sf.setProviders(providers);
}
@@ -103,4 +115,163 @@ public class RecursiveMetadataResourceTe
assertNotNull(metadataList.get(0).get("Author"));
assertEquals("pavel", metadataList.get(0).get("Author"));
}
+
+ @Test
+ public void testHandlerType() throws Exception {
+ //default unspecified
+ Response response = WebClient
+ .create(endPoint+META_PATH)
+ .accept("application/json")
+ .put(ClassLoader
+ .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+ Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(12, metadataList.size());
+ String content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
+ assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
+
+ //extra slash
+ response = WebClient
+ .create(endPoint + META_PATH + SLASH)
+ .accept("application/json")
+ .put(ClassLoader
+ .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(12, metadataList.size());
+ content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
+ assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
+
+ //unparseable
+ response = WebClient
+ .create(endPoint + META_PATH + UNPARSEABLE_PATH)
+ .accept("application/json")
+ .put(ClassLoader
+ .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(12, metadataList.size());
+ content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
+ assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
+
+ //xml
+ response = WebClient
+ .create(endPoint + META_PATH + XML_PATH)
+ .accept("application/json")
+ .put(ClassLoader
+ .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(12, metadataList.size());
+ content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
+ assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
+
+ //text
+ response = WebClient
+ .create(endPoint + META_PATH + TEXT_PATH)
+ .accept("application/json")
+ .put(ClassLoader
+ .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(12, metadataList.size());
+ content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
+ assertTrue(content.startsWith("embed_3"));
+
+ //ignore
+ response = WebClient
+ .create(endPoint + META_PATH + IGNORE_PATH)
+ .accept("application/json")
+ .put(ClassLoader
+ .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(12, metadataList.size());
+ assertNull(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT));
+
+ }
+
+ @Test
+ public void testHandlerTypeInMultipartXML() throws Exception {
+ //default unspecified
+ Attachment attachmentPart =
+ new Attachment("myworddocx",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ WebClient webClient = WebClient.create(endPoint + META_PATH + FORM_PATH);
+
+ Response response = webClient.type("multipart/form-data")
+ .accept("application/json")
+ .post(attachmentPart);
+ Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(12, metadataList.size());
+ String content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
+ assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
+
+ //unparseable
+ attachmentPart =
+ new Attachment("myworddocx",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ webClient = WebClient.create(endPoint + META_PATH + FORM_PATH + UNPARSEABLE_PATH);
+
+ response = webClient.type("multipart/form-data")
+ .accept("application/json")
+ .post(attachmentPart);
+ reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(12, metadataList.size());
+ content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
+ assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
+
+ //xml
+ attachmentPart =
+ new Attachment("myworddocx",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ webClient = WebClient.create(endPoint + META_PATH + FORM_PATH + XML_PATH);
+
+ response = webClient.type("multipart/form-data")
+ .accept("application/json")
+ .post(attachmentPart);
+ reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(12, metadataList.size());
+ content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
+ assertTrue(content.startsWith("<html xmlns=\"http://www.w3.org/1999/xhtml\">"));
+
+ //text
+ attachmentPart =
+ new Attachment("myworddocx",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ webClient = WebClient.create(endPoint + META_PATH + FORM_PATH+TEXT_PATH);
+
+ response = webClient.type("multipart/form-data")
+ .accept("application/json")
+ .post(attachmentPart);
+ reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(12, metadataList.size());
+ content = metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT).trim();
+ assertTrue(content.startsWith("embed_3"));
+
+ //ignore -- no content
+ attachmentPart =
+ new Attachment("myworddocx",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ webClient = WebClient.create(endPoint + META_PATH +FORM_PATH+IGNORE_PATH);
+
+ response = webClient.type("multipart/form-data")
+ .accept("application/json").query("handler", "ignore")
+ .post(attachmentPart);
+ reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(12, metadataList.size());
+ assertNull(metadataList.get(6).get(RecursiveParserWrapper.TIKA_CONTENT));
+ }
+
}