You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/09/19 21:18:08 UTC
svn commit: r1626300 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/parser/
tika-core/src/main/java/org/apache/tika/sax/
tika-parsers/src/test/java/org/apache/tika/
tika-parsers/src/test/java/org/apache/tika/parser/
tika-parsers/src/test/jav...
Author: tallison
Date: Fri Sep 19 19:18:08 2014
New Revision: 1626300
URL: http://svn.apache.org/r1626300
Log:
TIKA-1329 add RecursiveParserWrapper
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
tika/trunk/tika-parsers/src/test/resources/test-documents/test_recursive_embedded.docx (with props)
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java?rev=1626300&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java Fri Sep 19 19:18:08 2014
@@ -0,0 +1,326 @@
+package org.apache.tika.parser;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.FilenameUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.ContentHandlerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Date;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * This is a helper class that wraps a parser in a recursive handler.
+ * It takes care of setting the embedded parser in the ParseContext
+ * and handling the embedded path calculations.
+ * <p>
+ * After parsing a document, call getMetadata() to retrieve a list of
+ * Metadata objects, one for each embedded resource. The first item
+ * in the list will contain the Metadata for the outer container file.
+ * <p>
+ * Content can also be extracted and stored in the {@link #TIKA_CONTENT} field
+ * of a Metadata object. Select the type of content to be stored
+ * at initialization.
+ * <p>
+ * If a WriteLimitReachedException is encountered, the wrapper will stop
+ * processing the current resource, and it will not process
+ * any of the child resources for the given resource. However, it will try to
+ * parse as much as it can. If a WLRE is reached in the parent document,
+ * no child resources will be parsed.
+ * <p>
+ * The implementation is based on Jukka's RecursiveMetadataParser
+ * and Nick's additions. See:
+ * <a href="http://wiki.apache.org/tika/RecursiveMetadata#Jukka.27s_RecursiveMetadata_Parser">RecursiveMetadataParser</a>.
+ * <p>
+ * Note that this wrapper holds all data in memory and is not appropriate
+ * for files with content too large to be held in memory.
+ * <p>
+ * Note, too, that this wrapper is not thread safe because it stores state.
+ * The client must initialize a new wrapper for each thread, and the client
+ * is responsible for calling {@link #reset()} after each parse.
+ * <p>
+ * The unit tests for this class are in the tika-parsers module.
+ * </p>
+ */
+public class RecursiveParserWrapper implements Parser {
+
+ /**
+ * Generated serial version
+ */
+ private static final long serialVersionUID = 9086536568120690938L;
+
+
+
+ public final static String TIKA_PREFIX = "tika:";
+ public final static String TIKA_EXCEPTION_PREFIX = "tika_ex:";
+
+ //move this to TikaCoreProperties?
+ public final static Property TIKA_CONTENT = Property.internalText(TIKA_PREFIX+"content");
+ public final static Property PARSE_TIME_MILLIS = Property.internalText(TIKA_PREFIX+"parse_time_millis");
+ public final static Property WRITE_LIMIT_REACHED =
+ Property.internalBoolean(TIKA_EXCEPTION_PREFIX+"write_limit_reached");
+ public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED =
+ Property.internalBoolean(TIKA_EXCEPTION_PREFIX+"embedded_resource_limit_reached");
+
+ public final static Property PARSE_EXCEPTION =
+ Property.internalBoolean(TIKA_EXCEPTION_PREFIX+"parse_exception");
+
+ //move this to TikaCoreProperties?
+ public final static Property EMBEDDED_RESOURCE_PATH =
+ Property.internalText(TIKA_PREFIX+"embedded_resource_path");
+
+ private final Parser wrappedParser;
+ private final ContentHandlerFactory contentHandlerFactory;
+ private final List<Metadata> metadatas = new LinkedList<Metadata>();
+
+ //used in naming embedded resources that don't have a name.
+ private int unknownCount = 0;
+ private int maxEmbeddedResources = -1;
+ private boolean hitMaxEmbeddedResources = false;
+
+ public RecursiveParserWrapper(Parser wrappedParser, ContentHandlerFactory contentHandlerFactory) {
+ this.wrappedParser = wrappedParser;
+ this.contentHandlerFactory = contentHandlerFactory;
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return wrappedParser.getSupportedTypes(context);
+ }
+
+ /**
+ * Acts like a regular parser except it ignores the ContentHandler
+ * and it automatically sets/overwrites the embedded Parser in the
+ * ParseContext object.
+ * <p>
+ * To retrieve the results of the parse, use {@link #getMetadata()}.
+ * <p>
+ * Make sure to call {@link #reset()} after each parse.
+ */
+ @Override
+ public void parse(InputStream stream, ContentHandler ignore,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+
+ String name = getResourceName(metadata);
+ EmbeddedParserDecorator decorator = new EmbeddedParserDecorator(name);
+ context.set(Parser.class, decorator);
+ ContentHandler localHandler = contentHandlerFactory.getNewContentHandler();
+ long started = new Date().getTime();
+ try {
+ wrappedParser.parse(stream, localHandler, metadata, context);
+ } catch (SAXException e) {
+ boolean wlr = isWriteLimitReached(e);
+ if (wlr == false) {
+ throw e;
+ }
+ metadata.set(WRITE_LIMIT_REACHED, "true");
+ }
+ long elapsedMillis = new Date().getTime()-started;
+ metadata.set(PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
+ addContent(localHandler, metadata);
+
+ if (hitMaxEmbeddedResources) {
+ metadata.set(EMBEDDED_RESOURCE_LIMIT_REACHED, "true");
+ }
+ metadatas.add(0, deepCopy(metadata));
+ }
+
+ /**
+ *
+ * The first element in the returned list represents the
+ * data from the outer container file. There is no guarantee
+ * about the ordering of the list after that.
+ *
+ * @return list of Metadata objects that were gathered during the parse
+ */
+ public List<Metadata> getMetadata() {
+ return metadatas;
+ }
+
+ /**
+ * Set the maximum number of embedded resources to store.
+ * If the max is hit during parsing, the {@link #EMBEDDED_RESOURCE_LIMIT_REACHED}
+ * property will be added to the container document's Metadata.
+ *
+ * <p>
+ * If this value is < 0 (the default), the wrapper will store all Metadata.
+ *
+ * @param max maximum number of embedded resources to store
+ */
+ public void setMaxEmbeddedResources(int max) {
+ maxEmbeddedResources = max;
+ }
+
+
+ /**
+ * This clears the metadata list and resets {@link #unknownCount} and
+ * {@link #hitMaxEmbeddedResources}
+ */
+ public void reset() {
+ metadatas.clear();
+ unknownCount = 0;
+ hitMaxEmbeddedResources = false;
+ }
+
+ /**
+ * Copied/modified from WriteOutContentHandler. Couldn't make that
+ * static, and we need to have something that will work
+ * with exceptions thrown from both BodyContentHandler and WriteOutContentHandler
+ * @param t
+ * @return
+ */
+ private boolean isWriteLimitReached(Throwable t) {
+ if (t.getMessage().indexOf("Your document contained more than") == 0) {
+ return true;
+ } else {
+ return t.getCause() != null && isWriteLimitReached(t.getCause());
+ }
+ }
+
+ //defensive copy
+ private Metadata deepCopy(Metadata m) {
+ Metadata clone = new Metadata();
+
+ for (String n : m.names()){
+ if (! m.isMultiValued(n)) {
+ clone.set(n, m.get(n));
+ } else {
+ String[] vals = m.getValues(n);
+ for (int i = 0; i < vals.length; i++) {
+ clone.add(n, vals[i]);
+ }
+ }
+ }
+ return clone;
+ }
+
+ private String getResourceName(Metadata metadata) {
+ String objectName = "";
+ if (metadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY) != null) {
+ objectName = metadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY);
+ } else if (metadata.get(TikaMetadataKeys.EMBEDDED_RELATIONSHIP_ID) != null) {
+ objectName = metadata.get(TikaMetadataKeys.EMBEDDED_RELATIONSHIP_ID);
+ } else {
+ objectName = "embedded-" + (++unknownCount);
+ }
+ //make sure that there isn't any path info in the objectName
+ //some parsers can return paths, not just file names
+ objectName = FilenameUtils.getName(objectName);
+ return objectName;
+ }
+
+ private void addContent(ContentHandler handler, Metadata metadata) {
+
+ if (handler.getClass().equals(DefaultHandler.class)){
+ //no-op: we can't rely on just testing for
+ //empty content because DefaultHandler's toString()
+ //returns e.g. "org.xml.sax.helpers.DefaultHandler@6c8b1edd"
+ } else {
+ String content = handler.toString();
+ if (content != null && content.trim().length() > 0 ) {
+ metadata.add(TIKA_CONTENT, content);
+ }
+ }
+
+ }
+
+ /**
+ * Override for different behavior.
+ *
+ * @return handler to be used for each document
+ */
+
+
+ private class EmbeddedParserDecorator extends ParserDecorator {
+
+ private static final long serialVersionUID = 207648200464263337L;
+
+ private String location = null;
+
+
+ private EmbeddedParserDecorator(String location) {
+ super(wrappedParser);
+ this.location = location;
+ if (! this.location.endsWith("/")) {
+ this.location += "/";
+ }
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler ignore,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ //Test to see if we should avoid parsing
+ if (maxEmbeddedResources > -1 &&
+ metadatas.size() >= maxEmbeddedResources) {
+ hitMaxEmbeddedResources = true;
+ return;
+ }
+ // Work out what this thing is
+ String objectName = getResourceName(metadata);
+ String objectLocation = this.location + objectName;
+
+ metadata.add(EMBEDDED_RESOURCE_PATH, objectLocation);
+
+ //ignore the content handler that is passed in
+ //and get a fresh handler
+ ContentHandler localHandler = contentHandlerFactory.getNewContentHandler();
+
+ Parser preContextParser = context.get(Parser.class);
+ context.set(Parser.class, new EmbeddedParserDecorator(objectLocation));
+
+ try {
+ super.parse(stream, localHandler, metadata, context);
+ } catch (SAXException e) {
+ boolean wlr = isWriteLimitReached(e);
+ if (wlr == true) {
+ metadata.add(WRITE_LIMIT_REACHED, "true");
+ } else {
+ throw e;
+ }
+ } finally {
+ context.set(Parser.class, preContextParser);
+ }
+
+ //Because of recursion, we need
+ //to re-test to make sure that we limit the
+ //number of stored resources
+ if (maxEmbeddedResources > -1 &&
+ metadatas.size() >= maxEmbeddedResources) {
+ hitMaxEmbeddedResources = true;
+ return;
+ }
+ addContent(localHandler, metadata);
+ metadatas.add(deepCopy(metadata));
+ }
+ }
+
+
+}
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java?rev=1626300&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java Fri Sep 19 19:18:08 2014
@@ -0,0 +1,126 @@
+package org.apache.tika.sax;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.OutputStream;
+import java.io.UnsupportedEncodingException;
+
+/**
+ * Basic factory for creating common types of ContentHandlers
+ */
+public class BasicContentHandlerFactory implements ContentHandlerFactory {
+
+ /**
+ * Common handler types for content.
+ */
+ public enum HANDLER_TYPE {
+ BODY,
+ IGNORE, //don't store content
+ TEXT,
+ HTML,
+ XML
+ };
+
+ private final HANDLER_TYPE type;
+ private final int writeLimit;
+
+ /**
+ *
+ * @param type basic type of handler
+ * @param writeLimit max number of characters to store; if < 0, the handler will store all characters
+ */
+ public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit) {
+ this.type = type;
+ this.writeLimit = writeLimit;
+ }
+
+ @Override
+ public ContentHandler getNewContentHandler() {
+
+ if (writeLimit > -1) {
+ switch(type) {
+ case BODY:
+ return new BodyContentHandler(writeLimit);
+ case IGNORE:
+ return new DefaultHandler();
+ case TEXT:
+ return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit);
+ case HTML:
+ return new WriteOutContentHandler(new ToHTMLContentHandler(), writeLimit);
+ case XML:
+ return new WriteOutContentHandler(new ToXMLContentHandler(), writeLimit);
+ default:
+ return new WriteOutContentHandler(new ToTextContentHandler(), writeLimit);
+ }
+ } else {
+ switch (type) {
+ case BODY:
+ return new BodyContentHandler();
+ case IGNORE:
+ return new DefaultHandler();
+ case TEXT:
+ return new ToTextContentHandler();
+ case HTML:
+ return new ToHTMLContentHandler();
+ case XML:
+ return new ToXMLContentHandler();
+ default:
+ return new ToTextContentHandler();
+
+ }
+ }
+ }
+
+ @Override
+ public ContentHandler getNewContentHandler(OutputStream os, String encoding) throws UnsupportedEncodingException {
+ if (writeLimit > -1) {
+ switch(type) {
+ case BODY:
+ return new WriteOutContentHandler(new BodyContentHandler(new ToTextContentHandler(os, encoding)), writeLimit);
+ case IGNORE:
+ return new DefaultHandler();
+ case TEXT:
+ return new WriteOutContentHandler(new ToTextContentHandler(os, encoding), writeLimit);
+ case HTML:
+ return new WriteOutContentHandler(new ToHTMLContentHandler(os, encoding), writeLimit);
+ case XML:
+ return new WriteOutContentHandler(new ToXMLContentHandler(os, encoding), writeLimit);
+ default:
+ return new WriteOutContentHandler(new ToTextContentHandler(os, encoding), writeLimit);
+ }
+ } else {
+ switch (type) {
+ case BODY:
+ return new BodyContentHandler(new ToTextContentHandler(os, encoding));
+ case IGNORE:
+ return new DefaultHandler();
+ case TEXT:
+ return new ToTextContentHandler(os, encoding);
+ case HTML:
+ return new ToHTMLContentHandler(os, encoding);
+ case XML:
+ return new ToXMLContentHandler(os, encoding);
+ default:
+ return new ToTextContentHandler(os, encoding);
+
+ }
+ }
+ }
+
+}
Added: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java?rev=1626300&view=auto
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java (added)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java Fri Sep 19 19:18:08 2014
@@ -0,0 +1,32 @@
+package org.apache.tika.sax;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.xml.sax.ContentHandler;
+
+import java.io.OutputStream;
+import java.io.UnsupportedEncodingException;
+
+/**
+ * Interface to allow easier injection of code for getting a new ContentHandler
+ */
+public interface ContentHandlerFactory {
+ public ContentHandler getNewContentHandler();
+ public ContentHandler getNewContentHandler(OutputStream os, String encoding) throws UnsupportedEncodingException;
+
+}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java?rev=1626300&r1=1626299&r2=1626300&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java Fri Sep 19 19:18:08 2014
@@ -16,9 +16,17 @@
*/
package org.apache.tika;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
+import org.apache.tika.extractor.EmbeddedResourceHandler;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.xml.sax.ContentHandler;
import java.io.ByteArrayOutputStream;
import java.io.File;
@@ -31,21 +39,9 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedResourceHandler;
-import org.apache.tika.io.IOUtils;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.ParserDecorator;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.ToXMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
/**
* Parent class of Tika tests
@@ -200,56 +196,4 @@ public abstract class TikaTest {
}
}
}
-
- /**
- * Stores metadata and (optionally) content.
- * Many thanks to Jukka's example:
- * http://wiki.apache.org/tika/RecursiveMetadata
- * This ignores the incoming handler and applies a
- * new BodyContentHandler(-1) for each file.
- */
- public static class RecursiveMetadataParser extends ParserDecorator {
- /** Key for content string if stored */
- public static final String TIKA_CONTENT = "tika:content";
-
- private static final long serialVersionUID = 1L;
-
- private List<Metadata> metadatas = new ArrayList<Metadata>();
- private final boolean storeContent;
-
- public RecursiveMetadataParser(Parser parser,
- boolean storeContent) {
- super(parser);
- this.storeContent = storeContent;
- }
-
- @Override
- public void parse(
- InputStream stream, ContentHandler ignoredHandler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
-
- ContentHandler contentHandler = null;
- if (storeContent) {
- contentHandler = new BodyContentHandler(-1);
- } else {
- contentHandler = new DefaultHandler();
- }
- super.parse(stream, contentHandler, metadata, context);
-
- if (storeContent) {
- metadata.add(TIKA_CONTENT, contentHandler.toString());
- }
- metadatas.add(metadata);
- }
-
- public List<Metadata> getAllMetadata() {
- return metadatas;
- }
-
- public void clear() {
- metadatas.clear();
- }
- }
-
}
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java?rev=1626300&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java Fri Sep 19 19:18:08 2014
@@ -0,0 +1,202 @@
+package org.apache.tika.parser;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+public class RecursiveParserWrapperTest {
+
+ @Test
+ public void testBasicXML() throws Exception {
+ List<Metadata> list = getMetadata(new Metadata(),
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+ Metadata container = list.get(0);
+ String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+ //not much differentiates html from xml in this test file
+ assertTrue(content.indexOf("<p class=\"header\" />") > -1);
+ }
+
+ @Test
+ public void testBasicHTML() throws Exception {
+ List<Metadata> list = getMetadata(new Metadata(),
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1));
+ Metadata container = list.get(0);
+ String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+ //not much differentiates html from xml in this test file
+ assertTrue(content.indexOf("<p class=\"header\"></p>") > -1);
+ }
+
+ @Test
+ public void testBasicText() throws Exception {
+ List<Metadata> list = getMetadata(new Metadata(),
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+ Metadata container = list.get(0);
+ String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertTrue(content.indexOf("<p ") < 0);
+ assertTrue(content.indexOf("embed_0") > -1);
+ }
+
+ @Test
+ public void testIgnoreContent() throws Exception {
+ List<Metadata> list = getMetadata(new Metadata(),
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
+ Metadata container = list.get(0);
+ String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertNull(content);
+ }
+
+
+ @Test
+ public void testCharLimit() throws Exception {
+ ParseContext context = new ParseContext();
+ Metadata metadata = new Metadata();
+
+ Parser wrapped = new AutoDetectParser();
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60));
+ InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream(
+ "/test-documents/test_recursive_embedded.docx");
+ wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ List<Metadata> list = wrapper.getMetadata();
+
+ assertEquals(5, list.size());
+
+ int wlr = 0;
+ for (Metadata m : list) {
+ String limitReached = m.get(RecursiveParserWrapper.WRITE_LIMIT_REACHED);
+ if (limitReached != null && limitReached.equals("true")){
+ wlr++;
+ }
+ }
+ assertEquals(1, wlr);
+
+ }
+ @Test
+ public void testMaxEmbedded() throws Exception {
+ int maxEmbedded = 4;
+ int totalNoLimit = 12;//including outer container file
+ ParseContext context = new ParseContext();
+ Metadata metadata = new Metadata();
+ String limitReached = null;
+
+ Parser wrapped = new AutoDetectParser();
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+
+ InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream(
+ "/test-documents/test_recursive_embedded.docx");
+ wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ List<Metadata> list = wrapper.getMetadata();
+ //test default
+ assertEquals(totalNoLimit, list.size());
+
+ limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
+ assertNull(limitReached);
+
+
+ wrapper.reset();
+ stream.close();
+
+ //test setting value
+ metadata = new Metadata();
+ stream = RecursiveParserWrapperTest.class.getResourceAsStream(
+ "/test-documents/test_recursive_embedded.docx");
+ wrapper.setMaxEmbeddedResources(maxEmbedded);
+ wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ list = wrapper.getMetadata();
+
+ //add 1 for outer container file
+ assertEquals(maxEmbedded+1, list.size());
+
+ limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
+ assertEquals("true", limitReached);
+
+ wrapper.reset();
+ stream.close();
+
+ //test setting value < 0
+ metadata = new Metadata();
+ stream = RecursiveParserWrapperTest.class.getResourceAsStream(
+ "/test-documents/test_recursive_embedded.docx");
+
+ wrapper.setMaxEmbeddedResources(-2);
+ wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ assertEquals(totalNoLimit, list.size());
+ limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
+ assertNull(limitReached);
+ }
+
+ @Test
+ public void testEmbeddedResourcePath() throws Exception {
+
+ Set<String> targets = new HashSet<String>();
+ targets.add("test_recursive_embedded.docx/embed1.zip");
+ targets.add("test_recursive_embedded.docx/embed1.zip/embed2.zip");
+ targets.add("test_recursive_embedded.docx/embed1.zip/embed2.zip/embed3.zip");
+ targets.add("test_recursive_embedded.docx/embed1.zip/embed2.zip/embed3.zip/embed4.zip");
+ targets.add("test_recursive_embedded.docx/embed1.zip/embed2.zip/embed3.zip/embed4.zip/embed4.txt");
+ targets.add("test_recursive_embedded.docx/embed1.zip/embed2.zip/embed3.zip/embed3.txt");
+ targets.add("test_recursive_embedded.docx/embed1.zip/embed2.zip/embed2a.txt");
+ targets.add("test_recursive_embedded.docx/embed1.zip/embed2.zip/embed2b.txt");
+ targets.add("test_recursive_embedded.docx/embed1.zip/embed1b.txt");
+ targets.add("test_recursive_embedded.docx/embed1.zip/embed1a.txt");
+ targets.add("test_recursive_embedded.docx/image1.emf");
+
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
+ List<Metadata> list = getMetadata(metadata,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+ Metadata container = list.get(0);
+ String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertTrue(content.indexOf("<p class=\"header\" />") > -1);
+
+ Set<String> seen = new HashSet<String>();
+ for (Metadata m : list) {
+ String path = m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
+ if (path != null) {
+ seen.add(path);
+ }
+ }
+ assertEquals(targets, seen);
+ }
+
+ private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory)
+ throws Exception{
+ ParseContext context = new ParseContext();
+ Parser wrapped = new AutoDetectParser();
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, contentHandlerFactory);
+ InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream(
+ "/test-documents/test_recursive_embedded.docx");
+ wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ return wrapper.getMetadata();
+ }
+}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1626300&r1=1626299&r2=1626300&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Fri Sep 19 19:18:08 2014
@@ -16,21 +16,6 @@
*/
package org.apache.tika.parser.pdf;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.InputStream;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Set;
-
import org.apache.tika.TikaTest;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.DocumentSelector;
@@ -44,10 +29,27 @@ import org.apache.tika.parser.AutoDetect
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.junit.Test;
import org.xml.sax.ContentHandler;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
/**
* Test case for parsing pdf files.
*/
@@ -667,7 +669,8 @@ public class PDFParserTest extends TikaT
//"regressiveness" exists only in Unit10.doc not in the container pdf document
assertTrue(xml.contains("regressiveness"));
- RecursiveMetadataParser p = new RecursiveMetadataParser(new AutoDetectParser(), false);
+ RecursiveParserWrapper p = new RecursiveParserWrapper(new AutoDetectParser(),
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
TikaInputStream tis = null;
ParseContext context = new ParseContext();
PDFParserConfig config = new PDFParserConfig();
@@ -686,16 +689,17 @@ public class PDFParserTest extends TikaT
}
}
- List<Metadata> metadatas = p.getAllMetadata();
+ List<Metadata> metadatas = p.getMetadata();
+
assertEquals(5, metadatas.size());
assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
assertNull(metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
- assertEquals("Press Quality(1).joboptions", metadatas.get(2).get(Metadata.RESOURCE_NAME_KEY));
- assertEquals("Unit10.doc", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
- assertEquals(MediaType.image("jpeg").toString(), metadatas.get(0).get(Metadata.CONTENT_TYPE));
- assertEquals(MediaType.image("tiff").toString(), metadatas.get(1).get(Metadata.CONTENT_TYPE));
- assertEquals("text/plain; charset=ISO-8859-1", metadatas.get(2).get(Metadata.CONTENT_TYPE));
- assertEquals(TYPE_DOC.toString(), metadatas.get(3).get(Metadata.CONTENT_TYPE));
+ assertEquals("Press Quality(1).joboptions", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
+ assertEquals("Unit10.doc", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY));
+ assertEquals(MediaType.image("jpeg").toString(), metadatas.get(1).get(Metadata.CONTENT_TYPE));
+ assertEquals(MediaType.image("tiff").toString(), metadatas.get(2).get(Metadata.CONTENT_TYPE));
+ assertEquals("text/plain; charset=ISO-8859-1", metadatas.get(3).get(Metadata.CONTENT_TYPE));
+ assertEquals(TYPE_DOC.toString(), metadatas.get(4).get(Metadata.CONTENT_TYPE));
}
@@ -849,7 +853,8 @@ public class PDFParserTest extends TikaT
Parser defaultParser = new AutoDetectParser();
- RecursiveMetadataParser p = new RecursiveMetadataParser(defaultParser, false);
+ RecursiveParserWrapper p = new RecursiveParserWrapper(defaultParser,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
ParseContext context = new ParseContext();
context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
context.set(org.apache.tika.parser.Parser.class, p);
@@ -860,7 +865,7 @@ public class PDFParserTest extends TikaT
p.parse(stream, handler, metadata, context);
- List<Metadata> metadatas = p.getAllMetadata();
+ List<Metadata> metadatas = p.getMetadata();
int inline = 0;
int attach = 0;
for (Metadata m : metadatas) {
@@ -877,7 +882,7 @@ public class PDFParserTest extends TikaT
assertEquals(2, attach);
stream.close();
- p.clear();
+ p.reset();
//now try turning off inline
stream = TikaInputStream.get(this.getClass().getResource(path));
@@ -889,7 +894,7 @@ public class PDFParserTest extends TikaT
metadata = new Metadata();
p.parse(stream, handler, metadata, context);
- metadatas = p.getAllMetadata();
+ metadatas = p.getMetadata();
for (Metadata m : metadatas) {
String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
if (v != null) {
@@ -910,7 +915,8 @@ public class PDFParserTest extends TikaT
public void testInlineConfig() throws Exception {
Parser defaultParser = new AutoDetectParser();
- RecursiveMetadataParser p = new RecursiveMetadataParser(defaultParser, false);
+ RecursiveParserWrapper p = new RecursiveParserWrapper(defaultParser,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
ParseContext context = new ParseContext();
context.set(org.apache.tika.parser.Parser.class, p);
Metadata metadata = new Metadata();
@@ -920,7 +926,7 @@ public class PDFParserTest extends TikaT
p.parse(stream, handler, metadata, context);
- List<Metadata> metadatas = p.getAllMetadata();
+ List<Metadata> metadatas = p.getMetadata();
int inline = 0;
int attach = 0;
for (Metadata m : metadatas) {
@@ -937,7 +943,7 @@ public class PDFParserTest extends TikaT
assertEquals(2, attach);
stream.close();
- p.clear();
+ p.reset();
//now try turning off inline
stream = TikaInputStream.get(this.getClass().getResource(path));
@@ -952,7 +958,7 @@ public class PDFParserTest extends TikaT
metadata = new Metadata();
p.parse(stream, handler, metadata, context);
- metadatas = p.getAllMetadata();
+ metadatas = p.getMetadata();
for (Metadata m : metadatas) {
String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
if (v != null) {
@@ -971,16 +977,18 @@ public class PDFParserTest extends TikaT
public void testEmbeddedFileNameExtraction() throws Exception {
InputStream is = PDFParserTest.class.getResourceAsStream(
"/test-documents/testPDF_multiFormatEmbFiles.pdf");
- RecursiveMetadataParser p = new RecursiveMetadataParser(new AutoDetectParser(), false);
+ RecursiveParserWrapper p = new RecursiveParserWrapper(
+ new AutoDetectParser(),
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
Metadata m = new Metadata();
ParseContext c = new ParseContext();
c.set(org.apache.tika.parser.Parser.class, p);
ContentHandler h = new BodyContentHandler();
p.parse(is, h, m, c);
is.close();
- List<Metadata> metadatas = p.getAllMetadata();
+ List<Metadata> metadatas = p.getMetadata();
assertEquals("metadata size", 5, metadatas.size());
- Metadata firstAttachment = metadatas.get(0);
+ Metadata firstAttachment = metadatas.get(1);
assertEquals("attachment file name", "Test.txt", firstAttachment.get(Metadata.RESOURCE_NAME_KEY));
}
@@ -988,24 +996,26 @@ public class PDFParserTest extends TikaT
public void testOSSpecificEmbeddedFileExtraction() throws Exception {
InputStream is = PDFParserTest.class.getResourceAsStream(
"/test-documents/testPDF_multiFormatEmbFiles.pdf");
- RecursiveMetadataParser p = new RecursiveMetadataParser(new AutoDetectParser(), true);
+ RecursiveParserWrapper p = new RecursiveParserWrapper(
+ new AutoDetectParser(),
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
Metadata m = new Metadata();
ParseContext c = new ParseContext();
c.set(org.apache.tika.parser.Parser.class, p);
ContentHandler h = new BodyContentHandler();
p.parse(is, h, m, c);
is.close();
- List<Metadata> metadatas = p.getAllMetadata();
+ List<Metadata> metadatas = p.getMetadata();
assertEquals("metadata size", 5, metadatas.size());
- assertEquals("file name", "Test.txt", metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
- assertContains("os specific", metadatas.get(0).get(RecursiveMetadataParser.TIKA_CONTENT));
- assertEquals("file name", "TestMac.txt", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
- assertContains("mac embedded", metadatas.get(1).get(RecursiveMetadataParser.TIKA_CONTENT));
- assertEquals("file name", "TestDos.txt", metadatas.get(2).get(Metadata.RESOURCE_NAME_KEY));
- assertContains("dos embedded", metadatas.get(2).get(RecursiveMetadataParser.TIKA_CONTENT));
- assertEquals("file name", "TestUnix.txt", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
- assertContains("unix embedded", metadatas.get(3).get(RecursiveMetadataParser.TIKA_CONTENT));
+ assertEquals("file name", "Test.txt", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
+ assertContains("os specific", metadatas.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertEquals("file name", "TestMac.txt", metadatas.get(2).get(Metadata.RESOURCE_NAME_KEY));
+ assertContains("mac embedded", metadatas.get(2).get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertEquals("file name", "TestDos.txt", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
+ assertContains("dos embedded", metadatas.get(3).get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertEquals("file name", "TestUnix.txt", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY));
+ assertContains("unix embedded", metadatas.get(4).get(RecursiveParserWrapper.TIKA_CONTENT));
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1626300&r1=1626299&r2=1626300&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Fri Sep 19 19:18:08 2014
@@ -16,23 +16,6 @@
*/
package org.apache.tika.parser.rtf;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertNotNull;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-
import org.apache.tika.Tika;
import org.apache.tika.TikaTest;
import org.apache.tika.extractor.ContainerExtractor;
@@ -48,11 +31,29 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
/**
* Junit test class for the Tika {@link RTFParser}
*/
@@ -516,7 +517,8 @@ public class RTFParserTest extends TikaT
public void testRegularImages() throws Exception {
Parser base = new AutoDetectParser();
ParseContext ctx = new ParseContext();
- RecursiveMetadataParser parser = new RecursiveMetadataParser(base, false);
+ RecursiveParserWrapper parser = new RecursiveParserWrapper(base,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
ctx.set(org.apache.tika.parser.Parser.class, parser);
TikaInputStream tis = null;
ContentHandler handler = new BodyContentHandler();
@@ -528,7 +530,7 @@ public class RTFParserTest extends TikaT
} finally {
tis.close();
}
- List<Metadata> metadatas = parser.getAllMetadata();
+ List<Metadata> metadatas = parser.getMetadata();
Metadata meta_jpg_exif = metadatas.get(0);//("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg");
Metadata meta_jpg = metadatas.get(2);//("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/test_recursive_embedded.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/test_recursive_embedded.docx?rev=1626300&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/test_recursive_embedded.docx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream