You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/04/11 03:48:48 UTC
svn commit: r1586529 - in
/tika/trunk/tika-parsers/src/test/java/org/apache/tika: ./
parser/microsoft/ parser/microsoft/ooxml/ parser/pdf/ parser/xml/
Author: tallison
Date: Fri Apr 11 01:48:48 2014
New Revision: 1586529
URL: http://svn.apache.org/r1586529
Log:
TIKA-1271: trivial refactoring of classes useful for testing embedded document handling
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java?rev=1586529&r1=1586528&r2=1586529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java Fri Apr 11 01:48:48 2014
@@ -19,18 +19,31 @@ package org.apache.tika;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
+import java.io.ByteArrayOutputStream;
import java.io.File;
+import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.net.URL;
-
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedResourceHandler;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ToXMLContentHandler;
import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* Parent class of Tika tests
@@ -128,5 +141,98 @@ public abstract class TikaTest {
return getText(is, parser, new ParseContext(), new Metadata());
}
+ /**
+ * Keeps track of media types and file names recursively.
+ *
+ */
+ public static class TrackingHandler implements EmbeddedResourceHandler {
+ public List<String> filenames = new ArrayList<String>();
+ public List<MediaType> mediaTypes = new ArrayList<MediaType>();
+
+ private final Set<MediaType> skipTypes;
+
+ public TrackingHandler() {
+ skipTypes = new HashSet<MediaType>();
+ }
+
+ public TrackingHandler(Set<MediaType> skipTypes) {
+ this.skipTypes = skipTypes;
+ }
+
+ @Override
+ public void handle(String filename, MediaType mediaType,
+ InputStream stream) {
+ if (skipTypes.contains(mediaType)) {
+ return;
+ }
+ mediaTypes.add(mediaType);
+ filenames.add(filename);
+ }
+ }
+
+ /**
+ * Copies byte[] of embedded documents into a List.
+ */
+ public static class ByteCopyingHandler implements EmbeddedResourceHandler {
+
+ public List<byte[]> bytes = new ArrayList<byte[]>();
+
+ @Override
+ public void handle(String filename, MediaType mediaType,
+ InputStream stream) {
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+ if (! stream.markSupported()) {
+ stream = TikaInputStream.get(stream);
+ }
+ stream.mark(0);
+ try {
+ IOUtils.copy(stream, os);
+ bytes.add(os.toByteArray());
+ stream.reset();
+ } catch (IOException e) {
+ //swallow
+ }
+ }
+ }
+
+ /**
+ * Stores metadata and (optionally) content.
+ * Many thanks to Jukka's example:
+ * http://wiki.apache.org/tika/RecursiveMetadata
+ *
+ */
+ public static class RecursiveMetadataParser extends ParserDecorator {
+ /** Key for content string if stored */
+ public static final String TIKA_CONTENT = "tika:content";
+
+ private static final long serialVersionUID = 1L;
+
+ private List<Metadata> metadatas = new ArrayList<Metadata>();
+ private final boolean storeContent;
+
+ public RecursiveMetadataParser(Parser parser,
+ boolean storeContent) {
+ super(parser);
+ this.storeContent = storeContent;
+ }
+
+ @Override
+ public void parse(
+ InputStream stream, ContentHandler contentHandler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ super.parse(stream, contentHandler, metadata, context);
+
+ if (storeContent) {
+ metadata.add(TIKA_CONTENT, contentHandler.toString());
+ }
+ metadatas.add(metadata);
+ }
+
+ public List<Metadata> getAllMetadata() {
+ return metadatas;
+ }
+ }
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java?rev=1586529&r1=1586528&r2=1586529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java Fri Apr 11 01:48:48 2014
@@ -19,13 +19,10 @@ package org.apache.tika.parser.microsoft
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
-import java.io.InputStream;
import java.net.URL;
-import java.util.ArrayList;
-import java.util.List;
+import org.apache.tika.TikaTest.TrackingHandler;
import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.extractor.EmbeddedResourceHandler;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.mime.MediaType;
@@ -78,15 +75,4 @@ public abstract class AbstractPOIContain
return TikaInputStream.get(input);
}
-
- public static class TrackingHandler implements EmbeddedResourceHandler {
- public List<String> filenames = new ArrayList<String>();
- public List<MediaType> mediaTypes = new ArrayList<MediaType>();
-
- public void handle(String filename, MediaType mediaType,
- InputStream stream) {
- filenames.add(filename);
- mediaTypes.add(mediaType);
- }
- }
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java?rev=1586529&r1=1586528&r2=1586529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java Fri Apr 11 01:48:48 2014
@@ -20,6 +20,7 @@ import static org.junit.Assert.assertEqu
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
+import org.apache.tika.TikaTest.TrackingHandler;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.ParserContainerExtractor;
import org.apache.tika.mime.MediaType;
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java?rev=1586529&r1=1586528&r2=1586529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java Fri Apr 11 01:48:48 2014
@@ -18,6 +18,7 @@ package org.apache.tika.parser.microsoft
import static org.junit.Assert.assertEquals;
+import org.apache.tika.TikaTest.TrackingHandler;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.extractor.ContainerExtractor;
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java?rev=1586529&r1=1586528&r2=1586529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java Fri Apr 11 01:48:48 2014
@@ -20,6 +20,7 @@ import static org.junit.Assert.assertEqu
import static org.junit.Assert.assertNull;
import org.apache.tika.Tika;
+import org.apache.tika.TikaTest.TrackingHandler;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.ParserContainerExtractor;
import org.apache.tika.parser.microsoft.AbstractPOIContainerExtractionTest;
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1586529&r1=1586528&r2=1586529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Fri Apr 11 01:48:48 2014
@@ -43,7 +43,6 @@ import org.apache.tika.parser.AutoDetect
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
-import org.apache.tika.parser.microsoft.AbstractPOIContainerExtractionTest.TrackingHandler;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
@@ -494,7 +493,6 @@ public class PDFParserTest extends TikaT
assertTrue(needle > -1);
assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack);
- //plagiarized from POIContainerExtractionTest. Thank you!
TrackingHandler tracker = new TrackingHandler();
TikaInputStream tis;
ContainerExtractor ex = new ParserContainerExtractor();
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java?rev=1586529&r1=1586528&r2=1586529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java Fri Apr 11 01:48:48 2014
@@ -19,11 +19,11 @@ package org.apache.tika.parser.xml;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
+import org.apache.tika.TikaTest.TrackingHandler;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.ParserContainerExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.microsoft.AbstractPOIContainerExtractionTest;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
@@ -57,7 +57,7 @@ public class FictionBookParserTest {
assertEquals(true, extractor.isSupported(stream));
// Process it
- AbstractPOIContainerExtractionTest.TrackingHandler handler = new AbstractPOIContainerExtractionTest.TrackingHandler();
+ TrackingHandler handler = new TrackingHandler();
extractor.extract(stream, null, handler);
assertEquals(2, handler.filenames.size());