You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/04/11 03:48:48 UTC

svn commit: r1586529 - in /tika/trunk/tika-parsers/src/test/java/org/apache/tika: ./ parser/microsoft/ parser/microsoft/ooxml/ parser/pdf/ parser/xml/

Author: tallison
Date: Fri Apr 11 01:48:48 2014
New Revision: 1586529

URL: http://svn.apache.org/r1586529
Log:
TIKA-1271: trivial refactoring of classes useful for testing embedded document handling

Modified:
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java?rev=1586529&r1=1586528&r2=1586529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java Fri Apr 11 01:48:48 2014
@@ -19,18 +19,31 @@ package org.apache.tika;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
+import java.io.ByteArrayOutputStream;
 import java.io.File;
+import java.io.IOException;
 import java.io.InputStream;
 import java.net.URISyntaxException;
 import java.net.URL;
-
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedResourceHandler;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ToXMLContentHandler;
 import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
 /**
  * Parent class of Tika tests
@@ -128,5 +141,98 @@ public abstract class TikaTest {
         return getText(is, parser, new ParseContext(), new Metadata());
     }
 
+    /**
+     * Keeps track of media types and file names recursively.
+     *
+     */
+    public static class TrackingHandler implements EmbeddedResourceHandler {
+        public List<String> filenames = new ArrayList<String>();
+        public List<MediaType> mediaTypes = new ArrayList<MediaType>();
+        
+        private final Set<MediaType> skipTypes;
+        
+        public TrackingHandler() {
+            skipTypes = new HashSet<MediaType>();
+        }
+     
+        public TrackingHandler(Set<MediaType> skipTypes) {
+            this.skipTypes = skipTypes;
+        }
+
+        @Override
+        public void handle(String filename, MediaType mediaType,
+                InputStream stream) {
+            if (skipTypes.contains(mediaType)) {
+                return;
+            }
+            mediaTypes.add(mediaType);
+            filenames.add(filename);
+        }
+    }
+    
+    /**
+     * Copies byte[] of embedded documents into a List.
+     */
+    public static class ByteCopyingHandler implements EmbeddedResourceHandler {
+
+        public List<byte[]> bytes = new ArrayList<byte[]>();
+
+        @Override
+        public void handle(String filename, MediaType mediaType,
+                InputStream stream) {
+            ByteArrayOutputStream os = new ByteArrayOutputStream();
+            if (! stream.markSupported()) {
+                stream = TikaInputStream.get(stream);
+            }
+            stream.mark(0);
+            try {
+                IOUtils.copy(stream, os);
+                bytes.add(os.toByteArray());
+                stream.reset();
+            } catch (IOException e) {
+                //swallow
+            }
+        }
+    }
+    
+    /**
+     * Stores metadata and (optionally) content.
+     * Many thanks to Jukka's example:
+     * http://wiki.apache.org/tika/RecursiveMetadata
+     *
+     */
+    public static class RecursiveMetadataParser extends ParserDecorator {
+        /** Key for content string if stored */
+        public static final String TIKA_CONTENT = "tika:content";
+
+        private static final long serialVersionUID = 1L;
+        
+        private List<Metadata> metadatas = new ArrayList<Metadata>();
+        private final boolean storeContent;
+        
+        public RecursiveMetadataParser(Parser parser, 
+                boolean storeContent) {
+            super(parser);
+            this.storeContent = storeContent;
+        }
+
+        @Override
+        public void parse(
+                InputStream stream, ContentHandler contentHandler,
+                Metadata metadata, ParseContext context)
+                        throws IOException, SAXException, TikaException {
+
+            super.parse(stream, contentHandler, metadata, context);
+            
+            if (storeContent) {
+                metadata.add(TIKA_CONTENT, contentHandler.toString());
+            }
+            metadatas.add(metadata);
+        }
+
+        public List<Metadata> getAllMetadata() {
+            return metadatas;
+        }        
+    }
 
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java?rev=1586529&r1=1586528&r2=1586529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/AbstractPOIContainerExtractionTest.java Fri Apr 11 01:48:48 2014
@@ -19,13 +19,10 @@ package org.apache.tika.parser.microsoft
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 
-import java.io.InputStream;
 import java.net.URL;
-import java.util.ArrayList;
-import java.util.List;
 
+import org.apache.tika.TikaTest.TrackingHandler;
 import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.extractor.EmbeddedResourceHandler;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.mime.MediaType;
 
@@ -78,15 +75,4 @@ public abstract class AbstractPOIContain
 
         return TikaInputStream.get(input);
     }
-    
-    public static class TrackingHandler implements EmbeddedResourceHandler {
-       public List<String> filenames = new ArrayList<String>();
-       public List<MediaType> mediaTypes = new ArrayList<MediaType>();
-       
-       public void handle(String filename, MediaType mediaType,
-            InputStream stream) {
-          filenames.add(filename);
-          mediaTypes.add(mediaType);
-      }
-    }
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java?rev=1586529&r1=1586528&r2=1586529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java Fri Apr 11 01:48:48 2014
@@ -20,6 +20,7 @@ import static org.junit.Assert.assertEqu
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 
+import org.apache.tika.TikaTest.TrackingHandler;
 import org.apache.tika.extractor.ContainerExtractor;
 import org.apache.tika.extractor.ParserContainerExtractor;
 import org.apache.tika.mime.MediaType;

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java?rev=1586529&r1=1586528&r2=1586529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java Fri Apr 11 01:48:48 2014
@@ -18,6 +18,7 @@ package org.apache.tika.parser.microsoft
 
 import static org.junit.Assert.assertEquals;
 
+import org.apache.tika.TikaTest.TrackingHandler;
 import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.extractor.ContainerExtractor;

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java?rev=1586529&r1=1586528&r2=1586529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java Fri Apr 11 01:48:48 2014
@@ -20,6 +20,7 @@ import static org.junit.Assert.assertEqu
 import static org.junit.Assert.assertNull;
 
 import org.apache.tika.Tika;
+import org.apache.tika.TikaTest.TrackingHandler;
 import org.apache.tika.extractor.ContainerExtractor;
 import org.apache.tika.extractor.ParserContainerExtractor;
 import org.apache.tika.parser.microsoft.AbstractPOIContainerExtractionTest;

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1586529&r1=1586528&r2=1586529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Fri Apr 11 01:48:48 2014
@@ -43,7 +43,6 @@ import org.apache.tika.parser.AutoDetect
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
-import org.apache.tika.parser.microsoft.AbstractPOIContainerExtractionTest.TrackingHandler;
 import org.apache.tika.sax.BodyContentHandler;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
@@ -494,7 +493,6 @@ public class PDFParserTest extends TikaT
        assertTrue(needle > -1);
        assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack);
        
-       //plagiarized from POIContainerExtractionTest.  Thank you!
        TrackingHandler tracker = new TrackingHandler();
        TikaInputStream tis;
        ContainerExtractor ex = new ParserContainerExtractor();

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java?rev=1586529&r1=1586528&r2=1586529&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java Fri Apr 11 01:48:48 2014
@@ -19,11 +19,11 @@ package org.apache.tika.parser.xml;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
+import org.apache.tika.TikaTest.TrackingHandler;
 import org.apache.tika.extractor.ContainerExtractor;
 import org.apache.tika.extractor.ParserContainerExtractor;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.microsoft.AbstractPOIContainerExtractionTest;
 import org.apache.tika.sax.BodyContentHandler;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
@@ -57,7 +57,7 @@ public class FictionBookParserTest {
             assertEquals(true, extractor.isSupported(stream));
 
             // Process it
-            AbstractPOIContainerExtractionTest.TrackingHandler handler = new AbstractPOIContainerExtractionTest.TrackingHandler();
+            TrackingHandler handler = new TrackingHandler();
             extractor.extract(stream, null, handler);
 
             assertEquals(2, handler.filenames.size());