You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/09/13 15:39:59 UTC

svn commit: r996526 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/ test/java/org/apache/tika/parser/pkg/

Author: nick
Date: Mon Sep 13 13:39:59 2010
New Revision: 996526

URL: http://svn.apache.org/viewvc?rev=996526&view=rev
Log:
Container extraction tests for package based parsers (TIKA-509)

Added:
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=996526&r1=996525&r2=996526&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Mon Sep 13 13:39:59 2010
@@ -33,11 +33,8 @@ import org.apache.tika.extractor.Embedde
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
-import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=996526&r1=996525&r2=996526&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Mon Sep 13 13:39:59 2010
@@ -36,7 +36,6 @@ import org.apache.poi.hssf.eventusermode
 import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
 import org.apache.poi.hssf.eventusermodel.HSSFListener;
 import org.apache.poi.hssf.eventusermodel.HSSFRequest;
-import org.apache.poi.hssf.record.AbstractEscherHolderRecord;
 import org.apache.poi.hssf.record.BOFRecord;
 import org.apache.poi.hssf.record.BoundSheetRecord;
 import org.apache.poi.hssf.record.CellValueRecordInterface;

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java?rev=996526&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java Mon Sep 13 13:39:59 2010
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parent class for all Package based Test cases
+ */
+public abstract class AbstractPkgTest extends TestCase {
+   protected ParseContext trackingContext;
+   protected ParseContext recursingContext;
+   
+   protected Parser autoDetectParser;
+   protected EmbeddedTrackingParser tracker;
+
+   protected void setUp() throws Exception {
+      super.setUp();
+      
+      tracker = new EmbeddedTrackingParser();
+      trackingContext = new ParseContext();
+      trackingContext.set(Parser.class, tracker);
+      
+      autoDetectParser = new AutoDetectParser();
+      recursingContext = new ParseContext();
+      recursingContext.set(Parser.class, autoDetectParser);
+   }
+
+
+   @SuppressWarnings("serial")
+   protected static class EmbeddedTrackingParser implements Parser {
+      protected List<String> filenames = new ArrayList<String>();
+      protected List<String> mediatypes = new ArrayList<String>();
+      protected byte[] lastSeenStart;
+      
+      public void reset() {
+         filenames.clear();
+         mediatypes.clear();
+      }
+      
+      public Set<MediaType> getSupportedTypes(ParseContext context) {
+         // Cheat!
+         return (new AutoDetectParser()).getSupportedTypes(context);
+      }
+
+      public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+         filenames.add(metadata.get(Metadata.RESOURCE_NAME_KEY));
+         mediatypes.add(metadata.get(Metadata.CONTENT_TYPE));
+         
+         lastSeenStart = new byte[32];
+         stream.read(lastSeenStart);
+      }
+
+      public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata) throws IOException, SAXException, TikaException {
+         parse(stream, handler, metadata, new ParseContext());
+      }
+   }
+}

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java?rev=996526&r1=996525&r2=996526&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java Mon Sep 13 13:39:59 2010
@@ -18,8 +18,6 @@ package org.apache.tika.parser.pkg;
 
 import java.io.InputStream;
 
-import junit.framework.TestCase;
-
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
@@ -29,7 +27,7 @@ import org.xml.sax.ContentHandler;
 /**
  * Test case for parsing bzip2 files.
  */
-public class Bzip2ParserTest extends TestCase {
+public class Bzip2ParserTest extends AbstractPkgTest {
 
     public void testBzip2Parsing() throws Exception {
         Parser parser = new AutoDetectParser(); // Should auto-detect!
@@ -39,7 +37,7 @@ public class Bzip2ParserTest extends Tes
         InputStream stream = Bzip2ParserTest.class.getResourceAsStream(
                 "/test-documents/test-documents.tbz2");
         try {
-            parser.parse(stream, handler, metadata);
+            parser.parse(stream, handler, metadata, recursingContext);
         } finally {
             stream.close();
         }
@@ -66,4 +64,32 @@ public class Bzip2ParserTest extends Tes
         assertTrue(content.contains("Rida Benjelloun"));
     }
 
+
+    /**
+     * Tests that the ParseContext parser is correctly
+     *  fired for all the embedded entries.
+     */
+    public void testEmbedded() throws Exception {
+       Parser parser = new AutoDetectParser(); // Should auto-detect!
+       ContentHandler handler = new BodyContentHandler();
+       Metadata metadata = new Metadata();
+
+       InputStream stream = ZipParserTest.class.getResourceAsStream(
+               "/test-documents/test-documents.tbz2");
+       try {
+           parser.parse(stream, handler, metadata, trackingContext);
+       } finally {
+           stream.close();
+       }
+       
+       // Should find a single entry, for the (compressed) tar file
+       assertEquals(1, tracker.filenames.size());
+       assertEquals(1, tracker.mediatypes.size());
+       
+       assertEquals(null, tracker.filenames.get(0));
+       assertEquals(null, tracker.mediatypes.get(0));
+
+       // Tar file starts with the directory name
+       assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, "ASCII"));
+    }
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java?rev=996526&r1=996525&r2=996526&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java Mon Sep 13 13:39:59 2010
@@ -18,8 +18,6 @@ package org.apache.tika.parser.pkg;
 
 import java.io.InputStream;
 
-import junit.framework.TestCase;
-
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
@@ -29,7 +27,7 @@ import org.xml.sax.ContentHandler;
 /**
  * Test case for parsing gzip files.
  */
-public class GzipParserTest extends TestCase {
+public class GzipParserTest extends AbstractPkgTest {
 
     public void testGzipParsing() throws Exception {
         Parser parser = new AutoDetectParser(); // Should auto-detect!
@@ -39,7 +37,7 @@ public class GzipParserTest extends Test
         InputStream stream = GzipParserTest.class.getResourceAsStream(
                 "/test-documents/test-documents.tgz");
         try {
-            parser.parse(stream, handler, metadata);
+            parser.parse(stream, handler, metadata, recursingContext);
         } finally {
             stream.close();
         }
@@ -66,6 +64,34 @@ public class GzipParserTest extends Test
         assertTrue(content.contains("Rida Benjelloun"));
     }
 
+    /**
+     * Tests that the ParseContext parser is correctly
+     *  fired for all the embedded entries.
+     */
+    public void testEmbedded() throws Exception {
+       Parser parser = new AutoDetectParser(); // Should auto-detect!
+       ContentHandler handler = new BodyContentHandler();
+       Metadata metadata = new Metadata();
+
+       InputStream stream = ZipParserTest.class.getResourceAsStream(
+               "/test-documents/test-documents.tgz");
+       try {
+           parser.parse(stream, handler, metadata, trackingContext);
+       } finally {
+           stream.close();
+       }
+       
+       // Should find a single entry, for the (compressed) tar file
+       assertEquals(1, tracker.filenames.size());
+       assertEquals(1, tracker.mediatypes.size());
+       
+       assertEquals(null, tracker.filenames.get(0));
+       assertEquals(null, tracker.mediatypes.get(0));
+
+       // Tar file starts with the directory name
+       assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, "ASCII"));
+    }
+    
     public void testSvgzParsing() throws Exception {
         Parser parser = new AutoDetectParser(); // Should auto-detect!
         ContentHandler handler = new BodyContentHandler();
@@ -74,7 +100,7 @@ public class GzipParserTest extends Test
         InputStream stream = GzipParserTest.class.getResourceAsStream(
                 "/test-documents/testSVG.svgz");
         try {
-            parser.parse(stream, handler, metadata);
+            parser.parse(stream, handler, metadata, recursingContext);
         } finally {
             stream.close();
         }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java?rev=996526&r1=996525&r2=996526&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java Mon Sep 13 13:39:59 2010
@@ -18,8 +18,6 @@ package org.apache.tika.parser.pkg;
 
 import java.io.InputStream;
 
-import junit.framework.TestCase;
-
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
@@ -29,7 +27,7 @@ import org.xml.sax.ContentHandler;
 /**
  * Test case for parsing tar files.
  */
-public class TarParserTest extends TestCase {
+public class TarParserTest extends AbstractPkgTest {
 
     public void testTarParsing() throws Exception {
         Parser parser = new AutoDetectParser(); // Should auto-detect!
@@ -39,7 +37,7 @@ public class TarParserTest extends TestC
         InputStream stream = TarParserTest.class.getResourceAsStream(
                 "/test-documents/test-documents.tar");
         try {
-            parser.parse(stream, handler, metadata);
+            parser.parse(stream, handler, metadata, recursingContext);
         } finally {
             stream.close();
         }
@@ -66,4 +64,41 @@ public class TarParserTest extends TestC
         assertTrue(content.contains("Rida Benjelloun"));
     }
 
+    /**
+     * Tests that the ParseContext parser is correctly
+     *  fired for all the embedded entries.
+     */
+    public void testEmbedded() throws Exception {
+       Parser parser = new AutoDetectParser(); // Should auto-detect!
+       ContentHandler handler = new BodyContentHandler();
+       Metadata metadata = new Metadata();
+
+       InputStream stream = ZipParserTest.class.getResourceAsStream(
+               "/test-documents/test-documents.tar");
+       try {
+           parser.parse(stream, handler, metadata, trackingContext);
+       } finally {
+           stream.close();
+       }
+       
+       // Should have found all 9 documents, but not the directory
+       assertEquals(9, tracker.filenames.size());
+       assertEquals(9, tracker.mediatypes.size());
+       
+       // Should have names but not content types, as tar doesn't
+       //  store the content types
+       assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
+       assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));
+       assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2));
+       assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3));
+       assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4));
+       assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5));
+       assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6));
+       assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7));
+       assertEquals("test-documents/testXML.xml", tracker.filenames.get(8));
+       
+       for(String type : tracker.mediatypes) {
+          assertNull(type);
+       }
+    }
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java?rev=996526&r1=996525&r2=996526&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java Mon Sep 13 13:39:59 2010
@@ -18,8 +18,6 @@ package org.apache.tika.parser.pkg;
 
 import java.io.InputStream;
 
-import junit.framework.TestCase;
-
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
@@ -29,7 +27,7 @@ import org.xml.sax.ContentHandler;
 /**
  * Test case for parsing zip files.
  */
-public class ZipParserTest extends TestCase {
+public class ZipParserTest extends AbstractPkgTest {
 
     public void testZipParsing() throws Exception {
         Parser parser = new AutoDetectParser(); // Should auto-detect!
@@ -39,7 +37,7 @@ public class ZipParserTest extends TestC
         InputStream stream = ZipParserTest.class.getResourceAsStream(
                 "/test-documents/test-documents.zip");
         try {
-            parser.parse(stream, handler, metadata);
+            parser.parse(stream, handler, metadata, recursingContext);
         } finally {
             stream.close();
         }
@@ -66,4 +64,41 @@ public class ZipParserTest extends TestC
         assertTrue(content.contains("Rida Benjelloun"));
     }
 
+    /**
+     * Tests that the ParseContext parser is correctly
+     *  fired for all the embedded entries.
+     */
+    public void testEmbedded() throws Exception {
+       Parser parser = new AutoDetectParser(); // Should auto-detect!
+       ContentHandler handler = new BodyContentHandler();
+       Metadata metadata = new Metadata();
+
+       InputStream stream = ZipParserTest.class.getResourceAsStream(
+               "/test-documents/test-documents.zip");
+       try {
+           parser.parse(stream, handler, metadata, trackingContext);
+       } finally {
+           stream.close();
+       }
+       
+       // Should have found all 9 documents
+       assertEquals(9, tracker.filenames.size());
+       assertEquals(9, tracker.mediatypes.size());
+       
+       // Should have names but not content types, as zip doesn't
+       //  store the content types
+       assertEquals("testEXCEL.xls", tracker.filenames.get(0));
+       assertEquals("testHTML.html", tracker.filenames.get(1));
+       assertEquals("testOpenOffice2.odt", tracker.filenames.get(2));
+       assertEquals("testPDF.pdf", tracker.filenames.get(3));
+       assertEquals("testPPT.ppt", tracker.filenames.get(4));
+       assertEquals("testRTF.rtf", tracker.filenames.get(5));
+       assertEquals("testTXT.txt", tracker.filenames.get(6));
+       assertEquals("testWORD.doc", tracker.filenames.get(7));
+       assertEquals("testXML.xml", tracker.filenames.get(8));
+       
+       for(String type : tracker.mediatypes) {
+          assertNull(type);
+       }
+    }
 }