You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/09/13 15:39:59 UTC
svn commit: r996526 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/
test/java/org/apache/tika/parser/pkg/
Author: nick
Date: Mon Sep 13 13:39:59 2010
New Revision: 996526
URL: http://svn.apache.org/viewvc?rev=996526&view=rev
Log:
Container extraction tests for package based parsers (TIKA-509)
Added:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=996526&r1=996525&r2=996526&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Mon Sep 13 13:39:59 2010
@@ -33,11 +33,8 @@ import org.apache.tika.extractor.Embedde
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
-import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=996526&r1=996525&r2=996526&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Mon Sep 13 13:39:59 2010
@@ -36,7 +36,6 @@ import org.apache.poi.hssf.eventusermode
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
-import org.apache.poi.hssf.record.AbstractEscherHolderRecord;
import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.BoundSheetRecord;
import org.apache.poi.hssf.record.CellValueRecordInterface;
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java?rev=996526&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java Mon Sep 13 13:39:59 2010
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parent class for all Package based Test cases
+ */
+public abstract class AbstractPkgTest extends TestCase {
+ protected ParseContext trackingContext;
+ protected ParseContext recursingContext;
+
+ protected Parser autoDetectParser;
+ protected EmbeddedTrackingParser tracker;
+
+ protected void setUp() throws Exception {
+ super.setUp();
+
+ tracker = new EmbeddedTrackingParser();
+ trackingContext = new ParseContext();
+ trackingContext.set(Parser.class, tracker);
+
+ autoDetectParser = new AutoDetectParser();
+ recursingContext = new ParseContext();
+ recursingContext.set(Parser.class, autoDetectParser);
+ }
+
+
+ @SuppressWarnings("serial")
+ protected static class EmbeddedTrackingParser implements Parser {
+ protected List<String> filenames = new ArrayList<String>();
+ protected List<String> mediatypes = new ArrayList<String>();
+ protected byte[] lastSeenStart;
+
+ public void reset() {
+ filenames.clear();
+ mediatypes.clear();
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ // Cheat!
+ return (new AutoDetectParser()).getSupportedTypes(context);
+ }
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ filenames.add(metadata.get(Metadata.RESOURCE_NAME_KEY));
+ mediatypes.add(metadata.get(Metadata.CONTENT_TYPE));
+
+ lastSeenStart = new byte[32];
+ stream.read(lastSeenStart);
+ }
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata) throws IOException, SAXException, TikaException {
+ parse(stream, handler, metadata, new ParseContext());
+ }
+ }
+}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java?rev=996526&r1=996525&r2=996526&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java Mon Sep 13 13:39:59 2010
@@ -18,8 +18,6 @@ package org.apache.tika.parser.pkg;
import java.io.InputStream;
-import junit.framework.TestCase;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
@@ -29,7 +27,7 @@ import org.xml.sax.ContentHandler;
/**
* Test case for parsing bzip2 files.
*/
-public class Bzip2ParserTest extends TestCase {
+public class Bzip2ParserTest extends AbstractPkgTest {
public void testBzip2Parsing() throws Exception {
Parser parser = new AutoDetectParser(); // Should auto-detect!
@@ -39,7 +37,7 @@ public class Bzip2ParserTest extends Tes
InputStream stream = Bzip2ParserTest.class.getResourceAsStream(
"/test-documents/test-documents.tbz2");
try {
- parser.parse(stream, handler, metadata);
+ parser.parse(stream, handler, metadata, recursingContext);
} finally {
stream.close();
}
@@ -66,4 +64,32 @@ public class Bzip2ParserTest extends Tes
assertTrue(content.contains("Rida Benjelloun"));
}
+
+ /**
+ * Tests that the ParseContext parser is correctly
+ * fired for all the embedded entries.
+ */
+ public void testEmbedded() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = ZipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tbz2");
+ try {
+ parser.parse(stream, handler, metadata, trackingContext);
+ } finally {
+ stream.close();
+ }
+
+ // Should find a single entry, for the (compressed) tar file
+ assertEquals(1, tracker.filenames.size());
+ assertEquals(1, tracker.mediatypes.size());
+
+ assertEquals(null, tracker.filenames.get(0));
+ assertEquals(null, tracker.mediatypes.get(0));
+
+ // Tar file starts with the directory name
+ assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, "ASCII"));
+ }
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java?rev=996526&r1=996525&r2=996526&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java Mon Sep 13 13:39:59 2010
@@ -18,8 +18,6 @@ package org.apache.tika.parser.pkg;
import java.io.InputStream;
-import junit.framework.TestCase;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
@@ -29,7 +27,7 @@ import org.xml.sax.ContentHandler;
/**
* Test case for parsing gzip files.
*/
-public class GzipParserTest extends TestCase {
+public class GzipParserTest extends AbstractPkgTest {
public void testGzipParsing() throws Exception {
Parser parser = new AutoDetectParser(); // Should auto-detect!
@@ -39,7 +37,7 @@ public class GzipParserTest extends Test
InputStream stream = GzipParserTest.class.getResourceAsStream(
"/test-documents/test-documents.tgz");
try {
- parser.parse(stream, handler, metadata);
+ parser.parse(stream, handler, metadata, recursingContext);
} finally {
stream.close();
}
@@ -66,6 +64,34 @@ public class GzipParserTest extends Test
assertTrue(content.contains("Rida Benjelloun"));
}
+ /**
+ * Tests that the ParseContext parser is correctly
+ * fired for all the embedded entries.
+ */
+ public void testEmbedded() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = ZipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tgz");
+ try {
+ parser.parse(stream, handler, metadata, trackingContext);
+ } finally {
+ stream.close();
+ }
+
+ // Should find a single entry, for the (compressed) tar file
+ assertEquals(1, tracker.filenames.size());
+ assertEquals(1, tracker.mediatypes.size());
+
+ assertEquals(null, tracker.filenames.get(0));
+ assertEquals(null, tracker.mediatypes.get(0));
+
+ // Tar file starts with the directory name
+ assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, "ASCII"));
+ }
+
public void testSvgzParsing() throws Exception {
Parser parser = new AutoDetectParser(); // Should auto-detect!
ContentHandler handler = new BodyContentHandler();
@@ -74,7 +100,7 @@ public class GzipParserTest extends Test
InputStream stream = GzipParserTest.class.getResourceAsStream(
"/test-documents/testSVG.svgz");
try {
- parser.parse(stream, handler, metadata);
+ parser.parse(stream, handler, metadata, recursingContext);
} finally {
stream.close();
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java?rev=996526&r1=996525&r2=996526&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java Mon Sep 13 13:39:59 2010
@@ -18,8 +18,6 @@ package org.apache.tika.parser.pkg;
import java.io.InputStream;
-import junit.framework.TestCase;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
@@ -29,7 +27,7 @@ import org.xml.sax.ContentHandler;
/**
* Test case for parsing tar files.
*/
-public class TarParserTest extends TestCase {
+public class TarParserTest extends AbstractPkgTest {
public void testTarParsing() throws Exception {
Parser parser = new AutoDetectParser(); // Should auto-detect!
@@ -39,7 +37,7 @@ public class TarParserTest extends TestC
InputStream stream = TarParserTest.class.getResourceAsStream(
"/test-documents/test-documents.tar");
try {
- parser.parse(stream, handler, metadata);
+ parser.parse(stream, handler, metadata, recursingContext);
} finally {
stream.close();
}
@@ -66,4 +64,41 @@ public class TarParserTest extends TestC
assertTrue(content.contains("Rida Benjelloun"));
}
+ /**
+ * Tests that the ParseContext parser is correctly
+ * fired for all the embedded entries.
+ */
+ public void testEmbedded() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = ZipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tar");
+ try {
+ parser.parse(stream, handler, metadata, trackingContext);
+ } finally {
+ stream.close();
+ }
+
+ // Should have found all 9 documents, but not the directory
+ assertEquals(9, tracker.filenames.size());
+ assertEquals(9, tracker.mediatypes.size());
+
+ // Should have names but not content types, as tar doesn't
+ // store the content types
+ assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
+ assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));
+ assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2));
+ assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3));
+ assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4));
+ assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5));
+ assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6));
+ assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7));
+ assertEquals("test-documents/testXML.xml", tracker.filenames.get(8));
+
+ for(String type : tracker.mediatypes) {
+ assertNull(type);
+ }
+ }
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java?rev=996526&r1=996525&r2=996526&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java Mon Sep 13 13:39:59 2010
@@ -18,8 +18,6 @@ package org.apache.tika.parser.pkg;
import java.io.InputStream;
-import junit.framework.TestCase;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
@@ -29,7 +27,7 @@ import org.xml.sax.ContentHandler;
/**
* Test case for parsing zip files.
*/
-public class ZipParserTest extends TestCase {
+public class ZipParserTest extends AbstractPkgTest {
public void testZipParsing() throws Exception {
Parser parser = new AutoDetectParser(); // Should auto-detect!
@@ -39,7 +37,7 @@ public class ZipParserTest extends TestC
InputStream stream = ZipParserTest.class.getResourceAsStream(
"/test-documents/test-documents.zip");
try {
- parser.parse(stream, handler, metadata);
+ parser.parse(stream, handler, metadata, recursingContext);
} finally {
stream.close();
}
@@ -66,4 +64,41 @@ public class ZipParserTest extends TestC
assertTrue(content.contains("Rida Benjelloun"));
}
+ /**
+ * Tests that the ParseContext parser is correctly
+ * fired for all the embedded entries.
+ */
+ public void testEmbedded() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = ZipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.zip");
+ try {
+ parser.parse(stream, handler, metadata, trackingContext);
+ } finally {
+ stream.close();
+ }
+
+ // Should have found all 9 documents
+ assertEquals(9, tracker.filenames.size());
+ assertEquals(9, tracker.mediatypes.size());
+
+ // Should have names but not content types, as zip doesn't
+ // store the content types
+ assertEquals("testEXCEL.xls", tracker.filenames.get(0));
+ assertEquals("testHTML.html", tracker.filenames.get(1));
+ assertEquals("testOpenOffice2.odt", tracker.filenames.get(2));
+ assertEquals("testPDF.pdf", tracker.filenames.get(3));
+ assertEquals("testPPT.ppt", tracker.filenames.get(4));
+ assertEquals("testRTF.rtf", tracker.filenames.get(5));
+ assertEquals("testTXT.txt", tracker.filenames.get(6));
+ assertEquals("testWORD.doc", tracker.filenames.get(7));
+ assertEquals("testXML.xml", tracker.filenames.get(8));
+
+ for(String type : tracker.mediatypes) {
+ assertNull(type);
+ }
+ }
}