You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC
svn commit: r1723223 [20/32] - in /tika/branches/2.x:
tika-core/src/test/resources/META-INF/
tika-core/src/test/resources/META-INF/services/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-module/src/ ti...
Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,466 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Tests if the IWork parser parses the content and metadata properly of the supported formats.
+ */
+public class IWorkParserTest {
+
+ private IWorkPackageParser iWorkParser;
+ private ParseContext parseContext;
+
+ @Before
+ public void setUp() {
+ iWorkParser = new IWorkPackageParser();
+ parseContext = new ParseContext();
+ parseContext.set(Parser.class, new AutoDetectParser());
+ }
+
+ /**
+ * Check the given InputStream is not closed by the Parser (TIKA-1117).
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testStreamNotClosed() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+ input.read(); // Will throw an Exception if the stream was already closed.
+ }
+
+ @Test
+ public void testParseKeynote() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ // Make sure enough keys came through
+ // (Exact numbers will vary based on composites)
+ assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 6);
+ List<String> metadataKeys = Arrays.asList(metadata.names());
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.SLIDE_COUNT.getName()));
+// assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Office.SLIDE_COUNT.getName()));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
+
+ // Check the metadata values
+ assertEquals("application/vnd.apple.keynote", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("3", metadata.get(Metadata.SLIDE_COUNT));
+ assertEquals("1024", metadata.get(KeynoteContentHandler.PRESENTATION_WIDTH));
+ assertEquals("768", metadata.get(KeynoteContentHandler.PRESENTATION_HEIGHT));
+ assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));
+
+ String content = handler.toString();
+ assertContains("A sample presentation", content);
+ assertContains("For the Apache Tika project", content);
+ assertContains("Slide 1", content);
+ assertContains("Some random text for the sake of testability.", content);
+ assertContains("A nice comment", content);
+ assertContains("A nice note", content);
+
+ // test table data
+ assertContains("Cell one", content);
+ assertContains("Cell two", content);
+ assertContains("Cell three", content);
+ assertContains("Cell four", content);
+ assertContains("Cell 5", content);
+ assertContains("Cell six", content);
+ assertContains("7", content);
+ assertContains("Cell eight", content);
+ assertContains("5/5/1985", content);
+ }
+
+ // TIKA-910
+ @Test
+ public void testKeynoteTextBoxes() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testTextBoxes.key");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ String content = handler.toString();
+ assertTrue(content.replaceAll("\\s+", " ").contains("text1 text2 text3"));
+ }
+
+ // TIKA-910
+ @Test
+ public void testKeynoteBulletPoints() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testBulletPoints.key");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ String content = handler.toString();
+ assertTrue(content.replaceAll("\\s+", " ").contains("bullet point 1 bullet point 2 bullet point 3"));
+ }
+
+ // TIKA-923
+ @Test
+ public void testKeynoteTables() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testTables.key");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ String content = handler.toString();
+ content = content.replaceAll("\\s+", " ");
+ assertContains("row 1 row 2 row 3", content);
+ }
+
+ // TIKA-923
+ @Test
+ public void testKeynoteMasterSlideTable() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testMasterSlideTable.key");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ String content = handler.toString();
+ content = content.replaceAll("\\s+", " ");
+ assertContains("master row 1", content);
+ assertContains("master row 2", content);
+ assertContains("master row 3", content);
+ }
+
+ @Test
+ public void testParsePages() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPages.pages");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ // Make sure enough keys came through
+ // (Exact numbers will vary based on composites)
+ assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 50);
+ List<String> metadataKeys = Arrays.asList(metadata.names());
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.PAGE_COUNT.getName()));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.LAST_MODIFIED.getName()));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.LANGUAGE));
+
+ // Check the metadata values
+ assertEquals("application/vnd.apple.pages", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Tika user", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Apache tika", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("2010-05-09T21:34:38+0200", metadata.get(Metadata.CREATION_DATE));
+ assertEquals("2010-05-09T23:50:36+0200", metadata.get(Metadata.LAST_MODIFIED));
+ assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
+ assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
+
+ String content = handler.toString();
+
+ // text on page 1
+ assertContains("Sample pages document", content);
+ assertContains("Some plain text to parse.", content);
+ assertContains("Cell one", content);
+ assertContains("Cell two", content);
+ assertContains("Cell three", content);
+ assertContains("Cell four", content);
+ assertContains("Cell five", content);
+ assertContains("Cell six", content);
+ assertContains("Cell seven", content);
+ assertContains("Cell eight", content);
+ assertContains("Cell nine", content);
+ assertContains("Both Pages 1.x and Keynote 2.x", content); // ...
+
+ // text on page 2
+ assertContains("A second page....", content);
+ assertContains("Extensible Markup Language", content); // ...
+ }
+
+ // TIKA-904
+ @Test
+ public void testPagesLayoutMode() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesLayout.pages");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ String content = handler.toString();
+ assertContains("text box 1 - here is some text", content);
+ assertContains("created in a text box in layout mode", content);
+ assertContains("text box 2 - more text!@!$@#", content);
+ assertContains("this is text inside of a green box", content);
+ assertContains("text inside of a green circle", content);
+ }
+
+ @Test
+ public void testParseNumbers() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbers.numbers");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ // Make sure enough keys came through
+ // (Exact numbers will vary based on composites)
+ assertTrue("Insufficient metadata found " + metadata.size(), metadata.size() >= 8);
+ List<String> metadataKeys = Arrays.asList(metadata.names());
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.CONTENT_TYPE));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.PAGE_COUNT.getName()));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.CREATOR.getName()));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.COMMENTS.getName()));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(Metadata.TITLE));
+ assertTrue("Metadata not found in " + metadataKeys, metadataKeys.contains(TikaCoreProperties.TITLE.getName()));
+
+ // Check the metadata values
+ assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
+ assertEquals("Tika User", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Account checking", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("a comment", metadata.get(TikaCoreProperties.COMMENTS));
+
+ String content = handler.toString();
+ assertContains("Category", content);
+ assertContains("Home", content);
+ assertContains("-226", content);
+ assertContains("-137.5", content);
+ assertContains("Checking Account: 300545668", content);
+ assertContains("4650", content);
+ assertContains("Credit Card", content);
+ assertContains("Groceries", content);
+ assertContains("-210", content);
+ assertContains("Food", content);
+ assertContains("Try adding your own account transactions to this table.", content);
+ }
+
+ // TIKA- 924
+ @Test
+ public void testParseNumbersTableNames() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/tableNames.numbers");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+ String content = handler.toString();
+ assertContains("This is the main table", content);
+ }
+
+ @Test
+ public void testParseNumbersTableHeaders() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/tableHeaders.numbers");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ String content = handler.toString();
+ for(int header=1;header<=5;header++) {
+ assertContains("header" + header, content);
+ }
+ for(int row=1;row<=3;row++) {
+ assertContains("row" + row, content);
+ }
+ }
+
+ /**
+ * We don't currently support password protected Pages files, as
+ * we don't know how the encryption works (it's not regular Zip
+ * Encryption). See TIKA-903 for details
+ */
+ @Test
+ public void testParsePagesPasswordProtected() throws Exception {
+ // Document password is "tika", but we can't use that yet...
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesPwdProtected.pages");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ // Content will be empty
+ String content = handler.toString();
+ assertEquals("", content);
+
+ // Will have been identified as encrypted
+ assertEquals("application/x-tika-iworks-protected", metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+ /**
+ * Check we get headers, footers and footnotes from Pages
+ */
+ @Test
+ public void testParsePagesHeadersFootersFootnotes() throws Exception {
+ String footnote = "Footnote: Do a lot of people really use iWork?!?!";
+ String header = "THIS IS SOME HEADER TEXT";
+ String footer = "THIS IS SOME FOOTER TEXT\t1";
+ String footer2 = "THIS IS SOME FOOTER TEXT\t2";
+
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersFootnotes.pages");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, metadata, parseContext);
+ String contents = handler.toString();
+
+ // Check regular text
+ assertContains("Both Pages 1.x", contents); // P1
+ assertContains("understanding the Pages document", contents); // P1
+ assertContains("should be page 2", contents); // P2
+
+ // Check for headers, footers and footnotes
+ assertContains(header, contents);
+ assertContains(footer, contents);
+ assertContains(footer2, contents);
+ assertContains(footnote, contents);
+ }
+
+ /**
+ * Check we get upper-case Roman numerals within the footer for AutoPageNumber.
+ */
+ @Test
+ public void testParsePagesHeadersFootersRomanUpper() throws Exception {
+ String header = "THIS IS SOME HEADER TEXT";
+ String footer = "THIS IS SOME FOOTER TEXT\tI";
+ String footer2 = "THIS IS SOME FOOTER TEXT\tII";
+
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersRomanUpper.pages");
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, new Metadata(), parseContext);
+ String contents = handler.toString();
+
+ // Check for headers, footers and footnotes
+ assertContains(header, contents);
+ assertContains(footer, contents);
+ assertContains(footer2, contents);
+ }
+
+ /**
+ * Check we get lower-case Roman numerals within the footer for AutoPageNumber.
+ */
+ @Test
+ public void testParsePagesHeadersFootersRomanLower() throws Exception {
+ String header = "THIS IS SOME HEADER TEXT";
+ String footer = "THIS IS SOME FOOTER TEXT\ti";
+ String footer2 = "THIS IS SOME FOOTER TEXT\tii";
+
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersRomanLower.pages");
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, new Metadata(), parseContext);
+ String contents = handler.toString();
+
+ // Check for headers, footers and footnotes
+ assertContains(header, contents);
+ assertContains(footer, contents);
+ assertContains(footer2, contents);
+ }
+
+ /**
+ * Check we get upper-case alpha-numeric letters within the footer for AutoPageNumber.
+ */
+ @Test
+ public void testParsePagesHeadersAlphaUpper() throws Exception {
+ String header = "THIS IS SOME HEADER TEXT\tA";
+ String footer = "THIS IS SOME FOOTER TEXT\tA";
+ String footer2 = "THIS IS SOME FOOTER TEXT\tB";
+
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersAlphaUpper.pages");
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, new Metadata(), parseContext);
+ String contents = handler.toString();
+
+ // Check for headers, footers and footnotes
+ assertContains(header, contents);
+ assertContains(footer, contents);
+ assertContains(footer2, contents);
+ }
+
+ /**
+ * Check we get lower-case alpha-numeric letters within the footer for AutoPageNumber.
+ */
+ @Test
+ public void testParsePagesHeadersAlphaLower() throws Exception {
+ String header = "THIS IS SOME HEADER TEXT";
+ String footer = "THIS IS SOME FOOTER TEXT\ta";
+ String footer2 = "THIS IS SOME FOOTER TEXT\tb";
+
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersAlphaLower.pages");
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, new Metadata(), parseContext);
+ String contents = handler.toString();
+
+ // Check for headers, footers and footnotes
+ assertContains(header, contents);
+ assertContains(footer, contents);
+ assertContains(footer2, contents);
+ }
+
+ /**
+ * Check we get annotations (eg comments) from Pages
+ */
+ @Test
+ public void testParsePagesAnnotations() throws Exception {
+ String commentA = "comment about the APXL file";
+ String commentB = "comment about UIMA";
+
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesComments.pages");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, metadata, parseContext);
+ String contents = handler.toString();
+
+ // Check regular text
+ assertContains("Both Pages 1.x", contents); // P1
+ assertContains("understanding the Pages document", contents); // P1
+ assertContains("should be page 2", contents); // P2
+
+ // Check for comments
+ assertContains(commentA, contents);
+ assertContains(commentB, contents);
+ }
+
+ // TIKA-918
+ @Test
+ public void testNumbersExtractChartNames() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbersCharts.numbers");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ iWorkParser.parse(input, handler, metadata, parseContext);
+ String contents = handler.toString();
+ assertContains("Expenditure by Category", contents);
+ assertContains("Currency Chart name", contents);
+ assertContains("Chart 2", contents);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/AbstractPkgTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.junit.Before;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parent class for all Package based Test cases
+ */
+public abstract class AbstractPkgTest extends TikaTest {
+ protected ParseContext trackingContext;
+ protected ParseContext recursingContext;
+
+ protected Parser autoDetectParser;
+ protected EmbeddedTrackingParser tracker;
+
+ @Before
+ public void setUp() throws Exception {
+ tracker = new EmbeddedTrackingParser();
+ trackingContext = new ParseContext();
+ trackingContext.set(Parser.class, tracker);
+
+ autoDetectParser = new AutoDetectParser();
+ recursingContext = new ParseContext();
+ recursingContext.set(Parser.class, autoDetectParser);
+ }
+
+
+ @SuppressWarnings("serial")
+ protected static class EmbeddedTrackingParser extends AbstractParser {
+ protected List<String> filenames = new ArrayList<String>();
+ protected List<String> mediatypes = new ArrayList<String>();
+ protected List<String> createdAts = new ArrayList<String>();
+ protected List<String> modifiedAts = new ArrayList<String>();
+ protected byte[] lastSeenStart;
+
+ public void reset() {
+ filenames.clear();
+ mediatypes.clear();
+ createdAts.clear();
+ modifiedAts.clear();
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ // Cheat!
+ return (new AutoDetectParser()).getSupportedTypes(context);
+ }
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ filenames.add(metadata.get(Metadata.RESOURCE_NAME_KEY));
+ mediatypes.add(metadata.get(Metadata.CONTENT_TYPE));
+ createdAts.add(metadata.get(TikaCoreProperties.CREATED));
+ modifiedAts.add(metadata.get(TikaCoreProperties.MODIFIED));
+
+ lastSeenStart = new byte[32];
+ stream.read(lastSeenStart);
+ }
+
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/ArParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,110 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.pkg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class ArParserTest extends AbstractPkgTest {
+ @Test
+ public void testArParsing() throws Exception {
+ Parser parser = new AutoDetectParser();
+
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = ArParserTest.class.getResourceAsStream(
+ "/test-documents/testARofText.ar")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-archive",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ try (InputStream stream = ArParserTest.class.getResourceAsStream(
+ "/test-documents/testARofSND.ar")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-archive",
+ metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+ /**
+ * Tests that the ParseContext parser is correctly fired for all the
+ * embedded entries.
+ */
+ @Test
+ public void testEmbedded() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = ArParserTest.class.getResourceAsStream(
+ "/test-documents/testARofText.ar")) {
+ parser.parse(stream, handler, metadata, trackingContext);
+ }
+
+ assertEquals(1, tracker.filenames.size());
+ assertEquals(1, tracker.mediatypes.size());
+ assertEquals(1, tracker.modifiedAts.size());
+
+ assertEquals("testTXT.txt", tracker.filenames.get(0));
+
+ String modifiedAt = tracker.modifiedAts.get(0);
+ assertTrue("Modified at " + modifiedAt, modifiedAt.startsWith("201"));
+
+ for (String type : tracker.mediatypes) {
+ assertNull(type);
+ }
+ for(String crt : tracker.createdAts) {
+ assertNull(crt);
+ }
+
+ tracker.reset();
+ try (InputStream stream = ArParserTest.class.getResourceAsStream(
+ "/test-documents/testARofSND.ar")) {
+ parser.parse(stream, handler, metadata, trackingContext);
+ }
+
+ assertEquals(1, tracker.filenames.size());
+ assertEquals(1, tracker.mediatypes.size());
+ assertEquals(1, tracker.modifiedAts.size());
+ assertEquals("testAU.au", tracker.filenames.get(0));
+
+ modifiedAt = tracker.modifiedAts.get(0);
+ assertTrue("Modified at " + modifiedAt, modifiedAt.startsWith("201"));
+
+ for (String type : tracker.mediatypes) {
+ assertNull(type);
+ }
+ for(String crt : tracker.createdAts) {
+ assertNull(crt);
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/Bzip2ParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing bzip2 files.
+ */
+public class Bzip2ParserTest extends AbstractPkgTest {
+
+ @Test
+ public void testBzip2Parsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = Bzip2ParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tbz2")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("test-documents/testXML.xml", content);
+ }
+
+
+ /**
+ * Tests that the ParseContext parser is correctly
+ * fired for all the embedded entries.
+ */
+ @Test
+ public void testEmbedded() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tbz2")) {
+ parser.parse(stream, handler, metadata, trackingContext);
+ }
+
+ // Should find a single entry, for the (compressed) tar file
+ assertEquals(1, tracker.filenames.size());
+ assertEquals(1, tracker.mediatypes.size());
+ assertEquals(1, tracker.modifiedAts.size());
+
+ assertEquals(null, tracker.filenames.get(0));
+ assertEquals(null, tracker.mediatypes.get(0));
+ assertEquals(null, tracker.createdAts.get(0));
+ assertEquals(null, tracker.modifiedAts.get(0));
+
+ // Tar file starts with the directory name
+ assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/CompressParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/CompressParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/CompressParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/CompressParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing compress (.Z) files.
+ */
+public class CompressParserTest extends AbstractPkgTest {
+ @Test
+ public void testCompressParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = TarParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tar.Z");
+ try {
+ parser.parse(stream, handler, metadata, recursingContext);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("application/x-compress", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("test-documents/testXML.xml", content);
+ }
+
+ /**
+ * Tests that the ParseContext parser is correctly
+ * fired for all the embedded entries.
+ */
+ @Test
+ public void testEmbedded() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = ZipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tar.Z");
+ try {
+ parser.parse(stream, handler, metadata, trackingContext);
+ } finally {
+ stream.close();
+ }
+
+ // Should find a single entry, for the (compressed) tar file
+ assertEquals(1, tracker.filenames.size());
+ assertEquals(1, tracker.mediatypes.size());
+ assertEquals(1, tracker.modifiedAts.size());
+
+ assertEquals(null, tracker.filenames.get(0));
+ assertEquals(null, tracker.mediatypes.get(0));
+ assertEquals(null, tracker.modifiedAts.get(0));
+
+ // Tar file starts with the directory name
+ assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
+ }
+}
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing gzip files.
+ */
+public class GzipParserTest extends AbstractPkgTest {
+
+ @Test
+ public void testGzipParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = GzipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tgz")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("test-documents/testXML.xml", content);
+ }
+
+ /**
+ * Tests that the ParseContext parser is correctly
+ * fired for all the embedded entries.
+ */
+ @Test
+ public void testEmbedded() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tgz")) {
+ parser.parse(stream, handler, metadata, trackingContext);
+ }
+
+ // Should find a single entry, for the (compressed) tar file
+ assertEquals(1, tracker.filenames.size());
+ assertEquals(1, tracker.mediatypes.size());
+ assertEquals(1, tracker.modifiedAts.size());
+
+ assertEquals(null, tracker.filenames.get(0));
+ assertEquals(null, tracker.mediatypes.get(0));
+ assertEquals(null, tracker.modifiedAts.get(0));
+
+ // Tar file starts with the directory name
+ assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII));
+ }
+
+ @Test
+ public void testSvgzParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = GzipParserTest.class.getResourceAsStream(
+ "/test-documents/testSVG.svgz")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/RarParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing rar files.
+ */
+public class RarParserTest extends AbstractPkgTest {
+
+ @Test
+ public void testRarParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = RarParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.rar")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-rar-compressed", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("test-documents/testXML.xml", content);
+ }
+
+ /**
+ * Tests that the ParseContext parser is correctly
+ * fired for all the embedded entries.
+ */
+ @Test
+ public void testEmbedded() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = RarParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.rar")) {
+ parser.parse(stream, handler, metadata, trackingContext);
+ }
+
+ // Should have found all 9 documents, but not the directory
+ assertEquals(9, tracker.filenames.size());
+ assertEquals(9, tracker.mediatypes.size());
+ assertEquals(9, tracker.modifiedAts.size());
+
+ // Should have names but not content types, as rar doesn't
+ // store the content types
+ assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
+ assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));
+ assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2));
+ assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3));
+ assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4));
+ assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5));
+ assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6));
+ assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7));
+ assertEquals("test-documents/testXML.xml", tracker.filenames.get(8));
+
+ for(String type : tracker.mediatypes) {
+ assertNull(type);
+ }
+ for(String crt : tracker.createdAts) {
+ assertNull(crt);
+ }
+ for(String mod : tracker.modifiedAts) {
+ assertNotNull(mod);
+ assertTrue("Modified at " + mod, mod.startsWith("20"));
+ }
+
+ // Should have filenames in the content string
+ String content = handler.toString();
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("test-documents/testXML.xml", content);
+ }
+}
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/Seven7ParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import javax.crypto.Cipher;
+
+import java.io.InputStream;
+import java.security.NoSuchAlgorithmException;
+
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing 7z files.
+ */
+public class Seven7ParserTest extends AbstractPkgTest {
+ private static final MediaType TYPE_7ZIP = MediaType.application("x-7z-compressed");
+
+ @Test
+ public void test7ZParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ // Ensure 7zip is a parsable format
+ assertTrue("No 7zip parser found",
+ parser.getSupportedTypes(recursingContext).contains(TYPE_7ZIP));
+
+ // Parse
+ try (InputStream stream = Seven7ParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.7z")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("test-documents/testXML.xml", content);
+ }
+
+ /**
+ * Tests that the ParseContext parser is correctly
+ * fired for all the embedded entries.
+ */
+ @Test
+ public void testEmbedded() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = Seven7ParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.7z")) {
+ parser.parse(stream, handler, metadata, trackingContext);
+ }
+
+ // Should have found all 9 documents, but not the directory
+ assertEquals(9, tracker.filenames.size());
+ assertEquals(9, tracker.mediatypes.size());
+ assertEquals(9, tracker.modifiedAts.size());
+
+ // Should have names but not content types, as 7z doesn't
+ // store the content types
+ assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
+ assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));
+ assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2));
+ assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3));
+ assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4));
+ assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5));
+ assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6));
+ assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7));
+ assertEquals("test-documents/testXML.xml", tracker.filenames.get(8));
+
+ for(String type : tracker.mediatypes) {
+ assertNull(type);
+ }
+ for(String mod : tracker.modifiedAts) {
+ assertNotNull(mod);
+ assertTrue("Modified at " + mod, mod.startsWith("20"));
+ }
+ }
+
+ @Test
+ public void testPasswordProtected() throws Exception {
+ Parser parser = new AutoDetectParser();
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ // No password, will fail with EncryptedDocumentException
+ boolean ex = false;
+ try (InputStream stream = Seven7ParserTest.class.getResourceAsStream(
+ "/test-documents/test7Z_protected_passTika.7z")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ fail("Shouldn't be able to read a password protected 7z without the password");
+ } catch (EncryptedDocumentException e) {
+ // Good
+ ex = true;
+ }
+
+ assertTrue("test no password", ex);
+
+ ex = false;
+
+ // Wrong password currently silently gives no content
+ // Ideally we'd like Commons Compress to give an error, but it doesn't...
+ recursingContext.set(PasswordProvider.class, new PasswordProvider() {
+ @Override
+ public String getPassword(Metadata metadata) {
+ return "wrong";
+ }
+ });
+ handler = new BodyContentHandler();
+ try (InputStream stream = Seven7ParserTest.class.getResourceAsStream(
+ "/test-documents/test7Z_protected_passTika.7z")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ fail("Shouldn't be able to read a password protected 7z with wrong password");
+ } catch (TikaException e) {
+ //if JCE is installed, the cause will be: Caused by: org.tukaani.xz.CorruptedInputException: Compressed data is corrupt
+ //if JCE is not installed, the message will include
+ // "(do you have the JCE Unlimited Strength Jurisdiction Policy Files installed?")
+ ex = true;
+ }
+ assertTrue("TikaException for bad password", ex);
+ // Will be empty
+ assertEquals("", handler.toString());
+
+ ex = false;
+ // Right password works fine if JCE Unlimited Strength has been installed!!!
+ if (isStrongCryptoAvailable()) {
+ recursingContext.set(PasswordProvider.class, new PasswordProvider() {
+ @Override
+ public String getPassword(Metadata metadata) {
+ return "Tika";
+ }
+ });
+ handler = new BodyContentHandler();
+ try (InputStream stream = Seven7ParserTest.class.getResourceAsStream(
+ "/test-documents/test7Z_protected_passTika.7z")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+
+ // Should get filename
+ assertContains("text.txt", content);
+
+ // Should get contents from the text file in the 7z file
+ assertContains("TEST DATA FOR TIKA.", content);
+ assertContains("This is text inside an encrypted 7zip (7z) file.", content);
+ assertContains("It should be processed by Tika just fine!", content);
+ assertContains("TIKA-1521", content);
+ } else {
+ //if jce is not installed, test for IOException wrapped in TikaException
+ boolean ioe = false;
+ recursingContext.set(PasswordProvider.class, new PasswordProvider() {
+ @Override
+ public String getPassword(Metadata metadata) {
+ return "Tika";
+ }
+ });
+ handler = new BodyContentHandler();
+ try (InputStream stream = Seven7ParserTest.class.getResourceAsStream(
+ "/test-documents/test7Z_protected_passTika.7z")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ } catch (TikaException e) {
+ ioe = true;
+ }
+ assertTrue("IOException because JCE was not installed", ioe);
+ }
+ }
+
+ private static boolean isStrongCryptoAvailable() throws NoSuchAlgorithmException {
+ return Cipher.getMaxAllowedKeyLength("AES/ECB/PKCS5Padding") >= 256;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing tar files.
+ */
+public class TarParserTest extends AbstractPkgTest {
+
+ @Test
+ public void testTarParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = TarParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tar")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("test-documents/testXML.xml", content);
+ }
+
+ /**
+ * Tests that the ParseContext parser is correctly
+ * fired for all the embedded entries.
+ */
+ @Test
+ public void testEmbedded() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tar")) {
+ parser.parse(stream, handler, metadata, trackingContext);
+ }
+
+ // Should have found all 9 documents, but not the directory
+ assertEquals(9, tracker.filenames.size());
+ assertEquals(9, tracker.mediatypes.size());
+ assertEquals(9, tracker.modifiedAts.size());
+
+ // Should have names but not content types, as tar doesn't
+ // store the content types
+ assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
+ assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));
+ assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2));
+ assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3));
+ assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4));
+ assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5));
+ assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6));
+ assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7));
+ assertEquals("test-documents/testXML.xml", tracker.filenames.get(8));
+
+ for(String type : tracker.mediatypes) {
+ assertNull(type);
+ }
+ for(String crt : tracker.createdAts) {
+ assertNull(crt);
+ }
+ for(String mod : tracker.modifiedAts) {
+ assertNotNull(mod);
+ assertTrue("Modified at " + mod, mod.startsWith("20"));
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.tika.Tika;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Test case for parsing zip files.
+ */
+public class ZipParserTest extends AbstractPkgTest {
+
+ @Test
+ public void testZipParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.zip")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("testEXCEL.xls", content);
+ assertContains("testHTML.html", content);
+ assertContains("testOpenOffice2.odt", content);
+ assertContains("testPDF.pdf", content);
+ assertContains("testPPT.ppt", content);
+ assertContains("testRTF.rtf", content);
+ assertContains("testTXT.txt", content);
+ assertContains("testWORD.doc", content);
+ assertContains("testXML.xml", content);
+ }
+
+ /**
+ * Tests that the ParseContext parser is correctly
+ * fired for all the embedded entries.
+ */
+ @Test
+ public void testEmbedded() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.zip")) {
+ parser.parse(stream, handler, metadata, trackingContext);
+ }
+
+ // Should have found all 9 documents
+ assertEquals(9, tracker.filenames.size());
+ assertEquals(9, tracker.mediatypes.size());
+ assertEquals(9, tracker.modifiedAts.size());
+
+ // Should have names and modified dates, but not content types,
+ // as zip doesn't store the content types
+ assertEquals("testEXCEL.xls", tracker.filenames.get(0));
+ assertEquals("testHTML.html", tracker.filenames.get(1));
+ assertEquals("testOpenOffice2.odt", tracker.filenames.get(2));
+ assertEquals("testPDF.pdf", tracker.filenames.get(3));
+ assertEquals("testPPT.ppt", tracker.filenames.get(4));
+ assertEquals("testRTF.rtf", tracker.filenames.get(5));
+ assertEquals("testTXT.txt", tracker.filenames.get(6));
+ assertEquals("testWORD.doc", tracker.filenames.get(7));
+ assertEquals("testXML.xml", tracker.filenames.get(8));
+
+ for(String type : tracker.mediatypes) {
+ assertNull(type);
+ }
+ for(String crt : tracker.createdAts) {
+ assertNull(crt);
+ }
+ for(String mod : tracker.modifiedAts) {
+ assertNotNull(mod);
+ assertTrue("Modified at " + mod, mod.startsWith("20"));
+ }
+ }
+
+ /**
+ * Test case for the ability of the ZIP parser to extract the name of
+ * a ZIP entry even if the content of the entry is unreadable due to an
+ * unsupported compression method.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-346">TIKA-346</a>
+ */
+ @Test
+ public void testUnsupportedZipCompressionMethod() throws Exception {
+ String content = new Tika().parseToString(
+ ZipParserTest.class.getResourceAsStream(
+ "/test-documents/moby.zip"));
+ assertContains("README", content);
+ }
+
+ private class GatherRelIDsDocumentExtractor implements EmbeddedDocumentExtractor {
+ public Set<String> allRelIDs = new HashSet<String>();
+ public boolean shouldParseEmbedded(Metadata metadata) {
+ String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
+ if (relID != null) {
+ allRelIDs.add(relID);
+ }
+ return false;
+ }
+
+ public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) {
+ throw new UnsupportedOperationException("should never be called");
+ }
+ }
+
+ // TIKA-1036
+ @Test
+ public void testPlaceholders() throws Exception {
+ String xml = getXML("testEmbedded.zip").xml;
+ assertContains("<div class=\"embedded\" id=\"test1.txt\" />", xml);
+ assertContains("<div class=\"embedded\" id=\"test2.txt\" />", xml);
+
+ // Also make sure EMBEDDED_RELATIONSHIP_ID was
+ // passed when parsing the embedded docs:
+ Parser parser = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, parser);
+ GatherRelIDsDocumentExtractor relIDs = new GatherRelIDsDocumentExtractor();
+ context.set(EmbeddedDocumentExtractor.class, relIDs);
+ try (InputStream input = getResourceAsStream("/test-documents/testEmbedded.zip")) {
+ parser.parse(input,
+ new BodyContentHandler(),
+ new Metadata(),
+ context);
+ }
+
+ assertTrue(relIDs.allRelIDs.contains("test1.txt"));
+ assertTrue(relIDs.allRelIDs.contains("test2.txt"));
+ }
+
+ @Test // TIKA-936
+ public void testCustomEncoding() throws Exception {
+ ArchiveStreamFactory factory = new ArchiveStreamFactory();
+ factory.setEntryEncoding("SJIS");
+ trackingContext.set(ArchiveStreamFactory.class, factory);
+
+ try (InputStream stream = TikaInputStream.get(Base64.decodeBase64(
+ "UEsDBBQAAAAIAI+CvUCDo3+zIgAAACgAAAAOAAAAk/qWe4zqg4GDgi50"
+ + "eHRr2tj0qulsc2pzRHN609Gm7Y1OvFxNYLHJv6ZV97yCiQEAUEsBAh"
+ + "QLFAAAAAgAj4K9QIOjf7MiAAAAKAAAAA4AAAAAAAAAAAAgAAAAAAAA"
+ + "AJP6lnuM6oOBg4IudHh0UEsFBgAAAAABAAEAPAAAAE4AAAAAAA=="))) {
+ autoDetectParser.parse(
+ stream, new DefaultHandler(),
+ new Metadata(), trackingContext);
+ }
+
+ assertEquals(1, tracker.filenames.size());
+ assertEquals(
+ "\u65E5\u672C\u8A9E\u30E1\u30E2.txt",
+ tracker.filenames.get(0));
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-package-module/src/test/java/org/apache/tika/parser/pkg/ZlibParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing zlib compressed
+ */
+public class ZlibParserTest extends AbstractPkgTest {
+ @Test
+ public void testZlibParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+ "/test-documents/testTXT.zlib")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/zlib", metadata.get(Metadata.CONTENT_TYPE));
+ }
+
+ /**
+ * Tests that the ParseContext parser is correctly
+ * fired for all the embedded entries.
+ */
+ @Test
+ public void testEmbedded() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+ "/test-documents/testTXT.zlib")) {
+ parser.parse(stream, handler, metadata, trackingContext);
+ }
+
+ // Should have found a single text document inside
+ assertEquals(1, tracker.filenames.size());
+ assertEquals(1, tracker.mediatypes.size());
+ assertEquals(1, tracker.modifiedAts.size());
+
+ // Won't have names, dates or types, as zlib doesn't have that
+ assertEquals(null, tracker.filenames.get(0));
+ assertEquals(null, tracker.mediatypes.get(0));
+ assertEquals(null, tracker.createdAts.get(0));
+ assertEquals(null, tracker.modifiedAts.get(0));
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-pdf-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-module/pom.xml?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-pdf-module/pom.xml (added)
+++ tika/branches/2.x/tika-parser-modules/tika-pdf-module/pom.xml Wed Jan 6 03:50:50 2016
@@ -0,0 +1,106 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-pdf-module</artifactId>
+ <name>Apache Tika PDF Module</name>
+ <url>http://tika.apache.org/</url>
+
+ <properties>
+ <commons.logging.version>1.1.3</commons.logging.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.pdfbox</groupId>
+ <artifactId>pdfbox</artifactId>
+ <version>${pdfbox.version}</version>
+ </dependency>
+ <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies
+ as optional, but we prefer to have them always to avoid
+ problems with encrypted PDFs. -->
+ <dependency>
+ <groupId>org.bouncycastle</groupId>
+ <artifactId>bcmail-jdk15on</artifactId>
+ <version>1.52</version>
+ </dependency>
+ <dependency>
+ <groupId>org.bouncycastle</groupId>
+ <artifactId>bcprov-jdk15on</artifactId>
+ <version>1.52</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-logging</groupId>
+ <artifactId>commons-logging</artifactId>
+ <version>${commons.logging.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-package-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-text-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-office-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.pdf;
+
+import java.io.Serializable;
+
+import org.apache.tika.exception.AccessPermissionException;
+import org.apache.tika.metadata.AccessPermissions;
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * Checks whether or not a document allows extraction generally
+ * or extraction for accessibility only.
+ */
+public class AccessChecker implements Serializable {
+
+ private static final long serialVersionUID = 6492570218190936986L;
+
+ private final boolean needToCheck;
+ private final boolean allowAccessibility;
+
+ /**
+ * This constructs an {@link AccessChecker} that
+ * will not perform any checking and will always return without
+ * throwing an exception.
+ * <p/>
+ * This constructor is available to allow for Tika's legacy ( <= v1.7) behavior.
+ */
+ public AccessChecker() {
+ needToCheck = false;
+ allowAccessibility = true;
+ }
+
+ /**
+ * This constructs an {@link AccessChecker} that will check
+ * for whether or not content should be extracted from a document.
+ *
+ * @param allowExtractionForAccessibility if general extraction is not allowed, is extraction for accessibility allowed
+ */
+ public AccessChecker(boolean allowExtractionForAccessibility) {
+ needToCheck = true;
+ this.allowAccessibility = allowExtractionForAccessibility;
+ }
+
+ /**
+ * Checks to see if a document's content should be extracted based
+ * on metadata values and the value of {@link #allowAccessibility} in the constructor.
+ *
+ * @param metadata
+ * @throws AccessPermissionException if access is not permitted
+ */
+ public void check(Metadata metadata) throws AccessPermissionException {
+ if (!needToCheck) {
+ return;
+ }
+ if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) {
+ if (allowAccessibility) {
+ if ("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) {
+ return;
+ }
+ throw new AccessPermissionException("Content extraction for accessibility is not allowed.");
+ }
+ throw new AccessPermissionException("Content extraction is not allowed.");
+ }
+ }
+}