You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC
svn commit: r1723223 [32/32] - in /tika/branches/2.x:
tika-core/src/test/resources/META-INF/
tika-core/src/test/resources/META-INF/services/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-module/src/ ti...
Modified: tika/branches/2.x/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parsers/pom.xml?rev=1723223&r1=1723222&r2=1723223&view=diff
==============================================================================
--- tika/branches/2.x/tika-parsers/pom.xml (original)
+++ tika/branches/2.x/tika-parsers/pom.xml Wed Jan 6 03:50:50 2016
@@ -43,8 +43,6 @@
<mime4j.version>0.7.2</mime4j.version>
<vorbis.version>0.6</vorbis.version>
<pdfbox.version>1.8.10</pdfbox.version>
- <netcdf-java.version>4.5.5</netcdf-java.version>
- <cxf.version>3.0.3</cxf.version>
</properties>
<dependencies>
@@ -77,16 +75,6 @@
<artifactId>vorbis-java-tika</artifactId>
<version>${vorbis.version}</version>
</dependency>
- <dependency>
- <groupId>com.healthmarketscience.jackcess</groupId>
- <artifactId>jackcess</artifactId>
- <version>2.1.2</version>
- </dependency>
- <dependency>
- <groupId>com.healthmarketscience.jackcess</groupId>
- <artifactId>jackcess-encrypt</artifactId>
- <version>2.1.1</version>
- </dependency>
<!-- Optional OSGi dependencies, used only when running within OSGi -->
<dependency>
@@ -95,27 +83,6 @@
<scope>provided</scope>
</dependency>
- <!-- Upstream parser libraries -->
- <dependency>
- <groupId>net.sourceforge.jmatio</groupId>
- <artifactId>jmatio</artifactId>
- <version>1.0</version>
- </dependency>
- <dependency>
- <groupId>org.apache.james</groupId>
- <artifactId>apache-mime4j-core</artifactId>
- <version>${mime4j.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.james</groupId>
- <artifactId>apache-mime4j-dom</artifactId>
- <version>${mime4j.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-compress</artifactId>
- <version>${commons.compress.version}</version>
- </dependency>
<dependency>
<groupId>org.tukaani</groupId>
<artifactId>xz</artifactId>
@@ -132,19 +99,6 @@
<artifactId>pdfbox</artifactId>
<version>${pdfbox.version}</version>
</dependency>
- <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies
- as optional, but we prefer to have them always to avoid
- problems with encrypted PDFs. -->
- <dependency>
- <groupId>org.bouncycastle</groupId>
- <artifactId>bcmail-jdk15on</artifactId>
- <version>1.52</version>
- </dependency>
- <dependency>
- <groupId>org.bouncycastle</groupId>
- <artifactId>bcprov-jdk15on</artifactId>
- <version>1.52</version>
- </dependency>
<dependency>
<groupId>org.apache.poi</groupId>
@@ -171,116 +125,98 @@
</exclusion>
</exclusions>
</dependency>
+
+
+ <!-- Apache cTAKES -->
<dependency>
- <groupId>org.ccil.cowan.tagsoup</groupId>
- <artifactId>tagsoup</artifactId>
- <version>1.2.1</version>
+ <groupId>org.apache.ctakes</groupId>
+ <artifactId>ctakes-core</artifactId>
+ <version>3.2.2</version>
+ <scope>provided</scope>
</dependency>
<dependency>
- <groupId>org.ow2.asm</groupId>
- <artifactId>asm</artifactId>
- <version>5.0.4</version>
+ <groupId>org.xerial</groupId>
+ <artifactId>sqlite-jdbc</artifactId>
+ <version>3.8.10.1</version>
+ <scope>provided</scope>
</dependency>
-
<dependency>
- <groupId>com.googlecode.mp4parser</groupId>
- <artifactId>isoparser</artifactId>
- <version>1.0.2</version>
+ <groupId>org.gagravarr</groupId>
+ <artifactId>vorbis-java-core</artifactId>
+ <version>${vorbis.version}</version>
</dependency>
<dependency>
- <groupId>com.drewnoakes</groupId>
- <artifactId>metadata-extractor</artifactId>
- <version>2.8.0</version>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-multimedia-module</artifactId>
+ <version>${project.version}</version>
</dependency>
<dependency>
- <groupId>de.l3s.boilerpipe</groupId>
- <artifactId>boilerpipe</artifactId>
- <version>1.1.0</version>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-advanced-module</artifactId>
+ <version>${project.version}</version>
</dependency>
<dependency>
- <groupId>rome</groupId>
- <artifactId>rome</artifactId>
- <version>1.0</version>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-cad-module</artifactId>
+ <version>${project.version}</version>
</dependency>
<dependency>
- <groupId>org.gagravarr</groupId>
- <artifactId>vorbis-java-core</artifactId>
- <version>${vorbis.version}</version>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-code-module</artifactId>
+ <version>${project.version}</version>
</dependency>
<dependency>
- <groupId>com.googlecode.juniversalchardet</groupId>
- <artifactId>juniversalchardet</artifactId>
- <version>1.0.3</version>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-database-module</artifactId>
+ <version>${project.version}</version>
</dependency>
<dependency>
- <groupId>org.codelibs</groupId>
- <artifactId>jhighlight</artifactId>
- <version>1.0.2</version>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-ebook-module</artifactId>
+ <version>${project.version}</version>
</dependency>
<dependency>
- <groupId>com.pff</groupId>
- <artifactId>java-libpst</artifactId>
- <version>0.8.1</version>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-journal-module</artifactId>
+ <version>${project.version}</version>
</dependency>
<dependency>
- <groupId>com.github.junrar</groupId>
- <artifactId>junrar</artifactId>
- <version>0.7</version>
- </dependency>
- <dependency>
- <groupId>org.apache.cxf</groupId>
- <artifactId>cxf-rt-rs-client</artifactId>
- <version>${cxf.version}</version>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-office-module</artifactId>
+ <version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-multimedia-module</artifactId>
+ <artifactId>tika-package-module</artifactId>
<version>${project.version}</version>
</dependency>
-
- <!-- Provided dependencies -->
<dependency>
- <groupId>org.xerial</groupId>
- <artifactId>sqlite-jdbc</artifactId>
- <version>3.8.10.1</version>
- <scope>provided</scope>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-pdf-module</artifactId>
+ <version>${project.version}</version>
</dependency>
-
<dependency>
- <groupId>org.apache.opennlp</groupId>
- <artifactId>opennlp-tools</artifactId>
- <version>1.5.3</version>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-scientific-module</artifactId>
+ <version>${project.version}</version>
</dependency>
-
<dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>${commons.io.version}</version>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-text-module</artifactId>
+ <version>${project.version}</version>
</dependency>
-
<dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-exec</artifactId>
- <version>1.3</version>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-web-module</artifactId>
+ <version>${project.version}</version>
</dependency>
-
<dependency>
- <groupId>com.googlecode.json-simple</groupId>
- <artifactId>json-simple</artifactId>
- <version>1.1.1</version>
- <exclusions>
- <exclusion>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- </exclusion>
- </exclusions>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
</dependency>
- <dependency>
- <groupId>org.json</groupId>
- <artifactId>json</artifactId>
- <version>20140107</version>
- </dependency>
+
<!-- Test dependencies -->
<dependency>
@@ -305,68 +241,11 @@
<scope>test</scope>
<type>test-jar</type>
</dependency>
-
- <!-- edu.ucar dependencies -->
- <dependency>
- <groupId>edu.ucar</groupId>
- <artifactId>netcdf4</artifactId>
- <version>${netcdf-java.version}</version>
- </dependency>
- <dependency>
- <groupId>edu.ucar</groupId>
- <artifactId>grib</artifactId>
- <version>${netcdf-java.version}</version>
- </dependency>
- <dependency>
- <groupId>edu.ucar</groupId>
- <artifactId>cdm</artifactId>
- <version>${netcdf-java.version}</version>
- <exclusions>
- <exclusion>
- <groupId>org.slf4j</groupId>
- <artifactId>jcl-over-slf4j</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
- <groupId>edu.ucar</groupId>
- <artifactId>httpservices</artifactId>
- <version>${netcdf-java.version}</version>
- </dependency>
- <!-- Apache Commons CSV -->
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-csv</artifactId>
- <version>1.0</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.sis.core</groupId>
- <artifactId>sis-utility</artifactId>
- <version>0.5</version>
- </dependency>
- <dependency>
- <groupId>org.apache.sis.storage</groupId>
- <artifactId>sis-netcdf</artifactId>
- <version>0.5</version>
- </dependency>
- <dependency>
- <groupId>org.apache.sis.core</groupId>
- <artifactId>sis-metadata</artifactId>
- <version>0.5</version>
- </dependency>
<dependency>
<groupId>org.opengis</groupId>
<artifactId>geoapi</artifactId>
<version>3.0.0</version>
</dependency>
- <!-- Apache cTAKES -->
- <dependency>
- <groupId>org.apache.ctakes</groupId>
- <artifactId>ctakes-core</artifactId>
- <version>3.2.2</version>
- <scope>provided</scope>
- </dependency>
</dependencies>
<build>
@@ -441,8 +320,31 @@
<artifactSet>
<includes>
<include>org.apache.tika:tika-multimedia-module</include>
+ <include>org.apache.tika:tika-advanced-module</include>
+ <include>org.apache.tika:tika-cad-module</include>
+ <include>org.apache.tika:tika-code-module</include>
+ <include>org.apache.tika:tika-database-module</include>
+ <include>org.apache.tika:tika-ebook-module</include>
+ <include>org.apache.tika:tika-journal-module</include>
+ <include>org.apache.tika:tika-office-module</include>
+ <include>org.apache.tika:tika-package-module</include>
+ <include>org.apache.tika:tika-pdf-module</include>
+ <include>org.apache.tika:tika-scientific-module</include>
+ <include>org.apache.tika:tika-text-module</include>
+ <include>org.apache.tika:tika-web-module</include>
</includes>
</artifactSet>
+ <transformers>
+ <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
+ <resource>META-INF/services/org.apache.tika.detect.Detector</resource>
+ </transformer>
+ <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
+ <resource>META-INF/services/org.apache.tika.detect.EncodingDetector</resource>
+ </transformer>
+ <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
+ <resource>META-INF/services/org.apache.tika.parser.Parser</resource>
+ </transformer>
+ </transformers>
</configuration>
</execution>
</executions>
Added: tika/branches/2.x/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageTest.java (added)
+++ tika/branches/2.x/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,335 @@
+package org.apache.tika.parser.pkg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class PackageTest extends TikaTest {
+
+ private static final MediaType TYPE_7ZIP = MediaType.application("x-7z-compressed");
+
+ private ParseContext recursingContext;
+ private Parser autoDetectParser;
+
+ @Before
+ public void setUp() throws Exception {
+
+ autoDetectParser = new AutoDetectParser();
+ recursingContext = new ParseContext();
+ recursingContext.set(Parser.class, autoDetectParser);
+ }
+
+ @Test
+ public void testZlibParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/testTXT.zlib")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/zlib", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("http://www.apache.org", content);
+ }
+
+
+ @Test
+ public void testArParsing() throws Exception {
+ Parser parser = new AutoDetectParser();
+
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/testARofText.ar")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-archive",
+ metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("http://www.apache.org", content);
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/testARofSND.ar")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-archive",
+ metadata.get(Metadata.CONTENT_TYPE));
+ content = handler.toString();
+ assertContains("testAU.au", content);
+ }
+
+ @Test
+ public void testBzip2Parsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tbz2")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("test-documents/testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+
+ @Test
+ public void testCompressParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tar.Z");
+ try {
+ parser.parse(stream, handler, metadata, recursingContext);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("application/x-compress", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("test-documents/testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+
+ @Test
+ public void testGzipParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tgz")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("test-documents/testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+
+ @Test
+ public void testRarParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.rar")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-rar-compressed", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("test-documents/testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+
+ @Test
+ public void test7ZParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ // Ensure 7zip is a parsable format
+ assertTrue("No 7zip parser found",
+ parser.getSupportedTypes(recursingContext).contains(TYPE_7ZIP));
+
+ // Parse
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.7z")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("test-documents/testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+ @Test
+ public void testTarParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tar")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("test-documents/testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+
+ @Test
+ public void testZipParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.zip")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+
+ @Test
+ public void testSvgzParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/testSVG.svgz")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("Test SVG image", content);
+ }
+}