You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/03/18 20:29:24 UTC
[tika] branch main updated: TIKA-3697 : add initial warc and wacz parsers
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 0be1713 TIKA-3697 : add initial warc and wacz parsers
new 1260c02 Merge remote-tracking branch 'origin/main' into main
0be1713 is described below
commit 0be171352e5ece374228f8b6e95973cd6a8b9211
Author: tallison <ta...@apache.org>
AuthorDate: Fri Mar 18 16:29:04 2022 -0400
TIKA-3697 : add initial warc and wacz parsers
---
CHANGES.txt | 2 +
.../main/java/org/apache/tika/metadata/WARC.java | 31 ++++
.../org/apache/tika/mime/tika-mimetypes.xml | 2 +-
tika-parent/pom.xml | 5 +
.../tika-parsers-standard-modules/pom.xml | 1 +
.../tika-parser-webarchive-module/pom.xml | 82 +++++++++++
.../org/apache/tika/parser/wacz/WACZParser.java | 162 +++++++++++++++++++++
.../org/apache/tika/parser/warc/WARCParser.java | 148 +++++++++++++++++++
.../services/org.apache.tika.parser.Parser | 18 +++
.../apache/tika/parser/wacz/WACZParserTest.java | 38 +++++
.../apache/tika/parser/warc/WARCParserTest.java | 43 ++++++
.../src/test/resources/test-documents/cc.warc.gz | Bin 0 -> 5392 bytes
.../resources/test-documents/gzip_extra_sl.warc.gz | Bin 0 -> 459 bytes
.../test/resources/test-documents/testWACZ.wacz | Bin 0 -> 4186 bytes
.../tika-parsers-standard-package/pom.xml | 5 +
15 files changed, 536 insertions(+), 1 deletion(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index eea54b3..e577bc3 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -9,6 +9,8 @@ Release 2.4.0 - ???
* Add MetadataWriteFilter capability to improve memory profile in
Metadata objects (TIKA-3695).
+ * Add basic parsers for WARC and WACZ in tika-parsers-standard (TIKA-3697).
+
* Add detection for Frictionless Data packages and WACZ (TIKA-3696).
* Upgrade deeplearning4j to 1.0.0-M2 (TIKA-3458 and PR#527).
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/WARC.java b/tika-core/src/main/java/org/apache/tika/metadata/WARC.java
new file mode 100644
index 0000000..359236b
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/WARC.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+public interface WARC {
+ String PREFIX = "warc" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+
+ Property WARC_WARNING = Property.externalTextBag(PREFIX + "warning");
+
+ Property WARC_RECORD_CONTENT_TYPE = Property.externalText(PREFIX + "record-content-type");
+
+ Property WARC_PAYLOAD_CONTENT_TYPE = Property.externalText(PREFIX + "payload-content-type");
+
+ Property WARC_RECORD_ID = Property.externalText(PREFIX + "WARC-Record-ID");
+
+ //TODO: lots
+}
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index ef9e5fa..9a141b1 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3114,7 +3114,7 @@
<mime-type type="application/warc">
<acronym>WARC</acronym>
<_comment>WARC</_comment>
- <magic priority="50">
+ <magic priority="60">
<match value="WARC/" type="string" offset="0"/>
</magic>
<glob pattern="*.warc"/>
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 0970f3c..8bb0a47 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -606,6 +606,11 @@
<version>2.0.1</version>
</dependency>
<dependency>
+ <groupId>org.netpreserve</groupId>
+ <artifactId>jwarc</artifactId>
+ <version>0.17.0</version>
+ </dependency>
+ <dependency>
<groupId>org.ops4j.base</groupId>
<artifactId>ops4j-base-lang</artifactId>
<version>${ops4j.version}</version>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml
index 7153748..770f4ac 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml
@@ -68,6 +68,7 @@
<module>tika-parser-miscoffice-module</module>
<module>tika-parser-news-module</module>
<module>tika-parser-crypto-module</module>
+ <module>tika-parser-webarchive-module</module>
</modules>
<build>
<pluginManagement>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/pom.xml
new file mode 100644
index 0000000..5a21b8c
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/pom.xml
@@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <parent>
+ <artifactId>tika-parsers-standard-modules</artifactId>
+ <groupId>org.apache.tika</groupId>
+ <version>2.3.1-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <modelVersion>4.0.0</modelVersion>
+
+ <artifactId>tika-parser-webarchive-module</artifactId>
+ <name>Apache Tika WARC parser module</name>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.netpreserve</groupId>
+ <artifactId>jwarc</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-compress</artifactId>
+ </dependency>
+ <!-- need these for detection/ungzipping and html parsing in tests -->
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-html-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-pkg-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifestEntries>
+ <Automatic-Module-Name>org.apache.tika.parser.warc</Automatic-Module-Name>
+ </manifestEntries>
+ </archive>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+
+ <scm>
+ <tag>2.2.1-rc2</tag>
+ </scm>
+</project>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java
new file mode 100644
index 0000000..7697a98
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.wacz;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.compress.utils.IOUtils;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+public class WACZParser extends AbstractParser {
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
+ new HashSet<>(Arrays.asList(MediaType.application("x-wacz"))));
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ EmbeddedDocumentExtractor embeddedDocumentExtractor =
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+ if (stream instanceof TikaInputStream) {
+ ZipFile zip = (ZipFile) ((TikaInputStream) stream).getOpenContainer();
+ if (zip == null && ((TikaInputStream)stream).hasFile()) {
+ zip = new ZipFile(((TikaInputStream)stream).getFile());
+ }
+ if (zip != null) {
+ try {
+ processZip(zip, xhtml, metadata, embeddedDocumentExtractor);
+ } finally {
+ zip.close();
+ }
+ } else {
+ processStream(stream, xhtml, metadata, embeddedDocumentExtractor);
+ }
+ } else {
+ processStream(stream, xhtml, metadata, embeddedDocumentExtractor);
+ }
+ xhtml.endDocument();
+ }
+
+ private void processStream(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata,
+ EmbeddedDocumentExtractor ex) throws SAXException, IOException {
+ try (ZipArchiveInputStream zais = new ZipArchiveInputStream(
+ new CloseShieldInputStream(stream))) {
+ ZipArchiveEntry zae = zais.getNextZipEntry();
+ while (zae != null) {
+ String name = zae.getName();
+ if (name.startsWith("archive/")) {
+ name = name.substring(8);
+ processWARC(zais, zae, name, xhtml, metadata, ex);
+ } else if ("datapackage.json".equals(name)) {
+ //no-op
+ processDataPackage(zais, zae, xhtml, metadata);
+ }
+ //TODO -- process pages (jsonl); process indexes?!
+
+ zae = zais.getNextZipEntry();
+ }
+ }
+ }
+
+ private void processDataPackage(InputStream is, ZipArchiveEntry zae,
+ XHTMLContentHandler xhtml, Metadata metadata)
+ throws IOException {
+ //no-op
+ }
+
+ private void processWARC(InputStream zais, ZipArchiveEntry zae,
+ String name, XHTMLContentHandler xhtml, Metadata parentMetadata,
+ EmbeddedDocumentExtractor ex) throws IOException, SAXException {
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+ metadata.set(Metadata.CONTENT_LENGTH, Long.toString(zae.getSize()));
+ try (InputStream inputStream = getMaybeGzipInputStream(TikaInputStream.get(zais))) {
+ if (ex.shouldParseEmbedded(metadata)) {
+ ex.parseEmbedded(inputStream, xhtml, metadata, true);
+ }
+ }
+ }
+
+ private InputStream getMaybeGzipInputStream(InputStream is) throws IOException {
+ is.mark(2);
+ byte[] firstTwo = new byte[2];
+ try {
+ IOUtils.readFully(is, firstTwo);
+ } finally {
+ is.reset();
+ }
+ int magic = ((firstTwo[1] & 0xff) << 8) | (firstTwo[0] & 0xff);
+ if (GZIPInputStream.GZIP_MAGIC == magic) {
+ return new GzipCompressorInputStream(is);
+ } else {
+ return is;
+ }
+ }
+
+ private void processZip(ZipFile zip, XHTMLContentHandler xhtml, Metadata metadata,
+ EmbeddedDocumentExtractor ex) throws IOException, SAXException {
+
+ Enumeration<ZipArchiveEntry> zaeEnum = zip.getEntries();
+ while (zaeEnum.hasMoreElements()) {
+ ZipArchiveEntry zae = zaeEnum.nextElement();
+ String name = zae.getName();
+ if (name.startsWith("archive/")) {
+ name = name.substring(8);
+ processWARC(TikaInputStream.get(zip.getInputStream(zae)), zae, name, xhtml,
+ metadata, ex);
+ } else if ("datapackage.json".equals(name)) {
+ //no-op
+ processDataPackage(TikaInputStream.get(zip.getInputStream(zae)), zae, xhtml,
+ metadata);
+ }
+ }
+ }
+
+
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
new file mode 100644
index 0000000..95fe9c0
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.warc;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Optional;
+import java.util.Set;
+
+import org.netpreserve.jwarc.HttpResponse;
+import org.netpreserve.jwarc.WarcPayload;
+import org.netpreserve.jwarc.WarcReader;
+import org.netpreserve.jwarc.WarcRecord;
+import org.netpreserve.jwarc.WarcResponse;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.WARC;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+public class WARCParser extends AbstractParser {
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
+ new HashSet<>(Arrays.asList(MediaType.application("warc"))));
+
+ private static String RESPONSE = "response";
+ private static String WARCINFO = "warcinfo";
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ EmbeddedDocumentExtractor embeddedDocumentExtractor =
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+ try (WarcReader warcreader = new WarcReader(stream)) {
+ //TODO: record warnings in metadata: warcreader.onWarning();
+ for (WarcRecord record : warcreader) {
+ processRecord(record, xhtml, metadata, context, embeddedDocumentExtractor);
+ }
+ } finally {
+ xhtml.endDocument();
+ }
+ }
+
+ private void processRecord(WarcRecord record, XHTMLContentHandler xhtml, Metadata metadata,
+ ParseContext context,
+ EmbeddedDocumentExtractor embeddedDocumentExtractor)
+ throws SAXException {
+ if (RESPONSE.equals(record.type())) {
+ try {
+ processResponse((WarcResponse) record, xhtml, context, embeddedDocumentExtractor);
+ } catch (IOException | TikaException e) {
+ EmbeddedDocumentUtil.recordException(e, metadata);
+ } catch (SAXException e) {
+ if (WriteLimitReachedException.isWriteLimitReached(e)) {
+ throw e;
+ } else {
+ EmbeddedDocumentUtil.recordException(e, metadata);
+ }
+ }
+ } else if (WARCINFO.equals(record.type())) {
+ processWarcInfo(record, xhtml, context);
+ }
+ //TODO - other warc record types
+
+ }
+
+ private void processWarcInfo(WarcRecord record, XHTMLContentHandler xhtml,
+ ParseContext context) {
+ //NO-OP for now
+ }
+
+ private void processResponse(WarcResponse warcResponse, XHTMLContentHandler xhtml,
+ ParseContext context,
+ EmbeddedDocumentExtractor embeddedDocumentExtractor)
+ throws IOException, SAXException, TikaException {
+ Optional<WarcPayload> optionalPayload = warcResponse.payload();
+ if (!optionalPayload.isPresent()) {
+ //TODO handle missing payload? Report or ignore?
+ return;
+ }
+ Metadata metadata = new Metadata();
+ setNotNull(WARC.WARC_RECORD_CONTENT_TYPE, warcResponse.contentType(), metadata);
+ setNotNull(WARC.WARC_PAYLOAD_CONTENT_TYPE, warcResponse.payloadType(), metadata);
+ processResponseMetadata(warcResponse.http(), metadata);
+ //TODO: process other record metadata
+
+ String id = warcResponse.id().toString();
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, id);
+ WarcPayload payload = optionalPayload.get();
+ metadata.set(WARC.WARC_RECORD_CONTENT_TYPE, payload.type().toString());
+ metadata.set(Metadata.CONTENT_LENGTH, Long.toString(payload.body().size()));
+
+ if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
+ try (InputStream tis = TikaInputStream.get(payload.body().stream())) {
+ embeddedDocumentExtractor.parseEmbedded(tis, xhtml, metadata, true);
+ }
+ }
+
+ }
+
+ private void processResponseMetadata(HttpResponse http, Metadata metadata) {
+ //TODO -- no-op for now
+ }
+
+ private void setNotNull(Property key, org.netpreserve.jwarc.MediaType contentType,
+ Metadata metadata) {
+ if (contentType == null) {
+ return;
+ }
+ metadata.set(key, contentType.toString());
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
new file mode 100644
index 0000000..8ce5e1b
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.tika.parser.warc.WARCParser
+org.apache.tika.parser.wacz.WACZParser
+
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/wacz/WACZParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/wacz/WACZParserTest.java
new file mode 100644
index 0000000..e5ca159
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/wacz/WACZParserTest.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.wacz;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+
+public class WACZParserTest extends TikaTest {
+
+ @Test
+ public void testBasic() throws Exception {
+ //the embedded warc is of type warc-info so there's no real content to parse
+ List<Metadata> metadataList = getRecursiveMetadata("testWACZ.wacz");
+ assertEquals(2, metadataList.size());
+ assertEquals("application/x-wacz", metadataList.get(0).get(Metadata.CONTENT_TYPE));
+ assertEquals("application/warc", metadataList.get(1).get(Metadata.CONTENT_TYPE));
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
new file mode 100644
index 0000000..ac1f422
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.warc;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+public class WARCParserTest extends TikaTest {
+
+ // the cc.warc.gz and gzip_extra_sl.warc.gz files come
+ // from the jwarc unit tests.
+
+ @Test
+ public void testBasic() throws Exception {
+
+ List<Metadata> metadataList = getRecursiveMetadata("cc.warc.gz");
+ assertEquals(3, metadataList.size());
+ assertContains("text/html", metadataList.get(1).get(Metadata.CONTENT_TYPE));
+ assertContains("Common Crawl on Twitter", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+ assertEquals("application/warc", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/cc.warc.gz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/cc.warc.gz
new file mode 100644
index 0000000..223b62c
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/cc.warc.gz differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/gzip_extra_sl.warc.gz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/gzip_extra_sl.warc.gz
new file mode 100644
index 0000000..8fad890
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/gzip_extra_sl.warc.gz differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWACZ.wacz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWACZ.wacz
new file mode 100644
index 0000000..2d4b751
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWACZ.wacz differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
index 454713b..5730bdc 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
@@ -138,6 +138,11 @@
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-webarchive-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
<artifactId>tika-parser-xml-module</artifactId>
<version>${project.version}</version>
</dependency>