You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/03/18 20:29:24 UTC
[tika] branch main updated: TIKA-3697 : add initial warc and wacz parsers

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 0be1713  TIKA-3697 : add initial warc and wacz parsers
     new 1260c02  Merge remote-tracking branch 'origin/main' into main
0be1713 is described below

commit 0be171352e5ece374228f8b6e95973cd6a8b9211
Author: tallison <ta...@apache.org>
AuthorDate: Fri Mar 18 16:29:04 2022 -0400

    TIKA-3697 : add initial warc and wacz parsers
---
 CHANGES.txt                                        |   2 +
 .../main/java/org/apache/tika/metadata/WARC.java   |  31 ++++
 .../org/apache/tika/mime/tika-mimetypes.xml        |   2 +-
 tika-parent/pom.xml                                |   5 +
 .../tika-parsers-standard-modules/pom.xml          |   1 +
 .../tika-parser-webarchive-module/pom.xml          |  82 +++++++++++
 .../org/apache/tika/parser/wacz/WACZParser.java    | 162 +++++++++++++++++++++
 .../org/apache/tika/parser/warc/WARCParser.java    | 148 +++++++++++++++++++
 .../services/org.apache.tika.parser.Parser         |  18 +++
 .../apache/tika/parser/wacz/WACZParserTest.java    |  38 +++++
 .../apache/tika/parser/warc/WARCParserTest.java    |  43 ++++++
 .../src/test/resources/test-documents/cc.warc.gz   | Bin 0 -> 5392 bytes
 .../resources/test-documents/gzip_extra_sl.warc.gz | Bin 0 -> 459 bytes
 .../test/resources/test-documents/testWACZ.wacz    | Bin 0 -> 4186 bytes
 .../tika-parsers-standard-package/pom.xml          |   5 +
 15 files changed, 536 insertions(+), 1 deletion(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index eea54b3..e577bc3 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -9,6 +9,8 @@ Release 2.4.0 - ???
    * Add MetadataWriteFilter capability to improve memory profile in
      Metadata objects (TIKA-3695).
 
+   * Add basic parsers for WARC and WACZ in tika-parsers-standard (TIKA-3697).
+
    * Add detection for Frictionless Data packages and WACZ (TIKA-3696).
 
    * Upgrade deeplearning4j to 1.0.0-M2 (TIKA-3458 and PR#527).
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/WARC.java b/tika-core/src/main/java/org/apache/tika/metadata/WARC.java
new file mode 100644
index 0000000..359236b
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/WARC.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+public interface WARC {
+    String PREFIX = "warc" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+
+    Property WARC_WARNING = Property.externalTextBag(PREFIX + "warning");
+
+    Property WARC_RECORD_CONTENT_TYPE = Property.externalText(PREFIX + "record-content-type");
+
+    Property WARC_PAYLOAD_CONTENT_TYPE = Property.externalText(PREFIX + "payload-content-type");
+
+    Property WARC_RECORD_ID = Property.externalText(PREFIX + "WARC-Record-ID");
+
+    //TODO: lots
+}
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index ef9e5fa..9a141b1 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3114,7 +3114,7 @@
   <mime-type type="application/warc">
     <acronym>WARC</acronym>
     <_comment>WARC</_comment>
-    <magic priority="50">
+    <magic priority="60">
        <match value="WARC/" type="string" offset="0"/>
     </magic>
     <glob pattern="*.warc"/>
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 0970f3c..8bb0a47 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -606,6 +606,11 @@
         <version>2.0.1</version>
       </dependency>
       <dependency>
+        <groupId>org.netpreserve</groupId>
+        <artifactId>jwarc</artifactId>
+        <version>0.17.0</version>
+      </dependency>
+      <dependency>
         <groupId>org.ops4j.base</groupId>
         <artifactId>ops4j-base-lang</artifactId>
         <version>${ops4j.version}</version>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml
index 7153748..770f4ac 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/pom.xml
@@ -68,6 +68,7 @@
     <module>tika-parser-miscoffice-module</module>
     <module>tika-parser-news-module</module>
     <module>tika-parser-crypto-module</module>
+    <module>tika-parser-webarchive-module</module>
   </modules>
   <build>
     <pluginManagement>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/pom.xml
new file mode 100644
index 0000000..5a21b8c
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/pom.xml
@@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <parent>
+    <artifactId>tika-parsers-standard-modules</artifactId>
+    <groupId>org.apache.tika</groupId>
+    <version>2.3.1-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+  <modelVersion>4.0.0</modelVersion>
+
+  <artifactId>tika-parser-webarchive-module</artifactId>
+  <name>Apache Tika WARC parser module</name>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.netpreserve</groupId>
+      <artifactId>jwarc</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-compress</artifactId>
+    </dependency>
+    <!-- need these for detection/ungzipping and html parsing in tests -->
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-html-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-pkg-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <configuration>
+          <archive>
+            <manifestEntries>
+              <Automatic-Module-Name>org.apache.tika.parser.warc</Automatic-Module-Name>
+            </manifestEntries>
+          </archive>
+        </configuration>
+        <executions>
+          <execution>
+            <goals>
+              <goal>test-jar</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+
+  <scm>
+    <tag>2.2.1-rc2</tag>
+  </scm>
+</project>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java
new file mode 100644
index 0000000..7697a98
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.wacz;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.compress.utils.IOUtils;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+public class WACZParser extends AbstractParser {
+
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
+            new HashSet<>(Arrays.asList(MediaType.application("x-wacz"))));
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+                      ParseContext context) throws IOException, SAXException, TikaException {
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        EmbeddedDocumentExtractor embeddedDocumentExtractor =
+                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+        if (stream instanceof TikaInputStream) {
+            ZipFile zip = (ZipFile) ((TikaInputStream) stream).getOpenContainer();
+            if (zip == null && ((TikaInputStream)stream).hasFile()) {
+                zip = new ZipFile(((TikaInputStream)stream).getFile());
+            }
+            if (zip != null) {
+                try {
+                    processZip(zip, xhtml, metadata, embeddedDocumentExtractor);
+                } finally {
+                    zip.close();
+                }
+            } else {
+                processStream(stream, xhtml, metadata, embeddedDocumentExtractor);
+            }
+        } else {
+            processStream(stream, xhtml, metadata, embeddedDocumentExtractor);
+        }
+        xhtml.endDocument();
+    }
+
+    private void processStream(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata,
+                               EmbeddedDocumentExtractor ex) throws SAXException, IOException {
+        try (ZipArchiveInputStream zais = new ZipArchiveInputStream(
+                new CloseShieldInputStream(stream))) {
+            ZipArchiveEntry zae = zais.getNextZipEntry();
+            while (zae != null) {
+                String name = zae.getName();
+                if (name.startsWith("archive/")) {
+                    name = name.substring(8);
+                    processWARC(zais, zae, name, xhtml, metadata, ex);
+                } else if ("datapackage.json".equals(name)) {
+                    //no-op
+                    processDataPackage(zais, zae, xhtml, metadata);
+                }
+                //TODO -- process pages (jsonl); process indexes?!
+
+                zae = zais.getNextZipEntry();
+            }
+        }
+    }
+
+    private void processDataPackage(InputStream is, ZipArchiveEntry zae,
+                                    XHTMLContentHandler xhtml, Metadata metadata)
+            throws IOException {
+        //no-op
+    }
+
+    private void processWARC(InputStream zais, ZipArchiveEntry zae,
+                             String name, XHTMLContentHandler xhtml, Metadata parentMetadata,
+                             EmbeddedDocumentExtractor ex) throws IOException, SAXException {
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+        metadata.set(Metadata.CONTENT_LENGTH, Long.toString(zae.getSize()));
+        try (InputStream inputStream = getMaybeGzipInputStream(TikaInputStream.get(zais))) {
+            if (ex.shouldParseEmbedded(metadata)) {
+                ex.parseEmbedded(inputStream, xhtml, metadata, true);
+            }
+        }
+    }
+
+    private InputStream getMaybeGzipInputStream(InputStream is) throws IOException {
+        is.mark(2);
+        byte[] firstTwo = new byte[2];
+        try {
+            IOUtils.readFully(is, firstTwo);
+        } finally {
+            is.reset();
+        }
+        int magic = ((firstTwo[1] & 0xff) << 8) | (firstTwo[0] & 0xff);
+        if (GZIPInputStream.GZIP_MAGIC == magic) {
+            return new GzipCompressorInputStream(is);
+        } else {
+            return is;
+        }
+    }
+
+    private void processZip(ZipFile zip, XHTMLContentHandler xhtml, Metadata metadata,
+                            EmbeddedDocumentExtractor ex) throws IOException, SAXException {
+
+        Enumeration<ZipArchiveEntry> zaeEnum = zip.getEntries();
+        while (zaeEnum.hasMoreElements()) {
+            ZipArchiveEntry zae = zaeEnum.nextElement();
+            String name = zae.getName();
+            if (name.startsWith("archive/")) {
+                name = name.substring(8);
+                processWARC(TikaInputStream.get(zip.getInputStream(zae)), zae, name, xhtml,
+                        metadata, ex);
+            } else if ("datapackage.json".equals(name)) {
+                //no-op
+                processDataPackage(TikaInputStream.get(zip.getInputStream(zae)), zae, xhtml,
+                        metadata);
+            }
+        }
+    }
+
+
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
new file mode 100644
index 0000000..95fe9c0
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.warc;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Optional;
+import java.util.Set;
+
+import org.netpreserve.jwarc.HttpResponse;
+import org.netpreserve.jwarc.WarcPayload;
+import org.netpreserve.jwarc.WarcReader;
+import org.netpreserve.jwarc.WarcRecord;
+import org.netpreserve.jwarc.WarcResponse;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.WARC;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+public class WARCParser extends AbstractParser {
+
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
+            new HashSet<>(Arrays.asList(MediaType.application("warc"))));
+
+    private static String RESPONSE = "response";
+    private static String WARCINFO = "warcinfo";
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+                      ParseContext context) throws IOException, SAXException, TikaException {
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        EmbeddedDocumentExtractor embeddedDocumentExtractor =
+                EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+        try (WarcReader warcreader = new WarcReader(stream)) {
+            //TODO: record warnings in metadata: warcreader.onWarning();
+            for (WarcRecord record : warcreader) {
+                processRecord(record, xhtml, metadata, context, embeddedDocumentExtractor);
+            }
+        } finally {
+            xhtml.endDocument();
+        }
+    }
+
+    private void processRecord(WarcRecord record, XHTMLContentHandler xhtml, Metadata metadata,
+                               ParseContext context,
+                               EmbeddedDocumentExtractor embeddedDocumentExtractor)
+            throws SAXException {
+        if (RESPONSE.equals(record.type())) {
+            try {
+                processResponse((WarcResponse) record, xhtml, context, embeddedDocumentExtractor);
+            } catch (IOException | TikaException e) {
+                EmbeddedDocumentUtil.recordException(e, metadata);
+            } catch (SAXException e) {
+                if (WriteLimitReachedException.isWriteLimitReached(e)) {
+                    throw e;
+                } else {
+                    EmbeddedDocumentUtil.recordException(e, metadata);
+                }
+            }
+        } else if (WARCINFO.equals(record.type())) {
+            processWarcInfo(record, xhtml, context);
+        }
+        //TODO - other warc record types
+
+    }
+
+    private void processWarcInfo(WarcRecord record, XHTMLContentHandler xhtml,
+                                 ParseContext context) {
+        //NO-OP for now
+    }
+
+    private void processResponse(WarcResponse warcResponse, XHTMLContentHandler xhtml,
+                                 ParseContext context,
+                                 EmbeddedDocumentExtractor embeddedDocumentExtractor)
+            throws IOException, SAXException, TikaException {
+        Optional<WarcPayload> optionalPayload = warcResponse.payload();
+        if (!optionalPayload.isPresent()) {
+            //TODO handle missing payload?  Report or ignore?
+            return;
+        }
+        Metadata metadata = new Metadata();
+        setNotNull(WARC.WARC_RECORD_CONTENT_TYPE, warcResponse.contentType(), metadata);
+        setNotNull(WARC.WARC_PAYLOAD_CONTENT_TYPE, warcResponse.payloadType(), metadata);
+        processResponseMetadata(warcResponse.http(), metadata);
+        //TODO: process other record metadata
+
+        String id = warcResponse.id().toString();
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, id);
+        WarcPayload payload = optionalPayload.get();
+        metadata.set(WARC.WARC_RECORD_CONTENT_TYPE, payload.type().toString());
+        metadata.set(Metadata.CONTENT_LENGTH, Long.toString(payload.body().size()));
+
+        if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
+            try (InputStream tis = TikaInputStream.get(payload.body().stream())) {
+                embeddedDocumentExtractor.parseEmbedded(tis, xhtml, metadata, true);
+            }
+        }
+
+    }
+
+    private void processResponseMetadata(HttpResponse http, Metadata metadata) {
+        //TODO -- no-op for now
+    }
+
+    private void setNotNull(Property key, org.netpreserve.jwarc.MediaType contentType,
+                            Metadata metadata) {
+        if (contentType == null) {
+            return;
+        }
+        metadata.set(key, contentType.toString());
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
new file mode 100644
index 0000000..8ce5e1b
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -0,0 +1,18 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.tika.parser.warc.WARCParser
+org.apache.tika.parser.wacz.WACZParser
+
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/wacz/WACZParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/wacz/WACZParserTest.java
new file mode 100644
index 0000000..e5ca159
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/wacz/WACZParserTest.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.wacz;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+
+public class WACZParserTest extends TikaTest {
+
+    @Test
+    public void testBasic() throws Exception {
+        //the embedded warc is of type warc-info so there's no real content to parse
+        List<Metadata> metadataList = getRecursiveMetadata("testWACZ.wacz");
+        assertEquals(2, metadataList.size());
+        assertEquals("application/x-wacz", metadataList.get(0).get(Metadata.CONTENT_TYPE));
+        assertEquals("application/warc", metadataList.get(1).get(Metadata.CONTENT_TYPE));
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
new file mode 100644
index 0000000..ac1f422
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.warc;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+public class WARCParserTest extends TikaTest {
+
+    // the cc.warc.gz and gzip_extra_sl.warc.gz files come
+    // from the jwarc unit tests.
+
+    @Test
+    public void testBasic() throws Exception {
+
+        List<Metadata> metadataList = getRecursiveMetadata("cc.warc.gz");
+        assertEquals(3, metadataList.size());
+        assertContains("text/html", metadataList.get(1).get(Metadata.CONTENT_TYPE));
+        assertContains("Common Crawl on Twitter", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+        assertEquals("application/warc", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/cc.warc.gz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/cc.warc.gz
new file mode 100644
index 0000000..223b62c
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/cc.warc.gz differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/gzip_extra_sl.warc.gz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/gzip_extra_sl.warc.gz
new file mode 100644
index 0000000..8fad890
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/gzip_extra_sl.warc.gz differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWACZ.wacz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWACZ.wacz
new file mode 100644
index 0000000..2d4b751
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWACZ.wacz differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
index 454713b..5730bdc 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/pom.xml
@@ -138,6 +138,11 @@
     </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-webarchive-module</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
       <artifactId>tika-parser-xml-module</artifactId>
       <version>${project.version}</version>
     </dependency>