You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/09 15:43:46 UTC

(tika) branch main updated: TIKA-4188 (#1587)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 7d48d00ac TIKA-4188 (#1587)
7d48d00ac is described below

commit 7d48d00ac1febfb1ac70e4887268b28fb4951b78
Author: Tim Allison <ta...@apache.org>
AuthorDate: Fri Feb 9 10:43:40 2024 -0500

    TIKA-4188 (#1587)
    
    * TIKA-4188 -- add parsing for arc files
---
 .../detect/gzip/GZipSpecializationDetector.java    |   4 ++
 .../org/apache/tika/parser/warc/WARCParser.java    |  14 ++++--
 .../apache/tika/parser/warc/WARCParserTest.java    |  31 ++++++++++++-
 .../test/resources/test-documents/example.arc.gz   | Bin 0 -> 1027 bytes
 .../src/test/resources/test-documents/testARC.arc  |  50 +++++++++++++++++++++
 5 files changed, 94 insertions(+), 5 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java
index e3d743ad3..b87115b3b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java
@@ -38,6 +38,8 @@ public class GZipSpecializationDetector implements Detector {
     public static MediaType GZ = MediaType.application("gzip");
     public static MediaType WARC_GZ = MediaType.application("warc+gz");
 
+    public static MediaType ARC_GZ = MediaType.application("arc+gz");
+
     @Override
     public MediaType detect(InputStream input, Metadata metadata) throws IOException {
         if (input == null) {
@@ -84,6 +86,8 @@ public class GZipSpecializationDetector implements Detector {
         String s = new String(bytes.toByteArray(), StandardCharsets.UTF_8);
         if (s.startsWith("WARC/")) {
             return WARC_GZ;
+        } else if (s.startsWith("filedesc://")) {
+            return ARC_GZ;
         }
         return GZ;
     }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
index 2c61cae91..ad4894b54 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
@@ -49,11 +49,16 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.utils.StringUtils;
 
+/**
+ * This uses jwarc to parse warc files and arc files
+ */
 public class WARCParser implements Parser {
 
     private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
             new HashSet<>(Arrays.asList(MediaType.application("warc"),
-                    MediaType.application("warc+gz"))));
+                    MediaType.application("warc+gz"),
+                    MediaType.application("x-internet-archive"),
+                    MediaType.application("arc+gz"))));
 
     public static String WARC_PREFIX = "warc:";
     public static String WARC_HTTP_PREFIX = WARC_PREFIX + "http:";
@@ -130,9 +135,10 @@ public class WARCParser implements Parser {
         setNotNull(WARC.WARC_PAYLOAD_CONTENT_TYPE, warcResponse.payloadType(), metadata);
         processWarcMetadata(warcResponse, metadata);
         processHttpResponseMetadata(warcResponse.http(), metadata);
-
-        String id = warcResponse.id().toString();
-        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, id);
+        if (warcResponse.warcinfoID().isPresent()) {
+            String id = warcResponse.id().toString();
+            metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, id);
+        }
         WarcPayload payload = optionalPayload.get();
         metadata.set(WARC.WARC_RECORD_CONTENT_TYPE, payload.type().toString());
         metadata.set(Metadata.CONTENT_LENGTH, Long.toString(payload.body().size()));
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
index c92f8ec15..56d49aa2b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
@@ -31,7 +31,7 @@ import org.apache.tika.sax.BasicContentHandlerFactory;
 
 public class WARCParserTest extends TikaTest {
 
-    // the cc.warc.gz and gzip_extra_sl.warc.gz files come
+    // the cc.warc.gz and gzip_extra_sl.warc.gz and the testARC.arc files come
     // from the jwarc unit tests.
 
     @Test
@@ -64,4 +64,33 @@ public class WARCParserTest extends TikaTest {
         assertEquals("application/warc", metadataList.get(0).get(Metadata.CONTENT_TYPE));
         assertEquals("application/warc+gz", gzMetadataList.get(0).get(Metadata.CONTENT_TYPE));
     }
+
+    @Test
+    public void testARC() throws Exception {
+        //test file comes from:
+        // https://github.com/iipc/jwarc/blob/master/test/org/netpreserve/jwarc/apitests/ArcTest.java
+
+        List<Metadata> metadataList = getRecursiveMetadata("testARC.arc",
+                BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
+        assertEquals(2, metadataList.size());
+        assertContains("The document has moved here",
+                metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+        assertEquals("http://www.uq.edu.au/robots.txt",
+                metadataList.get(1).get("warc:WARC-Target-URI"));
+        assertEquals("http://www.uq.edu.au/",
+                metadataList.get(1).get("warc:http:Location"));
+    }
+
+    @Test
+    public void testArcGZ() throws Exception {
+        //test file from https://github.com/webrecorder/warcio/blob/master/test/data/example.arc.gz
+        List<Metadata> metadataList = getRecursiveMetadata("example.arc.gz",
+                BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
+        assertEquals(2, metadataList.size());
+        assertEquals("application/arc+gz", metadataList.get(0).get(Metadata.CONTENT_TYPE));
+        assertContains("This domain is established",
+                metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+
+        //TODO -- we should try to find an example gz with multiple arcs
+    }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc.gz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc.gz
new file mode 100644
index 000000000..bc959cf18
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc.gz differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc
new file mode 100644
index 000000000..b7f099eb3
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc
@@ -0,0 +1,50 @@
+filedesc://example.arc 0.0.0.0 20050614070144 text/plain 1338
+1 1 InternetArchive
+URL IP-address Archive-date Content-type Archive-length
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<arcmetadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:arc="http://archive.org/arc/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://archive.org/arc/1.0/" xsi:schemaLocation="http://archive.org/arc/1.0/ http://www.archive.org/arc/1.0/arc.xsd">
+<arc:software>Heritrix 1.5.0-200506132127 http://crawler.archive.org</arc:software>
+<arc:hostname>example.org</arc:hostname>
+<arc:ip>127.0.0.1</arc:ip>
+<dcterms:isPartOf>CRAWL</dcterms:isPartOf>
+<dc:description>Example crawl</dc:description>
+<arc:operator>Example</arc:operator>
+<dc:publisher>Example</dc:publisher>
+<dcterms:audience>Example</dcterms:audience>
+<ns0:date xmlns:ns0="http://purl.org/dc/elements/1.1/" xsi:type="dcterms:W3CDTF">2005-06-14T06:37:49+00:00</ns0:date>
+<arc:http-header-user-agent>Mozilla/5.0 (compatible; heritrix/1.5.0-200506132127 +http://example.org/)</arc:http-header-user-agent>
+<arc:http-header-from>example@example.org</arc:http-header-from>
+<arc:robots>classic</arc:robots>
+<dc:format>ARC file version 1.1</dc:format>
+<dcterms:conformsTo xsi:type="dcterms:URI">http://www.archive.org/web/researcher/ArcFileFormat.php</dcterms:conformsTo>
+</arcmetadata>
+
+dns:www.law.gov.au 207.241.224.11 20050614070144 text/dns 55
+20050614070144
+www.law.gov.au.		6858	IN	A	152.91.15.12
+
+http://www.uq.edu.au/robots.txt 130.102.5.51 20050614070151 text/html 524
+HTTP/1.1 302 Found
+Date: Tue, 14 Jun 2005 07:01:49 GMT
+Server: Apache/1.3.28 (Unix) DAV/1.0.3 PHP/4.2.2 mod_perl/1.24_01 mod_ssl/2.8.15 OpenSSL/0.9.7c
+Location: http://www.uq.edu.au/
+Connection: close
+Content-Type: text/html; charset=iso-8859-1
+
+<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
+<HTML><HEAD>
+<TITLE>302 Found</TITLE>
+</HEAD><BODY>
+<H1>Found</H1>
+The document has moved <A HREF="http://www.uq.edu.au/">here</A>.<P>
+<HR>
+<ADDRESS>Apache/1.3.28 Server at www.uq.edu.au Port 80</ADDRESS>
+</BODY></HTML>
+
+
+
+
+
+
+
+