You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/06 18:31:34 UTC

(tika) 01/01: TIKA-4188 -- add parsing for arc files

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4188
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 5bcb9c9ce5e83abed0bec138a24c2f4cd56a890f
Author: tallison <ta...@apache.org>
AuthorDate: Tue Feb 6 13:30:56 2024 -0500

    TIKA-4188 -- add parsing for arc files
---
 .../org/apache/tika/parser/warc/WARCParser.java    |  5 ++-
 .../apache/tika/parser/warc/WARCParserTest.java    | 13 ++++++-
 .../src/test/resources/test-documents/testARC.arc  | 42 ++++++++++++++++++++++
 3 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
index 2c61cae91..baf8d4a8d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
@@ -49,11 +49,14 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.utils.StringUtils;
 
+/**
+ * This uses jwarc to parse warc files and arc files
+ */
 public class WARCParser implements Parser {
 
     private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
             new HashSet<>(Arrays.asList(MediaType.application("warc"),
-                    MediaType.application("warc+gz"))));
+                    MediaType.application("warc+gz"), MediaType.application("x-internet-archive"))));
 
     public static String WARC_PREFIX = "warc:";
     public static String WARC_HTTP_PREFIX = WARC_PREFIX + "http:";
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
index c92f8ec15..57cc65bf4 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
@@ -31,7 +31,7 @@ import org.apache.tika.sax.BasicContentHandlerFactory;
 
 public class WARCParserTest extends TikaTest {
 
-    // the cc.warc.gz and gzip_extra_sl.warc.gz files come
+    // the cc.warc.gz and gzip_extra_sl.warc.gz and the testARC.arc files come
     // from the jwarc unit tests.
 
     @Test
@@ -64,4 +64,15 @@ public class WARCParserTest extends TikaTest {
         assertEquals("application/warc", metadataList.get(0).get(Metadata.CONTENT_TYPE));
         assertEquals("application/warc+gz", gzMetadataList.get(0).get(Metadata.CONTENT_TYPE));
     }
+
+    @Test
+    public void testARC() throws Exception {
+        //test file comes from:
+        // https://github.com/iipc/jwarc/blob/master/test/org/netpreserve/jwarc/apitests/ArcTest.java
+
+        List<Metadata> metadataList = getRecursiveMetadata("testARC.arc",
+                BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
+        debug(metadataList);
+
+    }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc
new file mode 100644
index 000000000..d2b4970be
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc
@@ -0,0 +1,42 @@
+filedesc://example.arc 0.0.0.0 20050614070144 text/plain 1338
+1 1 InternetArchive
+URL IP-address Archive-date Content-type Archive-length
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<arcmetadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:arc="http://archive.org/arc/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://archive.org/arc/1.0/" xsi:schemaLocation="http://archive.org/arc/1.0/ http://www.archive.org/arc/1.0/arc.xsd">
+<arc:software>Heritrix 1.5.0-200506132127 http://crawler.archive.org</arc:software>
+<arc:hostname>example.org</arc:hostname>
+<arc:ip>127.0.0.1</arc:ip>
+<dcterms:isPartOf>CRAWL</dcterms:isPartOf>
+<dc:description>Example crawl</dc:description>
+<arc:operator>Example</arc:operator>
+<dc:publisher>Example</dc:publisher>
+<dcterms:audience>Example</dcterms:audience>
+<ns0:date xmlns:ns0="http://purl.org/dc/elements/1.1/" xsi:type="dcterms:W3CDTF">2005-06-14T06:37:49+00:00</ns0:date>
+<arc:http-header-user-agent>Mozilla/5.0 (compatible; heritrix/1.5.0-200506132127 +http://example.org/)</arc:http-header-user-agent>
+<arc:http-header-from>example@example.org</arc:http-header-from>
+<arc:robots>classic</arc:robots>
+<dc:format>ARC file version 1.1</dc:format>
+<dcterms:conformsTo xsi:type="dcterms:URI">http://www.archive.org/web/researcher/ArcFileFormat.php</dcterms:conformsTo>
+</arcmetadata>
+
+dns:www.law.gov.au 207.241.224.11 20050614070144 text/dns 55
+20050614070144
+www.law.gov.au.		6858	IN	A	152.91.15.12
+
+http://www.uq.edu.au/robots.txt 130.102.5.51 20050614070151 text/html 524
+HTTP/1.1 302 Found
+Date: Tue, 14 Jun 2005 07:01:49 GMT
+Server: Apache/1.3.28 (Unix) DAV/1.0.3 PHP/4.2.2 mod_perl/1.24_01 mod_ssl/2.8.15 OpenSSL/0.9.7c
+Location: http://www.uq.edu.au/
+Connection: close
+Content-Type: text/html; charset=iso-8859-1
+
+<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
+<HTML><HEAD>
+<TITLE>302 Found</TITLE>
+</HEAD><BODY>
+<H1>Found</H1>
+The document has moved <A HREF="http://www.uq.edu.au/">here</A>.<P>
+<HR>
+<ADDRESS>Apache/1.3.28 Server at www.uq.edu.au Port 80</ADDRESS>
+</BODY></HTML>
\ No newline at end of file