You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/06 18:31:33 UTC

(tika) branch TIKA-4188 created (now 5bcb9c9ce)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4188
in repository https://gitbox.apache.org/repos/asf/tika.git


      at 5bcb9c9ce TIKA-4188 -- add parsing for arc files

This branch includes the following new commits:

     new 5bcb9c9ce TIKA-4188 -- add parsing for arc files

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



(tika) 01/01: TIKA-4188 -- add parsing for arc files

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4188
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 5bcb9c9ce5e83abed0bec138a24c2f4cd56a890f
Author: tallison <ta...@apache.org>
AuthorDate: Tue Feb 6 13:30:56 2024 -0500

    TIKA-4188 -- add parsing for arc files
---
 .../org/apache/tika/parser/warc/WARCParser.java    |  5 ++-
 .../apache/tika/parser/warc/WARCParserTest.java    | 13 ++++++-
 .../src/test/resources/test-documents/testARC.arc  | 42 ++++++++++++++++++++++
 3 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
index 2c61cae91..baf8d4a8d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
@@ -49,11 +49,14 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.utils.StringUtils;
 
+/**
+ * This uses jwarc to parse warc files and arc files
+ */
 public class WARCParser implements Parser {
 
     private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
             new HashSet<>(Arrays.asList(MediaType.application("warc"),
-                    MediaType.application("warc+gz"))));
+                    MediaType.application("warc+gz"), MediaType.application("x-internet-archive"))));
 
     public static String WARC_PREFIX = "warc:";
     public static String WARC_HTTP_PREFIX = WARC_PREFIX + "http:";
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
index c92f8ec15..57cc65bf4 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
@@ -31,7 +31,7 @@ import org.apache.tika.sax.BasicContentHandlerFactory;
 
 public class WARCParserTest extends TikaTest {
 
-    // the cc.warc.gz and gzip_extra_sl.warc.gz files come
+    // the cc.warc.gz and gzip_extra_sl.warc.gz and the testARC.arc files come
     // from the jwarc unit tests.
 
     @Test
@@ -64,4 +64,15 @@ public class WARCParserTest extends TikaTest {
         assertEquals("application/warc", metadataList.get(0).get(Metadata.CONTENT_TYPE));
         assertEquals("application/warc+gz", gzMetadataList.get(0).get(Metadata.CONTENT_TYPE));
     }
+
+    @Test
+    public void testARC() throws Exception {
+        //test file comes from:
+        // https://github.com/iipc/jwarc/blob/master/test/org/netpreserve/jwarc/apitests/ArcTest.java
+
+        List<Metadata> metadataList = getRecursiveMetadata("testARC.arc",
+                BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
+        debug(metadataList);
+
+    }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc
new file mode 100644
index 000000000..d2b4970be
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc
@@ -0,0 +1,42 @@
+filedesc://example.arc 0.0.0.0 20050614070144 text/plain 1338
+1 1 InternetArchive
+URL IP-address Archive-date Content-type Archive-length
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<arcmetadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:arc="http://archive.org/arc/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://archive.org/arc/1.0/" xsi:schemaLocation="http://archive.org/arc/1.0/ http://www.archive.org/arc/1.0/arc.xsd">
+<arc:software>Heritrix 1.5.0-200506132127 http://crawler.archive.org</arc:software>
+<arc:hostname>example.org</arc:hostname>
+<arc:ip>127.0.0.1</arc:ip>
+<dcterms:isPartOf>CRAWL</dcterms:isPartOf>
+<dc:description>Example crawl</dc:description>
+<arc:operator>Example</arc:operator>
+<dc:publisher>Example</dc:publisher>
+<dcterms:audience>Example</dcterms:audience>
+<ns0:date xmlns:ns0="http://purl.org/dc/elements/1.1/" xsi:type="dcterms:W3CDTF">2005-06-14T06:37:49+00:00</ns0:date>
+<arc:http-header-user-agent>Mozilla/5.0 (compatible; heritrix/1.5.0-200506132127 +http://example.org/)</arc:http-header-user-agent>
+<arc:http-header-from>example@example.org</arc:http-header-from>
+<arc:robots>classic</arc:robots>
+<dc:format>ARC file version 1.1</dc:format>
+<dcterms:conformsTo xsi:type="dcterms:URI">http://www.archive.org/web/researcher/ArcFileFormat.php</dcterms:conformsTo>
+</arcmetadata>
+
+dns:www.law.gov.au 207.241.224.11 20050614070144 text/dns 55
+20050614070144
+www.law.gov.au.		6858	IN	A	152.91.15.12
+
+http://www.uq.edu.au/robots.txt 130.102.5.51 20050614070151 text/html 524
+HTTP/1.1 302 Found
+Date: Tue, 14 Jun 2005 07:01:49 GMT
+Server: Apache/1.3.28 (Unix) DAV/1.0.3 PHP/4.2.2 mod_perl/1.24_01 mod_ssl/2.8.15 OpenSSL/0.9.7c
+Location: http://www.uq.edu.au/
+Connection: close
+Content-Type: text/html; charset=iso-8859-1
+
+<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
+<HTML><HEAD>
+<TITLE>302 Found</TITLE>
+</HEAD><BODY>
+<H1>Found</H1>
+The document has moved <A HREF="http://www.uq.edu.au/">here</A>.<P>
+<HR>
+<ADDRESS>Apache/1.3.28 Server at www.uq.edu.au Port 80</ADDRESS>
+</BODY></HTML>
\ No newline at end of file