You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/06 18:31:34 UTC
(tika) 01/01: TIKA-4188 -- add parsing for arc files
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4188
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 5bcb9c9ce5e83abed0bec138a24c2f4cd56a890f
Author: tallison <ta...@apache.org>
AuthorDate: Tue Feb 6 13:30:56 2024 -0500
TIKA-4188 -- add parsing for arc files
---
.../org/apache/tika/parser/warc/WARCParser.java | 5 ++-
.../apache/tika/parser/warc/WARCParserTest.java | 13 ++++++-
.../src/test/resources/test-documents/testARC.arc | 42 ++++++++++++++++++++++
3 files changed, 58 insertions(+), 2 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
index 2c61cae91..baf8d4a8d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
@@ -49,11 +49,14 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;
+/**
+ * This uses jwarc to parse warc files and arc files
+ */
public class WARCParser implements Parser {
private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
new HashSet<>(Arrays.asList(MediaType.application("warc"),
- MediaType.application("warc+gz"))));
+ MediaType.application("warc+gz"), MediaType.application("x-internet-archive"))));
public static String WARC_PREFIX = "warc:";
public static String WARC_HTTP_PREFIX = WARC_PREFIX + "http:";
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
index c92f8ec15..57cc65bf4 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
@@ -31,7 +31,7 @@ import org.apache.tika.sax.BasicContentHandlerFactory;
public class WARCParserTest extends TikaTest {
- // the cc.warc.gz and gzip_extra_sl.warc.gz files come
+ // the cc.warc.gz and gzip_extra_sl.warc.gz and the testARC.arc files come
// from the jwarc unit tests.
@Test
@@ -64,4 +64,15 @@ public class WARCParserTest extends TikaTest {
assertEquals("application/warc", metadataList.get(0).get(Metadata.CONTENT_TYPE));
assertEquals("application/warc+gz", gzMetadataList.get(0).get(Metadata.CONTENT_TYPE));
}
+
+ @Test
+ public void testARC() throws Exception {
+ //test file comes from:
+ // https://github.com/iipc/jwarc/blob/master/test/org/netpreserve/jwarc/apitests/ArcTest.java
+
+ List<Metadata> metadataList = getRecursiveMetadata("testARC.arc",
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
+ debug(metadataList);
+
+ }
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc
new file mode 100644
index 000000000..d2b4970be
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc
@@ -0,0 +1,42 @@
+filedesc://example.arc 0.0.0.0 20050614070144 text/plain 1338
+1 1 InternetArchive
+URL IP-address Archive-date Content-type Archive-length
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<arcmetadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:arc="http://archive.org/arc/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://archive.org/arc/1.0/" xsi:schemaLocation="http://archive.org/arc/1.0/ http://www.archive.org/arc/1.0/arc.xsd">
+<arc:software>Heritrix 1.5.0-200506132127 http://crawler.archive.org</arc:software>
+<arc:hostname>example.org</arc:hostname>
+<arc:ip>127.0.0.1</arc:ip>
+<dcterms:isPartOf>CRAWL</dcterms:isPartOf>
+<dc:description>Example crawl</dc:description>
+<arc:operator>Example</arc:operator>
+<dc:publisher>Example</dc:publisher>
+<dcterms:audience>Example</dcterms:audience>
+<ns0:date xmlns:ns0="http://purl.org/dc/elements/1.1/" xsi:type="dcterms:W3CDTF">2005-06-14T06:37:49+00:00</ns0:date>
+<arc:http-header-user-agent>Mozilla/5.0 (compatible; heritrix/1.5.0-200506132127 +http://example.org/)</arc:http-header-user-agent>
+<arc:http-header-from>example@example.org</arc:http-header-from>
+<arc:robots>classic</arc:robots>
+<dc:format>ARC file version 1.1</dc:format>
+<dcterms:conformsTo xsi:type="dcterms:URI">http://www.archive.org/web/researcher/ArcFileFormat.php</dcterms:conformsTo>
+</arcmetadata>
+
+dns:www.law.gov.au 207.241.224.11 20050614070144 text/dns 55
+20050614070144
+www.law.gov.au. 6858 IN A 152.91.15.12
+
+http://www.uq.edu.au/robots.txt 130.102.5.51 20050614070151 text/html 524
+HTTP/1.1 302 Found
+Date: Tue, 14 Jun 2005 07:01:49 GMT
+Server: Apache/1.3.28 (Unix) DAV/1.0.3 PHP/4.2.2 mod_perl/1.24_01 mod_ssl/2.8.15 OpenSSL/0.9.7c
+Location: http://www.uq.edu.au/
+Connection: close
+Content-Type: text/html; charset=iso-8859-1
+
+<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
+<HTML><HEAD>
+<TITLE>302 Found</TITLE>
+</HEAD><BODY>
+<H1>Found</H1>
+The document has moved <A HREF="http://www.uq.edu.au/">here</A>.<P>
+<HR>
+<ADDRESS>Apache/1.3.28 Server at www.uq.edu.au Port 80</ADDRESS>
+</BODY></HTML>
\ No newline at end of file