You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/07 16:08:02 UTC

(tika) branch TIKA-4188 updated: TIKA-4188 -- WIP -- initial steps towards parsing arc files.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4188
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/TIKA-4188 by this push:
     new 0d584aebc TIKA-4188 -- WIP -- initial steps towards parsing arc files.
0d584aebc is described below

commit 0d584aebc4694643d2d3c62d49e8a4ccd8d4e6c9
Author: tallison <ta...@apache.org>
AuthorDate: Wed Feb 7 11:07:49 2024 -0500

    TIKA-4188 -- WIP -- initial steps towards parsing arc files.
---
 .../detect/gzip/GZipSpecializationDetector.java    |   4 ++
 .../org/apache/tika/parser/warc/WARCParser.java    |  11 ++--
 .../apache/tika/parser/warc/WARCParserTest.java    |  10 ++-
 .../src/test/resources/test-documents/example.arc  |  69 +++++++++++++++++++++
 .../test/resources/test-documents/example.arc.gz   | Bin 0 -> 1027 bytes
 .../src/test/resources/test-documents/testARC.arc  |  10 ++-
 6 files changed, 97 insertions(+), 7 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java
index e3d743ad3..b87115b3b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java
@@ -38,6 +38,8 @@ public class GZipSpecializationDetector implements Detector {
     public static MediaType GZ = MediaType.application("gzip");
     public static MediaType WARC_GZ = MediaType.application("warc+gz");
 
+    public static MediaType ARC_GZ = MediaType.application("arc+gz");
+
     @Override
     public MediaType detect(InputStream input, Metadata metadata) throws IOException {
         if (input == null) {
@@ -84,6 +86,8 @@ public class GZipSpecializationDetector implements Detector {
         String s = new String(bytes.toByteArray(), StandardCharsets.UTF_8);
         if (s.startsWith("WARC/")) {
             return WARC_GZ;
+        } else if (s.startsWith("filedesc://")) {
+            return ARC_GZ;
         }
         return GZ;
     }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
index baf8d4a8d..ad4894b54 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
@@ -56,7 +56,9 @@ public class WARCParser implements Parser {
 
     private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
             new HashSet<>(Arrays.asList(MediaType.application("warc"),
-                    MediaType.application("warc+gz"), MediaType.application("x-internet-archive"))));
+                    MediaType.application("warc+gz"),
+                    MediaType.application("x-internet-archive"),
+                    MediaType.application("arc+gz"))));
 
     public static String WARC_PREFIX = "warc:";
     public static String WARC_HTTP_PREFIX = WARC_PREFIX + "http:";
@@ -133,9 +135,10 @@ public class WARCParser implements Parser {
         setNotNull(WARC.WARC_PAYLOAD_CONTENT_TYPE, warcResponse.payloadType(), metadata);
         processWarcMetadata(warcResponse, metadata);
         processHttpResponseMetadata(warcResponse.http(), metadata);
-
-        String id = warcResponse.id().toString();
-        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, id);
+        if (warcResponse.warcinfoID().isPresent()) {
+            String id = warcResponse.id().toString();
+            metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, id);
+        }
         WarcPayload payload = optionalPayload.get();
         metadata.set(WARC.WARC_RECORD_CONTENT_TYPE, payload.type().toString());
         metadata.set(Metadata.CONTENT_LENGTH, Long.toString(payload.body().size()));
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
index 57cc65bf4..bb7031550 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.warc;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
+import java.io.File;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
@@ -72,7 +73,12 @@ public class WARCParserTest extends TikaTest {
 
         List<Metadata> metadataList = getRecursiveMetadata("testARC.arc",
                 BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
-        debug(metadataList);
-
+        assertEquals(2, metadataList.size());
+        assertContains("The document has moved here",
+                metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+        assertEquals("http://www.uq.edu.au/robots.txt",
+                metadataList.get(1).get("warc:WARC-Target-URI"));
+        assertEquals("http://www.uq.edu.au/",
+                metadataList.get(1).get("warc:http:Location"));
     }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc
new file mode 100644
index 000000000..0d2af2bd2
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc
@@ -0,0 +1,69 @@
+filedesc://live-web-example.arc.gz 127.0.0.1 20140216050221 text/plain 75
+1 0 LiveWeb Capture
+URL IP-address Archive-date Content-type Archive-length
+
+http://example.com/ 93.184.216.119 20140216050221 text/html 1591
+HTTP/1.1 200 OK
+Accept-Ranges: bytes
+Cache-Control: max-age=604800
+Content-Type: text/html
+Date: Sun, 16 Feb 2014 05:02:20 GMT
+Etag: "359670651"
+Expires: Sun, 23 Feb 2014 05:02:20 GMT
+Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT
+Server: ECS (sjc/4FCE)
+X-Cache: HIT
+x-ec-custom-error: 1
+Content-Length: 1270
+
+<!doctype html>
+<html>
+<head>
+    <title>Example Domain</title>
+
+    <meta charset="utf-8" />
+    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <style type="text/css">
+    body {
+        background-color: #f0f0f2;
+        margin: 0;
+        padding: 0;
+        font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
+        
+    }
+    div {
+        width: 600px;
+        margin: 5em auto;
+        padding: 50px;
+        background-color: #fff;
+        border-radius: 1em;
+    }
+    a:link, a:visited {
+        color: #38488f;
+        text-decoration: none;
+    }
+    @media (max-width: 700px) {
+        body {
+            background-color: #fff;
+        }
+        div {
+            width: auto;
+            margin: 0 auto;
+            border-radius: 0;
+            padding: 1em;
+        }
+    }
+    </style>    
+</head>
+
+<body>
+<div>
+    <h1>Example Domain</h1>
+    <p>This domain is established to be used for illustrative examples in documents. You may use this
+    domain in examples without prior coordination or asking for permission.</p>
+    <p><a href="http://www.iana.org/domains/example">More information...</a></p>
+</div>
+</body>
+</html>
+
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc.gz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc.gz
new file mode 100644
index 000000000..bc959cf18
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc.gz differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc
index d2b4970be..b7f099eb3 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc
@@ -39,4 +39,12 @@ Content-Type: text/html; charset=iso-8859-1
 The document has moved <A HREF="http://www.uq.edu.au/">here</A>.<P>
 <HR>
 <ADDRESS>Apache/1.3.28 Server at www.uq.edu.au Port 80</ADDRESS>
-</BODY></HTML>
\ No newline at end of file
+</BODY></HTML>
+
+
+
+
+
+
+
+