You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/07 16:08:02 UTC
(tika) branch TIKA-4188 updated: TIKA-4188 -- WIP -- initial steps towards parsing arc files.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4188
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-4188 by this push:
new 0d584aebc TIKA-4188 -- WIP -- initial steps towards parsing arc files.
0d584aebc is described below
commit 0d584aebc4694643d2d3c62d49e8a4ccd8d4e6c9
Author: tallison <ta...@apache.org>
AuthorDate: Wed Feb 7 11:07:49 2024 -0500
TIKA-4188 -- WIP -- initial steps towards parsing arc files.
---
.../detect/gzip/GZipSpecializationDetector.java | 4 ++
.../org/apache/tika/parser/warc/WARCParser.java | 11 ++--
.../apache/tika/parser/warc/WARCParserTest.java | 10 ++-
.../src/test/resources/test-documents/example.arc | 69 +++++++++++++++++++++
.../test/resources/test-documents/example.arc.gz | Bin 0 -> 1027 bytes
.../src/test/resources/test-documents/testARC.arc | 10 ++-
6 files changed, 97 insertions(+), 7 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java
index e3d743ad3..b87115b3b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java
@@ -38,6 +38,8 @@ public class GZipSpecializationDetector implements Detector {
public static MediaType GZ = MediaType.application("gzip");
public static MediaType WARC_GZ = MediaType.application("warc+gz");
+ public static MediaType ARC_GZ = MediaType.application("arc+gz");
+
@Override
public MediaType detect(InputStream input, Metadata metadata) throws IOException {
if (input == null) {
@@ -84,6 +86,8 @@ public class GZipSpecializationDetector implements Detector {
String s = new String(bytes.toByteArray(), StandardCharsets.UTF_8);
if (s.startsWith("WARC/")) {
return WARC_GZ;
+ } else if (s.startsWith("filedesc://")) {
+ return ARC_GZ;
}
return GZ;
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
index baf8d4a8d..ad4894b54 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
@@ -56,7 +56,9 @@ public class WARCParser implements Parser {
private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
new HashSet<>(Arrays.asList(MediaType.application("warc"),
- MediaType.application("warc+gz"), MediaType.application("x-internet-archive"))));
+ MediaType.application("warc+gz"),
+ MediaType.application("x-internet-archive"),
+ MediaType.application("arc+gz"))));
public static String WARC_PREFIX = "warc:";
public static String WARC_HTTP_PREFIX = WARC_PREFIX + "http:";
@@ -133,9 +135,10 @@ public class WARCParser implements Parser {
setNotNull(WARC.WARC_PAYLOAD_CONTENT_TYPE, warcResponse.payloadType(), metadata);
processWarcMetadata(warcResponse, metadata);
processHttpResponseMetadata(warcResponse.http(), metadata);
-
- String id = warcResponse.id().toString();
- metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, id);
+ if (warcResponse.warcinfoID().isPresent()) {
+ String id = warcResponse.id().toString();
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, id);
+ }
WarcPayload payload = optionalPayload.get();
metadata.set(WARC.WARC_RECORD_CONTENT_TYPE, payload.type().toString());
metadata.set(Metadata.CONTENT_LENGTH, Long.toString(payload.body().size()));
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
index 57cc65bf4..bb7031550 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.warc;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import java.io.File;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
@@ -72,7 +73,12 @@ public class WARCParserTest extends TikaTest {
List<Metadata> metadataList = getRecursiveMetadata("testARC.arc",
BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
- debug(metadataList);
-
+ assertEquals(2, metadataList.size());
+ assertContains("The document has moved here",
+ metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+ assertEquals("http://www.uq.edu.au/robots.txt",
+ metadataList.get(1).get("warc:WARC-Target-URI"));
+ assertEquals("http://www.uq.edu.au/",
+ metadataList.get(1).get("warc:http:Location"));
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc
new file mode 100644
index 000000000..0d2af2bd2
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc
@@ -0,0 +1,69 @@
+filedesc://live-web-example.arc.gz 127.0.0.1 20140216050221 text/plain 75
+1 0 LiveWeb Capture
+URL IP-address Archive-date Content-type Archive-length
+
+http://example.com/ 93.184.216.119 20140216050221 text/html 1591
+HTTP/1.1 200 OK
+Accept-Ranges: bytes
+Cache-Control: max-age=604800
+Content-Type: text/html
+Date: Sun, 16 Feb 2014 05:02:20 GMT
+Etag: "359670651"
+Expires: Sun, 23 Feb 2014 05:02:20 GMT
+Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT
+Server: ECS (sjc/4FCE)
+X-Cache: HIT
+x-ec-custom-error: 1
+Content-Length: 1270
+
+<!doctype html>
+<html>
+<head>
+ <title>Example Domain</title>
+
+ <meta charset="utf-8" />
+ <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
+ <style type="text/css">
+ body {
+ background-color: #f0f0f2;
+ margin: 0;
+ padding: 0;
+ font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
+
+ }
+ div {
+ width: 600px;
+ margin: 5em auto;
+ padding: 50px;
+ background-color: #fff;
+ border-radius: 1em;
+ }
+ a:link, a:visited {
+ color: #38488f;
+ text-decoration: none;
+ }
+ @media (max-width: 700px) {
+ body {
+ background-color: #fff;
+ }
+ div {
+ width: auto;
+ margin: 0 auto;
+ border-radius: 0;
+ padding: 1em;
+ }
+ }
+ </style>
+</head>
+
+<body>
+<div>
+ <h1>Example Domain</h1>
+ <p>This domain is established to be used for illustrative examples in documents. You may use this
+ domain in examples without prior coordination or asking for permission.</p>
+ <p><a href="http://www.iana.org/domains/example">More information...</a></p>
+</div>
+</body>
+</html>
+
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc.gz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc.gz
new file mode 100644
index 000000000..bc959cf18
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc.gz differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc
index d2b4970be..b7f099eb3 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc
@@ -39,4 +39,12 @@ Content-Type: text/html; charset=iso-8859-1
The document has moved <A HREF="http://www.uq.edu.au/">here</A>.<P>
<HR>
<ADDRESS>Apache/1.3.28 Server at www.uq.edu.au Port 80</ADDRESS>
-</BODY></HTML>
\ No newline at end of file
+</BODY></HTML>
+
+
+
+
+
+
+
+