You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/09 14:43:44 UTC
(tika) branch TIKA-4188 updated: TIKA-4188 -- upgrade jwarc and add unit test
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4188
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-4188 by this push:
new 6ac12b6c8 TIKA-4188 -- upgrade jwarc and add unit test
6ac12b6c8 is described below
commit 6ac12b6c8bff62269e4ecb5a9f6d00f8a7495d20
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 9 09:43:28 2024 -0500
TIKA-4188 -- upgrade jwarc and add unit test
---
tika-parent/pom.xml | 2 +-
.../apache/tika/parser/warc/WARCParserTest.java | 12 +++-
.../src/test/resources/test-documents/example.arc | 69 ----------------------
3 files changed, 12 insertions(+), 71 deletions(-)
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index fe002e538..5f76e65c7 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -378,7 +378,7 @@
<junit5.version>5.10.2</junit5.version>
<juniversalchardet.version>2.4.0</juniversalchardet.version>
<junrar.version>7.5.5</junrar.version>
- <jwarc.version>0.28.5</jwarc.version>
+ <jwarc.version>0.28.6</jwarc.version>
<kafka.version>3.6.1</kafka.version>
<libpst.version>0.9.3</libpst.version>
<log4j2.version>2.22.1</log4j2.version>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
index bb7031550..8dc35bcf9 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
@@ -18,7 +18,6 @@ package org.apache.tika.parser.warc;
import static org.junit.jupiter.api.Assertions.assertEquals;
-import java.io.File;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
@@ -81,4 +80,15 @@ public class WARCParserTest extends TikaTest {
assertEquals("http://www.uq.edu.au/",
metadataList.get(1).get("warc:http:Location"));
}
+
+ @Test
+ public void testExampleARC() throws Exception {
+ //test file from https://github.com/webrecorder/warcio/blob/master/test/data/example.arc.gz
+ List<Metadata> metadataList = getRecursiveMetadata("example.arc.gz",
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
+ assertEquals(2, metadataList.size());
+ assertEquals("application/arc+gz", metadataList.get(0).get(Metadata.CONTENT_TYPE));
+ assertContains("This domain is established",
+ metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+ }
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc
deleted file mode 100644
index 0d2af2bd2..000000000
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc
+++ /dev/null
@@ -1,69 +0,0 @@
-filedesc://live-web-example.arc.gz 127.0.0.1 20140216050221 text/plain 75
-1 0 LiveWeb Capture
-URL IP-address Archive-date Content-type Archive-length
-
-http://example.com/ 93.184.216.119 20140216050221 text/html 1591
-HTTP/1.1 200 OK
-Accept-Ranges: bytes
-Cache-Control: max-age=604800
-Content-Type: text/html
-Date: Sun, 16 Feb 2014 05:02:20 GMT
-Etag: "359670651"
-Expires: Sun, 23 Feb 2014 05:02:20 GMT
-Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT
-Server: ECS (sjc/4FCE)
-X-Cache: HIT
-x-ec-custom-error: 1
-Content-Length: 1270
-
-<!doctype html>
-<html>
-<head>
- <title>Example Domain</title>
-
- <meta charset="utf-8" />
- <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
- <meta name="viewport" content="width=device-width, initial-scale=1" />
- <style type="text/css">
- body {
- background-color: #f0f0f2;
- margin: 0;
- padding: 0;
- font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
-
- }
- div {
- width: 600px;
- margin: 5em auto;
- padding: 50px;
- background-color: #fff;
- border-radius: 1em;
- }
- a:link, a:visited {
- color: #38488f;
- text-decoration: none;
- }
- @media (max-width: 700px) {
- body {
- background-color: #fff;
- }
- div {
- width: auto;
- margin: 0 auto;
- border-radius: 0;
- padding: 1em;
- }
- }
- </style>
-</head>
-
-<body>
-<div>
- <h1>Example Domain</h1>
- <p>This domain is established to be used for illustrative examples in documents. You may use this
- domain in examples without prior coordination or asking for permission.</p>
- <p><a href="http://www.iana.org/domains/example">More information...</a></p>
-</div>
-</body>
-</html>
-