You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/09 14:43:44 UTC

(tika) branch TIKA-4188 updated: TIKA-4188 -- upgrade jwarc and add unit test

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4188
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/TIKA-4188 by this push:
     new 6ac12b6c8 TIKA-4188 -- upgrade jwarc and add unit test
6ac12b6c8 is described below

commit 6ac12b6c8bff62269e4ecb5a9f6d00f8a7495d20
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 9 09:43:28 2024 -0500

    TIKA-4188 -- upgrade jwarc and add unit test
---
 tika-parent/pom.xml                                |  2 +-
 .../apache/tika/parser/warc/WARCParserTest.java    | 12 +++-
 .../src/test/resources/test-documents/example.arc  | 69 ----------------------
 3 files changed, 12 insertions(+), 71 deletions(-)

diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index fe002e538..5f76e65c7 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -378,7 +378,7 @@
     <junit5.version>5.10.2</junit5.version>
     <juniversalchardet.version>2.4.0</juniversalchardet.version>
     <junrar.version>7.5.5</junrar.version>
-    <jwarc.version>0.28.5</jwarc.version>
+    <jwarc.version>0.28.6</jwarc.version>
     <kafka.version>3.6.1</kafka.version>
     <libpst.version>0.9.3</libpst.version>
     <log4j2.version>2.22.1</log4j2.version>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
index bb7031550..8dc35bcf9 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
@@ -18,7 +18,6 @@ package org.apache.tika.parser.warc;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
-import java.io.File;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
@@ -81,4 +80,15 @@ public class WARCParserTest extends TikaTest {
         assertEquals("http://www.uq.edu.au/",
                 metadataList.get(1).get("warc:http:Location"));
     }
+
+    @Test
+    public void testExampleARC() throws Exception {
+        //test file from https://github.com/webrecorder/warcio/blob/master/test/data/example.arc.gz
+        List<Metadata> metadataList = getRecursiveMetadata("example.arc.gz",
+                BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
+        assertEquals(2, metadataList.size());
+        assertEquals("application/arc+gz", metadataList.get(0).get(Metadata.CONTENT_TYPE));
+        assertContains("This domain is established",
+                metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+    }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc
deleted file mode 100644
index 0d2af2bd2..000000000
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc
+++ /dev/null
@@ -1,69 +0,0 @@
-filedesc://live-web-example.arc.gz 127.0.0.1 20140216050221 text/plain 75
-1 0 LiveWeb Capture
-URL IP-address Archive-date Content-type Archive-length
-
-http://example.com/ 93.184.216.119 20140216050221 text/html 1591
-HTTP/1.1 200 OK
-Accept-Ranges: bytes
-Cache-Control: max-age=604800
-Content-Type: text/html
-Date: Sun, 16 Feb 2014 05:02:20 GMT
-Etag: "359670651"
-Expires: Sun, 23 Feb 2014 05:02:20 GMT
-Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT
-Server: ECS (sjc/4FCE)
-X-Cache: HIT
-x-ec-custom-error: 1
-Content-Length: 1270
-
-<!doctype html>
-<html>
-<head>
-    <title>Example Domain</title>
-
-    <meta charset="utf-8" />
-    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
-    <meta name="viewport" content="width=device-width, initial-scale=1" />
-    <style type="text/css">
-    body {
-        background-color: #f0f0f2;
-        margin: 0;
-        padding: 0;
-        font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
-        
-    }
-    div {
-        width: 600px;
-        margin: 5em auto;
-        padding: 50px;
-        background-color: #fff;
-        border-radius: 1em;
-    }
-    a:link, a:visited {
-        color: #38488f;
-        text-decoration: none;
-    }
-    @media (max-width: 700px) {
-        body {
-            background-color: #fff;
-        }
-        div {
-            width: auto;
-            margin: 0 auto;
-            border-radius: 0;
-            padding: 1em;
-        }
-    }
-    </style>    
-</head>
-
-<body>
-<div>
-    <h1>Example Domain</h1>
-    <p>This domain is established to be used for illustrative examples in documents. You may use this
-    domain in examples without prior coordination or asking for permission.</p>
-    <p><a href="http://www.iana.org/domains/example">More information...</a></p>
-</div>
-</body>
-</html>
-