You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/05/26 16:10:03 UTC

[tika] branch TIKA-4055 created (now a2267afc6)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4055
in repository https://gitbox.apache.org/repos/asf/tika.git


      at a2267afc6 TIKA-4002 -- add mime type detection for pcapng

This branch includes the following new commits:

     new a2267afc6 TIKA-4002 -- add mime type detection for pcapng

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-4002 -- add mime type detection for pcapng

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4055
in repository https://gitbox.apache.org/repos/asf/tika.git

commit a2267afc66d30a425a8ad6482adeb22dd4b91897
Author: tballison <ta...@apache.org>
AuthorDate: Fri May 26 12:09:52 2023 -0400

    TIKA-4002 -- add mime type detection for pcapng
---
 CHANGES.txt                                        |  6 +++
 .../apache/tika/parser/RecursiveParserWrapper.java |  2 +
 .../tika/parser/RecursiveParserWrapperTest.java    | 61 +++++++++++++++++++---
 .../src/test/resources/log4j.properties            |  2 +-
 .../core/resource/RecursiveMetadataResource.java   |  4 +-
 .../standard/RecursiveMetadataResourceTest.java    | 14 ++---
 6 files changed, 73 insertions(+), 16 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index b3ac0be3b..5526b5f86 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,9 @@
+Release 2.8.1 - ???
+
+   * Fixed write limit bug in RecursiveParserWrapper (TIKA-4055).
+
+   * Add mime detection for many files (TIKA-3992).
+
 Release 2.8.0 - 5/11/2023
 
    * Enable counting and/or parsing of incremental updates in PDFs.  This
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 483181b0a..e8f029770 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -372,6 +372,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
             }
             int availableLength = Math.min(totalWriteLimit - totalChars, length);
             super.characters(ch, start, availableLength);
+            totalChars += availableLength;
             if (availableLength < length) {
                 handleWriteLimitReached();
             }
@@ -389,6 +390,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
             }
             int availableLength = Math.min(totalWriteLimit - totalChars, length);
             super.ignorableWhitespace(ch, start, availableLength);
+            totalChars += availableLength;
             if (availableLength < length) {
                 handleWriteLimitReached();
             }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 24800926a..61eeab14d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -93,14 +93,15 @@ public class RecursiveParserWrapperTest extends TikaTest {
 
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
-                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 70));
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+                        70));
         try (InputStream stream =
                     getResourceAsStream("/test-documents/test_recursive_embedded.docx")) {
             wrapper.parse(stream, handler, metadata, context);
         }
         List<Metadata> list = handler.getMetadataList();
 
-        assertEquals(5, list.size());
+        assertEquals(2, list.size());
 
         int wlr = 0;
         for (Metadata m : list) {
@@ -112,15 +113,31 @@ public class RecursiveParserWrapperTest extends TikaTest {
         assertEquals(2, wlr);
     }
 
+    @Test
+    public void testOne() throws Exception {
+        ParseContext context = new ParseContext();
+        Metadata metadata = new Metadata();
+        int writeLimit = 100;
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
+        RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+                        writeLimit, false, context));
+        try (InputStream stream = getResourceAsStream(
+                "/test-documents/test_recursive_embedded" + ".docx")) {
+            wrapper.parse(stream, handler, metadata, context);
+        }
+        List<Metadata> list = handler.getMetadataList();
+        assertEquals(12, list.size());
+    }
     @Test
     public void testCharLimitNoThrowOnWriteLimit() throws Exception {
         ParseContext context = new ParseContext();
         Metadata metadata = new Metadata();
-
+        int writeLimit = 500;
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
-                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 500,
-                        false, context));
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+                        writeLimit, false, context));
         try (InputStream stream = getResourceAsStream("/test-documents/test_recursive_embedded" +
                 ".docx")) {
             wrapper.parse(stream, handler, metadata, context);
@@ -131,11 +148,41 @@ public class RecursiveParserWrapperTest extends TikaTest {
 
         assertEquals("true", list.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
 
-        assertContains("them to the separation", list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
-        assertNotContained("unalienable Rights",
+        assertContains("dissolve the political", list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
+        assertNotContained("them to the separation",
                 list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
     }
 
+    @Test
+    public void testSpecificLimit() throws Exception {
+        int writeLimit = 60;
+
+        ParseContext context = new ParseContext();
+        Metadata metadata = new Metadata();
+
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
+        RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+                        writeLimit, false, context));
+        try (InputStream stream = getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf")) {
+            wrapper.parse(stream, handler, metadata, context);
+        }
+        List<Metadata> list = handler.getMetadataList();
+        assertTrue(writeLimit >= getContentLength(list),
+                "writeLimit=" + writeLimit + " contentLength=" + getContentLength(list));
+    }
+
+    private int getContentLength(List<Metadata> metadataList) {
+        int sz = 0;
+        for (Metadata metadata : metadataList) {
+            String content = metadata.get(TikaCoreProperties.TIKA_CONTENT);
+            if (content != null) {
+                sz += content.length();
+            }
+        }
+        return sz;
+    }
+
     @Test
     public void testMaxEmbedded() throws Exception {
         int maxEmbedded = 4;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties
index 8c106427a..bd6faa7a8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 #info,debug, error,fatal ...
-log4j.rootLogger=info,stdout
+log4j.rootLogger=error,stdout
 
 #console
 log4j.appender.stdout=org.apache.log4j.ConsoleAppender
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
index 76e24b926..ac4837110 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
@@ -69,9 +69,11 @@ public class RecursiveMetadataResource {
         fillMetadata(parser, metadata, httpHeaders);
         fillParseContext(httpHeaders, metadata, context);
         TikaResource.logRequest(LOG, "/rmeta", metadata);
+
         BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.getType();
         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
-                new BasicContentHandlerFactory(type, handlerConfig.getWriteLimit()),
+                new BasicContentHandlerFactory(type, handlerConfig.getWriteLimit(),
+                        handlerConfig.isThrowOnWriteLimitReached(), context),
                 handlerConfig.getMaxEmbeddedResources(),
                 TikaResource.getConfig().getMetadataFilter());
         try {
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
index 3de5c0e65..691554edb 100644
--- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
@@ -345,8 +345,8 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
         assertEquals(1, metadataList.size());
         assertEquals("true", metadataList.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
 
-        //now try with a write limit of 200
-        writeLimit = 200;
+        //now try with a write limit of 500
+        writeLimit = 550;
         response = WebClient.create(endPoint + META_PATH).accept("application/json")
                 .header("writeLimit", Integer.toString(writeLimit))
                 .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
@@ -390,11 +390,11 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
         // Check results
         Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
         List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
-        assertEquals(10, metadataList.size());
+        assertEquals(12, metadataList.size());
         assertEquals("true", metadataList.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
 
-        //now try with a write limit of 200
-        writeLimit = 200;
+        //now try with a write limit of 550
+        writeLimit = 550;
         response = WebClient.create(endPoint + META_PATH).accept("application/json")
                 .header("writeLimit", Integer.toString(writeLimit))
                 .header("throwOnWriteLimitReached", "false")
@@ -404,8 +404,8 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
         // Check results
         reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
         metadataList = JsonMetadataList.fromJson(reader);
-        assertEquals(10, metadataList.size());
-        assertEquals("true", metadataList.get(6).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
+        assertEquals(12, metadataList.size());
+        assertEquals("true", metadataList.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
         assertContains("When in the Course of human events it becomes necessary for one people",
                 metadataList.get(6).get(TikaCoreProperties.TIKA_CONTENT));
         TikaTest.assertNotContained("We hold these truths",