You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/12/08 15:55:58 UTC

[tika] branch main updated: TIKA-3243 -- bump max record length and enable manual configuration of max record length

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new e8fa990  TIKA-3243 -- bump max record length and enable manual configuration of max record length
e8fa990 is described below

commit e8fa990e201f9ffdef2da3b0e2237a0b93ed9353
Author: tallison <ta...@apache.org>
AuthorDate: Tue Dec 8 10:55:31 2020 -0500

    TIKA-3243 -- bump max record length and enable manual configuration of max record length
---
 .../org/apache/tika/parser/image/PSDParser.java    | 23 ++++++++++++-----
 .../apache/tika/parser/image/PSDParserTest.java    | 10 ++++++++
 .../tika/parser/image/tika-config-TIKA-3243.xml    | 29 ++++++++++++++++++++++
 3 files changed, 56 insertions(+), 6 deletions(-)

diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java
index 8b3e2fd..7dc4253 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java
@@ -26,6 +26,7 @@ import java.util.HashSet;
 import java.util.Set;
 
 import org.apache.commons.io.IOUtils;
+import org.apache.tika.config.Field;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.EndianUtils;
 import org.apache.tika.metadata.Metadata;
@@ -62,9 +63,11 @@ public class PSDParser extends AbstractParser {
             Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
                     MediaType.image("vnd.adobe.photoshop"))));
 
-    private static final int MAX_DATA_LENGTH_BYTES = 1000000;
+    private static final int MAX_DATA_LENGTH_BYTES = 10_000_000;
     private static final int MAX_BLOCKS = 10000;
 
+    private int maxDataLengthBytes = MAX_DATA_LENGTH_BYTES;
+
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }
@@ -127,7 +130,7 @@ public class PSDParser extends AbstractParser {
         //infinite loop by only reading 10000 blocks
         int blocks = 0;
         while (read < imageResourcesSectionSize && blocks < MAX_BLOCKS) {
-            ResourceBlock rb = new ResourceBlock(stream);
+            ResourceBlock rb = new ResourceBlock(stream, maxDataLengthBytes);
             if (rb.totalLength <= 0) {
                 //break;
             }
@@ -159,6 +162,11 @@ public class PSDParser extends AbstractParser {
         xhtml.endDocument();
     }
 
+    @Field
+    public void setMaxDataLengthBytes(int maxDataLengthBytes) {
+        this.maxDataLengthBytes = maxDataLengthBytes;
+    }
+
     private static class ResourceBlock {
         private static final long SIGNATURE = 0x3842494d; // 8BIM
         private static final int ID_CAPTION = 0x03F0;
@@ -170,12 +178,15 @@ public class PSDParser extends AbstractParser {
         private static final int ID_AUTO_SAVE_FILE_PATH = 0x043E;
         private static final int ID_THUMBNAIL_RESOURCE = 0x040C;
 
+        private final int maxDataLengthBytes;
         private int id;
         private String name;
         private byte[] data;
         private int totalLength;
         static int counter = 0;
-        private ResourceBlock(InputStream stream) throws IOException, TikaException {
+
+        private ResourceBlock(InputStream stream, int maxDataLengthBytes) throws IOException, TikaException {
+            this.maxDataLengthBytes = maxDataLengthBytes;
             counter++;
             // Verify the signature
             long sig = EndianUtils.readIntBE(stream);
@@ -224,9 +235,9 @@ public class PSDParser extends AbstractParser {
             totalLength = 4 + 2 + nameLen + 4 + dataLen;
             // Do we have use for the data segment?
             if (captureData(id)) {
-                if (dataLen > MAX_DATA_LENGTH_BYTES) {
-                    throw new TikaException("data length must be < "+MAX_DATA_LENGTH_BYTES+
-                            ": "+dataLen);
+                if (dataLen > maxDataLengthBytes) {
+                    throw new TikaException("data length must be < " +
+                            maxDataLengthBytes + ": " + dataLen);
                 }
                 data = new byte[dataLen];
                 IOUtils.readFully(stream, data);
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-image-module/src/test/java/org/apache/tika/parser/image/PSDParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-image-module/src/test/java/org/apache/tika/parser/image/PSDParserTest.java
index 657de5d..e2cb091 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-image-module/src/test/java/org/apache/tika/parser/image/PSDParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-image-module/src/test/java/org/apache/tika/parser/image/PSDParserTest.java
@@ -21,6 +21,8 @@ import static org.junit.Assert.assertEquals;
 import java.io.InputStream;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.XMPMM;
 import org.apache.tika.parser.ParseContext;
@@ -70,4 +72,12 @@ public class PSDParserTest extends TikaTest {
         assertEquals("Adobe Photoshop CC 2014 (Macintosh)", metadata.get(XMPMM.HISTORY_SOFTWARE_AGENT));
         assertEquals("xmp.iid:63681182-81a0-4035-b4b2-19bea6201c05", metadata.get(XMPMM.HISTORY_EVENT_INSTANCEID));
     }
+
+    @Test (expected = TikaException.class)
+    public void testMaxLength() throws Exception {
+        TikaConfig config = new TikaConfig(getResourceAsStream("tika-config-TIKA-3243.xml"));
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/x-psd");
+        getXML("testPSD_xmp.psd", config.getParser(), metadata);
+    }
 }
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-image-module/src/test/resources/org/apache/tika/parser/image/tika-config-TIKA-3243.xml b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-image-module/src/test/resources/org/apache/tika/parser/image/tika-config-TIKA-3243.xml
new file mode 100644
index 0000000..a3230ee
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-image-module/src/test/resources/org/apache/tika/parser/image/tika-config-TIKA-3243.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser">
+            <parser-exclude class="org.apache.tika.parser.image.PSDParser"/>
+        </parser>
+        <parser class="org.apache.tika.parser.image.PSDParser">
+            <params>
+                <param name="maxDataLengthBytes" type="int">100</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
\ No newline at end of file