You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/04/20 20:09:13 UTC

[tika] branch master updated (941d61a -> 77d5745)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git.

      from  941d61a   update CHANGES.txt in prep for release. reorder changes to most significant first...changes in default behavior  then new parsers...Completely subjective, and I'm open to reordering!
       new  a31ed0d   TIKA-2331 -- more opportunities to check the alleged length of a byte[]
       new  77d5745   TIKA-2024 -- another location where the original source path might be recorded

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "adds" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../apache/tika/parser/rtf/RTFEmbObjHandler.java   |  9 +++----
 .../apache/tika/parser/rtf/RTFObjDataParser.java   | 28 ++++++++++++++++------
 .../java/org/apache/tika/parser/rtf/RTFParser.java |  2 +-
 .../org/apache/tika/parser/rtf/RTFParserTest.java  |  4 ++--
 4 files changed, 29 insertions(+), 14 deletions(-)

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].

[tika] 02/02: TIKA-2024 -- another location where the original source path might be recorded

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 77d57457b31ca0f3faa80fc293cd092dda7c9c40
Author: tballison <ta...@mitre.org>
AuthorDate: Thu Apr 20 15:59:43 2017 -0400

    TIKA-2024 -- another location where the original source path might be recorded
---
 .../src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java    | 2 ++
 .../src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java       | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
index bac828f..26f0084 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
@@ -29,6 +29,7 @@ import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.xml.sax.ContentHandler;
@@ -185,6 +186,7 @@ class RTFEmbObjHandler {
             if (filePath != null && filePath.length() > 0) {
                 metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filePath);
                 metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(filePath));
+                metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, filePath);
             }
             metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject));
             extractObj(bytes, handler, metadata);
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
index aed6cf5..7e81329 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
@@ -479,8 +479,8 @@ public class RTFParserTest extends TikaTest {
         assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL));
         assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
 
-        assertEquals(48, meta_jpg.names().length);
-        assertEquals(112, meta_jpg_exif.names().length);
+        assertEquals(49, meta_jpg.names().length);
+        assertEquals(113, meta_jpg_exif.names().length);
     }
 
     @Test

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.

[tika] 01/02: TIKA-2331 -- more opportunities to check the alleged length of a byte[]

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit a31ed0d995e16d67f510d0ffdaa1d890e1df2da1
Author: tballison <ta...@mitre.org>
AuthorDate: Thu Apr 20 15:56:47 2017 -0400

    TIKA-2331 -- more opportunities to check the alleged length of a byte[]
---
 .../apache/tika/parser/rtf/RTFEmbObjHandler.java   |  7 +++---
 .../apache/tika/parser/rtf/RTFObjDataParser.java   | 28 ++++++++++++++++------
 .../java/org/apache/tika/parser/rtf/RTFParser.java |  2 +-
 3 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
index 42900fc..bac828f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
@@ -151,9 +151,9 @@ class RTFEmbObjHandler {
         if (len < 0) {
             throw new TikaException("Requesting I read < 0 bytes ?!");
         }
-        if (len > memoryLimitInKb) {
+        if (len > memoryLimitInKb*1024) {
             throw new TikaMemoryLimitException("File embedded in RTF caused this (" + len +
-                    ") bytes), but maximum allowed is ("+memoryLimitInKb+")."+
+                    ") bytes), but maximum allowed is ("+(memoryLimitInKb*1024)+")."+
                     "If this is a valid RTF file, consider increasing the memory limit via TikaConfig.");
         }
 
@@ -171,10 +171,9 @@ class RTFEmbObjHandler {
      */
     protected void handleCompletedObject() throws IOException, SAXException, TikaException {
 
-
         byte[] bytes = os.toByteArray();
         if (state == EMB_STATE.OBJDATA) {
-            RTFObjDataParser objParser = new RTFObjDataParser();
+            RTFObjDataParser objParser = new RTFObjDataParser(memoryLimitInKb);
             try {
                 byte[] objBytes = objParser.parse(bytes, metadata, unknownFilenameCount);
                 extractObj(objBytes, handler, metadata);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
index b878dd2..a43c789 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
@@ -36,6 +36,8 @@ import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
 import org.apache.poi.util.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.EndianUtils;
 import org.apache.tika.io.TikaInputStream;
@@ -52,7 +54,11 @@ import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
 class RTFObjDataParser {
 
     private final static String WIN_ASCII = "WINDOWS-1252";
+    private final int memoryLimitInKb;
 
+    RTFObjDataParser(int memoryLimitInKb) {
+        this.memoryLimitInKb = memoryLimitInKb;
+    }
     /**
      * Parses the embedded object/pict string
      *
@@ -71,7 +77,7 @@ class RTFObjDataParser {
      * @throws IOException
      */
     protected byte[] parse(byte[] bytes, Metadata metadata, AtomicInteger unknownFilenameCount)
-            throws IOException {
+            throws IOException, TikaException {
         ByteArrayInputStream is = new ByteArrayInputStream(bytes);
         long version = readUInt(is);
         metadata.add(RTFMetadata.EMB_APP_VERSION, Long.toString(version));
@@ -192,7 +198,7 @@ class RTFObjDataParser {
      * can return null if there is a linked object
      * instead of an embedded file
      */
-    private byte[] handlePackage(byte[] pkgBytes, Metadata metadata) throws IOException {
+    private byte[] handlePackage(byte[] pkgBytes, Metadata metadata) throws IOException, TikaException {
         //now parse the package header
         ByteArrayInputStream is = new ByteArrayInputStream(pkgBytes);
         readUShort(is);
@@ -288,7 +294,7 @@ class RTFObjDataParser {
         return sb.toString();
     }
 
-    private String readLengthPrefixedAnsiString(InputStream is) throws IOException {
+    private String readLengthPrefixedAnsiString(InputStream is) throws IOException, TikaException {
         long len = readUInt(is);
         byte[] bytes = readBytes(is, len);
         try {
@@ -300,17 +306,25 @@ class RTFObjDataParser {
     }
 
 
-    private byte[] readBytes(InputStream is, long len) throws IOException {
+    private byte[] readBytes(InputStream is, long len) throws IOException, TikaException {
         //initByteArray tests for "reading of too many bytes"
         byte[] bytes = initByteArray(len);
         IOUtils.readFully(is, bytes);
         return bytes;
     }
 
-    private byte[] initByteArray(long len) throws IOException {
-        if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) {
-            throw new IOException("Requested length for reading bytes is out of bounds: " + len);
+    private byte[] initByteArray(long len) throws IOException, TikaException {
+        if (len < 0) {
+            throw new IOException("Requested length for reading bytes < 0?!: " + len);
+        } else if (memoryLimitInKb > -1 && len > memoryLimitInKb*1024) {
+            throw new TikaMemoryLimitException("File embedded in RTF caused this (" + len +
+                    ") bytes), but maximum allowed is ("+(memoryLimitInKb*1024)+")."+
+                    "If this is a valid RTF file, consider increasing the memory limit via TikaConfig.");
+        } else if (len > Integer.MAX_VALUE) {
+            throw new TikaMemoryLimitException("File embedded in RTF caused this (" + len +
+                    ") bytes), but there is a hard limit of Integer.MAX_VALUE+");
         }
+
         return new byte[(int) len];
 
     }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
index 567a7a8..a553dc0 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
@@ -83,7 +83,7 @@ public class RTFParser extends AbstractParser {
     }
 
     @Field
-    private int memoryLimitInKb = EMB_OBJ_MAX_BYTES;
+    private int memoryLimitInKb = EMB_OBJ_MAX_BYTES/1024;
 
     public void parse(
             InputStream stream, ContentHandler handler,

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.