You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/28 16:02:09 UTC

(tika) 01/02: TIKA-4204 -- improve lookup of dataspace/storage items

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit c94093684622a2a05d06e9225fd5cc1166814038
Author: tallison <ta...@apache.org>
AuthorDate: Wed Feb 28 09:41:08 2024 -0500

    TIKA-4204 -- improve lookup of dataspace/storage items
    
    (cherry picked from commit eefe884c81a2a94c212e5ed9aa5bbb659e653782)
---
 .../java/org/apache/tika/parser/microsoft/chm/ChmCommons.java | 11 +++++++----
 .../org/apache/tika/parser/microsoft/chm/ChmExtractor.java    |  2 +-
 .../org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java   |  2 +-
 .../org/apache/tika/parser/microsoft/chm/TestChmLzxState.java |  2 +-
 4 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java
index 4af06e446..bb0de014c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java
@@ -222,7 +222,7 @@ public class ChmCommons {
      */
     public static final int indexOfResetTableBlock(byte[] text, byte[] pattern)
             throws ChmParsingException {
-        return (indexOf(text, pattern)) - 4;
+        return (indexOfDataSpaceStorageElement(text, pattern)) - 4;
     }
 
     /**
@@ -233,7 +233,7 @@ public class ChmCommons {
      * @return an index, if nothing found returns -1
      * @throws ChmParsingException
      */
-    public static int indexOf(byte[] text, byte[] pattern) throws ChmParsingException {
+    public static int indexOfDataSpaceStorageElement(byte[] text, byte[] pattern) throws ChmParsingException {
         int[] next = null;
         int i = 0, j = -1;
 
@@ -281,15 +281,18 @@ public class ChmCommons {
 
     /**
      * Searches for some pattern in the directory listing entry list
+     * This requires that the entry name start with "::DataSpaceStorage"
+     * See TIKA-4204
      *
      * @param list
      * @param pattern
      * @return an index, if nothing found returns -1
      */
-    public static int indexOf(List<DirectoryListingEntry> list, String pattern) {
+    public static int indexOfDataSpaceStorageElement(List<DirectoryListingEntry> list, String pattern) {
         int place = 0;
         for (DirectoryListingEntry directoryListingEntry : list) {
-            if (directoryListingEntry.toString().contains(pattern)) {
+            if (directoryListingEntry.getName().startsWith("::DataSpace/Storage") &&
+                    directoryListingEntry.getName().contains(pattern)) {
                 return place;
             }
             ++place;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java
index ba1738b65..7081f6bc7 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java
@@ -119,7 +119,7 @@ public class ChmExtractor {
             getChmLzxcResetTable().parse(dir_chunk, getChmLzxcResetTable());
 
             setIndexOfContent(ChmCommons
-                    .indexOf(getChmDirList().getDirectoryListingEntryList(), ChmConstants.CONTENT));
+                    .indexOfDataSpaceStorageElement(getChmDirList().getDirectoryListingEntryList(), ChmConstants.CONTENT));
             setLzxBlockOffset(
                     (getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent())
                             .getOffset() + getChmItsfHeader().getDataOffset()));
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java
index ccd59bbee..a57e03d5b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java
@@ -73,7 +73,7 @@ public class ChmPmgiHeader implements ChmAccessor<ChmPmgiHeader> {
         ChmAssert.assertChmAccessorNotNull(chmPmgiHeader);
         ChmAssert.assertPositiveInt(count);
         this.setDataRemained(data.length);
-        index = ChmCommons.indexOf(data, ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8));
+        index = ChmCommons.indexOfDataSpaceStorageElement(data, ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8));
 
         if (index >= 0) {
             System.arraycopy(data, index, chmPmgiHeader.getSignature(), 0, count);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java
index d9c3da81d..e95047a43 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java
@@ -53,7 +53,7 @@ public class TestChmLzxState {
         ChmDirectoryListingSet chmDirListCont =
                 new ChmDirectoryListingSet(data, chmItsHeader, chmItspHeader);
         int indexOfControlData = ChmCommons
-                .indexOf(chmDirListCont.getDirectoryListingEntryList(), ChmConstants.CONTROL_DATA);
+                .indexOfDataSpaceStorageElement(chmDirListCont.getDirectoryListingEntryList(), ChmConstants.CONTROL_DATA);
 
         int indexOfResetTable =
                 ChmCommons.indexOfResetTableBlock(data, ChmConstants.LZXC.getBytes(UTF_8));