You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/28 16:02:08 UTC

(tika) branch branch_2x updated (0ff5834e3 -> 7302ccd17)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git


    from 0ff5834e3 TIKA-4162: update aws, miredot-plugin
     new c94093684 TIKA-4204 -- improve lookup of dataspace/storage items
     new 7302ccd17 TIKA-4204 -- improve lookup of dataspace/storage items -- fix checkstyle

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../java/org/apache/tika/parser/microsoft/chm/ChmCommons.java | 11 +++++++----
 .../org/apache/tika/parser/microsoft/chm/ChmExtractor.java    |  3 ++-
 .../org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java   |  2 +-
 .../org/apache/tika/parser/microsoft/chm/TestChmLzxState.java |  3 ++-
 4 files changed, 12 insertions(+), 7 deletions(-)


(tika) 02/02: TIKA-4204 -- improve lookup of dataspace/storage items -- fix checkstyle

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 7302ccd17cf07506bc7f774b7aeb9cfd2408c05b
Author: tallison <ta...@apache.org>
AuthorDate: Wed Feb 28 09:50:30 2024 -0500

    TIKA-4204 -- improve lookup of dataspace/storage items -- fix checkstyle
    
    Apologies for initially pushing this commit to main instead of an issue branch. :/
    
    (cherry picked from commit 1c1018950c88454ee9a91456931f9d18dde13124)
---
 .../main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java   | 3 ++-
 .../java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java     | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java
index 7081f6bc7..abe44405a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java
@@ -119,7 +119,8 @@ public class ChmExtractor {
             getChmLzxcResetTable().parse(dir_chunk, getChmLzxcResetTable());
 
             setIndexOfContent(ChmCommons
-                    .indexOfDataSpaceStorageElement(getChmDirList().getDirectoryListingEntryList(), ChmConstants.CONTENT));
+                    .indexOfDataSpaceStorageElement(getChmDirList().getDirectoryListingEntryList(),
+                            ChmConstants.CONTENT));
             setLzxBlockOffset(
                     (getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent())
                             .getOffset() + getChmItsfHeader().getDataOffset()));
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java
index e95047a43..815820346 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java
@@ -53,7 +53,8 @@ public class TestChmLzxState {
         ChmDirectoryListingSet chmDirListCont =
                 new ChmDirectoryListingSet(data, chmItsHeader, chmItspHeader);
         int indexOfControlData = ChmCommons
-                .indexOfDataSpaceStorageElement(chmDirListCont.getDirectoryListingEntryList(), ChmConstants.CONTROL_DATA);
+                .indexOfDataSpaceStorageElement(chmDirListCont.getDirectoryListingEntryList(),
+                        ChmConstants.CONTROL_DATA);
 
         int indexOfResetTable =
                 ChmCommons.indexOfResetTableBlock(data, ChmConstants.LZXC.getBytes(UTF_8));


(tika) 01/02: TIKA-4204 -- improve lookup of dataspace/storage items

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit c94093684622a2a05d06e9225fd5cc1166814038
Author: tallison <ta...@apache.org>
AuthorDate: Wed Feb 28 09:41:08 2024 -0500

    TIKA-4204 -- improve lookup of dataspace/storage items
    
    (cherry picked from commit eefe884c81a2a94c212e5ed9aa5bbb659e653782)
---
 .../java/org/apache/tika/parser/microsoft/chm/ChmCommons.java | 11 +++++++----
 .../org/apache/tika/parser/microsoft/chm/ChmExtractor.java    |  2 +-
 .../org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java   |  2 +-
 .../org/apache/tika/parser/microsoft/chm/TestChmLzxState.java |  2 +-
 4 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java
index 4af06e446..bb0de014c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java
@@ -222,7 +222,7 @@ public class ChmCommons {
      */
     public static final int indexOfResetTableBlock(byte[] text, byte[] pattern)
             throws ChmParsingException {
-        return (indexOf(text, pattern)) - 4;
+        return (indexOfDataSpaceStorageElement(text, pattern)) - 4;
     }
 
     /**
@@ -233,7 +233,7 @@ public class ChmCommons {
      * @return an index, if nothing found returns -1
      * @throws ChmParsingException
      */
-    public static int indexOf(byte[] text, byte[] pattern) throws ChmParsingException {
+    public static int indexOfDataSpaceStorageElement(byte[] text, byte[] pattern) throws ChmParsingException {
         int[] next = null;
         int i = 0, j = -1;
 
@@ -281,15 +281,18 @@ public class ChmCommons {
 
     /**
      * Searches for some pattern in the directory listing entry list
+     * This requires that the entry name start with "::DataSpaceStorage"
+     * See TIKA-4204
      *
      * @param list
      * @param pattern
      * @return an index, if nothing found returns -1
      */
-    public static int indexOf(List<DirectoryListingEntry> list, String pattern) {
+    public static int indexOfDataSpaceStorageElement(List<DirectoryListingEntry> list, String pattern) {
         int place = 0;
         for (DirectoryListingEntry directoryListingEntry : list) {
-            if (directoryListingEntry.toString().contains(pattern)) {
+            if (directoryListingEntry.getName().startsWith("::DataSpace/Storage") &&
+                    directoryListingEntry.getName().contains(pattern)) {
                 return place;
             }
             ++place;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java
index ba1738b65..7081f6bc7 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java
@@ -119,7 +119,7 @@ public class ChmExtractor {
             getChmLzxcResetTable().parse(dir_chunk, getChmLzxcResetTable());
 
             setIndexOfContent(ChmCommons
-                    .indexOf(getChmDirList().getDirectoryListingEntryList(), ChmConstants.CONTENT));
+                    .indexOfDataSpaceStorageElement(getChmDirList().getDirectoryListingEntryList(), ChmConstants.CONTENT));
             setLzxBlockOffset(
                     (getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent())
                             .getOffset() + getChmItsfHeader().getDataOffset()));
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java
index ccd59bbee..a57e03d5b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java
@@ -73,7 +73,7 @@ public class ChmPmgiHeader implements ChmAccessor<ChmPmgiHeader> {
         ChmAssert.assertChmAccessorNotNull(chmPmgiHeader);
         ChmAssert.assertPositiveInt(count);
         this.setDataRemained(data.length);
-        index = ChmCommons.indexOf(data, ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8));
+        index = ChmCommons.indexOfDataSpaceStorageElement(data, ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8));
 
         if (index >= 0) {
             System.arraycopy(data, index, chmPmgiHeader.getSignature(), 0, count);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java
index d9c3da81d..e95047a43 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java
@@ -53,7 +53,7 @@ public class TestChmLzxState {
         ChmDirectoryListingSet chmDirListCont =
                 new ChmDirectoryListingSet(data, chmItsHeader, chmItspHeader);
         int indexOfControlData = ChmCommons
-                .indexOf(chmDirListCont.getDirectoryListingEntryList(), ChmConstants.CONTROL_DATA);
+                .indexOfDataSpaceStorageElement(chmDirListCont.getDirectoryListingEntryList(), ChmConstants.CONTROL_DATA);
 
         int indexOfResetTable =
                 ChmCommons.indexOfResetTableBlock(data, ChmConstants.LZXC.getBytes(UTF_8));