You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/28 16:02:09 UTC
(tika) 01/02: TIKA-4204 -- improve lookup of dataspace/storage items
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit c94093684622a2a05d06e9225fd5cc1166814038
Author: tallison <ta...@apache.org>
AuthorDate: Wed Feb 28 09:41:08 2024 -0500
TIKA-4204 -- improve lookup of dataspace/storage items
(cherry picked from commit eefe884c81a2a94c212e5ed9aa5bbb659e653782)
---
.../java/org/apache/tika/parser/microsoft/chm/ChmCommons.java | 11 +++++++----
.../org/apache/tika/parser/microsoft/chm/ChmExtractor.java | 2 +-
.../org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java | 2 +-
.../org/apache/tika/parser/microsoft/chm/TestChmLzxState.java | 2 +-
4 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java
index 4af06e446..bb0de014c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java
@@ -222,7 +222,7 @@ public class ChmCommons {
*/
public static final int indexOfResetTableBlock(byte[] text, byte[] pattern)
throws ChmParsingException {
- return (indexOf(text, pattern)) - 4;
+ return (indexOfDataSpaceStorageElement(text, pattern)) - 4;
}
/**
@@ -233,7 +233,7 @@ public class ChmCommons {
* @return an index, if nothing found returns -1
* @throws ChmParsingException
*/
- public static int indexOf(byte[] text, byte[] pattern) throws ChmParsingException {
+ public static int indexOfDataSpaceStorageElement(byte[] text, byte[] pattern) throws ChmParsingException {
int[] next = null;
int i = 0, j = -1;
@@ -281,15 +281,18 @@ public class ChmCommons {
/**
* Searches for some pattern in the directory listing entry list
+ * This requires that the entry name start with "::DataSpaceStorage"
+ * See TIKA-4204
*
* @param list
* @param pattern
* @return an index, if nothing found returns -1
*/
- public static int indexOf(List<DirectoryListingEntry> list, String pattern) {
+ public static int indexOfDataSpaceStorageElement(List<DirectoryListingEntry> list, String pattern) {
int place = 0;
for (DirectoryListingEntry directoryListingEntry : list) {
- if (directoryListingEntry.toString().contains(pattern)) {
+ if (directoryListingEntry.getName().startsWith("::DataSpace/Storage") &&
+ directoryListingEntry.getName().contains(pattern)) {
return place;
}
++place;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java
index ba1738b65..7081f6bc7 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmExtractor.java
@@ -119,7 +119,7 @@ public class ChmExtractor {
getChmLzxcResetTable().parse(dir_chunk, getChmLzxcResetTable());
setIndexOfContent(ChmCommons
- .indexOf(getChmDirList().getDirectoryListingEntryList(), ChmConstants.CONTENT));
+ .indexOfDataSpaceStorageElement(getChmDirList().getDirectoryListingEntryList(), ChmConstants.CONTENT));
setLzxBlockOffset(
(getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent())
.getOffset() + getChmItsfHeader().getDataOffset()));
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java
index ccd59bbee..a57e03d5b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmPmgiHeader.java
@@ -73,7 +73,7 @@ public class ChmPmgiHeader implements ChmAccessor<ChmPmgiHeader> {
ChmAssert.assertChmAccessorNotNull(chmPmgiHeader);
ChmAssert.assertPositiveInt(count);
this.setDataRemained(data.length);
- index = ChmCommons.indexOf(data, ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8));
+ index = ChmCommons.indexOfDataSpaceStorageElement(data, ChmConstants.CHM_PMGI_MARKER.getBytes(UTF_8));
if (index >= 0) {
System.arraycopy(data, index, chmPmgiHeader.getSignature(), 0, count);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java
index d9c3da81d..e95047a43 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/chm/TestChmLzxState.java
@@ -53,7 +53,7 @@ public class TestChmLzxState {
ChmDirectoryListingSet chmDirListCont =
new ChmDirectoryListingSet(data, chmItsHeader, chmItspHeader);
int indexOfControlData = ChmCommons
- .indexOf(chmDirListCont.getDirectoryListingEntryList(), ChmConstants.CONTROL_DATA);
+ .indexOfDataSpaceStorageElement(chmDirListCont.getDirectoryListingEntryList(), ChmConstants.CONTROL_DATA);
int indexOfResetTable =
ChmCommons.indexOfResetTableBlock(data, ChmConstants.LZXC.getBytes(UTF_8));