You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by nd...@apache.org on 2023/02/22 04:30:17 UTC

[tika] 01/01: fix TIKA-3970 - start correctly handling the onlyLatestRevision option by detecting parent property id of type ElementChildNodesOfVersionHistory and do not print those if that flag is set. - need to prevent File Node references to the same text from being printed out in the xml stream multiple times. - improve PropertyValue to have a toString so that debugging in IDE is easier.

This is an automated email from the ASF dual-hosted git repository.

ndipiazza pushed a commit to branch TIKA-3970-onenote-dupe-text
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 2ea6e7888d275001c7242084c9c22c83c6ab28b1
Author: Nicholas DiPiazza <ni...@gmail.com>
AuthorDate: Tue Feb 21 22:30:01 2023 -0600

    fix TIKA-3970
    - start correctly handling the onlyLatestRevision option by detecting parent property id of type ElementChildNodesOfVersionHistory and do not print those if that flag is set.
    - need to prevent File Node references to the same text from being printed out in the xml stream multiple times.
    - improve PropertyValue to have a toString so that debugging in IDE is easier.
---
 .../microsoft/onenote/OneNoteTreeWalker.java       |  59 ++++++++++++++-------
 .../parser/microsoft/onenote/PropertyValue.java    |  13 ++++-
 .../microsoft/onenote/OneNoteParserTest.java       |  14 ++++-
 .../test-documents/test-tika-3970-dupetext.one     | Bin 0 -> 2602160 bytes
 4 files changed, 66 insertions(+), 20 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
index f5738bb19..90ff013a9 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
@@ -99,6 +99,11 @@ class OneNoteTreeWalker {
     private boolean mostRecentAuthorProp = false;
     private boolean originalAuthorProp = false;
 
+    /**
+     * Contains pairs of {Offset,Length} that we have added to the text stream already.
+     */
+    private Set<Pair<Long, Integer>> textAlreadyFetched = new HashSet<>();
+
     /**
      * Create a one tree walker.
      *
@@ -149,7 +154,7 @@ class OneNoteTreeWalker {
             throws IOException, TikaException, SAXException {
         List<Map<String, Object>> res = new ArrayList<>();
         if (options.isCrawlAllFileNodesFromRoot()) {
-            res.add(walkFileNodeList(oneNoteDocument.root));
+            res.add(walkFileNodeList(oneNoteDocument.root, null));
         } else {
             for (ExtendedGUID revisionListGuid : oneNoteDocument.revisionListOrder) {
                 Map<String, Object> structure = new HashMap<>();
@@ -221,7 +226,7 @@ class OneNoteTreeWalker {
                         child.subType.rootObjectReference.rootObjectReferenceBase.rootRole == 1) {
                     FileNodePtr childFileNodePointer =
                             oneNoteDocument.guidToObject.get(child.gosid);
-                    children.add(walkFileNodePtr(childFileNodePointer));
+                    children.add(walkFileNodePtr(childFileNodePointer, null));
                 }
             }
         }
@@ -239,14 +244,16 @@ class OneNoteTreeWalker {
      * Walk the file node pointer.
      *
      * @param fileNodePtr The file node pointer.
+     * @param parentPropertyId The PropertyId of the parent.
      * @return Returns a map of the main data.
      * @throws IOException Can throw these when manipulating the seekable byte channel.
      */
-    public Map<String, Object> walkFileNodePtr(FileNodePtr fileNodePtr)
+    public Map<String, Object> walkFileNodePtr(FileNodePtr fileNodePtr,
+                                               OneNotePropertyId parentPropertyId)
             throws IOException, TikaException, SAXException {
         if (fileNodePtr != null) {
             FileNode fileNode = fileNodePtr.dereference(oneNoteDocument);
-            return walkFileNode(fileNode);
+            return walkFileNode(fileNode, parentPropertyId);
         }
         return Collections.emptyMap();
     }
@@ -258,7 +265,7 @@ class OneNoteTreeWalker {
      * @return The result.
      * @throws IOException Can throw these when manipulating the seekable byte channel.
      */
-    public Map<String, Object> walkFileNodeList(FileNodeList fileNodeList)
+    public Map<String, Object> walkFileNodeList(FileNodeList fileNodeList, OneNotePropertyId parentPropertyId)
             throws IOException, TikaException, SAXException {
         Map<String, Object> structure = new HashMap<>();
         structure.put("oneNoteType", "FileNodeList");
@@ -266,7 +273,7 @@ class OneNoteTreeWalker {
         if (!fileNodeList.children.isEmpty()) {
             List<Map<String, Object>> children = new ArrayList<>();
             for (FileNode child : fileNodeList.children) {
-                children.add(walkFileNode(child));
+                children.add(walkFileNode(child, parentPropertyId));
             }
             structure.put("children", children);
         }
@@ -277,10 +284,12 @@ class OneNoteTreeWalker {
      * Walk a single file node.
      *
      * @param fileNode The file node.
+     * @param parentPropertyId
      * @return Map which is result of the parsed file node.
      * @throws IOException Can throw these when manipulating the seekable byte channel.
      */
-    public Map<String, Object> walkFileNode(FileNode fileNode)
+    public Map<String, Object> walkFileNode(FileNode fileNode,
+                                            OneNotePropertyId parentPropertyId)
             throws IOException, TikaException, SAXException {
         Map<String, Object> structure = new HashMap<>();
         structure.put("oneNoteType", "FileNode");
@@ -293,10 +302,10 @@ class OneNoteTreeWalker {
         structure.put("idDesc", fileNode.idDesc);
         if (fileNode.childFileNodeList != null &&
                 fileNode.childFileNodeList.fileNodeListHeader != null) {
-            structure.put("childFileNodeList", walkFileNodeList(fileNode.childFileNodeList));
+            structure.put("childFileNodeList", walkFileNodeList(fileNode.childFileNodeList, parentPropertyId));
         }
         if (fileNode.propertySet != null) {
-            List<Map<String, Object>> propSet = processPropertySet(fileNode.propertySet);
+            List<Map<String, Object>> propSet = processPropertySet(fileNode.propertySet, parentPropertyId);
             if (!propSet.isEmpty()) {
                 structure.put("propertySet", propSet);
             }
@@ -360,14 +369,17 @@ class OneNoteTreeWalker {
 
     /**
      * @param propertySet
+     * @param parentPropertyId
      * @return
      * @throws IOException Can throw these when manipulating the seekable byte channel.
      */
-    private List<Map<String, Object>> processPropertySet(PropertySet propertySet)
+    private List<Map<String, Object>> processPropertySet(PropertySet propertySet,
+                                                         OneNotePropertyId parentPropertyId)
             throws IOException, TikaException, SAXException {
         List<Map<String, Object>> propValues = new ArrayList<>();
-        for (PropertyValue propertyValue : propertySet.rgPridsData) {
-            propValues.add(processPropertyValue(propertyValue));
+        for (int i = 0; i < propertySet.rgPridsData.size(); ++i) {
+            PropertyValue propertyValue = propertySet.rgPridsData.get(i);
+            propValues.add(processPropertyValue(propertyValue, parentPropertyId));
         }
         return propValues;
     }
@@ -391,10 +403,12 @@ class OneNoteTreeWalker {
      * engine parsing.
      *
      * @param propertyValue The property value we are parsing.
+     * @param parentPropertyId
      * @return The map parsed by this property value.
      * @throws IOException Can throw these when manipulating the seekable byte channel.
      */
-    private Map<String, Object> processPropertyValue(PropertyValue propertyValue)
+    private Map<String, Object> processPropertyValue(PropertyValue propertyValue,
+                                                     OneNotePropertyId parentPropertyId)
             throws IOException, TikaException, SAXException {
         Map<String, Object> propMap = new HashMap<>();
         propMap.put("oneNoteType", "PropertyValue");
@@ -495,7 +509,11 @@ class OneNoteTreeWalker {
                 }
                 if (propertyValue.propertyId.propertyEnum ==
                         OneNotePropertyEnum.RichEditTextUnicode) {
-                    handleRichEditTextUnicode(content.size());
+                    if (!options.isOnlyLatestRevision()
+                            || (parentPropertyId != null && parentPropertyId.propertyEnum != OneNotePropertyEnum.ElementChildNodesOfVersionHistory)) {
+                        // only handle text for the latest revision, unless the options have the onlyLatestRevision = false
+                        handleRichEditTextUnicode(content.size());
+                    }
                 } else {
                     //TODO -- these seem to be somewhat broken font files and other
                     //odds and ends...what are they and how should we process them?
@@ -507,14 +525,14 @@ class OneNoteTreeWalker {
             List<Map<String, Object>> children = new ArrayList<>();
             for (CompactID compactID : propertyValue.compactIDs) {
                 FileNodePtr childFileNodePointer = oneNoteDocument.guidToObject.get(compactID.guid);
-                children.add(walkFileNodePtr(childFileNodePointer));
+                children.add(walkFileNodePtr(childFileNodePointer, propertyValue.propertyId));
             }
             if (!children.isEmpty()) {
                 propMap.put("children", children);
             }
         }
         if (propertyValue.propertySet != null && propertyValue.propertySet.rgPridsData != null) {
-            List<Map<String, Object>> propSet = processPropertySet(propertyValue.propertySet);
+            List<Map<String, Object>> propSet = processPropertySet(propertyValue.propertySet, parentPropertyId);
             if (!propSet.isEmpty()) {
                 propMap.put("propertySet", propSet);
             }
@@ -543,7 +561,12 @@ class OneNoteTreeWalker {
     }
 
     private void handleRichEditTextUnicode(int length)
-            throws SAXException, IOException, TikaException {
+            throws SAXException, IOException {
+        if (!textAlreadyFetched.add(Pair.of(dif.position(), length))) {
+            // do not revisit already visited text, as you may encounter references to the same file nodes
+            // while walking the tree.
+            return;
+        }
         //this is a null-ended UTF-16LE string
         ByteBuffer buf = ByteBuffer.allocate(length);
         dif.read(buf);
@@ -608,4 +631,4 @@ class OneNoteTreeWalker {
     public void setCreationTimestamp(long creationTimestamp) {
         this.creationTimestamp = creationTimestamp;
     }
-}
+}
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/PropertyValue.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/PropertyValue.java
index a641cc713..6cb1ecf17 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/PropertyValue.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/PropertyValue.java
@@ -137,4 +137,15 @@ class PropertyValue {
         this.rawData = rawData;
         return this;
     }
-}
+
+    @Override
+    public String toString() {
+        return "PropertyValue{" +
+                "propertyId=" + propertyId +
+                ", scalar=" + scalar +
+                ", compactIDs=" + compactIDs +
+                ", propertySet=" + propertySet +
+                ", rawData=" + rawData +
+                '}';
+    }
+}
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
index 3c04f95a8..9bb252af4 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
@@ -26,6 +26,7 @@ import java.util.List;
 
 import org.junit.jupiter.api.Test;
 
+import org.apache.commons.lang3.StringUtils;
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -275,4 +276,15 @@ public class OneNoteParserTest extends TikaTest {
         assertNotContained("\u432F", txt);
         assertNotContained("\u01E1", txt);
     }
-}
+
+    /**
+     * TIKA-3970 - test duplicate text.
+     */
+    @Test
+    public void testDupeText() throws Exception {
+        Metadata metadata = new Metadata();
+        String txt = getText("test-tika-3970-dupetext.one", metadata);
+
+        assertEquals(1, StringUtils.countMatches(txt, "Sunday morning"));
+    }
+}
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test-tika-3970-dupetext.one b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test-tika-3970-dupetext.one
new file mode 100644
index 000000000..946678313
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test-tika-3970-dupetext.one differ