You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by nd...@apache.org on 2023/02/22 04:30:16 UTC

[tika] branch TIKA-3970-onenote-dupe-text created (now 2ea6e7888)

This is an automated email from the ASF dual-hosted git repository.

ndipiazza pushed a change to branch TIKA-3970-onenote-dupe-text
in repository https://gitbox.apache.org/repos/asf/tika.git


      at 2ea6e7888 fix TIKA-3970 - start correctly handling the onlyLatestRevision option by detecting parent property id of type ElementChildNodesOfVersionHistory and do not print those if that flag is set. - need to prevent File Node references to the same text from being printed out in the xml stream multiple times. - improve PropertyValue to have a toString so that debugging in IDE is easier.

This branch includes the following new commits:

     new 2ea6e7888 fix TIKA-3970 - start correctly handling the onlyLatestRevision option by detecting parent property id of type ElementChildNodesOfVersionHistory and do not print those if that flag is set. - need to prevent File Node references to the same text from being printed out in the xml stream multiple times. - improve PropertyValue to have a toString so that debugging in IDE is easier.

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: fix TIKA-3970 - start correctly handling the onlyLatestRevision option by detecting parent property id of type ElementChildNodesOfVersionHistory and do not print those if that flag is set. - need to prevent File Node references to the same text from being printed out in the xml stream multiple times. - improve PropertyValue to have a toString so that debugging in IDE is easier.

Posted by nd...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

ndipiazza pushed a commit to branch TIKA-3970-onenote-dupe-text
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 2ea6e7888d275001c7242084c9c22c83c6ab28b1
Author: Nicholas DiPiazza <ni...@gmail.com>
AuthorDate: Tue Feb 21 22:30:01 2023 -0600

    fix TIKA-3970
    - start correctly handling the onlyLatestRevision option by detecting parent property id of type ElementChildNodesOfVersionHistory and do not print those if that flag is set.
    - need to prevent File Node references to the same text from being printed out in the xml stream multiple times.
    - improve PropertyValue to have a toString so that debugging in IDE is easier.
---
 .../microsoft/onenote/OneNoteTreeWalker.java       |  59 ++++++++++++++-------
 .../parser/microsoft/onenote/PropertyValue.java    |  13 ++++-
 .../microsoft/onenote/OneNoteParserTest.java       |  14 ++++-
 .../test-documents/test-tika-3970-dupetext.one     | Bin 0 -> 2602160 bytes
 4 files changed, 66 insertions(+), 20 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
index f5738bb19..90ff013a9 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
@@ -99,6 +99,11 @@ class OneNoteTreeWalker {
     private boolean mostRecentAuthorProp = false;
     private boolean originalAuthorProp = false;
 
+    /**
+     * Contains pairs of {Offset,Length} that we have added to the text stream already.
+     */
+    private Set<Pair<Long, Integer>> textAlreadyFetched = new HashSet<>();
+
     /**
      * Create a one tree walker.
      *
@@ -149,7 +154,7 @@ class OneNoteTreeWalker {
             throws IOException, TikaException, SAXException {
         List<Map<String, Object>> res = new ArrayList<>();
         if (options.isCrawlAllFileNodesFromRoot()) {
-            res.add(walkFileNodeList(oneNoteDocument.root));
+            res.add(walkFileNodeList(oneNoteDocument.root, null));
         } else {
             for (ExtendedGUID revisionListGuid : oneNoteDocument.revisionListOrder) {
                 Map<String, Object> structure = new HashMap<>();
@@ -221,7 +226,7 @@ class OneNoteTreeWalker {
                         child.subType.rootObjectReference.rootObjectReferenceBase.rootRole == 1) {
                     FileNodePtr childFileNodePointer =
                             oneNoteDocument.guidToObject.get(child.gosid);
-                    children.add(walkFileNodePtr(childFileNodePointer));
+                    children.add(walkFileNodePtr(childFileNodePointer, null));
                 }
             }
         }
@@ -239,14 +244,16 @@ class OneNoteTreeWalker {
      * Walk the file node pointer.
      *
      * @param fileNodePtr The file node pointer.
+     * @param parentPropertyId The PropertyId of the parent.
      * @return Returns a map of the main data.
      * @throws IOException Can throw these when manipulating the seekable byte channel.
      */
-    public Map<String, Object> walkFileNodePtr(FileNodePtr fileNodePtr)
+    public Map<String, Object> walkFileNodePtr(FileNodePtr fileNodePtr,
+                                               OneNotePropertyId parentPropertyId)
             throws IOException, TikaException, SAXException {
         if (fileNodePtr != null) {
             FileNode fileNode = fileNodePtr.dereference(oneNoteDocument);
-            return walkFileNode(fileNode);
+            return walkFileNode(fileNode, parentPropertyId);
         }
         return Collections.emptyMap();
     }
@@ -258,7 +265,7 @@ class OneNoteTreeWalker {
      * @return The result.
      * @throws IOException Can throw these when manipulating the seekable byte channel.
      */
-    public Map<String, Object> walkFileNodeList(FileNodeList fileNodeList)
+    public Map<String, Object> walkFileNodeList(FileNodeList fileNodeList, OneNotePropertyId parentPropertyId)
             throws IOException, TikaException, SAXException {
         Map<String, Object> structure = new HashMap<>();
         structure.put("oneNoteType", "FileNodeList");
@@ -266,7 +273,7 @@ class OneNoteTreeWalker {
         if (!fileNodeList.children.isEmpty()) {
             List<Map<String, Object>> children = new ArrayList<>();
             for (FileNode child : fileNodeList.children) {
-                children.add(walkFileNode(child));
+                children.add(walkFileNode(child, parentPropertyId));
             }
             structure.put("children", children);
         }
@@ -277,10 +284,12 @@ class OneNoteTreeWalker {
      * Walk a single file node.
      *
      * @param fileNode The file node.
+     * @param parentPropertyId
      * @return Map which is result of the parsed file node.
      * @throws IOException Can throw these when manipulating the seekable byte channel.
      */
-    public Map<String, Object> walkFileNode(FileNode fileNode)
+    public Map<String, Object> walkFileNode(FileNode fileNode,
+                                            OneNotePropertyId parentPropertyId)
             throws IOException, TikaException, SAXException {
         Map<String, Object> structure = new HashMap<>();
         structure.put("oneNoteType", "FileNode");
@@ -293,10 +302,10 @@ class OneNoteTreeWalker {
         structure.put("idDesc", fileNode.idDesc);
         if (fileNode.childFileNodeList != null &&
                 fileNode.childFileNodeList.fileNodeListHeader != null) {
-            structure.put("childFileNodeList", walkFileNodeList(fileNode.childFileNodeList));
+            structure.put("childFileNodeList", walkFileNodeList(fileNode.childFileNodeList, parentPropertyId));
         }
         if (fileNode.propertySet != null) {
-            List<Map<String, Object>> propSet = processPropertySet(fileNode.propertySet);
+            List<Map<String, Object>> propSet = processPropertySet(fileNode.propertySet, parentPropertyId);
             if (!propSet.isEmpty()) {
                 structure.put("propertySet", propSet);
             }
@@ -360,14 +369,17 @@ class OneNoteTreeWalker {
 
     /**
      * @param propertySet
+     * @param parentPropertyId
      * @return
      * @throws IOException Can throw these when manipulating the seekable byte channel.
      */
-    private List<Map<String, Object>> processPropertySet(PropertySet propertySet)
+    private List<Map<String, Object>> processPropertySet(PropertySet propertySet,
+                                                         OneNotePropertyId parentPropertyId)
             throws IOException, TikaException, SAXException {
         List<Map<String, Object>> propValues = new ArrayList<>();
-        for (PropertyValue propertyValue : propertySet.rgPridsData) {
-            propValues.add(processPropertyValue(propertyValue));
+        for (int i = 0; i < propertySet.rgPridsData.size(); ++i) {
+            PropertyValue propertyValue = propertySet.rgPridsData.get(i);
+            propValues.add(processPropertyValue(propertyValue, parentPropertyId));
         }
         return propValues;
     }
@@ -391,10 +403,12 @@ class OneNoteTreeWalker {
      * engine parsing.
      *
      * @param propertyValue The property value we are parsing.
+     * @param parentPropertyId
      * @return The map parsed by this property value.
      * @throws IOException Can throw these when manipulating the seekable byte channel.
      */
-    private Map<String, Object> processPropertyValue(PropertyValue propertyValue)
+    private Map<String, Object> processPropertyValue(PropertyValue propertyValue,
+                                                     OneNotePropertyId parentPropertyId)
             throws IOException, TikaException, SAXException {
         Map<String, Object> propMap = new HashMap<>();
         propMap.put("oneNoteType", "PropertyValue");
@@ -495,7 +509,11 @@ class OneNoteTreeWalker {
                 }
                 if (propertyValue.propertyId.propertyEnum ==
                         OneNotePropertyEnum.RichEditTextUnicode) {
-                    handleRichEditTextUnicode(content.size());
+                    if (!options.isOnlyLatestRevision()
+                            || (parentPropertyId != null && parentPropertyId.propertyEnum != OneNotePropertyEnum.ElementChildNodesOfVersionHistory)) {
+                        // only handle text for the latest revision, unless the options have the onlyLatestRevision = false
+                        handleRichEditTextUnicode(content.size());
+                    }
                 } else {
                     //TODO -- these seem to be somewhat broken font files and other
                     //odds and ends...what are they and how should we process them?
@@ -507,14 +525,14 @@ class OneNoteTreeWalker {
             List<Map<String, Object>> children = new ArrayList<>();
             for (CompactID compactID : propertyValue.compactIDs) {
                 FileNodePtr childFileNodePointer = oneNoteDocument.guidToObject.get(compactID.guid);
-                children.add(walkFileNodePtr(childFileNodePointer));
+                children.add(walkFileNodePtr(childFileNodePointer, propertyValue.propertyId));
             }
             if (!children.isEmpty()) {
                 propMap.put("children", children);
             }
         }
         if (propertyValue.propertySet != null && propertyValue.propertySet.rgPridsData != null) {
-            List<Map<String, Object>> propSet = processPropertySet(propertyValue.propertySet);
+            List<Map<String, Object>> propSet = processPropertySet(propertyValue.propertySet, parentPropertyId);
             if (!propSet.isEmpty()) {
                 propMap.put("propertySet", propSet);
             }
@@ -543,7 +561,12 @@ class OneNoteTreeWalker {
     }
 
     private void handleRichEditTextUnicode(int length)
-            throws SAXException, IOException, TikaException {
+            throws SAXException, IOException {
+        if (!textAlreadyFetched.add(Pair.of(dif.position(), length))) {
+            // do not revisit already visited text, as you may encounter references to the same file nodes
+            // while walking the tree.
+            return;
+        }
         //this is a null-ended UTF-16LE string
         ByteBuffer buf = ByteBuffer.allocate(length);
         dif.read(buf);
@@ -608,4 +631,4 @@ class OneNoteTreeWalker {
     public void setCreationTimestamp(long creationTimestamp) {
         this.creationTimestamp = creationTimestamp;
     }
-}
+}
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/PropertyValue.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/PropertyValue.java
index a641cc713..6cb1ecf17 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/PropertyValue.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/PropertyValue.java
@@ -137,4 +137,15 @@ class PropertyValue {
         this.rawData = rawData;
         return this;
     }
-}
+
+    @Override
+    public String toString() {
+        return "PropertyValue{" +
+                "propertyId=" + propertyId +
+                ", scalar=" + scalar +
+                ", compactIDs=" + compactIDs +
+                ", propertySet=" + propertySet +
+                ", rawData=" + rawData +
+                '}';
+    }
+}
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
index 3c04f95a8..9bb252af4 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
@@ -26,6 +26,7 @@ import java.util.List;
 
 import org.junit.jupiter.api.Test;
 
+import org.apache.commons.lang3.StringUtils;
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -275,4 +276,15 @@ public class OneNoteParserTest extends TikaTest {
         assertNotContained("\u432F", txt);
         assertNotContained("\u01E1", txt);
     }
-}
+
+    /**
+     * TIKA-3970 - test duplicate text.
+     */
+    @Test
+    public void testDupeText() throws Exception {
+        Metadata metadata = new Metadata();
+        String txt = getText("test-tika-3970-dupetext.one", metadata);
+
+        assertEquals(1, StringUtils.countMatches(txt, "Sunday morning"));
+    }
+}
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test-tika-3970-dupetext.one b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test-tika-3970-dupetext.one
new file mode 100644
index 000000000..946678313
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/test-tika-3970-dupetext.one differ