You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/12/17 12:26:06 UTC
[tika] 07/07: squash commits of one note tika parser (#303)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit ec0c88ce05b7852ea832fb1a7273c88fcef04261
Author: Nicholas DiPiazza <ni...@lucidworks.com>
AuthorDate: Mon Dec 16 15:56:48 2019 -0600
squash commits of one note tika parser (#303)
---
.../java/org/apache/tika/TikaDetectionTest.java | 1 +
.../microsoft/onenote/CheckedFileNodePushBack.java | 43 +
.../tika/parser/microsoft/onenote/CompactID.java | 60 +
.../tika/parser/microsoft/onenote/Error.java | 29 +
.../parser/microsoft/onenote/ExtendedGUID.java | 87 ++
.../microsoft/onenote/FileChunkReference.java | 99 ++
.../microsoft/onenote/FileDataStoreObject.java | 31 +
.../onenote/FileDataStoreObjectReference.java | 30 +
.../tika/parser/microsoft/onenote/FileNode.java | 278 +++++
.../parser/microsoft/onenote/FileNodeList.java | 43 +
.../microsoft/onenote/FileNodeListHeader.java | 92 ++
.../tika/parser/microsoft/onenote/FileNodePtr.java | 65 ++
.../microsoft/onenote/FileNodePtrBackPush.java | 30 +
.../parser/microsoft/onenote/FileNodeUnion.java | 140 +++
.../microsoft/onenote/FndStructureConstants.java | 166 +++
.../apache/tika/parser/microsoft/onenote/GUID.java | 130 +++
.../microsoft/onenote/GlobalIdTableEntry2FNDX.java | 40 +
.../microsoft/onenote/GlobalIdTableEntry3FNDX.java | 50 +
.../microsoft/onenote/GlobalIdTableEntryFNDX.java | 40 +
.../microsoft/onenote/GlobalIdTableStartFNDX.java | 30 +
.../tika/parser/microsoft/onenote/IndentUtil.java | 27 +
.../tika/parser/microsoft/onenote/Int24.java | 36 +
.../apache/tika/parser/microsoft/onenote/JCID.java | 148 +++
.../microsoft/onenote/JCIDPropertySetTypeEnum.java | 79 ++
.../onenote/ObjectDeclarationWithRefCount.java | 75 ++
.../onenote/ObjectDeclarationWithRefCountBody.java | 73 ++
.../onenote/ObjectInfoDependencyOverrideData.java | 74 ++
.../onenote/ObjectInfoDependencyOverrides.java | 30 +
.../onenote/ObjectRevisionWithRefCountFNDX.java | 70 ++
.../onenote/ObjectSpaceObjectPropSet.java | 60 +
...ctSpaceObjectStreamOfOIDsOSIDsOrContextIDs.java | 63 ++
.../microsoft/onenote/ObjectStreamCounters.java | 29 +
.../onenote/OneNoteDirectFileResource.java | 87 ++
.../parser/microsoft/onenote/OneNoteDocument.java | 138 +++
.../parser/microsoft/onenote/OneNoteHeader.java | 403 +++++++
.../parser/microsoft/onenote/OneNoteParser.java | 170 +++
.../microsoft/onenote/OneNotePropertyEnum.java | 210 ++++
.../microsoft/onenote/OneNotePropertyId.java | 86 ++
.../tika/parser/microsoft/onenote/OneNotePtr.java | 1158 ++++++++++++++++++++
.../microsoft/onenote/OneNoteTreeWalker.java | 579 ++++++++++
.../onenote/OneNoteTreeWalkerOptions.java | 88 ++
.../parser/microsoft/onenote/PropertyIDType.java | 26 +
.../tika/parser/microsoft/onenote/PropertySet.java | 95 ++
.../parser/microsoft/onenote/PropertyValue.java | 137 +++
.../tika/parser/microsoft/onenote/Revision.java | 72 ++
.../parser/microsoft/onenote/RevisionManifest.java | 60 +
.../onenote/RevisionManifestListStart.java | 30 +
.../microsoft/onenote/RevisionRoleDeclaration.java | 30 +
.../microsoft/onenote/RootObjectReference.java | 40 +
.../microsoft/onenote/RootObjectReferenceBase.java | 30 +
.../services/org.apache.tika.parser.Parser | 2 +
.../java/org/apache/tika/mime/TestMimeTypes.java | 30 +-
.../apache/tika/parser/image/TiffParserTest.java | 6 -
.../microsoft/onenote/OneNoteParserTest.java | 193 ++++
.../test/resources/test-documents/testOneNote1.one | Bin 0 -> 360280 bytes
.../test/resources/test-documents/testOneNote2.one | Bin 0 -> 435128 bytes
.../resources/test-documents/testOneNote2016.one | Bin 0 -> 14744 bytes
.../test/resources/test-documents/testOneNote3.one | Bin 0 -> 35344 bytes
.../test/resources/test-documents/testOneNote4.one | Bin 0 -> 43176 bytes
.../test-documents/testOneNoteEmbeddedWordDoc.one | Bin 0 -> 33096 bytes
60 files changed, 5899 insertions(+), 19 deletions(-)
diff --git a/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java b/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java
index a642b47..8f14a2b 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java
@@ -90,6 +90,7 @@ public class TikaDetectionTest {
assertEquals("application/oebps-package+xml", tika.detect("x.opf"));
assertEquals("application/ogg", tika.detect("x.ogx"));
// Differ from httpd - We have subtypes they lack
+ //assertEquals("application/onenote", tika.detect("x.one"));
//assertEquals("application/onenote", tika.detect("x.onetoc"));
//assertEquals("application/onenote", tika.detect("x.onetoc2"));
//assertEquals("application/onenote", tika.detect("x.onetmp"));
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/CheckedFileNodePushBack.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/CheckedFileNodePushBack.java
new file mode 100644
index 0000000..5cf23dd
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/CheckedFileNodePushBack.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+/**
+ * Provides a way to add a new element on the fileNode list, but remove it from the list if
+ * we end up not committing it.
+ */
+class CheckedFileNodePushBack {
+ FileNodeList fileNodeList;
+ boolean committed;
+
+ public CheckedFileNodePushBack(FileNodeList fileNodeList) {
+ committed = true;
+ this.fileNodeList = fileNodeList;
+ fileNodeList.children.add(new FileNode());
+ committed = false;
+ }
+
+ public void commit() {
+ committed = true;
+ }
+
+ public void popBackIfNotCommitted() {
+ if (!committed) {
+ fileNodeList.children.remove(fileNodeList.children.size() - 1);
+ }
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/CompactID.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/CompactID.java
new file mode 100644
index 0000000..bc7378b
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/CompactID.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+class CompactID {
+ char n;
+ long guidIndex; //only occupies 24 bits
+ ExtendedGUID guid;
+
+ public char getN() {
+ return n;
+ }
+
+ public CompactID setN(char n) {
+ this.n = n;
+ return this;
+ }
+
+ public long getGuidIndex() {
+ return guidIndex;
+ }
+
+ public CompactID setGuidIndex(long guidIndex) {
+ this.guidIndex = guidIndex;
+ return this;
+ }
+
+ public ExtendedGUID getGuid() {
+ return guid;
+ }
+
+ public CompactID setGuid(ExtendedGUID guid) {
+ this.guid = guid;
+ return this;
+ }
+
+ public String getCompactIDString() {
+ return new StringBuilder()
+ .append(guid)
+ .append(", index=")
+ .append(guidIndex)
+ .append(", n=")
+ .append((int) n)
+ .toString();
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/Error.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/Error.java
new file mode 100644
index 0000000..1239231
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/Error.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+public enum Error {
+ OK,
+ SEGV,
+ RESERVED_NONZERO,
+ UNKNOWN_ENUM,
+ INVALID_CONSTANT,
+ STRING_TOO_SHORT,
+ HEX_OUT_OF_RANGE,
+ COMPACT_ID_MISSING,
+ UNKNOWN_GUID,
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ExtendedGUID.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ExtendedGUID.java
new file mode 100644
index 0000000..2b46de2
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ExtendedGUID.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+import java.util.Locale;
+import java.util.Objects;
+
+class ExtendedGUID implements Comparable<ExtendedGUID> {
+ GUID guid;
+ long n;
+
+ public ExtendedGUID(GUID guid, long n) {
+ this.guid = guid;
+ this.n = n;
+ }
+
+ @Override
+ public int compareTo(ExtendedGUID other) {
+ if (other.guid.equals(guid)) {
+ new Long(n).compareTo(other.n);
+ }
+ return guid.compareTo(other.guid);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ ExtendedGUID that = (ExtendedGUID) o;
+ return n == that.n &&
+ Objects.equals(guid, that.guid);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(guid, n);
+ }
+
+ public static ExtendedGUID nil() {
+ return new ExtendedGUID(GUID.nil(), 0);
+ }
+
+ @Override
+ public String toString() {
+ return String.format(Locale.US, "%s [%d]", guid, n);
+ }
+
+ public GUID getGuid() {
+ return guid;
+ }
+
+ public ExtendedGUID setGuid(GUID guid) {
+ this.guid = guid;
+ return this;
+ }
+
+ public String getExtendedGuidString() {
+ return guid.toString() + " [" + n + "]";
+ }
+
+ public long getN() {
+ return n;
+ }
+
+ public ExtendedGUID setN(long n) {
+ this.n = n;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileChunkReference.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileChunkReference.java
new file mode 100644
index 0000000..04d1cb1
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileChunkReference.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+import java.util.Objects;
+
+/**
+ * A file chunk reference specifies a reference to data in the file.
+ * <p>
+ * Each file chunk reference contains an <pre>stp</pre> field and a <pre>cb</pre> field.
+ * <p>
+ * The <pre>stp</pre> field is a stream pointer that specifies the offset, in bytes, from the beginning of the file where the referenced
+ * data is located.
+ * <p>
+ * The <pre>cb</pre> field specifies the size, in bytes, of the referenced data. The sizes, in bytes, of the
+ * stp and cb fields are specified by the structures in this section.
+ * <p>
+ * There are some Special values:
+ * <p>
+ * fcrNil - Specifies a file chunk reference where all bits of the stp field are set to 1, and all bits of the cb field are set to zero.
+ * <p>
+ * fcrZero - Specifies a file chunk reference where all bits of the stp and cb fields are set to zero.
+ */
+class FileChunkReference {
+
+ long stp;
+ long cb;
+
+ public FileChunkReference() {
+
+ }
+
+ public FileChunkReference(long stp, long cb) {
+ this.stp = stp;
+ this.cb = cb;
+ }
+
+ public static FileChunkReference nil() {
+ return new FileChunkReference(-1L, 0L);
+ }
+
+ @Override
+ public String toString() {
+ return "FileChunkReference{" +
+ "stp=" + stp +
+ ", cb=" + cb +
+ '}';
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ FileChunkReference that = (FileChunkReference) o;
+ return stp == that.stp &&
+ cb == that.cb;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(stp, cb);
+ }
+
+ public long getStp() {
+ return stp;
+ }
+
+ public FileChunkReference setStp(long stp) {
+ this.stp = stp;
+ return this;
+ }
+
+ public long getCb() {
+ return cb;
+ }
+
+ public FileChunkReference setCb(long cb) {
+ this.cb = cb;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileDataStoreObject.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileDataStoreObject.java
new file mode 100644
index 0000000..f48019b
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileDataStoreObject.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+class FileDataStoreObject {
+ // uint64_t cbLength;implicit in the fileData FileChunkReference
+ FileChunkReference fileData = new FileChunkReference(); //points to raw data
+
+ public FileChunkReference getFileData() {
+ return fileData;
+ }
+
+ public FileDataStoreObject setFileData(FileChunkReference fileData) {
+ this.fileData = fileData;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileDataStoreObjectReference.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileDataStoreObjectReference.java
new file mode 100644
index 0000000..443b8c0
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileDataStoreObjectReference.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+class FileDataStoreObjectReference {
+ FileDataStoreObject ref;
+
+ public FileDataStoreObject getRef() {
+ return ref;
+ }
+
+ public FileDataStoreObjectReference setRef(FileDataStoreObject ref) {
+ this.ref = ref;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileNode.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileNode.java
new file mode 100644
index 0000000..f27e877
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileNode.java
@@ -0,0 +1,278 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+import org.apache.tika.exception.TikaMemoryLimitException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Objects;
+
+/**
+ * A FileNode structure is the basic unit for holding and referencing data in the file.
+ * FileNode structures are organized into file node lists
+ * <p>
+ * A FileNode structure is divided into header fields and a data field, fnd. The header fields specify what type of FileNode structure it
+ * is,
+ * and what format the fnd field is in.
+ * <p>
+ * The fnd field can be empty, or it can contain data directly, or it can contain a reference to another block of the file by
+ * byte position and byte count, or it can contain both data and a reference.
+ */
+class FileNode {
+ private static final Logger LOG = LoggerFactory.getLogger(FileNode.class);
+
+ /**
+ * An unsigned integer that specifies the type of this FileNode structure. The meaning of this value is specified by the fnd field.
+ */
+ long id;
+ long size;
+
+ /**
+ * An unsigned integer that specifies whether the structure specified by fnd contains a FileNodeChunkReference structure.
+ * 0 - This FileNode structure does not reference other data. The data structure specified by fnd MUST NOT contain a
+ * FileNodeChunkReference structure. The StpFormat and CbFormat fields MUST be ignored.
+ * 1 - This FileNode structure contains a reference to data. The first field in the data structure specified by an fnd field MUST be a
+ * FileNodeChunkReference structure that specifies the location and size of the referenced data.
+ * The type of the FileNodeChunkReference structure is specified by the StpFormat and CbFormat fields.
+ * 2 - This FileNode structure contains a reference to a file node list.
+ * The first field in the data structure specified by the fnd field MUST be a FileNodeChunkReference structure that specifies the
+ * location and size of a file node list. The type of the FileNodeChunkReference is specified by the StpFormat and CbFormat fields.
+ */
+ long baseType;
+
+ /**
+ * The ExtendedGUID for this FileNode.
+ * Specified for ObjectSpaceManifestRoot
+ * ObjectSpaceManifestStart
+ * ObjectSpaceManifestList
+ * RevisionManifestListStart
+ * ObjectGroupStartFND
+ * ObjectGroupID
+ * ObjectGroupListReferenceFND
+ * <p>
+ * RID for RevisionManifestStart4FND
+ * DataSignatureGroup for RevisionManifestEndFND
+ */
+ ExtendedGUID gosid;
+
+ // only present for RevisionManfiest7FND and RevisionRoleAndContextDeclaration
+ ExtendedGUID gctxid;
+ GUID fileDataStoreReference;
+ FileChunkReference ref;
+ PropertySet propertySet;
+ boolean isFileData;
+
+ /**
+ * For ObjectGroupListReference, the children.
+ */
+ FileNodeList childFileNodeList = new FileNodeList();
+
+ FileNodeUnion subType = new FileNodeUnion();
+
+ String idDesc;
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ FileNode fileNode = (FileNode) o;
+ return id == fileNode.id &&
+ size == fileNode.size &&
+ baseType == fileNode.baseType &&
+ isFileData == fileNode.isFileData &&
+ Objects.equals(gosid, fileNode.gosid) &&
+ Objects.equals(gctxid, fileNode.gctxid) &&
+ Objects.equals(fileDataStoreReference, fileNode.fileDataStoreReference) &&
+ Objects.equals(ref, fileNode.ref) &&
+ Objects.equals(propertySet, fileNode.propertySet) &&
+ Objects.equals(childFileNodeList, fileNode.childFileNodeList) &&
+ Objects.equals(subType, fileNode.subType);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(id, size, baseType, gosid, gctxid, fileDataStoreReference, ref, propertySet, isFileData, childFileNodeList,
+ subType);
+ }
+
+ public boolean hasGctxid() {
+ return id == FndStructureConstants.RevisionRoleAndContextDeclarationFND
+ || id == FndStructureConstants.RevisionManifestStart7FND;
+ }
+
+ public long getId() {
+ return id;
+ }
+
+ public FileNode setId(long id) {
+ this.id = id;
+ return this;
+ }
+
+ public long getSize() {
+ return size;
+ }
+
+ public FileNode setSize(long size) {
+ this.size = size;
+ return this;
+ }
+
+ public long getBaseType() {
+ return baseType;
+ }
+
+ public FileNode setBaseType(long baseType) {
+ this.baseType = baseType;
+ return this;
+ }
+
+ public ExtendedGUID getGosid() {
+ return gosid;
+ }
+
+ public FileNode setGosid(ExtendedGUID gosid) {
+ this.gosid = gosid;
+ return this;
+ }
+
+ public ExtendedGUID getGctxid() {
+ return gctxid;
+ }
+
+ public FileNode setGctxid(ExtendedGUID gctxid) {
+ this.gctxid = gctxid;
+ return this;
+ }
+
+ public GUID getFileDataStoreReference() {
+ return fileDataStoreReference;
+ }
+
+ public FileNode setFileDataStoreReference(GUID fileDataStoreReference) {
+ this.fileDataStoreReference = fileDataStoreReference;
+ return this;
+ }
+
+ public FileChunkReference getRef() {
+ return ref;
+ }
+
+ public FileNode setRef(FileChunkReference ref) {
+ this.ref = ref;
+ return this;
+ }
+
+ public PropertySet getPropertySet() {
+ return propertySet;
+ }
+
+ public FileNode setPropertySet(PropertySet propertySet) {
+ this.propertySet = propertySet;
+ return this;
+ }
+
+ public boolean isFileData() {
+ return isFileData;
+ }
+
+ public FileNode setFileData(boolean fileData) {
+ isFileData = fileData;
+ return this;
+ }
+
+ public FileNodeList getChildFileNodeList() {
+ return childFileNodeList;
+ }
+
+ public FileNode setChildFileNodeList(FileNodeList childFileNodeList) {
+ this.childFileNodeList = childFileNodeList;
+ return this;
+ }
+
+ public FileNodeUnion getSubType() {
+ return subType;
+ }
+
+ public FileNode setSubType(FileNodeUnion subType) {
+ this.subType = subType;
+ return this;
+ }
+
+ public void print(OneNoteDocument document, OneNotePtr pointer, int indentLevel) throws IOException, TikaMemoryLimitException {
+ boolean shouldPrintHeader = FndStructureConstants.nameOf(id).contains("ObjectDec");
+ if (gosid.equals(ExtendedGUID.nil()) && shouldPrintHeader) {
+ LOG.debug("{}[beg {}]:{}", IndentUtil.getIndent(indentLevel + 1), FndStructureConstants.nameOf(id), gosid);
+ }
+ propertySet.print(document, pointer, indentLevel + 1);
+ if (!childFileNodeList.children.isEmpty()) {
+ if (shouldPrintHeader) {
+ LOG.debug("{}children", IndentUtil.getIndent(indentLevel + 1));
+ }
+ for (FileNode child : childFileNodeList.children) {
+ child.print(document, pointer, indentLevel + 1);
+ }
+ }
+ if (id == FndStructureConstants.RevisionRoleDeclarationFND
+ || id == FndStructureConstants.RevisionRoleAndContextDeclarationFND) {
+ LOG.debug("{}[Revision Role {}]", IndentUtil.getIndent(indentLevel + 1),
+ subType.revisionRoleDeclaration.revisionRole);
+
+ }
+ if (id == FndStructureConstants.RevisionManifestStart4FND || id == FndStructureConstants.RevisionManifestStart6FND
+ || id == FndStructureConstants.RevisionManifestStart7FND) {
+ LOG.debug("{}[revisionRole {}]", IndentUtil.getIndent(indentLevel + 1),
+ subType.revisionManifest.revisionRole);
+
+ }
+ if ((gctxid != ExtendedGUID.nil() || id == FndStructureConstants.RevisionManifestStart7FND)
+ && shouldPrintHeader) {
+ LOG.debug("{}[gctxid {}]", IndentUtil.getIndent(indentLevel + 1), gctxid);
+ }
+ if (gosid != ExtendedGUID.nil() && shouldPrintHeader) {
+ LOG.debug("{}[end {}]:{}", IndentUtil.getIndent(indentLevel + 1), FndStructureConstants.nameOf(id),
+ gosid);
+
+ }
+ }
+
+ /**
+ * A description of what this GUID id means in this context.
+ *
+ * @return A description of what this GUID id means in this context.
+ */
+ public String getIdDesc() {
+ return idDesc;
+ }
+
+ @Override
+ public String toString() {
+ return new StringBuilder().append("FileNodeID=0x")
+ .append(Long.toHexString(id))
+ .append(", gosid=")
+ .append(gosid)
+ .append(", baseType=0x")
+ .append(Long.toHexString(baseType))
+ .toString();
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileNodeList.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileNodeList.java
new file mode 100644
index 0000000..aa01c18
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileNodeList.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+import java.util.ArrayList;
+import java.util.List;
+
+class FileNodeList {
+ FileNodeListHeader fileNodeListHeader;
+ List<FileNode> children = new ArrayList<>();
+
+ public FileNodeListHeader getFileNodeListHeader() {
+ return fileNodeListHeader;
+ }
+
+ public FileNodeList setFileNodeListHeader(FileNodeListHeader fileNodeListHeader) {
+ this.fileNodeListHeader = fileNodeListHeader;
+ return this;
+ }
+
+ public List<FileNode> getChildren() {
+ return children;
+ }
+
+ public FileNodeList setChildren(List<FileNode> children) {
+ this.children = children;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileNodeListHeader.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileNodeListHeader.java
new file mode 100644
index 0000000..1f8ee22
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileNodeListHeader.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+import org.apache.commons.lang3.StringUtils;
+
+class FileNodeListHeader {
+ public static final long UNIT_MAGIC_CONSTANT = 0xA4567AB1F5F7F4C4L;
+ long position;
+ long fileNodeListId;
+ long nFragmentSequence;
+
+ /**
+ * The FileNodeListHeader structure specifies the beginning of a FileNodeListFragment structure.
+ *
+ * @param position Position of the file where this header starts.
+ * @param uintMagic An unsigned integer; MUST be "0xA4567AB1F5F7F4C4"
+ * @param fileNodeListId An unsigned integer that specifies the identity of the file node list
+ * this fragment belongs to. MUST be equal to or greater than 0x00000010. The pair of
+ * FileNodeListID and nFragmentSequence fields MUST be unique relative to other
+ * FileNodeListFragment structures in the file.
+ * @param nFragmentSequence An unsigned integer that specifies the index of the fragment in the
+ * file node list containing the fragment. The nFragmentSequence field of the first fragment in a
+ * given file node list MUST be 0 and the nFragmentSequence fields of all subsequent fragments in
+ * this list MUST be sequential.
+ */
+ public FileNodeListHeader(long position, long uintMagic, long fileNodeListId, long nFragmentSequence) {
+ if (uintMagic != UNIT_MAGIC_CONSTANT) {
+ throw new RuntimeException("unitMagic must always be: 0x" + Long.toHexString(UNIT_MAGIC_CONSTANT));
+ }
+ this.position = position;
+ this.fileNodeListId = fileNodeListId;
+ if (fileNodeListId < 0x00000010) {
+ throw new RuntimeException("FileNodeListHeader.fileNodeListId MUST be equal to or greater than 0x00000010");
+ }
+ this.nFragmentSequence = nFragmentSequence;
+ }
+
+ public long getFileNodeListId() {
+ return fileNodeListId;
+ }
+
+ public FileNodeListHeader setFileNodeListId(long fileNodeListId) {
+ this.fileNodeListId = fileNodeListId;
+ return this;
+ }
+
+ public long getnFragmentSequence() {
+ return nFragmentSequence;
+ }
+
+ public FileNodeListHeader setnFragmentSequence(long nFragmentSequence) {
+ this.nFragmentSequence = nFragmentSequence;
+ return this;
+ }
+
+ public long getPosition() {
+ return position;
+ }
+
+ public FileNodeListHeader setPosition(long position) {
+ this.position = position;
+ return this;
+ }
+
+ public String getPositionHex() {
+ return "0x" + StringUtils.leftPad(Long.toHexString(position), 8, "0");
+ }
+
+ @Override
+ public String toString() {
+ return "FileNodeListHeader{" +
+ "position=" + "0x" + StringUtils.leftPad(Long.toHexString(position), 8, "0") +
+ ", fileNodeListId=" + fileNodeListId +
+ ", nFragmentSequence=" + nFragmentSequence +
+ '}';
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileNodePtr.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileNodePtr.java
new file mode 100644
index 0000000..a0e9e25
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileNodePtr.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Stores a list that represents how to get to the file node in the data structure.
+ */
+class FileNodePtr {
+ List<Integer> nodeListPositions = new ArrayList<>();
+
+ public FileNodePtr() {
+
+ }
+
+ public FileNodePtr(FileNodePtr copyFrom) {
+ nodeListPositions.addAll(copyFrom.nodeListPositions);
+ }
+
+ /**
+ * Uses the nodeListPositions to get the FileNode from the document.root hierarchy.
+ * <p>
+ * It works like this:
+ * <p>
+ * The first element of the nodeListPositions is the index of the FileNode at the root.
+ * The next element of the nodeListPosition is the index at the child of the first element.
+ * And so on...
+ * <p>
+ * For example 0, 4, 15 would mean
+ * <p>
+ * document.root.children.get(0).childFileNodeList.children.get(4).childFileNodeList.children.get(15)
+ *
+ * @param document
+ * @return
+ */
+ public FileNode dereference(OneNoteDocument document) {
+ if (nodeListPositions.isEmpty()) {
+ return null;
+ }
+ if (nodeListPositions.get(0) >= document.root.children.size()) {
+ throw new RuntimeException("Exceeded root child size");
+ }
+ FileNode cur = document.root.children.get(nodeListPositions.get(0));
+ for (int i = 1, ie = nodeListPositions.size(); i < ie; ++i) {
+ cur = cur.childFileNodeList.children.get(nodeListPositions.get(i));
+ }
+ return cur;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileNodePtrBackPush.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileNodePtrBackPush.java
new file mode 100644
index 0000000..b79ef8a
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileNodePtrBackPush.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+class FileNodePtrBackPush {
+ FileNodePtr parent;
+
+ public FileNodePtrBackPush(FileNodePtr parent) {
+ this.parent = parent;
+ this.parent.nodeListPositions.add(0);
+ }
+
+ public void dec() {
+ parent.nodeListPositions.remove(parent.nodeListPositions.size() - 1);
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileNodeUnion.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileNodeUnion.java
new file mode 100644
index 0000000..169c394
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FileNodeUnion.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+class FileNodeUnion {
+ RevisionManifestListStart revisionManifestListStart = new RevisionManifestListStart();
+ RevisionManifest revisionManifest = new RevisionManifest();
+ RevisionRoleDeclaration revisionRoleDeclaration = new RevisionRoleDeclaration();
+ GlobalIdTableStartFNDX globalIdTableStartFNDX = new GlobalIdTableStartFNDX();
+ GlobalIdTableEntryFNDX globalIdTableEntryFNDX = new GlobalIdTableEntryFNDX();
+ GlobalIdTableEntry2FNDX globalIdTableEntry2FNDX = new GlobalIdTableEntry2FNDX();
+ GlobalIdTableEntry3FNDX globalIdTableEntry3FNDX = new GlobalIdTableEntry3FNDX();
+ ObjectRevisionWithRefCountFNDX objectRevisionWithRefCountFNDX = new ObjectRevisionWithRefCountFNDX();
+ ObjectInfoDependencyOverrides objectInfoDependencyOverrides = new ObjectInfoDependencyOverrides();
+ ObjectDeclarationWithRefCount objectDeclarationWithRefCount = new ObjectDeclarationWithRefCount();
+ RootObjectReference rootObjectReference = new RootObjectReference();
+ FileDataStoreObjectReference fileDataStoreObjectReference = new FileDataStoreObjectReference();
+
+ public RevisionManifestListStart getRevisionManifestListStart() {
+ return revisionManifestListStart;
+ }
+
+ public FileNodeUnion setRevisionManifestListStart(RevisionManifestListStart revisionManifestListStart) {
+ this.revisionManifestListStart = revisionManifestListStart;
+ return this;
+ }
+
+ public RevisionManifest getRevisionManifest() {
+ return revisionManifest;
+ }
+
+ public FileNodeUnion setRevisionManifest(RevisionManifest revisionManifest) {
+ this.revisionManifest = revisionManifest;
+ return this;
+ }
+
+ public RevisionRoleDeclaration getRevisionRoleDeclaration() {
+ return revisionRoleDeclaration;
+ }
+
+ public FileNodeUnion setRevisionRoleDeclaration(RevisionRoleDeclaration revisionRoleDeclaration) {
+ this.revisionRoleDeclaration = revisionRoleDeclaration;
+ return this;
+ }
+
+ public GlobalIdTableStartFNDX getGlobalIdTableStartFNDX() {
+ return globalIdTableStartFNDX;
+ }
+
+ public FileNodeUnion setGlobalIdTableStartFNDX(GlobalIdTableStartFNDX globalIdTableStartFNDX) {
+ this.globalIdTableStartFNDX = globalIdTableStartFNDX;
+ return this;
+ }
+
+ public GlobalIdTableEntryFNDX getGlobalIdTableEntryFNDX() {
+ return globalIdTableEntryFNDX;
+ }
+
+ public FileNodeUnion setGlobalIdTableEntryFNDX(GlobalIdTableEntryFNDX globalIdTableEntryFNDX) {
+ this.globalIdTableEntryFNDX = globalIdTableEntryFNDX;
+ return this;
+ }
+
+ public GlobalIdTableEntry2FNDX getGlobalIdTableEntry2FNDX() {
+ return globalIdTableEntry2FNDX;
+ }
+
+ public FileNodeUnion setGlobalIdTableEntry2FNDX(GlobalIdTableEntry2FNDX globalIdTableEntry2FNDX) {
+ this.globalIdTableEntry2FNDX = globalIdTableEntry2FNDX;
+ return this;
+ }
+
+ public GlobalIdTableEntry3FNDX getGlobalIdTableEntry3FNDX() {
+ return globalIdTableEntry3FNDX;
+ }
+
+ public FileNodeUnion setGlobalIdTableEntry3FNDX(GlobalIdTableEntry3FNDX globalIdTableEntry3FNDX) {
+ this.globalIdTableEntry3FNDX = globalIdTableEntry3FNDX;
+ return this;
+ }
+
+ public ObjectRevisionWithRefCountFNDX getObjectRevisionWithRefCountFNDX() {
+ return objectRevisionWithRefCountFNDX;
+ }
+
+ public FileNodeUnion setObjectRevisionWithRefCountFNDX(ObjectRevisionWithRefCountFNDX objectRevisionWithRefCountFNDX) {
+ this.objectRevisionWithRefCountFNDX = objectRevisionWithRefCountFNDX;
+ return this;
+ }
+
+ public ObjectInfoDependencyOverrides getObjectInfoDependencyOverrides() {
+ return objectInfoDependencyOverrides;
+ }
+
+ public FileNodeUnion setObjectInfoDependencyOverrides(ObjectInfoDependencyOverrides objectInfoDependencyOverrides) {
+ this.objectInfoDependencyOverrides = objectInfoDependencyOverrides;
+ return this;
+ }
+
+ public ObjectDeclarationWithRefCount getObjectDeclarationWithRefCount() {
+ return objectDeclarationWithRefCount;
+ }
+
+ public FileNodeUnion setObjectDeclarationWithRefCount(ObjectDeclarationWithRefCount objectDeclarationWithRefCount) {
+ this.objectDeclarationWithRefCount = objectDeclarationWithRefCount;
+ return this;
+ }
+
+ public RootObjectReference getRootObjectReference() {
+ return rootObjectReference;
+ }
+
+ public FileNodeUnion setRootObjectReference(RootObjectReference rootObjectReference) {
+ this.rootObjectReference = rootObjectReference;
+ return this;
+ }
+
+ public FileDataStoreObjectReference getFileDataStoreObjectReference() {
+ return fileDataStoreObjectReference;
+ }
+
+ public FileNodeUnion setFileDataStoreObjectReference(FileDataStoreObjectReference fileDataStoreObjectReference) {
+ this.fileDataStoreObjectReference = fileDataStoreObjectReference;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FndStructureConstants.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FndStructureConstants.java
new file mode 100644
index 0000000..88f543e
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/FndStructureConstants.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+/**
+ * Some types of FileNodes have an "fnd" variable.
+ * FND stands for "File Node Data"
+ * <p>
+ * These are the different types of FND variables there are.
+ * <p>
+ * The value of each constant corresponds to the FileNodeID property for the file node.
+ */
+final class FndStructureConstants {
+ private FndStructureConstants() {
+ // no op
+ }
+
+ static final long ObjectSpaceManifestRootFND = 0x04;
+ static final long ObjectSpaceManifestListReferenceFND = 0x08;
+ static final long ObjectSpaceManifestListStartFND = 0x0c;
+ static final long RevisionManifestListReferenceFND = 0x10;
+ static final long RevisionManifestListStartFND = 0x14;
+ static final long RevisionManifestStart4FND = 0x1b;
+ static final long RevisionManifestEndFND = 0x1c;
+ static final long RevisionManifestStart6FND = 0x1e;
+ static final long RevisionManifestStart7FND = 0x1f;
+ static final long GlobalIdTableStartFNDX = 0x21;
+ static final long GlobalIdTableStart2FND = 0x22;
+ static final long GlobalIdTableEntryFNDX = 0x24;
+ static final long GlobalIdTableEntry2FNDX = 0x25;
+ static final long GlobalIdTableEntry3FNDX = 0x26;
+ static final long GlobalIdTableEndFNDX = 0x28;
+
+ public static final class CanRevise {
+ private CanRevise() {
+ // no op
+ }
+
+ static final long ObjectDeclarationWithRefCountFNDX = 0x2d;
+ static final long ObjectDeclarationWithRefCount2FNDX = 0x2e;
+ static final long ObjectRevisionWithRefCountFNDX = 0x041;
+ static final long ObjectRevisionWithRefCount2FNDX = 0x42;
+ static final long ObjectDeclaration2RefCountFND = 0x0A4;
+ static final long ObjectDeclaration2LargeRefCountFND = 0xA5;
+ static final long ReadOnlyObjectDeclaration2RefCountFND = 0xc4;
+ static final long ReadOnlyObjectDeclaration2LargeRefCountFND = 0xc5;
+ static final long ObjectDeclarationFileData3RefCountFND = 0x72;
+ static final long ObjectDeclarationFileData3LargeRefCountFND = 0x73;
+ }
+
+ static final long RootObjectReference2FNDX = 0x59;
+ static final long RootObjectReference3FND = 0x5a; // each root object must have a differe
+ static final long RevisionRoleDeclarationFND = 0x5c;
+ static final long RevisionRoleAndContextDeclarationFND = 0x5d;
+ static final long ObjectDataEncryptionKeyV2FNDX = 0x7c;
+ static final long ObjectInfoDependencyOverridesFND = 0x84;
+ static final long DataSignatureGroupDefinitionFND = 0x8c;
+ static final long FileDataStoreListReferenceFND = 0x90;
+ static final long FileDataStoreObjectReferenceFND = 0x94;
+ static final long ObjectGroupListReferenceFND = 0xb0;
+ static final long ObjectGroupStartFND = 0xb4;
+ static final long ObjectGroupEndFND = 0xb8;
+ static final long HashedChunkDescriptor2FND = 0xc2;
+
+ static final long ChunkTerminatorFND = 0xff;
+
+ static String nameOf(long type) {
+ switch (new Long(type).intValue()) {
+ case (int) ObjectSpaceManifestRootFND:
+ return "ObjectSpaceManifestRootFND";
+ case (int) ObjectSpaceManifestListReferenceFND:
+ return "ObjectSpaceManifestListReferenceFND";
+ case (int) ObjectSpaceManifestListStartFND:
+ return "ObjectSpaceManifestListStartFND";
+ case (int) RevisionManifestListReferenceFND:
+ return "RevisionManifestListReferenceFND";
+ case (int) RevisionManifestListStartFND:
+ return "RevisionManifestListStartFND";
+ case (int) RevisionManifestStart4FND:
+ return "RevisionManifestStart4FND";
+ case (int) RevisionManifestEndFND:
+ return "RevisionManifestEndFND";
+ case (int) RevisionManifestStart6FND:
+ return "RevisionManifestStart6FND";
+ case (int) RevisionManifestStart7FND:
+ return "RevisionManifestStart7FND";
+ case (int) GlobalIdTableStartFNDX:
+ return "GlobalIdTableStartFNDX";
+ case (int) GlobalIdTableStart2FND:
+ return "GlobalIdTableStart2FND";
+ case (int) GlobalIdTableEntryFNDX:
+ return "GlobalIdTableEntryFNDX";
+ case (int) GlobalIdTableEntry2FNDX:
+ return "GlobalIdTableEntry2FNDX";
+ case (int) GlobalIdTableEntry3FNDX:
+ return "GlobalIdTableEntry3FNDX";
+ case (int) GlobalIdTableEndFNDX:
+ return "GlobalIdTableEndFNDX";
+ case (int) CanRevise.ObjectDeclarationWithRefCountFNDX:
+ return "ObjectDeclarationWithRefCountFNDX";
+ case (int) CanRevise.ObjectDeclarationWithRefCount2FNDX:
+ return "ObjectDeclarationWithRefCount2FNDX";
+ case (int) CanRevise.ObjectRevisionWithRefCountFNDX:
+ return "ObjectRevisionWithRefCountFNDX";
+ case (int) CanRevise.ObjectRevisionWithRefCount2FNDX:
+ return "ObjectRevisionWithRefCount2FNDX";
+ case (int) CanRevise.ObjectDeclaration2RefCountFND:
+ return "ObjectDeclaration2RefCountFND";
+ case (int) CanRevise.ObjectDeclaration2LargeRefCountFND:
+ return "ObjectDeclaration2LargeRefCountFND";
+ case (int) CanRevise.ReadOnlyObjectDeclaration2RefCountFND:
+ return "ReadOnlyObjectDeclaration2RefCountFND";
+ case (int) CanRevise.ReadOnlyObjectDeclaration2LargeRefCountFND:
+ return "ReadOnlyObjectDeclaration2LargeRefCountFND";
+ case (int) CanRevise.ObjectDeclarationFileData3RefCountFND:
+ return "ObjectDeclarationFileData3RefCountFND";
+ case (int) CanRevise.ObjectDeclarationFileData3LargeRefCountFND:
+ return "ObjectDeclarationFileData3LargeRefCountFND";
+ case (int) RootObjectReference2FNDX:
+ return "RootObjectReference2FNDX";
+ case (int) RootObjectReference3FND:
+ return "RootObjectReference3FND";
+ case (int) RevisionRoleDeclarationFND:
+ return "RevisionRoleDeclarationFND";
+ case (int) RevisionRoleAndContextDeclarationFND:
+ return "RevisionRoleAndContextDeclarationFND";
+ case (int) ObjectDataEncryptionKeyV2FNDX:
+ return "ObjectDataEncryptionKeyV2FNDX";
+ case (int) ObjectInfoDependencyOverridesFND:
+ return "ObjectInfoDependencyOverridesFND";
+ case (int) DataSignatureGroupDefinitionFND:
+ return "DataSignatureGroupDefinitionFND";
+ case (int) FileDataStoreListReferenceFND:
+ return "FileDataStoreListReferenceFND";
+ case (int) FileDataStoreObjectReferenceFND:
+ return "FileDataStoreObjectReferenceFND";
+ case (int) ObjectGroupListReferenceFND:
+ return "ObjectGroupListReferenceFND";
+ case (int) ObjectGroupStartFND:
+ return "ObjectGroupStartFND";
+ case (int) ObjectGroupEndFND:
+ return "ObjectGroupEndFND";
+ case (int) HashedChunkDescriptor2FND:
+ return "HashedChunkDescriptor2FND";
+
+ case (int) ChunkTerminatorFND:
+ return "ChunkTerminatorFND";
+ default:
+ return "UnknownFND";
+ }
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/GUID.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/GUID.java
new file mode 100644
index 0000000..371e328
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/GUID.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+import org.apache.commons.lang3.StringUtils;
+
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Locale;
+
+class GUID implements Comparable<GUID> {
+ int[] guid;
+
+ /**
+ * Converts a GUID of format: {AAAAAAAA-BBBB-CCCC-DDDD-EEEEEEEEEEEE} (in bytes) to a GUID object.
+ *
+ * @param guid The bytes that contain string in UTF-16 format of {AAAAAAAA-BBBB-CCCC-DDDD-EEEEEEEEEEEE}
+ * @return GUID object parsed from guid bytes.
+ */
+ public static GUID fromCurlyBraceUTF16Bytes(byte[] guid) {
+ int[] intGuid = new int[16];
+ String utf16Str = new String(guid, StandardCharsets.UTF_16LE).replaceAll("\\{", "")
+ .replaceAll("-", "").replaceAll("}", "");
+ for (int i = 0; i < utf16Str.length(); i += 2) {
+ intGuid[i / 2] = Integer.parseUnsignedInt("" + utf16Str.charAt(i) + utf16Str.charAt(i + 1), 16);
+ }
+ return new GUID(intGuid);
+ }
+
+ @Override
+ public int compareTo(GUID o) {
+ return memcmp(guid, o.guid, 16);
+ }
+
+ public GUID(int[] guid) {
+ this.guid = guid;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ GUID guid1 = (GUID) o;
+ return Arrays.equals(guid, guid1.guid);
+ }
+
+ @Override
+ public int hashCode() {
+ return Arrays.hashCode(guid);
+ }
+
+ public static int memcmp(int b1[], int b2[], int sz) {
+ for (int i = 0; i < sz; i++) {
+ if (b1[i] != b2[i]) {
+ if ((b1[i] >= 0 && b2[i] >= 0) || (b1[i] < 0 && b2[i] < 0)) {
+ return b1[i] - b2[i];
+ }
+ if (b1[i] < 0 && b2[i] >= 0) {
+ return 1;
+ }
+ if (b2[i] < 0 && b1[i] >= 0) {
+ return -1;
+ }
+ }
+ }
+ return 0;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("{");
+ for (int i = 0; i < 4; ++i) {
+ sb.append(StringUtils.leftPad(Integer.toHexString(guid[i]), 2, '0'));
+ }
+ sb.append("-");
+ for (int i = 4; i < 6; ++i) {
+ sb.append(StringUtils.leftPad(Integer.toHexString(guid[i]), 2, '0'));
+ }
+ sb.append("-");
+ for (int i = 6; i < 8; ++i) {
+ sb.append(StringUtils.leftPad(Integer.toHexString(guid[i]), 2, '0'));
+ }
+ sb.append("-");
+ for (int i = 8; i < 10; ++i) {
+ sb.append(StringUtils.leftPad(Integer.toHexString(guid[i]), 2, '0'));
+ }
+ sb.append("-");
+ for (int i = 10; i < 16; ++i) {
+ sb.append(StringUtils.leftPad(Integer.toHexString(guid[i]), 2, '0'));
+ }
+ sb.append("}");
+ return sb.toString().toUpperCase(Locale.US);
+ }
+
+ public static GUID nil() {
+ return new GUID(new int[16]);
+ }
+
+ public int[] getGuid() {
+ return guid;
+ }
+
+ public GUID setGuid(int[] guid) {
+ this.guid = guid;
+ return this;
+ }
+
+ public String getGuidString() {
+ return guid.toString();
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/GlobalIdTableEntry2FNDX.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/GlobalIdTableEntry2FNDX.java
new file mode 100644
index 0000000..c799c0e
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/GlobalIdTableEntry2FNDX.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+class GlobalIdTableEntry2FNDX {
+ long indexMapFrom;
+ long indexMapTo;
+
+ public long getIndexMapFrom() {
+ return indexMapFrom;
+ }
+
+ public GlobalIdTableEntry2FNDX setIndexMapFrom(long indexMapFrom) {
+ this.indexMapFrom = indexMapFrom;
+ return this;
+ }
+
+ public long getIndexMapTo() {
+ return indexMapTo;
+ }
+
+ public GlobalIdTableEntry2FNDX setIndexMapTo(long indexMapTo) {
+ this.indexMapTo = indexMapTo;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/GlobalIdTableEntry3FNDX.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/GlobalIdTableEntry3FNDX.java
new file mode 100644
index 0000000..0cc3050
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/GlobalIdTableEntry3FNDX.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+public class GlobalIdTableEntry3FNDX {
+ long indexCopyFromStart;
+ long entriesToCopy;
+ long indexCopyToStart;
+
+ public long getIndexCopyFromStart() {
+ return indexCopyFromStart;
+ }
+
+ public GlobalIdTableEntry3FNDX setIndexCopyFromStart(long indexCopyFromStart) {
+ this.indexCopyFromStart = indexCopyFromStart;
+ return this;
+ }
+
+ public long getEntriesToCopy() {
+ return entriesToCopy;
+ }
+
+ public GlobalIdTableEntry3FNDX setEntriesToCopy(long entriesToCopy) {
+ this.entriesToCopy = entriesToCopy;
+ return this;
+ }
+
+ public long getIndexCopyToStart() {
+ return indexCopyToStart;
+ }
+
+ public GlobalIdTableEntry3FNDX setIndexCopyToStart(long indexCopyToStart) {
+ this.indexCopyToStart = indexCopyToStart;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/GlobalIdTableEntryFNDX.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/GlobalIdTableEntryFNDX.java
new file mode 100644
index 0000000..16d0016
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/GlobalIdTableEntryFNDX.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+public class GlobalIdTableEntryFNDX {
+ long index;
+ GUID guid;
+
+ public long getIndex() {
+ return index;
+ }
+
+ public GlobalIdTableEntryFNDX setIndex(long index) {
+ this.index = index;
+ return this;
+ }
+
+ public GUID getGuid() {
+ return guid;
+ }
+
+ public GlobalIdTableEntryFNDX setGuid(GUID guid) {
+ this.guid = guid;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/GlobalIdTableStartFNDX.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/GlobalIdTableStartFNDX.java
new file mode 100644
index 0000000..3c09449
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/GlobalIdTableStartFNDX.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+class GlobalIdTableStartFNDX {
+ char reserved;
+
+ public char getReserved() {
+ return reserved;
+ }
+
+ public GlobalIdTableStartFNDX setReserved(char reserved) {
+ this.reserved = reserved;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/IndentUtil.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/IndentUtil.java
new file mode 100644
index 0000000..50c381f
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/IndentUtil.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+class IndentUtil {
+ public static String getIndent(int indentLevel) {
+ String retval = "";
+ for (int i = 0; i < indentLevel; ++i) {
+ retval += " ";
+ }
+ return retval;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/Int24.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/Int24.java
new file mode 100644
index 0000000..8fd7133
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/Int24.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+class Int24 {
+ int[] val = new int[3];
+
+ public Int24(int b1, int b2, int b3) {
+ val[0] = b1;
+ val[1] = b2;
+ val[2] = b3;
+ }
+
+ public int value() {
+ int le = val[2];
+ le <<= 8;
+ le |= val[1];
+ le <<= 8;
+ le |= val[0];
+ return le;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/JCID.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/JCID.java
new file mode 100644
index 0000000..745ff55
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/JCID.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+/**
+ * The JCID structure specifies the type of object and the type of data the object contains. A JCID structure can be
+ * considered to be an unsigned integer of size four bytes as specified by property set and
+ * file data object.
+ *
+ * <pre>[0,15] - the index</pre>
+ * <pre>16 - A</pre>
+ * <pre>17 - B</pre>
+ * <pre>18 - C</pre>
+ * <pre>19 - D</pre>
+ * <pre>20 - E</pre>
+ * <pre>21 - 31 = reserved</pre>
+ * <p>
+ * index (2 bytes): An unsigned integer that specifies the type of object.
+ * <p>
+ * A - IsBinary (1 bit): Specifies whether the object contains encryption data transmitted over the File Synchronization via SOAP over
+ * HTTP Protocol, as specified in [MS-FSSHTTP].
+ * <p>
+ * B - IsPropertySet (1 bit): Specifies whether the object contains a property set.
+ * <p>
+ * C - IsGraphNode (1 bit): Undefined and MUST be ignored.
+ * <p>
+ * D - IsFileData (1 bit): Specifies whether the object is a file data object. If the value of IsFileData is "true", then the values of
+ * the IsBinary, IsPropertySet, IsGraphNode, and IsReadOnly fields MUST all be false.
+ * <p>
+ * E - IsReadOnly (1 bit): Specifies whether the object's data MUST NOT be changed when the object is revised.
+ * <p>
+ * reserved (11 bits): MUST be zero, and MUST be ignored.
+ */
+class JCID {
+ long jcid;
+ long index;
+ boolean isBinary;
+ boolean isPropertySet;
+ boolean isGraphNode;
+ boolean isFileData;
+ boolean isReadOnly;
+
+ /**
+ * If the value of the JCID.IsPropertySet field is "true" or if only JCID.index is specified, then the data
+ * for the Object Space Object structure MUST be an ObjectSpaceObjectPropSet structure.
+ *
+ * @return true if is ObjectSpaceObjectPropSet. false otherwise.
+ */
+ public boolean isObjectSpaceObjectPropSet() {
+ return isPropertySet || !isBinary && !isGraphNode && !isFileData && !isReadOnly && index > 0;
+ }
+
+ public void loadFrom32BitIndex(long fullIndex) {
+ jcid = fullIndex;
+ index = fullIndex & 0xffff;
+ isBinary = ((fullIndex >> 16) & 1) == 1;
+ isPropertySet = ((fullIndex >> 17) & 1) == 1;
+ isGraphNode = ((fullIndex >> 18) & 1) == 1;
+ isFileData = ((fullIndex >> 19) & 1) == 1;
+ isReadOnly = ((fullIndex >> 20) & 1) == 1;
+ if ((fullIndex >> 21) != 0) {
+ throw new RuntimeException("RESERVED_NONZERO");
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "JCID{" +
+ "jcid=" + JCIDPropertySetTypeEnum.of(jcid) + " (0x" + Long.toHexString(jcid) + ")" +
+ ", index=" + index +
+ ", isBinary=" + isBinary +
+ ", isPropertySet=" + isPropertySet +
+ ", isGraphNode=" + isGraphNode +
+ ", isFileData=" + isFileData +
+ ", isReadOnly=" + isReadOnly +
+ '}';
+ }
+
+ public long getJcid() {
+ return jcid;
+ }
+
+ public void setJcid(long jcid) {
+ this.jcid = jcid;
+ }
+
+ public long getIndex() {
+ return index;
+ }
+
+ public void setIndex(long index) {
+ this.index = index;
+ }
+
+ public boolean isBinary() {
+ return isBinary;
+ }
+
+ public void setBinary(boolean binary) {
+ isBinary = binary;
+ }
+
+ public boolean isPropertySet() {
+ return isPropertySet;
+ }
+
+ public void setPropertySet(boolean propertySet) {
+ isPropertySet = propertySet;
+ }
+
+ public boolean isGraphNode() {
+ return isGraphNode;
+ }
+
+ public void setGraphNode(boolean graphNode) {
+ isGraphNode = graphNode;
+ }
+
+ public boolean isFileData() {
+ return isFileData;
+ }
+
+ public void setFileData(boolean fileData) {
+ isFileData = fileData;
+ }
+
+ public boolean isReadOnly() {
+ return isReadOnly;
+ }
+
+ public void setReadOnly(boolean readOnly) {
+ isReadOnly = readOnly;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/JCIDPropertySetTypeEnum.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/JCIDPropertySetTypeEnum.java
new file mode 100644
index 0000000..4b30da0
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/JCIDPropertySetTypeEnum.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.onenote;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * The JCID property set type enum from section 2.1.13 of MS-ONE
+ * specification.
+ */
+enum JCIDPropertySetTypeEnum {
+ jcidReadOnlyPersistablePropertyContainerForAuthor(0x00120001),
+ jcidPersistablePropertyContainerForTOC(0x00020001),
+ jcidPersistablePropertyContainerForTOCSection(0x00020001),
+ jcidSectionNode(0x00060007),
+ jcidPageSeriesNode(0x00060008),
+ jcidPageNode(0x0006000B),
+ jcidOutlineNode(0x0006000C),
+ jcidOutlineElementNode(0x0006000D),
+ jcidRichTextOENode(0x0006000E),
+ jcidImageNode(0x00060011),
+ jcidNumberListNode(0x00060012),
+ jcidOutlineGroup(0x00060019),
+ jcidTableNode(0x00060022),
+ jcidTableRowNode(0x00060023),
+ jcidTableCellNode(0x00060024),
+ jcidTitleNode(0x0006002C),
+ jcidPageMetaData(0x00020030),
+ jcidSectionMetaData(0x00020031),
+ jcidEmbeddedFileNode(0x00060035),
+ jcidPageManifestNode(0x00060037),
+ jcidConflictPageMetaData(0x00020038),
+ jcidVersionHistoryContent(0x0006003C),
+ jcidVersionProxy(0x0006003D),
+ jcidNoteTagSharedDefinitionContainer(0x00120043),
+ jcidRevisionMetaData(0x00020044),
+ jcidVersionHistoryMetaData(0x00020046),
+ jcidParagraphStyleObject(0x0012004D),
+ jcidParagraphStyleObjectForText(0x0012004D),
+ unknown(0x0);
+
+ private long jcid;
+
+ JCIDPropertySetTypeEnum(long jcid) {
+ this.jcid = jcid;
+ }
+
+ private static final Map<Long, JCIDPropertySetTypeEnum> BY_ID = new HashMap<>();
+
+ static {
+ for (JCIDPropertySetTypeEnum e : values()) {
+ BY_ID.put(e.jcid, e);
+ }
+ }
+
+ public static JCIDPropertySetTypeEnum of(Long id) {
+ JCIDPropertySetTypeEnum result = BY_ID.get(id);
+ if (result == null) {
+ return unknown;
+ }
+ return result;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectDeclarationWithRefCount.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectDeclarationWithRefCount.java
new file mode 100644
index 0000000..f3831e5
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectDeclarationWithRefCount.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.onenote;
+
+class ObjectDeclarationWithRefCount {
+ ObjectSpaceObjectPropSet objectRef;
+ ObjectDeclarationWithRefCountBody body = new ObjectDeclarationWithRefCountBody();
+ long cRef;
+
+ public static class ReadOnly {
+ byte[] md5;
+
+ public byte[] getMd5() {
+ return md5;
+ }
+
+ public ReadOnly setMd5(byte[] md5) {
+ this.md5 = md5;
+ return this;
+ }
+ }
+
+ ReadOnly readOnly = new ReadOnly();
+
+ public ObjectSpaceObjectPropSet getObjectRef() {
+ return objectRef;
+ }
+
+ public ObjectDeclarationWithRefCount setObjectRef(ObjectSpaceObjectPropSet objectRef) {
+ this.objectRef = objectRef;
+ return this;
+ }
+
+ public ObjectDeclarationWithRefCountBody getBody() {
+ return body;
+ }
+
+ public ObjectDeclarationWithRefCount setBody(ObjectDeclarationWithRefCountBody body) {
+ this.body = body;
+ return this;
+ }
+
+ public long getcRef() {
+ return cRef;
+ }
+
+ public ObjectDeclarationWithRefCount setcRef(long cRef) {
+ this.cRef = cRef;
+ return this;
+ }
+
+ public ReadOnly getReadOnly() {
+ return readOnly;
+ }
+
+ public ObjectDeclarationWithRefCount setReadOnly(ReadOnly readOnly) {
+ this.readOnly = readOnly;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectDeclarationWithRefCountBody.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectDeclarationWithRefCountBody.java
new file mode 100644
index 0000000..476aeb5
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectDeclarationWithRefCountBody.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.onenote;
+
+class ObjectDeclarationWithRefCountBody {
+ CompactID oid;
+ JCID jcid = new JCID(); // if this is a ObjectDeclarationWithRefCountBody, jci = 0x01
+ boolean fHasOidReferences;
+ boolean hasOsidReferences;
+ // the obj is a GUID in the file_data_store_reference
+ // for a ObjectDeclarationFileData3RefCountFND
+ boolean file_data_store_reference;
+
+ public CompactID getOid() {
+ return oid;
+ }
+
+ public ObjectDeclarationWithRefCountBody setOid(CompactID oid) {
+ this.oid = oid;
+ return this;
+ }
+
+ public JCID getJcid() {
+ return jcid;
+ }
+
+ public ObjectDeclarationWithRefCountBody setJcid(JCID jcid) {
+ this.jcid = jcid;
+ return this;
+ }
+
+ public boolean isfHasOidReferences() {
+ return fHasOidReferences;
+ }
+
+ public ObjectDeclarationWithRefCountBody setfHasOidReferences(boolean fHasOidReferences) {
+ this.fHasOidReferences = fHasOidReferences;
+ return this;
+ }
+
+ public boolean isHasOsidReferences() {
+ return hasOsidReferences;
+ }
+
+ public ObjectDeclarationWithRefCountBody setHasOsidReferences(boolean hasOsidReferences) {
+ this.hasOsidReferences = hasOsidReferences;
+ return this;
+ }
+
+ public boolean isFile_data_store_reference() {
+ return file_data_store_reference;
+ }
+
+ public ObjectDeclarationWithRefCountBody setFile_data_store_reference(boolean file_data_store_reference) {
+ this.file_data_store_reference = file_data_store_reference;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectInfoDependencyOverrideData.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectInfoDependencyOverrideData.java
new file mode 100644
index 0000000..c9a16a1
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectInfoDependencyOverrideData.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+
+import java.util.ArrayList;
+import java.util.List;
+
+class ObjectInfoDependencyOverrideData {
+ long c8bitOverrides;
+ long c32bitOverrides;
+ long crc;
+ List<Integer> overrides1 = new ArrayList<>();
+ List<Long> overrides2 = new ArrayList<>();
+
+ public long getC8bitOverrides() {
+ return c8bitOverrides;
+ }
+
+ public ObjectInfoDependencyOverrideData setC8bitOverrides(long c8bitOverrides) {
+ this.c8bitOverrides = c8bitOverrides;
+ return this;
+ }
+
+ public long getC32bitOverrides() {
+ return c32bitOverrides;
+ }
+
+ public ObjectInfoDependencyOverrideData setC32bitOverrides(long c32bitOverrides) {
+ this.c32bitOverrides = c32bitOverrides;
+ return this;
+ }
+
+ public long getCrc() {
+ return crc;
+ }
+
+ public ObjectInfoDependencyOverrideData setCrc(long crc) {
+ this.crc = crc;
+ return this;
+ }
+
+ public List<Integer> getOverrides1() {
+ return overrides1;
+ }
+
+ public ObjectInfoDependencyOverrideData setOverrides1(List<Integer> overrides1) {
+ this.overrides1 = overrides1;
+ return this;
+ }
+
+ public List<Long> getOverrides2() {
+ return overrides2;
+ }
+
+ public ObjectInfoDependencyOverrideData setOverrides2(List<Long> overrides2) {
+ this.overrides2 = overrides2;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectInfoDependencyOverrides.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectInfoDependencyOverrides.java
new file mode 100644
index 0000000..94409a7
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectInfoDependencyOverrides.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+class ObjectInfoDependencyOverrides {
+ ObjectInfoDependencyOverrideData data;
+
+ public ObjectInfoDependencyOverrideData getData() {
+ return data;
+ }
+
+ public ObjectInfoDependencyOverrides setData(ObjectInfoDependencyOverrideData data) {
+ this.data = data;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectRevisionWithRefCountFNDX.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectRevisionWithRefCountFNDX.java
new file mode 100644
index 0000000..5284edf
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectRevisionWithRefCountFNDX.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+class ObjectRevisionWithRefCountFNDX {
+ ObjectSpaceObjectPropSet ref;
+ CompactID oid;
+ long hasOidReferences;
+ long hasOsidReferences;
+ long cRef;
+
+ public ObjectSpaceObjectPropSet getRef() {
+ return ref;
+ }
+
+ public ObjectRevisionWithRefCountFNDX setRef(ObjectSpaceObjectPropSet ref) {
+ this.ref = ref;
+ return this;
+ }
+
+ public CompactID getOid() {
+ return oid;
+ }
+
+ public ObjectRevisionWithRefCountFNDX setOid(CompactID oid) {
+ this.oid = oid;
+ return this;
+ }
+
+ public long getHasOidReferences() {
+ return hasOidReferences;
+ }
+
+ public ObjectRevisionWithRefCountFNDX setHasOidReferences(long hasOidReferences) {
+ this.hasOidReferences = hasOidReferences;
+ return this;
+ }
+
+ public long getHasOsidReferences() {
+ return hasOsidReferences;
+ }
+
+ public ObjectRevisionWithRefCountFNDX setHasOsidReferences(long hasOsidReferences) {
+ this.hasOsidReferences = hasOsidReferences;
+ return this;
+ }
+
+ public long getcRef() {
+ return cRef;
+ }
+
+ public ObjectRevisionWithRefCountFNDX setcRef(long cRef) {
+ this.cRef = cRef;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectSpaceObjectPropSet.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectSpaceObjectPropSet.java
new file mode 100644
index 0000000..d555fc9
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectSpaceObjectPropSet.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+class ObjectSpaceObjectPropSet {
+ ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs oids = new ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs();
+ ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs osids = new ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs();
+ ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs contextIDs = new ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs();
+ PropertySet body = new PropertySet();
+
+ public ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs getOids() {
+ return oids;
+ }
+
+ public ObjectSpaceObjectPropSet setOids(ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs oids) {
+ this.oids = oids;
+ return this;
+ }
+
+ public ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs getOsids() {
+ return osids;
+ }
+
+ public ObjectSpaceObjectPropSet setOsids(ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs osids) {
+ this.osids = osids;
+ return this;
+ }
+
+ public ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs getContextIDs() {
+ return contextIDs;
+ }
+
+ public ObjectSpaceObjectPropSet setContextIDs(ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs contextIDs) {
+ this.contextIDs = contextIDs;
+ return this;
+ }
+
+ public PropertySet getBody() {
+ return body;
+ }
+
+ public ObjectSpaceObjectPropSet setBody(PropertySet body) {
+ this.body = body;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs.java
new file mode 100644
index 0000000..458b69a
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+import java.util.ArrayList;
+import java.util.List;
+
+class ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs {
+ long count; // 24 bits
+ long extendedStreamsPresent;
+ long osidsStreamNotPresent;
+ List<CompactID> data = new ArrayList<>();
+
+ public long getCount() {
+ return count;
+ }
+
+ public ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs setCount(long count) {
+ this.count = count;
+ return this;
+ }
+
+ public long getExtendedStreamsPresent() {
+ return extendedStreamsPresent;
+ }
+
+ public ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs setExtendedStreamsPresent(long extendedStreamsPresent) {
+ this.extendedStreamsPresent = extendedStreamsPresent;
+ return this;
+ }
+
+ public long getOsidsStreamNotPresent() {
+ return osidsStreamNotPresent;
+ }
+
+ public ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs setOsidsStreamNotPresent(long osidsStreamNotPresent) {
+ this.osidsStreamNotPresent = osidsStreamNotPresent;
+ return this;
+ }
+
+ public List<CompactID> getData() {
+ return data;
+ }
+
+ public ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs setData(List<CompactID> data) {
+ this.data = data;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectStreamCounters.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectStreamCounters.java
new file mode 100644
index 0000000..c97739f
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/ObjectStreamCounters.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+class ObjectStreamCounters {
+ long oids_count;
+ long osids_count;
+ long context_ids_count;
+
+ public ObjectStreamCounters() {
+ oids_count = 0;
+ osids_count = 0;
+ context_ids_count = 0;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteDirectFileResource.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteDirectFileResource.java
new file mode 100644
index 0000000..475b680
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteDirectFileResource.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.onenote;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+
+/**
+ * This is copied mostly from the {@link org.apache.tika.parser.mp4.DirectFileReadDataSource}.
+ * <p>
+ * Implements a simple way to encapsulate a {@link org.apache.tika.io.TikaInputStream} that you will have to seek,read,repeat
+ * while parsing OneNote contents.
+ */
+class OneNoteDirectFileResource implements Closeable {
+
+ private static final int TRANSFER_SIZE = 8192;
+
+ private RandomAccessFile raf;
+
+ public OneNoteDirectFileResource(File f) throws IOException {
+ this.raf = new RandomAccessFile(f, "r");
+ }
+
+ public int read() throws IOException {
+ return raf.read();
+ }
+
+ public int read(ByteBuffer byteBuffer) throws IOException {
+ int len = byteBuffer.remaining();
+ int totalRead = 0;
+ int bytesRead = 0;
+ byte[] buf = new byte[TRANSFER_SIZE];
+ while (totalRead < len) {
+ int bytesToRead = Math.min((len - totalRead), TRANSFER_SIZE);
+ bytesRead = raf.read(buf, 0, bytesToRead);
+ if (bytesRead < 0) {
+ break;
+ } else {
+ totalRead += bytesRead;
+ }
+ byteBuffer.put(buf, 0, bytesRead);
+ }
+ if (bytesRead < 0 && position() == size() && byteBuffer.hasRemaining()) {
+ throw new IOException("End of stream reached earlier than expected");
+ }
+ return ((bytesRead < 0) && (totalRead == 0)) ? -1 : totalRead;
+ }
+
+ public long size() throws IOException {
+ return raf.length();
+ }
+
+ public long position() throws IOException {
+ return raf.getFilePointer();
+ }
+
+ public void position(long nuPos) throws IOException {
+ if (nuPos > raf.length()) {
+ throw new IOException("requesting seek past end of stream");
+ }
+ raf.seek(nuPos);
+ }
+
+ @Override
+ public void close() throws IOException {
+ raf.close();
+ }
+
+}
\ No newline at end of file
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteDocument.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteDocument.java
new file mode 100644
index 0000000..83402a2
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteDocument.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+import org.apache.commons.lang3.tuple.Pair;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+class OneNoteDocument {
+ OneNoteHeader header;
+ List<ExtendedGUID> revisionListOrder = new ArrayList<>();
+ Map<ExtendedGUID, Revision> revisionMap = new HashMap<>();
+ Map<ExtendedGUID, FileNodePtr> revisionManifestLists = new HashMap<>();
+ Map<ExtendedGUID, FileChunkReference> guidToRef = new HashMap<>();
+ Map<ExtendedGUID, FileNodePtr> guidToObject = new HashMap<>();
+
+ Map<ExtendedGUID, Pair<Long, ExtendedGUID>> revisionRoleMap = new HashMap<>();
+ ExtendedGUID currentRevision = ExtendedGUID.nil();
+ FileNodeList root = new FileNodeList();
+
+ public OneNoteDocument() {
+
+ }
+
+ FileChunkReference getAssocGuidToRef(ExtendedGUID guid) {
+ return guidToRef.get(guid);
+ }
+
+ void setAssocGuidToRef(ExtendedGUID guid, FileChunkReference ref) {
+ guidToRef.put(guid, ref);
+ }
+
+ void registerRevisionManifestList(ExtendedGUID guid, FileNodePtr ptr) {
+ revisionManifestLists.put(guid, ptr);
+ revisionListOrder.add(guid);
+ }
+
+ void registerRevisionManifest(FileNode fn) {
+ revisionMap.putIfAbsent(fn.gosid, new Revision());
+ Revision toModify = revisionMap.get(fn.gosid);
+ toModify.gosid = fn.gosid;
+ toModify.dependent = fn.subType.revisionManifest.ridDependent;
+ currentRevision = fn.gosid;
+ }
+
+ public void registerAdditionalRevisionRole(ExtendedGUID gosid, long revisionRole, ExtendedGUID gctxid) {
+ revisionRoleMap.put(gosid, Pair.of(revisionRole, gctxid));
+ }
+
+ public List<ExtendedGUID> getRevisionListOrder() {
+ return revisionListOrder;
+ }
+
+ public OneNoteDocument setRevisionListOrder(List<ExtendedGUID> revisionListOrder) {
+ this.revisionListOrder = revisionListOrder;
+ return this;
+ }
+
+ public Map<ExtendedGUID, Revision> getRevisionMap() {
+ return revisionMap;
+ }
+
+ public OneNoteDocument setRevisionMap(Map<ExtendedGUID, Revision> revisionMap) {
+ this.revisionMap = revisionMap;
+ return this;
+ }
+
+ public Map<ExtendedGUID, FileNodePtr> getRevisionManifestLists() {
+ return revisionManifestLists;
+ }
+
+ public OneNoteDocument setRevisionManifestLists(Map<ExtendedGUID, FileNodePtr> revisionManifestLists) {
+ this.revisionManifestLists = revisionManifestLists;
+ return this;
+ }
+
+ public Map<ExtendedGUID, FileChunkReference> getGuidToRef() {
+ return guidToRef;
+ }
+
+ public OneNoteDocument setGuidToRef(Map<ExtendedGUID, FileChunkReference> guidToRef) {
+ this.guidToRef = guidToRef;
+ return this;
+ }
+
+ public Map<ExtendedGUID, FileNodePtr> getGuidToObject() {
+ return guidToObject;
+ }
+
+ public OneNoteDocument setGuidToObject(Map<ExtendedGUID, FileNodePtr> guidToObject) {
+ this.guidToObject = guidToObject;
+ return this;
+ }
+
+ public Map<ExtendedGUID, Pair<Long, ExtendedGUID>> getRevisionRoleMap() {
+ return revisionRoleMap;
+ }
+
+ public OneNoteDocument setRevisionRoleMap(Map<ExtendedGUID, Pair<Long, ExtendedGUID>> revisionRoleMap) {
+ this.revisionRoleMap = revisionRoleMap;
+ return this;
+ }
+
+ public ExtendedGUID getCurrentRevision() {
+ return currentRevision;
+ }
+
+ public OneNoteDocument setCurrentRevision(ExtendedGUID currentRevision) {
+ this.currentRevision = currentRevision;
+ return this;
+ }
+
+ public FileNodeList getRoot() {
+ return root;
+ }
+
+ public OneNoteDocument setRoot(FileNodeList root) {
+ this.root = root;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteHeader.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteHeader.java
new file mode 100644
index 0000000..2ff811b
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteHeader.java
@@ -0,0 +1,403 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+import java.io.Serializable;
+
+class OneNoteHeader implements Serializable {
+
+ GUID guidFileType;
+ GUID guidFile;
+ GUID guidLegacyFileVersion;
+ GUID guidFileFormat;
+ long ffvLastCode;
+ long ffvNewestCode;
+ long ffvOldestCode;
+ long ffvOldestReader;
+ FileChunkReference fcrLegacyFreeChunkList;
+ FileChunkReference fcrLegacyTransactionLog;
+ long cTransactionsInLog;
+ long cbLegacyExpectedFileLength;
+ long rgbPlaceholder;
+ FileChunkReference fcrLegacyFileNodeListRoot;
+ long cbLegacyFreeSpaceInFreeChunkList;
+ long ignoredZeroA;
+ long ignoredZeroB;
+ long ignoredZeroC;
+ long ignoredZeroD;
+ GUID guidAncestor;
+ long crcName;
+ FileChunkReference fcrHashedChunkList;
+ FileChunkReference fcrTransactionLog;
+ FileChunkReference fcrFileNodeListRoot;
+ FileChunkReference fcrFreeChunkList;
+ long cbExpectedFileLength;
+ long cbFreeSpaceInFreeChunkList;
+ GUID guidFileVersion;
+ long nFileVersionGeneration;
+ GUID guidDenyReadFileVersion;
+ long grfDebugLogFlags;
+ FileChunkReference fcrDebugLogA;
+ FileChunkReference fcrDebugLogB;
+ long buildNumberCreated;
+ long buildNumberLastWroteToFile;
+ long buildNumberOldestWritten;
+ long buildNumberNewestWritten;
+ byte[] reserved;
+
+ public GUID getGuidFileType() {
+ return guidFileType;
+ }
+
+ public OneNoteHeader setGuidFileType(GUID guidFileType) {
+ this.guidFileType = guidFileType;
+ return this;
+ }
+
+ public GUID getGuidFile() {
+ return guidFile;
+ }
+
+ public OneNoteHeader setGuidFile(GUID guidFile) {
+ this.guidFile = guidFile;
+ return this;
+ }
+
+ public GUID getGuidLegacyFileVersion() {
+ return guidLegacyFileVersion;
+ }
+
+ public OneNoteHeader setGuidLegacyFileVersion(GUID guidLegacyFileVersion) {
+ this.guidLegacyFileVersion = guidLegacyFileVersion;
+ return this;
+ }
+
+ public GUID getGuidFileFormat() {
+ return guidFileFormat;
+ }
+
+ public OneNoteHeader setGuidFileFormat(GUID guidFileFormat) {
+ this.guidFileFormat = guidFileFormat;
+ return this;
+ }
+
+ public long getFfvLastCode() {
+ return ffvLastCode;
+ }
+
+ public OneNoteHeader setFfvLastCode(long ffvLastCode) {
+ this.ffvLastCode = ffvLastCode;
+ return this;
+ }
+
+ public long getFfvNewestCode() {
+ return ffvNewestCode;
+ }
+
+ public OneNoteHeader setFfvNewestCode(long ffvNewestCode) {
+ this.ffvNewestCode = ffvNewestCode;
+ return this;
+ }
+
+ public long getFfvOldestCode() {
+ return ffvOldestCode;
+ }
+
+ public OneNoteHeader setFfvOldestCode(long ffvOldestCode) {
+ this.ffvOldestCode = ffvOldestCode;
+ return this;
+ }
+
+ public long getFfvOldestReader() {
+ return ffvOldestReader;
+ }
+
+ public OneNoteHeader setFfvOldestReader(long ffvOldestReader) {
+ this.ffvOldestReader = ffvOldestReader;
+ return this;
+ }
+
+ public FileChunkReference getFcrLegacyFreeChunkList() {
+ return fcrLegacyFreeChunkList;
+ }
+
+ public OneNoteHeader setFcrLegacyFreeChunkList(FileChunkReference fcrLegacyFreeChunkList) {
+ this.fcrLegacyFreeChunkList = fcrLegacyFreeChunkList;
+ return this;
+ }
+
+ public FileChunkReference getFcrLegacyTransactionLog() {
+ return fcrLegacyTransactionLog;
+ }
+
+ public OneNoteHeader setFcrLegacyTransactionLog(FileChunkReference fcrLegacyTransactionLog) {
+ this.fcrLegacyTransactionLog = fcrLegacyTransactionLog;
+ return this;
+ }
+
+ public long getcTransactionsInLog() {
+ return cTransactionsInLog;
+ }
+
+ public OneNoteHeader setcTransactionsInLog(long cTransactionsInLog) {
+ this.cTransactionsInLog = cTransactionsInLog;
+ return this;
+ }
+
+ public long getCbLegacyExpectedFileLength() {
+ return cbLegacyExpectedFileLength;
+ }
+
+ public OneNoteHeader setCbLegacyExpectedFileLength(long cbLegacyExpectedFileLength) {
+ this.cbLegacyExpectedFileLength = cbLegacyExpectedFileLength;
+ return this;
+ }
+
+ public long getRgbPlaceholder() {
+ return rgbPlaceholder;
+ }
+
+ public OneNoteHeader setRgbPlaceholder(long rgbPlaceholder) {
+ this.rgbPlaceholder = rgbPlaceholder;
+ return this;
+ }
+
+ public FileChunkReference getFcrLegacyFileNodeListRoot() {
+ return fcrLegacyFileNodeListRoot;
+ }
+
+ public OneNoteHeader setFcrLegacyFileNodeListRoot(FileChunkReference fcrLegacyFileNodeListRoot) {
+ this.fcrLegacyFileNodeListRoot = fcrLegacyFileNodeListRoot;
+ return this;
+ }
+
+ public long getCbLegacyFreeSpaceInFreeChunkList() {
+ return cbLegacyFreeSpaceInFreeChunkList;
+ }
+
+ public OneNoteHeader setCbLegacyFreeSpaceInFreeChunkList(long cbLegacyFreeSpaceInFreeChunkList) {
+ this.cbLegacyFreeSpaceInFreeChunkList = cbLegacyFreeSpaceInFreeChunkList;
+ return this;
+ }
+
+ public long getIgnoredZeroA() {
+ return ignoredZeroA;
+ }
+
+ public OneNoteHeader setIgnoredZeroA(long ignoredZeroA) {
+ this.ignoredZeroA = ignoredZeroA;
+ return this;
+ }
+
+ public long getIgnoredZeroB() {
+ return ignoredZeroB;
+ }
+
+ public OneNoteHeader setIgnoredZeroB(long ignoredZeroB) {
+ this.ignoredZeroB = ignoredZeroB;
+ return this;
+ }
+
+ public long getIgnoredZeroC() {
+ return ignoredZeroC;
+ }
+
+ public OneNoteHeader setIgnoredZeroC(long ignoredZeroC) {
+ this.ignoredZeroC = ignoredZeroC;
+ return this;
+ }
+
+ public long getIgnoredZeroD() {
+ return ignoredZeroD;
+ }
+
+ public OneNoteHeader setIgnoredZeroD(long ignoredZeroD) {
+ this.ignoredZeroD = ignoredZeroD;
+ return this;
+ }
+
+ public GUID getGuidAncestor() {
+ return guidAncestor;
+ }
+
+ public OneNoteHeader setGuidAncestor(GUID guidAncestor) {
+ this.guidAncestor = guidAncestor;
+ return this;
+ }
+
+ public long getCrcName() {
+ return crcName;
+ }
+
+ public OneNoteHeader setCrcName(long crcName) {
+ this.crcName = crcName;
+ return this;
+ }
+
+ public FileChunkReference getFcrHashedChunkList() {
+ return fcrHashedChunkList;
+ }
+
+ public OneNoteHeader setFcrHashedChunkList(FileChunkReference fcrHashedChunkList) {
+ this.fcrHashedChunkList = fcrHashedChunkList;
+ return this;
+ }
+
+ public FileChunkReference getFcrTransactionLog() {
+ return fcrTransactionLog;
+ }
+
+ public OneNoteHeader setFcrTransactionLog(FileChunkReference fcrTransactionLog) {
+ this.fcrTransactionLog = fcrTransactionLog;
+ return this;
+ }
+
+ public FileChunkReference getFcrFileNodeListRoot() {
+ return fcrFileNodeListRoot;
+ }
+
+ public OneNoteHeader setFcrFileNodeListRoot(FileChunkReference fcrFileNodeListRoot) {
+ this.fcrFileNodeListRoot = fcrFileNodeListRoot;
+ return this;
+ }
+
+ public FileChunkReference getFcrFreeChunkList() {
+ return fcrFreeChunkList;
+ }
+
+ public OneNoteHeader setFcrFreeChunkList(FileChunkReference fcrFreeChunkList) {
+ this.fcrFreeChunkList = fcrFreeChunkList;
+ return this;
+ }
+
+ public long getCbExpectedFileLength() {
+ return cbExpectedFileLength;
+ }
+
+ public OneNoteHeader setCbExpectedFileLength(long cbExpectedFileLength) {
+ this.cbExpectedFileLength = cbExpectedFileLength;
+ return this;
+ }
+
+ public long getCbFreeSpaceInFreeChunkList() {
+ return cbFreeSpaceInFreeChunkList;
+ }
+
+ public OneNoteHeader setCbFreeSpaceInFreeChunkList(long cbFreeSpaceInFreeChunkList) {
+ this.cbFreeSpaceInFreeChunkList = cbFreeSpaceInFreeChunkList;
+ return this;
+ }
+
+ public GUID getGuidFileVersion() {
+ return guidFileVersion;
+ }
+
+ public OneNoteHeader setGuidFileVersion(GUID guidFileVersion) {
+ this.guidFileVersion = guidFileVersion;
+ return this;
+ }
+
+ public long getnFileVersionGeneration() {
+ return nFileVersionGeneration;
+ }
+
+ public OneNoteHeader setnFileVersionGeneration(long nFileVersionGeneration) {
+ this.nFileVersionGeneration = nFileVersionGeneration;
+ return this;
+ }
+
+ public GUID getGuidDenyReadFileVersion() {
+ return guidDenyReadFileVersion;
+ }
+
+ public OneNoteHeader setGuidDenyReadFileVersion(GUID guidDenyReadFileVersion) {
+ this.guidDenyReadFileVersion = guidDenyReadFileVersion;
+ return this;
+ }
+
+ public long getGrfDebugLogFlags() {
+ return grfDebugLogFlags;
+ }
+
+ public OneNoteHeader setGrfDebugLogFlags(long grfDebugLogFlags) {
+ this.grfDebugLogFlags = grfDebugLogFlags;
+ return this;
+ }
+
+ public FileChunkReference getFcrDebugLogA() {
+ return fcrDebugLogA;
+ }
+
+ public OneNoteHeader setFcrDebugLogA(FileChunkReference fcrDebugLogA) {
+ this.fcrDebugLogA = fcrDebugLogA;
+ return this;
+ }
+
+ public FileChunkReference getFcrDebugLogB() {
+ return fcrDebugLogB;
+ }
+
+ public OneNoteHeader setFcrDebugLogB(FileChunkReference fcrDebugLogB) {
+ this.fcrDebugLogB = fcrDebugLogB;
+ return this;
+ }
+
+ public long getBuildNumberCreated() {
+ return buildNumberCreated;
+ }
+
+ public OneNoteHeader setBuildNumberCreated(long buildNumberCreated) {
+ this.buildNumberCreated = buildNumberCreated;
+ return this;
+ }
+
+ public long getBuildNumberLastWroteToFile() {
+ return buildNumberLastWroteToFile;
+ }
+
+ public OneNoteHeader setBuildNumberLastWroteToFile(long buildNumberLastWroteToFile) {
+ this.buildNumberLastWroteToFile = buildNumberLastWroteToFile;
+ return this;
+ }
+
+ public long getBuildNumberOldestWritten() {
+ return buildNumberOldestWritten;
+ }
+
+ public OneNoteHeader setBuildNumberOldestWritten(long buildNumberOldestWritten) {
+ this.buildNumberOldestWritten = buildNumberOldestWritten;
+ return this;
+ }
+
+ public long getBuildNumberNewestWritten() {
+ return buildNumberNewestWritten;
+ }
+
+ public OneNoteHeader setBuildNumberNewestWritten(long buildNumberNewestWritten) {
+ this.buildNumberNewestWritten = buildNumberNewestWritten;
+ return this;
+ }
+
+ public byte[] getReserved() {
+ return reserved;
+ }
+
+ public OneNoteHeader setReserved(byte[] reserved) {
+ this.reserved = reserved;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteParser.java
new file mode 100644
index 0000000..22756e3
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteParser.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.time.Instant;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * OneNote tika parser capable of parsing Microsoft OneNote files.
+ * <p>
+ * Based on the Microsoft specs MS-ONE and MS-ONESTORE.
+ */
+public class OneNoteParser extends AbstractParser {
+
+ private static final Map<MediaType, List<String>> typesMap = new HashMap<>();
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -5504243905998074168L;
+
+ static {
+ // All types should be 4 bytes long, space padded as needed
+ typesMap.put(MediaType.application("onenote; format=one"), Arrays.asList("ONE "));
+ // TODO - add onetoc and other onenote mime types
+ }
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(typesMap.keySet());
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+
+ try (TemporaryResources temporaryResources = new TemporaryResources();
+ TikaInputStream tikaInputStream = TikaInputStream.get(stream, temporaryResources);
+ OneNoteDirectFileResource oneNoteDirectFileResource = new OneNoteDirectFileResource(tikaInputStream.getFile())) {
+
+ temporaryResources.addResource(oneNoteDirectFileResource);
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ OneNoteDocument oneNoteDocument = createOneNoteDocumentFromDirectFileResource(oneNoteDirectFileResource);
+
+ metadata.set("buildNumberCreated", "0x" + Long.toHexString(oneNoteDocument.header.buildNumberCreated));
+ metadata.set("buildNumberLastWroteToFile", "0x" + Long.toHexString(oneNoteDocument.header.buildNumberLastWroteToFile));
+ metadata.set("buildNumberNewestWritten", "0x" + Long.toHexString(oneNoteDocument.header.buildNumberNewestWritten));
+ metadata.set("buildNumberOldestWritten", "0x" + Long.toHexString(oneNoteDocument.header.buildNumberOldestWritten));
+ metadata.set("cbExpectedFileLength", "0x" + Long.toHexString(oneNoteDocument.header.cbExpectedFileLength));
+ metadata.set("cbFreeSpaceInFreeChunkList", "0x" + Long.toHexString(oneNoteDocument.header.cbFreeSpaceInFreeChunkList));
+ metadata.set("cbLegacyExpectedFileLength", "0x" + Long.toHexString(oneNoteDocument.header.cbLegacyExpectedFileLength));
+ metadata.set("cbLegacyFreeSpaceInFreeChunkList",
+ "0x" + Long.toHexString(oneNoteDocument.header.cbLegacyFreeSpaceInFreeChunkList));
+ metadata.set("crcName", "0x" + Long.toHexString(oneNoteDocument.header.crcName));
+ metadata.set("cTransactionsInLog", "0x" + Long.toHexString(oneNoteDocument.header.cTransactionsInLog));
+ metadata.set("ffvLastCode", "0x" + Long.toHexString(oneNoteDocument.header.ffvLastCode));
+ metadata.set("ffvNewestCode", "0x" + Long.toHexString(oneNoteDocument.header.ffvNewestCode));
+ metadata.set("ffvOldestReader", "0x" + Long.toHexString(oneNoteDocument.header.ffvOldestReader));
+ metadata.set("grfDebugLogFlags", "0x" + Long.toHexString(oneNoteDocument.header.grfDebugLogFlags));
+ metadata.set("nFileVersionGeneration", "0x" + Long.toHexString(oneNoteDocument.header.nFileVersionGeneration));
+ metadata.set("rgbPlaceholder", "0x" + Long.toHexString(oneNoteDocument.header.rgbPlaceholder));
+
+ Pair<Long, ExtendedGUID> roleAndContext = Pair.of(1L, ExtendedGUID.nil());
+ OneNoteTreeWalker oneNoteTreeWalker = new OneNoteTreeWalker(
+ new OneNoteTreeWalkerOptions(), oneNoteDocument,
+ oneNoteDirectFileResource, xhtml, metadata, context, roleAndContext);
+
+ oneNoteTreeWalker.walkTree();
+
+ if (!oneNoteTreeWalker.getAuthors().isEmpty()) {
+ metadata.set(Property.externalTextBag("authors"), oneNoteTreeWalker.getAuthors().toArray(new String[] {}));
+ }
+ if (!oneNoteTreeWalker.getMostRecentAuthors().isEmpty()) {
+ metadata.set(Property.externalTextBag("mostRecentAuthors"), oneNoteTreeWalker.getMostRecentAuthors().toArray(new String[] {}));
+ }
+ if (!oneNoteTreeWalker.getOriginalAuthors().isEmpty()) {
+ metadata.set(Property.externalTextBag("originalAuthors"), oneNoteTreeWalker.getOriginalAuthors().toArray(new String[] {}));
+ }
+ if (!Instant.MAX.equals(oneNoteTreeWalker.getCreationTimestamp())) {
+ metadata.set("creationTimestamp", String.valueOf(oneNoteTreeWalker.getCreationTimestamp()));
+ }
+ if (!Instant.MIN.equals(oneNoteTreeWalker.getLastModifiedTimestamp())) {
+ metadata.set("lastModifiedTimestamp", String.valueOf(oneNoteTreeWalker.getLastModifiedTimestamp().toEpochMilli()));
+ }
+ if (oneNoteTreeWalker.getLastModified() > Long.MIN_VALUE) {
+ metadata.set("lastModified", String.valueOf(oneNoteTreeWalker.getLastModified()));
+ }
+ xhtml.endDocument();
+ }
+ }
+
+ /**
+ * Create a OneNoteDocument object.
+ * <p>
+ * This won't actually have the binary data of any of the sections, but it's more of a metadata structure that contains
+ * the general structure of the container and contains offset positions of where to find the binary data we care about.
+ * <p>
+ * OneNote files are of format:
+ * <p>
+ * The header (section 2.3.1 in MS-ONESTORE) is the first 1024 bytes of the file. It contains references to the other structures in the
+ * file as well as metadata about the file.
+ * The free chunk list (section 2.3.2 in MS-ONESTORE) defines where there are free spaces in the file where data can be written.
+ * The transaction log (section 2.3.3 in MS-ONESTORE) stores the state and length of each file node list (section 2.4 in MS-ONESTORE)
+ * in the file.
+ * The hashed chunk list (section 2.3.4 in MS-ONESTORE) stores read-only objects in the file that can be referenced by multiple
+ * revisions (section 2.1.8 in MS-ONESTORE).
+ * The root file node list (section 2.1.14 in MS-ONESTORE) is the file node list that is the root of the tree of all file node lists in
+ * the file.
+ * <p>
+ * In this method we first parse the header.
+ * <p>
+ * After parsing the header, this results in header.fcrFileNodeListRoot that points to the first
+ *
+ * @param oneNoteDirectFileResource A random access file resource used as the source of the content.
+ * @return A parsed one note document. This document does not contain any of the binary data, rather it just contains
+ * the data pointers and metadata.
+ * @throws IOException Will throw IOException in typical IO issue situations.
+ */
+ public OneNoteDocument createOneNoteDocumentFromDirectFileResource(OneNoteDirectFileResource oneNoteDirectFileResource) throws IOException, TikaException {
+ OneNoteDocument oneNoteDocument = new OneNoteDocument();
+ OneNotePtr oneNotePtr = new OneNotePtr(oneNoteDocument, oneNoteDirectFileResource);
+ // First parse out the header.
+ oneNoteDocument.header = oneNotePtr.deserializeHeader();
+
+ // Now that we parsed the header, the "root file node list"
+
+ oneNotePtr.reposition(oneNoteDocument.header.fcrFileNodeListRoot);
+ FileNodePtr curPath = new FileNodePtr();
+ oneNotePtr.deserializeFileNodeList(oneNoteDocument.root, curPath);
+
+ return oneNoteDocument;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePropertyEnum.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePropertyEnum.java
new file mode 100644
index 0000000..c47a5f7
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePropertyEnum.java
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+import java.util.HashMap;
+import java.util.Map;
+
+@SuppressWarnings("unused")
+enum OneNotePropertyEnum {
+ LayoutTightLayout(0x08001C00),
+ PageWidth(0x14001C01),
+ PageHeight(0x14001C02),
+ OutlineElementChildLevel(0x0C001C03),
+ Bold(0x08001C04),
+ Italic(0x08001C05),
+ Underline(0x08001C06),
+ Strikethrough(0x08001C07),
+ Superscript(0x08001C08),
+ Subscript(0x08001C09),
+ Font(0x1C001C0A),
+ FontSize(0x10001C0B),
+ FontColor(0x14001C0C),
+ Highlight(0x14001C0D),
+ RgOutlineIndentDistance(0x1C001C12),
+ BodyTextAlignment(0x0C001C13),
+ OffsetFromParentHoriz(0x14001C14),
+ OffsetFromParentVert(0x14001C15),
+ NumberListFormat(0x1C001C1A),
+ LayoutMaxWidth(0x14001C1B),
+ LayoutMaxHeight(0x14001C1C),
+ ContentChildNodesOfOutlineElement(0x24001C1F),
+ ContentChildNodesOfPageManifest(0x24001C1F),
+ ElementChildNodesOfSection(0x24001C20),
+ ElementChildNodesOfPage(0x24001C20),
+ ElementChildNodesOfTitle(0x24001C20),
+ ElementChildNodesOfOutline(0x24001C20),
+ ElementChildNodesOfOutlineElement(0x24001C20),
+ ElementChildNodesOfTable(0x24001C20),
+ ElementChildNodesOfTableRow(0x24001C20),
+ ElementChildNodesOfTableCell(0x24001C20),
+ ElementChildNodesOfVersionHistory(0x24001C20),
+ EnableHistory(0x08001E1E),
+ RichEditTextUnicode(0x1C001C22),
+ ListNodes(0x24001C26),
+ NotebookManagementEntityGuid(0x1C001C30),
+ OutlineElementRTL(0x08001C34),
+ LanguageID(0x14001C3B),
+ LayoutAlignmentInParent(0x14001C3E),
+ PictureContainer(0x20001C3F),
+ PageMarginTop(0x14001C4C),
+ PageMarginBottom(0x14001C4D),
+ PageMarginLeft(0x14001C4E),
+ PageMarginRight(0x14001C4F),
+ ListFont(0x1C001C52),
+ TopologyCreationTimeStamp(0x18001C65),
+ LayoutAlignmentSelf(0x14001C84),
+ IsTitleTime(0x08001C87),
+ IsBoilerText(0x08001C88),
+ PageSize(0x14001C8B),
+ PortraitPage(0x08001C8E),
+ EnforceOutlineStructure(0x08001C91),
+ EditRootRTL(0x08001C92),
+ CannotBeSelected(0x08001CB2),
+ IsTitleText(0x08001CB4),
+ IsTitleDate(0x08001CB5),
+ ListRestart(0x14001CB7),
+ IsLayoutSizeSetByUser(0x08001CBD),
+ ListSpacingMu(0x14001CCB),
+ LayoutOutlineReservedWidth(0x14001CDB),
+ LayoutResolveChildCollisions(0x08001CDC),
+ IsReadOnly(0x08001CDE),
+ LayoutMinimumOutlineWidth(0x14001CEC),
+ LayoutCollisionPriority(0x14001CF1),
+ CachedTitleString(0x1C001CF3),
+ DescendantsCannotBeMoved(0x08001CF9),
+ RichEditTextLangID(0x10001CFE),
+ LayoutTightAlignment(0x08001CFF),
+ Charset(0x0C001D01),
+ CreationTimeStamp(0x14001D09),
+ Deletable(0x08001D0C),
+ ListMSAAIndex(0x10001D0E),
+ IsBackground(0x08001D13),
+ IRecordMedia(0x14001D24),
+ CachedTitleStringFromPage(0x1C001D3C),
+ RowCount(0x14001D57),
+ ColumnCount(0x14001D58),
+ TableBordersVisible(0x08001D5E),
+ StructureElementChildNodes(0x24001D5F),
+ ChildGraphSpaceElementNodes(0x2C001D63),
+ TableColumnWidths(0x1C001D66),
+ Author(0x1C001D75),
+ LastModifiedTimeStamp(0x18001D77),
+ AuthorOriginal(0x20001D78),
+ AuthorMostRecent(0x20001D79),
+ LastModifiedTime(0x14001D7A),
+ IsConflictPage(0x08001D7C),
+ TableColumnsLocked(0x1C001D7D),
+ SchemaRevisionInOrderToRead(0x14001D82),
+ IsConflictObjectForRender(0x08001D96),
+ EmbeddedFileContainer(0x20001D9B),
+ EmbeddedFileName(0x1C001D9C),
+ SourceFilepath(0x1C001D9D),
+ ConflictingUserName(0x1C001D9E),
+ ImageFilename(0x1C001DD7),
+ IsConflictObjectForSelection(0x08001DDB),
+ PageLevel(0x14001DFF),
+ TextRunIndex(0x1C001E12),
+ TextRunFormatting(0x24001E13),
+ Hyperlink(0x08001E14),
+ UnderlineType(0x0C001E15),
+ Hidden(0x08001E16),
+ HyperlinkProtected(0x08001E19),
+ TextRunIsEmbeddedObject(0x08001E22),
+ ImageAltText(0x1C001E58),
+ MathFormatting(0x08003401),
+ ParagraphStyle(0x2000342C),
+ ParagraphSpaceBefore(0x1400342E),
+ ParagraphSpaceAfter(0x1400342F),
+ ParagraphLineSpacingExact(0x14003430),
+ MetaDataObjectsAboveGraphSpace(0x24003442),
+ TextRunDataObject(0x24003458),
+ TextRunData(0x40003499),
+ ParagraphStyleId(0x1C00345A),
+ HasVersionPages(0x08003462),
+ ActionItemType(0x10003463),
+ NoteTagShape(0x10003464),
+ NoteTagHighlightColor(0x14003465),
+ NoteTagTextColor(0x14003466),
+ NoteTagPropertyStatus(0x14003467),
+ NoteTagLabel(0x1C003468),
+ NoteTagCreated(0x1400346E),
+ NoteTagCompleted(0x1400346F),
+ NoteTagDefinitionOid(0x20003488),
+ NoteTagStates(0x04003489),
+ ActionItemStatus(0x10003470),
+ ActionItemSchemaVersion(0x0C003473),
+ ReadingOrderRTL(0x08003476),
+ ParagraphAlignment(0x0C003477),
+ VersionHistoryGraphSpaceContextNodes(0x3400347B),
+ DisplayedPageNumber(0x14003480),
+ SectionDisplayName(0x1C00349B),
+ NextStyle(0x1C00348A),
+ WebPictureContainer14(0x200034C8),
+ ImageUploadState(0x140034CB),
+ TextExtendedAscii(0x1C003498),
+ PictureWidth(0x140034CD),
+ PictureHeight(0x140034CE),
+ PageMarginOriginX(0x14001D0F),
+ PageMarginOriginY(0x14001D10),
+ WzHyperlinkUrl(0x1C001E20),
+ TaskTagDueDate(0x1400346B),
+ Unknown(0x00000000);
+
+ private long id;
+
+ OneNotePropertyEnum(long id) {
+ this.id = id;
+ }
+
+ private static final Map<Long, OneNotePropertyEnum> BY_ID = new HashMap<>();
+
+ static {
+ for (OneNotePropertyEnum e : values()) {
+ BY_ID.put(e.id, e);
+ }
+ }
+
+ public static OneNotePropertyEnum of(Long id) {
+ OneNotePropertyEnum result = BY_ID.get(id);
+ if (result == null) {
+ return Unknown;
+ }
+ return result;
+ }
+
+ public static long getType(OneNotePropertyEnum propertyEnum) {
+ long pid = propertyEnum.id;
+ long id = (pid & 0x3ffffff);
+ return pid >> 26 & 0x1f;
+ }
+
+ public static boolean getInlineBool(OneNotePropertyEnum propertyEnum) {
+ long pid = propertyEnum.id;
+ long id = (pid & 0x3ffffff);
+ long type = pid >> 26 & 0x1f;
+ boolean inlineBool = false;
+ if (type == 0x2) {
+ inlineBool = ((pid >> 31) & 0x1) > 0; // set the bool value from header
+ } else {
+ if (((pid >> 31) & 0x1) > 0) {
+ throw new RuntimeException("Reserved non-zero");
+ }
+ }
+ return inlineBool;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePropertyId.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePropertyId.java
new file mode 100644
index 0000000..661b03e
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePropertyId.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+class OneNotePropertyId {
+ OneNotePropertyEnum propertyEnum;
+ long pid;
+ long type;
+ boolean inlineBool;
+
+ public OneNotePropertyId() {
+ }
+
+ public OneNotePropertyId(long pid) {
+ this.pid = pid;
+ propertyEnum = OneNotePropertyEnum.of(pid);
+ type = pid >> 26 & 0x1f;
+ inlineBool = false;
+ if (type == 0x2) {
+ inlineBool = ((pid >> 31) & 0x1) > 0; // set the bool value from header
+ } else {
+ if (((pid >> 31) & 0x1) > 0) {
+ throw new RuntimeException("Reserved non-zero");
+ }
+ }
+ }
+
+ public OneNotePropertyEnum getPropertyEnum() {
+ return propertyEnum;
+ }
+
+ public OneNotePropertyId setPropertyEnum(OneNotePropertyEnum propertyEnum) {
+ this.propertyEnum = propertyEnum;
+ return this;
+ }
+
+ public long getPid() {
+ return pid;
+ }
+
+ public OneNotePropertyId setPid(long pid) {
+ this.pid = pid;
+ return this;
+ }
+
+ public long getType() {
+ return type;
+ }
+
+ public OneNotePropertyId setType(long type) {
+ this.type = type;
+ return this;
+ }
+
+ public boolean isInlineBool() {
+ return inlineBool;
+ }
+
+ public OneNotePropertyId setInlineBool(boolean inlineBool) {
+ this.inlineBool = inlineBool;
+ return this;
+ }
+
+ @Override
+ public String toString() {
+ return "{" + propertyEnum +
+ ", pid=0x" + Long.toHexString(pid) +
+ ", type=0x" + Long.toHexString(type) +
+ ", inlineBool=" + inlineBool +
+ '}';
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java
new file mode 100644
index 0000000..408cc27
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java
@@ -0,0 +1,1158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+import org.apache.commons.codec.binary.Hex;
+import org.apache.commons.io.EndianUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+/**
+ * This is the main class used during parsing. This will contain an offset position and end position for reading bytes
+ * from the byte stream.
+ * <p>
+ * It contains all the deserialize methods used to read the different data elements from a one note file.
+ * <p>
+ * You can construct a new one note pointer and it will reposition the byte channel and will read until
+ */
+class OneNotePtr {
+
+ private static final Logger LOG = LoggerFactory.getLogger(OneNoteParser.class);
+
+ public static final long FOOTER_CONST = 0x8BC215C38233BA4BL;
+ public static final String UNKNOWN = "unknown";
+ private static final byte[] IFNDF = new byte[] {
+ 60, 0, 105, 0, 102, 0, 110, 0, 100, 0, 102, 0, 62, 0
+ };
+
+ private static final GUID FILE_DATA_STORE_OBJ_HEADER = new GUID(new int[] {
+ 0xBD,
+ 0xE3,
+ 0x16,
+ 0xE7,
+ 0x26,
+ 0x65,
+ 0x45,
+ 0x11,
+ 0xA4,
+ 0xC4,
+ 0x8D,
+ 0x4D,
+ 0x0B,
+ 0x7A,
+ 0x9E,
+ 0xAC
+ });
+
+ private static final GUID FILE_DATA_STORE_OBJ_FOOTER = new GUID(new int[] {
+ 0x71,
+ 0xFB,
+ 0xA7,
+ 0x22,
+ 0x0F,
+ 0x79,
+ 0x4A,
+ 0x0B,
+ 0xBB,
+ 0x13,
+ 0x89,
+ 0x92,
+ 0x56,
+ 0x42,
+ 0x6B,
+ 0x24});
+
+ public static final int IFNDF_GUID_LENGTH = 38; // 36 char guid with a { and a } char.
+ int indentLevel = 0;
+
+ long offset;
+ long end;
+
+ OneNoteDocument document;
+ OneNoteDirectFileResource dif;
+
+ public OneNotePtr(OneNoteDocument document, OneNoteDirectFileResource oneNoteDirectFileResource) throws IOException {
+ this.document = document;
+ this.dif = oneNoteDirectFileResource;
+ offset = oneNoteDirectFileResource.position();
+ end = oneNoteDirectFileResource.size();
+ }
+
+ public OneNotePtr(OneNotePtr oneNotePtr) {
+ this.document = oneNotePtr.document;
+ this.dif = oneNotePtr.dif;
+ this.offset = oneNotePtr.offset;
+ this.end = oneNotePtr.end;
+ this.indentLevel = oneNotePtr.indentLevel;
+ }
+
+ public OneNoteHeader deserializeHeader() throws IOException {
+ OneNoteHeader data = new OneNoteHeader();
+ data.setGuidFileType(deserializeGUID())
+ .setGuidFile(deserializeGUID())
+ .setGuidLegacyFileVersion(deserializeGUID())
+ .setGuidFileFormat(deserializeGUID())
+ .setFfvLastCode(deserializeLittleEndianInt())
+ .setFfvNewestCode(deserializeLittleEndianInt())
+ .setFfvOldestCode(deserializeLittleEndianInt())
+ .setFfvOldestReader(deserializeLittleEndianInt())
+ .setFcrLegacyFreeChunkList(deserializeFileChunkReference64())
+ .setFcrLegacyTransactionLog(deserializeFileChunkReference64())
+ .setcTransactionsInLog(deserializeLittleEndianInt())
+ .setCbExpectedFileLength(deserializeLittleEndianInt())
+ .setRgbPlaceholder(deserializeLittleEndianLong())
+ .setFcrLegacyFileNodeListRoot(deserializeFileChunkReference64())
+ .setCbLegacyFreeSpaceInFreeChunkList(deserializeLittleEndianInt())
+ .setIgnoredZeroA(deserializeLittleEndianChar())
+ .setIgnoredZeroB(deserializeLittleEndianChar())
+ .setIgnoredZeroC(deserializeLittleEndianChar())
+ .setIgnoredZeroD(deserializeLittleEndianChar())
+ .setGuidAncestor(deserializeGUID())
+ .setCrcName(deserializeLittleEndianInt())
+ .setFcrHashedChunkList(deserializeFileChunkReference64x32())
+ .setFcrTransactionLog(deserializeFileChunkReference64x32())
+ .setFcrFileNodeListRoot(deserializeFileChunkReference64x32())
+ .setFcrFreeChunkList(deserializeFileChunkReference64x32())
+ .setCbExpectedFileLength(deserializeLittleEndianLong())
+ .setCbFreeSpaceInFreeChunkList(deserializeLittleEndianLong())
+ .setGuidFileVersion(deserializeGUID())
+ .setnFileVersionGeneration(deserializeLittleEndianLong())
+ .setGuidDenyReadFileVersion(deserializeGUID())
+ .setGrfDebugLogFlags(deserializeLittleEndianInt())
+ .setFcrDebugLogA(deserializeFileChunkReference64x32())
+ .setFcrDebugLogB(deserializeFileChunkReference64x32())
+ .setBuildNumberCreated(deserializeLittleEndianInt())
+ .setBuildNumberLastWroteToFile(deserializeLittleEndianInt())
+ .setBuildNumberOldestWritten(deserializeLittleEndianInt())
+ .setBuildNumberNewestWritten(deserializeLittleEndianInt())
+ .setReserved(deserializedReservedHeader());
+ return data;
+ }
+
+ private GUID deserializeGUID() throws IOException {
+ int[] guid = new int[16];
+ for (int i = 0; i < 16; ++i) {
+ guid[i] = dif.read();
+ }
+ offset = dif.position();
+ return new GUID(guid);
+ }
+
+ private byte[] deserializedReservedHeader() throws IOException {
+ if (dif.position() != offset) {
+ dif.position(offset);
+ }
+ ByteBuffer data = ByteBuffer.allocate(728);
+
+ dif.read(data);
+
+ offset = dif.position();
+ return data.array();
+ }
+
+ private FileChunkReference deserializeFileChunkReference64() throws IOException {
+ long stp = deserializeLittleEndianInt();
+ long cb = deserializeLittleEndianInt();
+ offset = dif.position();
+ return new FileChunkReference(stp, cb);
+ }
+
+ private FileChunkReference deserializeFileChunkReference64x32() throws IOException {
+ long stp = deserializeLittleEndianLong();
+ long cb = deserializeLittleEndianInt();
+ offset = dif.position();
+ return new FileChunkReference(stp, cb);
+ }
+
+ private char deserializeLittleEndianChar() throws IOException {
+ if (dif.position() != offset) {
+ dif.position(offset);
+ }
+ char res = (char) dif.read();
+ ++offset;
+ return res;
+ }
+
+ private long deserializeLittleEndianInt() throws IOException {
+ if (dif.position() != offset) {
+ dif.position(offset);
+ }
+ ByteBuffer byteBuffer = ByteBuffer.allocate(4);
+ dif.read(byteBuffer);
+ long res = EndianUtils.readSwappedUnsignedInteger(byteBuffer.array(), 0);
+ offset = dif.position();
+ return res;
+ }
+
+ private long deserializeLittleEndianLong() throws IOException {
+ if (dif.position() != offset) {
+ dif.position(offset);
+ }
+ ByteBuffer byteBuffer = ByteBuffer.allocate(8);
+ dif.read(byteBuffer);
+ long res = EndianUtils.readSwappedLong(byteBuffer.array(), 0);
+ offset = dif.position();
+ return res;
+ }
+
+ private long deserializeLittleEndianShort() throws IOException {
+ if (dif.position() != offset) {
+ dif.position(offset);
+ }
+ int c1 = dif.read();
+ int c2 = dif.read();
+ long res = (((c1 & 0xff) << 0) +
+ ((c2 & 0xff) << 8));
+ offset = dif.position();
+ return res;
+ }
+
+ private String getIndent() {
+ String retval = "";
+ for (int i = 0; i < indentLevel; ++i) {
+ retval += " ";
+ }
+ return retval;
+ }
+
+ public void reposition(FileChunkReference loc) throws IOException {
+ reposition(loc.stp);
+ this.end = offset + loc.cb;
+ }
+
+ private void reposition(long offset) throws IOException {
+ this.offset = offset;
+ dif.position(offset);
+ }
+
+ /**
+ * Keep parsing file node list fragments until a nil file chunk reference is encountered.
+ * <p>
+ * A file node list can be divided into one or more FileNodeListFragment
+ * structures. Each fragment can specify whether there are more fragments in the list and
+ * the location of the next fragment. Each fragment specifies a sub-sequence of FileNode structures
+ * from the file node list.
+ * <p>
+ * When specifying the structure of a specific file node list in this document, the division of the list into
+ * fragments is ignored and FileNode structures with FileNode.FileNodeID field values equal to 0x0FF
+ * ("ChunkTerminatorFND") are not specified.
+ *
+ * @param ptr The current OneNotePtr we are at currently.
+ * @param fileNodeList The file node list to populate as we parse.
+ * @param curPath The current FileNodePtr.
+ * @return The resulting one note pointer after node lists are all parsed.
+ */
+ public OneNotePtr internalDeserializeFileNodeList(OneNotePtr ptr, FileNodeList fileNodeList, FileNodePtr curPath) throws IOException,
+ TikaException {
+ OneNotePtr localPtr = new OneNotePtr(document, dif);
+ FileNodePtrBackPush bp = new FileNodePtrBackPush(curPath);
+ try {
+ while (true) {
+ FileChunkReference next = FileChunkReference.nil();
+ ptr.deserializeFileNodeListFragment(fileNodeList, next, curPath);
+ if (FileChunkReference.nil().equals(next)) {
+ break;
+ }
+ localPtr.reposition(next);
+ ptr = localPtr;
+ }
+ return ptr;
+ } finally {
+ bp.dec();
+ }
+ }
+
+
+ public OneNotePtr deserializeFileNodeList(FileNodeList fileNodeList, FileNodePtr curPath) throws IOException, TikaException {
+ return internalDeserializeFileNodeList(this, fileNodeList, curPath);
+ }
+
+ /**
+ * Deserializes a FileNodeListFragment.
+ * <p>
+ * The FileNodeListFragment structure specifies a sequence of file nodes from a file node list. The size of the
+ * FileNodeListFragment structure is specified by the structure that references it.
+ * <p>
+ * All fragments in the same file node list MUST have the same FileNodeListFragment.header.FileNodeListID field.
+ *
+ * @param data List of file nodes that we collect while deserializing.
+ * @param next The next file chunk we are referencing.
+ * @param curPath The current FileNodePtr.
+ */
+ void deserializeFileNodeListFragment(FileNodeList data, FileChunkReference next, FileNodePtr curPath) throws IOException,
+ TikaException {
+ data.fileNodeListHeader = deserializeFileNodeListHeader();
+ boolean terminated = false;
+ while (offset + 24 <= end) { // while there are at least 24 bytes free
+ // 24 = sizeof(nextFragment) [12 bytes] + sizeof(footer) [8 bytes]
+ // + 4 bytes for the FileNode header
+ CheckedFileNodePushBack pushBack = new CheckedFileNodePushBack(data);
+ try {
+ FileNode fileNode = deserializeFileNode(data.children.get(data.children.size() - 1), curPath);
+ if (fileNode.id == FndStructureConstants.ChunkTerminatorFND || fileNode.id == 0) {
+ terminated = true;
+ break;
+ }
+ pushBack.commit();
+ FileNode dereference = curPath.dereference(document);
+ FileNode lastChild = data.children.get(data.children.size() - 1);
+ assert dereference.equals(lastChild); // is this correct? or should we be checking the pointer?
+ Integer curPathOffset = curPath.nodeListPositions.get(curPath.nodeListPositions.size() - 1);
+ curPath.nodeListPositions.set(curPath.nodeListPositions.size() - 1, curPathOffset + 1);
+ } finally {
+ pushBack.popBackIfNotCommitted();
+ }
+ }
+ reposition(end - 20);
+ FileChunkReference nextChunkRef = deserializeFileChunkReference64x32();
+ next.cb = nextChunkRef.cb;
+ next.stp = nextChunkRef.stp;
+ if (terminated) {
+ LOG.debug("{}Chunk terminator found NextChunkRef.cb={}, NextChunkRef.stp={}, Offset={}, End={}", getIndent(), nextChunkRef.cb
+ , nextChunkRef.stp, offset, end);
+ // TODO check that next is OK
+ }
+ long footer = deserializeLittleEndianLong();
+ if (footer != FOOTER_CONST) {
+ throw new TikaException("Invalid footer constant. Expected " + FOOTER_CONST + " but was " + footer);
+ }
+ }
+
+ private FileNode deserializeFileNode(FileNode data, FileNodePtr curPath) throws IOException, TikaException {
+ OneNotePtr backup = new OneNotePtr(this);
+ long reserved;
+
+ data.isFileData = false;
+ data.gosid = ExtendedGUID.nil();
+ long fileNodeHeader = deserializeLittleEndianInt();
+ data.id = fileNodeHeader & 0x3ff;
+ if (data.id == 0) {
+ return data;
+ }
+ LOG.debug("{}Start Node {} ({}) - Offset={}, End={}", getIndent(), FndStructureConstants.nameOf(data.id), data.id, offset, end);
+
+ ++indentLevel;
+
+ data.size = (fileNodeHeader >> 10) & 0x1fff;
+ // reset the size to only be in scope of this FileNode
+ end = backup.offset + data.size;
+
+ long stpFormat = (fileNodeHeader >> 23) & 0x3;
+ long cbFormat = (fileNodeHeader >> 25) & 0x3;
+ data.baseType = (fileNodeHeader >> 27) & 0xf;
+ reserved = (fileNodeHeader >> 31);
+ data.ref = FileChunkReference.nil();
+ if (data.baseType == 1 || data.baseType == 2) {
+ data.ref = deserializeVarFileChunkReference(stpFormat, cbFormat);
+ } // otherwise ignore the data ref, since we're a type 0
+ if (data.baseType == 1 && !data.ref.equals(FileChunkReference.nil())) {
+ OneNotePtr content = new OneNotePtr(this);
+ content.reposition(data.ref);
+ // would have thrown an error if invalid.
+ }
+ if (data.id == FndStructureConstants.ObjectGroupStartFND) {
+ data.idDesc = "oid(group)";
+ data.gosid = deserializeExtendedGUID();
+ } else if (data.id == FndStructureConstants.ObjectGroupEndFND) {
+ // no data
+ } else if (data.id == FndStructureConstants.ObjectSpaceManifestRootFND
+ || data.id == FndStructureConstants.ObjectSpaceManifestListStartFND) {
+ if (data.id == FndStructureConstants.ObjectSpaceManifestRootFND) {
+ data.idDesc = "gosidRoot";
+ } else {
+ data.idDesc = "gosid";
+ }
+ // Specifies the identity of the object space being specified by this object space manifest list.
+ // MUST match the ObjectSpaceManifestListReferenceFND.gosid field of the FileNode structure that referenced
+ // this file node list.
+ data.gosid = deserializeExtendedGUID();
+ //LOG.debug("{}gosid {}", getIndent(), data.gosid.toString().c_str());
+ } else if (data.id == FndStructureConstants.ObjectSpaceManifestListReferenceFND) {
+ data.gosid = deserializeExtendedGUID();
+ data.idDesc = "gosid";
+ //LOG.debug("{}gosid {}", getIndent(),data.gosid.toString().c_str());
+ //children parsed in generic base_type 2 parser
+ } else if (data.id == FndStructureConstants.RevisionManifestListStartFND) {
+ data.gosid = deserializeExtendedGUID();
+ data.idDesc = "gosid";
+ FileNodePtr parentPath = new FileNodePtr(curPath);
+ parentPath.nodeListPositions.remove(parentPath.nodeListPositions.size() - 1);
+ document.registerRevisionManifestList(data.gosid, parentPath);
+
+ //LOG.debug("{}gosid {}", getIndent(),data.gosid.toString().c_str());
+ data.subType.revisionManifestListStart.nInstanceIgnored = deserializeLittleEndianInt();
+ } else if (data.id == FndStructureConstants.RevisionManifestStart4FND) {
+ data.gosid = deserializeExtendedGUID(); // the rid
+ data.idDesc = "rid";
+ //LOG.debug("{}gosid {}", getIndent(), data.gosid.toString().c_str());
+ data.subType.revisionManifest.ridDependent = deserializeExtendedGUID(); // the rid
+ LOG.debug("{}dependent gosid {}", getIndent(), data.subType.revisionManifest.ridDependent);
+ data.subType.revisionManifest.timeCreation = deserializeLittleEndianLong();
+ data.subType.revisionManifest.revisionRole = deserializeLittleEndianInt();
+ data.subType.revisionManifest.odcsDefault = deserializeLittleEndianShort();
+
+ data.gctxid = ExtendedGUID.nil();
+ document.registerRevisionManifest(data);
+ } else if (data.id == FndStructureConstants.RevisionManifestStart6FND
+ || data.id == FndStructureConstants.RevisionManifestStart7FND) {
+ data.gosid = deserializeExtendedGUID(); // the rid
+ data.idDesc = "rid";
+ //LOG.debug("{}gosid {}", getIndent(), data.gosid.toString().c_str());
+ data.subType.revisionManifest.ridDependent = deserializeExtendedGUID(); // the rid
+ LOG.debug("{}dependent gosid {}", getIndent(), data.subType.revisionManifest.ridDependent);
+ data.subType.revisionManifest.revisionRole = deserializeLittleEndianInt();
+ data.subType.revisionManifest.odcsDefault = deserializeLittleEndianShort();
+
+ data.gctxid = ExtendedGUID.nil();
+ if (data.id == FndStructureConstants.RevisionManifestStart7FND) {
+ data.gctxid = deserializeExtendedGUID(); // the rid
+ }
+ document.registerAdditionalRevisionRole(data.gosid, data.subType.revisionManifest.revisionRole, data.gctxid);
+ document.registerRevisionManifest(data);
+ } else if (data.id == FndStructureConstants.GlobalIdTableStartFNDX) {
+ data.subType.globalIdTableStartFNDX.reserved = deserializeLittleEndianChar();
+
+ } else if (data.id == FndStructureConstants.GlobalIdTableEntryFNDX) {
+ data.subType.globalIdTableEntryFNDX.index = deserializeLittleEndianInt();
+
+ data.subType.globalIdTableEntryFNDX.guid = deserializeGUID();
+
+ document.revisionMap.get(document.currentRevision).globalId.put(data.subType.globalIdTableEntryFNDX.index,
+ data.subType.globalIdTableEntryFNDX.guid);
+ } else if (data.id == FndStructureConstants.GlobalIdTableEntry2FNDX) {
+ data.subType.globalIdTableEntry2FNDX.indexMapFrom = deserializeLittleEndianInt();
+ data.subType.globalIdTableEntry2FNDX.indexMapTo = deserializeLittleEndianInt();
+
+ ExtendedGUID dependentRevision =
+ document.revisionMap.get(document.currentRevision).dependent;
+ // Get the compactId from the revisionMap's globalId map.
+ GUID compactId = document.revisionMap.get(dependentRevision).globalId.get(data.subType.globalIdTableEntry2FNDX.indexMapFrom);
+ if (compactId == null) {
+ throw new TikaException("COMPACT_ID_MISSING");
+ }
+ document.revisionMap.get(document.currentRevision).globalId.put(data.subType.globalIdTableEntry2FNDX.indexMapTo, compactId);
+ } else if (data.id == FndStructureConstants.GlobalIdTableEntry3FNDX) {
+ data.subType.globalIdTableEntry3FNDX.indexCopyFromStart = deserializeLittleEndianInt();
+
+ data.subType.globalIdTableEntry3FNDX.entriesToCopy = deserializeLittleEndianInt();
+
+ data.subType.globalIdTableEntry3FNDX.indexCopyToStart = deserializeLittleEndianInt();
+
+ ExtendedGUID dependent_revision = document.revisionMap.get(document.currentRevision).dependent;
+ for (int i = 0; i < data.subType.globalIdTableEntry3FNDX.entriesToCopy; ++i) {
+ Map<Long, GUID> globalIdMap = document.revisionMap.get(dependent_revision).globalId;
+ GUID compactId = globalIdMap.get(data.subType.globalIdTableEntry3FNDX.indexCopyFromStart + i);
+ if (compactId == null) {
+ throw new TikaException("COMPACT_ID_MISSING");
+ }
+ document.revisionMap.get(document.currentRevision).globalId.put(data.subType.globalIdTableEntry3FNDX.indexCopyToStart + i
+ , compactId);
+ }
+ } else if (data.id == FndStructureConstants.CanRevise.ObjectRevisionWithRefCountFNDX
+ || data.id == FndStructureConstants.CanRevise.ObjectRevisionWithRefCount2FNDX) {
+ data.subType.objectRevisionWithRefCountFNDX.oid = deserializeCompactID(); // the oid
+
+ if (data.id == FndStructureConstants.CanRevise.ObjectRevisionWithRefCountFNDX) {
+ int ref = deserializeLittleEndianChar();
+
+ data.subType.objectRevisionWithRefCountFNDX.hasOidReferences = ref & 1;
+ data.subType.objectRevisionWithRefCountFNDX.hasOsidReferences = ref & 2;
+ data.subType.objectRevisionWithRefCountFNDX.cRef = (ref >> 2);
+ } else {
+ long ref = deserializeLittleEndianInt();
+
+ data.subType.objectRevisionWithRefCountFNDX.hasOidReferences = ref & 1;
+ data.subType.objectRevisionWithRefCountFNDX.hasOsidReferences = ref & 2;
+ if ((ref >> 2) != 0) {
+ throw new TikaException("Reserved non-zero");
+ }
+ data.subType.objectRevisionWithRefCountFNDX.cRef = deserializeLittleEndianInt();
+ }
+ } else if (data.id == FndStructureConstants.RootObjectReference2FNDX) {
+ data.subType.rootObjectReference.oidRoot = deserializeCompactID();
+
+ data.idDesc = "oidRoot";
+ data.gosid = data.subType.rootObjectReference.oidRoot.guid;
+ data.subType.rootObjectReference.rootObjectReferenceBase.rootRole = deserializeLittleEndianInt();
+
+ LOG.debug("{}Root role {}", getIndent(),
+ data.subType.rootObjectReference.rootObjectReferenceBase.rootRole);
+ } else if (data.id == FndStructureConstants.RootObjectReference3FND) {
+ data.idDesc = "oidRoot";
+ data.gosid = deserializeExtendedGUID();
+
+ data.subType.rootObjectReference.rootObjectReferenceBase.rootRole = deserializeLittleEndianInt();
+
+ LOG.debug("{}Root role {}", getIndent(),
+ data.subType.rootObjectReference.rootObjectReferenceBase.rootRole);
+ } else if (data.id == FndStructureConstants.RevisionRoleDeclarationFND
+ || data.id == FndStructureConstants.RevisionRoleAndContextDeclarationFND) {
+ data.gosid = deserializeExtendedGUID();
+
+ data.subType.revisionRoleDeclaration.revisionRole = deserializeLittleEndianInt();
+
+ if (data.id == FndStructureConstants.RevisionRoleAndContextDeclarationFND) {
+ data.gctxid = deserializeExtendedGUID();
+
+ }
+ document.registerAdditionalRevisionRole(data.gosid,
+ data.subType.revisionRoleDeclaration.revisionRole,
+ data.gctxid);
+ // FIXME: deal with ObjectDataEncryptionKey
+ } else if (data.id == FndStructureConstants.ObjectInfoDependencyOverridesFND) {
+ OneNotePtr content = new OneNotePtr(this);
+ if (!data.ref.equals(FileChunkReference.nil())) {
+ content.reposition(data.ref); // otherwise it's positioned right at this node
+ }
+ data.subType.objectInfoDependencyOverrides.data = content.deserializeObjectInfoDependencyOverrideData();
+ } else if (data.id == FndStructureConstants.FileDataStoreListReferenceFND) {
+ // already processed this
+ } else if (data.id == FndStructureConstants.FileDataStoreObjectReferenceFND) {
+ FileChunkReference ref = deserializeFileChunkReference64();
+ GUID guid = deserializeGUID();
+ ExtendedGUID extendedGuid = new ExtendedGUID(guid, 0);
+ LOG.trace("found extended guid {}", extendedGuid);
+ document.guidToRef.put(extendedGuid, ref);
+ OneNotePtr fileDataStorePtr = new OneNotePtr(this);
+ fileDataStorePtr.reposition(data.ref);
+
+ data.subType.fileDataStoreObjectReference.ref = fileDataStorePtr.deserializeFileDataStoreObject();
+
+ } else if (data.id == FndStructureConstants.CanRevise.ObjectDeclarationWithRefCountFNDX
+ || data.id == FndStructureConstants.CanRevise.ObjectDeclarationWithRefCount2FNDX
+ || data.id == FndStructureConstants.CanRevise.ObjectDeclaration2RefCountFND
+ || data.id == FndStructureConstants.CanRevise.ObjectDeclaration2LargeRefCountFND
+ || data.id == FndStructureConstants.CanRevise.ReadOnlyObjectDeclaration2RefCountFND
+ || data.id == FndStructureConstants.CanRevise.ReadOnlyObjectDeclaration2LargeRefCountFND) {
+ data.subType.objectDeclarationWithRefCount.body.file_data_store_reference =
+ false;
+ if (data.id == FndStructureConstants.CanRevise.ObjectDeclarationWithRefCountFNDX
+ || data.id == FndStructureConstants.CanRevise.ObjectDeclarationWithRefCount2FNDX) {
+ data.subType.objectDeclarationWithRefCount.body = deserializeObjectDeclarationWithRefCountBody();
+ } else { // one of the other 4 that use the ObjectDeclaration2Body
+ data.subType.objectDeclarationWithRefCount.body = deserializeObjectDeclaration2Body();
+ }
+ if (data.id == FndStructureConstants.CanRevise.ObjectDeclarationWithRefCountFNDX
+ || data.id == FndStructureConstants.CanRevise.ObjectDeclaration2RefCountFND
+ || data.id == FndStructureConstants.CanRevise.ReadOnlyObjectDeclaration2RefCountFND) {
+ long refCnt = deserializeLittleEndianChar();
+ data.subType.objectDeclarationWithRefCount.cRef = refCnt;
+ } else {
+ data.subType.objectDeclarationWithRefCount.cRef = deserializeLittleEndianInt();
+ }
+
+ if (data.id == FndStructureConstants.CanRevise.ReadOnlyObjectDeclaration2RefCountFND
+ || data.id == FndStructureConstants.CanRevise.ReadOnlyObjectDeclaration2LargeRefCountFND) {
+ ByteBuffer md5Buffer = ByteBuffer.allocate(16);
+ deserializeBytes(md5Buffer);
+ data.subType.objectDeclarationWithRefCount.readOnly.md5 = md5Buffer.array();
+ }
+ data.idDesc = "oid";
+ postprocessObjectDeclarationContents(data, curPath);
+
+ LOG.debug("{}Ref Count JCID {}", getIndent(),
+ data.subType.objectDeclarationWithRefCount.body.jcid);
+ } else if (data.id == FndStructureConstants.CanRevise.ObjectDeclarationFileData3RefCountFND
+ || data.id == FndStructureConstants.CanRevise.ObjectDeclarationFileData3LargeRefCountFND) {
+ data.subType.objectDeclarationWithRefCount.body.oid = deserializeCompactID();
+
+ long jcid = deserializeLittleEndianInt();
+
+ data.subType.objectDeclarationWithRefCount.body.jcid.loadFrom32BitIndex(jcid);
+
+ if (data.id == FndStructureConstants.CanRevise.ObjectDeclarationFileData3RefCountFND) {
+ data.subType.objectDeclarationWithRefCount.cRef = deserializeLittleEndianChar();
+ } else {
+ data.subType.objectDeclarationWithRefCount.cRef = deserializeLittleEndianInt();
+ }
+
+ long cch = deserializeLittleEndianInt();
+
+ long roomLeftLong = roomLeft();
+ if (cch > roomLeftLong) { // not a valid guid
+ throw new TikaException("Data out of bounds - cch " + cch + " is > room left = " + roomLeftLong);
+ }
+
+ if (cch > dif.size()) {
+ throw new TikaMemoryLimitException("CCH=" + cch + " was found that was great than file size " + dif.size());
+ }
+ ByteBuffer dataSpaceBuffer = ByteBuffer.allocate((int) cch * 2);
+ dif.read(dataSpaceBuffer);
+ byte[] dataSpaceBufferBytes = dataSpaceBuffer.array();
+ offset += dataSpaceBufferBytes.length;
+ if (dataSpaceBufferBytes.length == (IFNDF_GUID_LENGTH * 2 + IFNDF.length) &&
+ Arrays.equals(IFNDF, Arrays.copyOfRange(dataSpaceBufferBytes, 0, IFNDF.length))) {
+ data.subType.objectDeclarationWithRefCount.body.file_data_store_reference = true;
+ GUID guid = GUID.fromCurlyBraceUTF16Bytes(Arrays.copyOfRange(dataSpaceBufferBytes, IFNDF.length,
+ dataSpaceBufferBytes.length));
+ ExtendedGUID extendedGUID = new ExtendedGUID(guid, 0);
+ FileChunkReference fileChunk = document.getAssocGuidToRef(extendedGUID);
+ if (fileChunk == null) {
+ LOG.debug("{} have not seen GUID {} yet", getIndent(), extendedGUID);
+ } else {
+ // TODO - call postprocessObjectDeclarationContents on this object?
+ }
+ } else {
+ LOG.debug("{}Ignoring an external reference {}", getIndent(), new String(dataSpaceBufferBytes, StandardCharsets.UTF_16LE));
+ }
+ } else if (data.id == FndStructureConstants.ObjectGroupListReferenceFND) {
+ data.idDesc = "object_group_id";
+ data.gosid = deserializeExtendedGUID(); // the object group id
+
+ // the ref populates the FileNodeList children
+ } else if (data.id == FndStructureConstants.ObjectGroupStartFND) {
+ data.idDesc = "object_group_id";
+ data.gosid = deserializeExtendedGUID(); // the oid
+
+ } else if (data.id == FndStructureConstants.ObjectGroupEndFND) {
+ // nothing to see here
+ } else if (data.id == FndStructureConstants.DataSignatureGroupDefinitionFND) {
+ data.idDesc = "data_sig";
+ data.gosid = deserializeExtendedGUID(); // the DataSignatureGroup
+
+ } else if (data.id == FndStructureConstants.RevisionManifestListReferenceFND) {
+ document.revisionMap.putIfAbsent(document.currentRevision, new Revision());
+ Revision currentRevision = document.revisionMap.get(document.currentRevision);
+ currentRevision.manifestList.add(curPath);
+ } else {
+ LOG.debug("No fnd needed to be parsed for data.id=0x" + Long.toHexString(data.id) + " (" + FndStructureConstants.nameOf(data.id) + ")");
+ }
+ if (data.baseType == 2) {
+ // Generic baseType == 2 parser - means we have children to parse.
+ OneNotePtr subList = new OneNotePtr(this);
+ // position the subList pointer to the data.ref and deserialize recursively.
+ subList.reposition(data.ref);
+ subList.deserializeFileNodeList(data.childFileNodeList, curPath);
+ }
+
+ offset = backup.offset + data.size;
+ end = backup.end;
+
+ if (reserved != 1) {
+ System.exit(1);
+ throw new TikaException("RESERVED_NONZERO");
+ }
+
+ if (data.baseType == 1 && !(data.ref.equals(FileChunkReference.nil()))) {
+ document.setAssocGuidToRef(data.gosid, data.ref);
+ OneNotePtr content = new OneNotePtr(this);
+ content.reposition(data.ref);
+ if (data.hasGctxid()) {
+ LOG.debug("{}gctxid {}", getIndent(), data.gctxid);
+ }
+ } else if (!data.gosid.equals(ExtendedGUID.nil())) {
+ LOG.trace("Non base type == 1 guid {}", data.gosid);
+ }
+ --indentLevel;
+ if (data.gosid.equals(ExtendedGUID.nil())) {
+ LOG.debug("{}End Node {} ({}) - Offset={}, End={}", getIndent(), FndStructureConstants.nameOf(data.id), (int) data.id, offset
+ , end);
+ } else {
+ LOG.debug("{}End Node {} ({}) {}:[{}] - Offset={}, End={}", getIndent(), FndStructureConstants.nameOf(data.id), (int) data.id
+ , data.idDesc,
+ data.gosid, offset, end);
+ }
+ return data;
+ }
+
+ private void deserializeBytes(ByteBuffer byteBuffer) throws IOException {
+ if (dif.position() != offset) {
+ dif.position(offset);
+ }
+ dif.read(byteBuffer);
+ offset = dif.position();
+ }
+
+ private ObjectDeclarationWithRefCountBody deserializeObjectDeclarationWithRefCountBody() throws IOException, TikaException {
+ ObjectDeclarationWithRefCountBody data = new ObjectDeclarationWithRefCountBody();
+ data.oid = deserializeCompactID();
+ long jci_odcs_etc = deserializeLittleEndianInt();
+ long reserved = deserializeLittleEndianShort();
+
+ data.jcid.index = jci_odcs_etc & 0x3ffL;
+
+ long must_be_zero = (jci_odcs_etc >> 10) & 0xf;
+ long must_be_zeroA = ((jci_odcs_etc >> 14) & 0x3);
+ data.fHasOidReferences = ((jci_odcs_etc >> 16) & 0x1) != 0;
+ data.hasOsidReferences = ((jci_odcs_etc >> 17) & 0x1) != 0;
+ if (jci_odcs_etc >> 18L > 0) {
+ throw new TikaException("RESERVED_NONZERO");
+ }
+ if (reserved != 0 || must_be_zeroA != 0 || must_be_zero != 0) {
+ throw new TikaException("RESERVED_NONZERO");
+ }
+ return data;
+ }
+
+ private ObjectDeclarationWithRefCountBody deserializeObjectDeclaration2Body() throws IOException, TikaException {
+ ObjectDeclarationWithRefCountBody data = new ObjectDeclarationWithRefCountBody();
+ data.oid = deserializeCompactID();
+ long jcid = deserializeLittleEndianInt();
+ data.jcid.loadFrom32BitIndex(jcid);
+ long hasRefs = deserializeLittleEndianChar();
+ data.fHasOidReferences = (hasRefs & 0x1) != 0;
+ data.hasOsidReferences = (hasRefs & 0x2) != 0;
+ return data;
+ }
+
+ /**
+ * The FileDataStoreObject structure specifies the data for a file data object.
+ *
+ * @return
+ * @throws IOException
+ */
+ private FileDataStoreObject deserializeFileDataStoreObject() throws IOException, TikaException {
+ FileDataStoreObject data = new FileDataStoreObject();
+ GUID header = deserializeGUID();
+ // TODO - the expected header is different per version of one note.
+// if (!header.equals(FILE_DATA_STORE_OBJ_HEADER)) {
+// throw new TikaException("Unexpected file data store object header: " + header);
+// }
+ long len = deserializeLittleEndianLong();
+ long unused = deserializeLittleEndianInt();
+ long reserved = deserializeLittleEndianLong();
+ if (offset + len + 16 > end) {
+ throw new TikaException("SEGV error");
+ }
+ if (unused > 0 || reserved > 0) {
+ throw new TikaException("SEGV error");
+ }
+ data.fileData.stp = offset;
+ data.fileData.cb = len;
+ offset += len;
+ while ((offset & 0x7) > 0) {
+ // Padding is added to the end of the FileData stream to ensure that it ends on an 8-byte boundary.
+ ++offset;
+ }
+ GUID footer = deserializeGUID();
+ // TODO - the expected footer is per version of one note.
+// if (!footer.equals(FILE_DATA_STORE_OBJ_FOOTER)) {
+// throw new TikaException("Unexpected file data store object footer: " + footer);
+// }
+ return data;
+ }
+
+ private ObjectInfoDependencyOverrideData deserializeObjectInfoDependencyOverrideData() throws IOException {
+ ObjectInfoDependencyOverrideData objectInfoDependencyOverrideData = new ObjectInfoDependencyOverrideData();
+ long num_8bit_overrides = deserializeLittleEndianInt();
+ long num_32bit_overrides = deserializeLittleEndianInt();
+ long crc = deserializeLittleEndianInt();
+ for (int i = 0; i < num_8bit_overrides; ++i) {
+ int local = deserializeLittleEndianChar();
+ objectInfoDependencyOverrideData.overrides1.add(local);
+ }
+ for (int i = 0; i < num_32bit_overrides; ++i) {
+ long local = deserializeLittleEndianInt();
+ objectInfoDependencyOverrideData.overrides2.add(local);
+ }
+ return objectInfoDependencyOverrideData;
+ }
+
+ private CompactID deserializeCompactID() throws IOException, TikaException {
+ CompactID compactID = new CompactID();
+ compactID.n = deserializeLittleEndianChar();
+ compactID.guidIndex = deserializeInt24();
+ compactID.guid = ExtendedGUID.nil();
+ compactID.guid.n = compactID.n;
+ long index = compactID.guidIndex;
+ Map<Long, GUID> globalIdMap = document.revisionMap.get(document.currentRevision).globalId;
+ GUID guid = globalIdMap.get(index);
+ if (guid != null) {
+ compactID.guid.guid = guid;
+ } else {
+ throw new TikaException("COMPACT ID MISSING");
+ }
+ return compactID;
+ }
+
+ private long deserializeInt24() throws IOException {
+ int b1 = deserializeLittleEndianChar();
+ int b2 = deserializeLittleEndianChar();
+ int b3 = deserializeLittleEndianChar();
+
+ return new Int24(b1, b2, b3).value();
+ }
+
+ private ExtendedGUID deserializeExtendedGUID() throws IOException {
+ GUID guid = deserializeGUID();
+ long n = deserializeLittleEndianInt();
+ return new ExtendedGUID(guid, n);
+ }
+
+ /**
+ * Depending on stpFormat and cbFormat, will deserialize a FileChunkReference.
+ *
+ * @param stpFormat An unsigned integer that specifies the size and format of the
+ * FileNodeChunkReference.stp field specified by the fnd field if this FileNode structure has a
+ * value of the BaseType field equal to 1 or 2. MUST be ignored if the value of the BaseType field
+ * of this FileNode structure is equal to 0. The meaning of the StpFormat field is given by the
+ * following table.
+ * Value Meaning
+ * 0 8 bytes, uncompressed.
+ * 1 4 bytes, uncompressed.
+ * 2 2 bytes, compressed.
+ * 3 4 bytes, compressed.
+ * The value of an uncompressed file pointer specifies a location in the file. To uncompress a
+ * compressed file pointer, multiply the value by 8.
+ * @param cbFormat An unsigned integer that specifies the size and format of the
+ * FileNodeChunkReference.cb field specified by the fnd field if this FileNode structure has a
+ * BaseType field value equal to 1 or 2. MUST be 0 and MUST be ignored if BaseType of this
+ * FileNode structure is equal to 0. The meaning of CbFormat is given by the following table.
+ * Value Meaning
+ * 0 4 bytes, uncompressed.
+ * 1 8 bytes, uncompressed.
+ * 2 1 byte, compressed.
+ * 3 2 bytes, compressed.
+ * The value of an uncompressed byte count specifies the size, in bytes, of the data referenced by a
+ * FileNodeChunkReference structure. To uncompress a compressed byte count,
+ * multiply the value by 8.
+ * @return
+ * @throws IOException
+ */
+ FileChunkReference deserializeVarFileChunkReference(long stpFormat, long cbFormat) throws IOException, TikaException {
+ FileChunkReference data = new FileChunkReference(0, 0);
+ long local8;
+ long local16;
+ long local32;
+ switch (new Long(stpFormat).intValue()) {
+ case 0: // 8 bytes, uncompressed
+ data.stp = deserializeLittleEndianLong();
+ break;
+ case 1:
+ local32 = deserializeLittleEndianInt();
+ data.stp = local32;
+ break;
+ case 2:
+ local16 = deserializeLittleEndianShort();
+ data.stp = local16;
+ data.stp <<= 3;
+ break;
+ case 3:
+ local32 = deserializeLittleEndianInt();
+ data.stp = local32;
+ data.stp <<= 3;
+ break;
+ default:
+ throw new TikaException("Unknown STP file node format " + stpFormat);
+ }
+ switch (new Long(cbFormat).intValue()) {
+ case 0: // 4 bytes, uncompressed
+ local32 = deserializeLittleEndianInt();
+ data.cb = local32;
+ break;
+ case 1: // 8 bytes, uncompressed;
+ data.cb = deserializeLittleEndianLong();
+ break;
+ case 2: // 1 byte, compressed
+ local8 = deserializeLittleEndianChar();
+ data.cb = local8;
+ data.cb <<= 3;
+ break;
+ case 3: // 2 bytes, compressed
+ local16 = deserializeLittleEndianShort();
+ data.cb = local16;
+ data.cb <<= 3;
+
+ break;
+ default:
+ throw new TikaException("Unknown CB file node format " + cbFormat);
+ }
+ return data;
+ }
+
+ FileNodeListHeader deserializeFileNodeListHeader() throws IOException {
+ long positionOfThisHeader = offset;
+ long uintMagic = deserializeLittleEndianLong();
+ long fileNodeListId = deserializeLittleEndianInt();
+ long nFragmentSequence = deserializeLittleEndianInt();
+
+ return new FileNodeListHeader(positionOfThisHeader, uintMagic, fileNodeListId, nFragmentSequence);
+ }
+
+ /**
+ * For an object declaration file node, after parsing all the fnd variables, now we will process
+ * the object declaration's contents.
+ *
+ * @param data The FileNode containing all the fnd variable's data.
+ * @param curPtr The current pointer.
+ * @throws IOException
+ */
+ private void postprocessObjectDeclarationContents(FileNode data, FileNodePtr curPtr) throws IOException, TikaException {
+ data.gosid = data.subType.objectDeclarationWithRefCount.body.oid.guid;
+ document.guidToObject.put(data.gosid, new FileNodePtr(curPtr));
+ if (data.subType.objectDeclarationWithRefCount.body.jcid.isObjectSpaceObjectPropSet()) {
+ OneNotePtr objectSpacePropSetPtr = new OneNotePtr(this);
+ objectSpacePropSetPtr.reposition(data.ref);
+ data.subType.objectDeclarationWithRefCount.objectRef = objectSpacePropSetPtr.deserializeObjectSpaceObjectPropSet();
+ ObjectStreamCounters streamCounters = new ObjectStreamCounters();
+ data.propertySet = objectSpacePropSetPtr.deserializePropertySet(streamCounters,
+ data.subType.objectDeclarationWithRefCount.objectRef);
+ } else {
+ if (!data.subType.objectDeclarationWithRefCount.body.jcid.isFileData) {
+ throw new TikaException("JCID must be file data when !isObjectSpaceObjectPropSet.");
+ }
+ // this is FileData
+ data.isFileData = true;
+ if (LOG.isDebugEnabled()) {
+ OneNotePtr content = new OneNotePtr(this);
+ content.reposition(data.ref);
+ LOG.debug("{}Raw:", getIndent());
+ content.dumpHex();
+ LOG.debug("");
+ }
+ }
+ }
+
+ private PropertySet deserializePropertySet(ObjectStreamCounters counters, ObjectSpaceObjectPropSet streams) throws IOException,
+ TikaException {
+ PropertySet data = new PropertySet();
+ long count = deserializeLittleEndianShort();
+ data.rgPridsData = Stream.generate(PropertyValue::new)
+ .limit((int) count)
+ .collect(Collectors.toList());
+ for (int i = 0; i < count; ++i) {
+ data.rgPridsData.get(i).propertyId = deserializePropertyID();
+ LOG.debug("{}Property {}", getIndent(), data.rgPridsData.get(i).propertyId);
+ }
+ LOG.debug("{}{} elements in property set:", getIndent(), count);
+ for (int i = 0; i < count; ++i) {
+ data.rgPridsData.set(i, deserializePropertyValueFromPropertyID(
+ data.rgPridsData.get(i).propertyId, streams, counters));
+ }
+ LOG.debug("");
+ return data;
+
+ }
+
+ private PropertyValue deserializePropertyValueFromPropertyID(OneNotePropertyId propertyID, ObjectSpaceObjectPropSet streams,
+ ObjectStreamCounters counters) throws IOException, TikaException {
+ PropertyValue data = new PropertyValue();
+ data.propertyId = propertyID;
+ char val8;
+ long val16;
+ long val32 = 0;
+ long val64;
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("\n{}<{}", getIndent(), propertyID);
+ }
+
+ ++indentLevel;
+ try {
+ long type = propertyID.type;
+ switch ((int) type) {
+ case 0x1:
+ LOG.debug(" [] ");
+ return data;
+ case 0x2:
+ LOG.debug(" PropertyID bool({})", propertyID.inlineBool);
+ data.scalar = propertyID.inlineBool ? 1 : 0;
+ return data;
+ case 0x3:
+ val8 = deserializeLittleEndianChar();
+ data.scalar = val8;
+ LOG.debug(" PropertyID byte({})", data.scalar);
+ break;
+ case 0x4:
+ val16 = deserializeLittleEndianShort();
+ data.scalar = val16;
+ LOG.debug(" uint16 PropertyID short({})", data.scalar);
+ break;
+ case 0x5:
+ val32 = deserializeLittleEndianInt();
+ data.scalar = val32;
+ LOG.debug(" PropertyID int({})", data.scalar);
+ break;
+ case 0x6:
+ val64 = deserializeLittleEndianLong();
+ data.scalar = val64;
+ LOG.debug(" PropertyID long({})", data.scalar);
+ break;
+ case 0x7:
+ // If the value of the PropertyID.type element is "0x7" and the property specifies an array of elements, the value of
+ // the
+ // prtFourBytesOfLengthFollowedByData.cb element MUST be the sum of the sizes, in bytes, of each element in the array.
+ // Exceptions include:
+ // * The RgOutlineIndentDistance element, where the value of the prtFourBytesOfLengthFollowedByData.cb element
+ // MUST be: 4 + (4 × RgOutlineIndentDistance.count).
+ // * The TableColumnsLocked element, where the value of the prtFourBytesOfLengthFollowedByData.cb
+ // element MUST be: 1 + (TableColumnsLocked.cColumns + 7) / 8.
+ // * The TableColumnWidths element, where the value of the prtFourBytesOfLengthFollowedByData.cb
+ // element MUST be: 1 + (4 × TableColumnWidths.cColumns).
+
+ val32 = deserializeLittleEndianInt();
+ LOG.debug(" raw data: ({})[", val32);
+ {
+ data.rawData.stp = offset;
+ data.rawData.cb = 0;
+ if (offset + val32 > end) {
+ data.rawData.cb = end - offset;
+ offset = end;
+ throw new TikaException("Offset is past end of file.");
+ }
+ data.rawData.cb = val32;
+ offset += val32;
+ if (LOG.isDebugEnabled()) {
+ OneNotePtr content = new OneNotePtr(this);
+ content.reposition(data.rawData);
+ content.dumpHex();
+ }
+ }
+ LOG.debug("]");
+ break;
+ case 0x9:
+ case 0xb:
+ case 0xd:
+ val32 = deserializeLittleEndianInt();
+ // fallthrough
+ case 0x8:
+ case 0xa:
+ case 0xc:
+ if (type == 0x8 || type == 0xa
+ || type == 0xc) {
+ val32 = 1;
+ }
+ {
+ List<CompactID> stream = streams.contextIDs.data;
+ String xtype = "contextID";
+ long s_count = counters.context_ids_count;
+ if (type == 0x8 || type == 0x9) {
+ stream = streams.oids.data;
+ s_count = counters.oids_count;
+ xtype = "OIDs";
+ }
+ if (type == 0xa || type == 0xb) {
+ stream = streams.osids.data;
+ s_count = counters.osids_count;
+ xtype = "OSIDS";
+ }
+ for (int i = 0; i < val32; ++i, ++s_count) {
+ int index = (int) s_count;
+ if (index < stream.size()) {
+ data.compactIDs.add(stream.get(index));
+ LOG.debug(" {}[{}]", xtype,
+ data.compactIDs.get(data.compactIDs.size() - 1));
+ } else {
+ throw new TikaException("SEGV");
+ }
+ }
+ }
+ break;
+ case 0x10:
+ val32 = deserializeLittleEndianInt();
+ {
+ OneNotePropertyId propId = deserializePropertyID();
+ LOG.debug(" UnifiedSubPropertySet {} {}", val32, propId);
+ data.propertySet.rgPridsData = Stream.generate(PropertyValue::new)
+ .limit((int) val32)
+ .collect(Collectors.toList());
+ for (int i = 0; i < val32; ++i) {
+ try {
+ data.propertySet.rgPridsData.set(i, deserializePropertyValueFromPropertyID(propId, streams, counters));
+ } catch (IOException e) {
+ return data;
+ }
+ }
+ }
+ break;
+ case 0x11:
+ LOG.debug(" SubPropertySet");
+ data.propertySet = deserializePropertySet(counters, streams);
+ break;
+ default:
+ throw new TikaException("Invalid type: " + type);
+ }
+ LOG.debug(">");
+ return data;
+ } finally {
+ --indentLevel;
+ }
+ }
+
+ private OneNotePropertyId deserializePropertyID() throws IOException {
+ long pid = deserializeLittleEndianInt();
+ return new OneNotePropertyId(pid);
+ }
+
+ private ObjectSpaceObjectPropSet deserializeObjectSpaceObjectPropSet() throws IOException, TikaException {
+ ObjectSpaceObjectPropSet data = new ObjectSpaceObjectPropSet();
+ data.osids.extendedStreamsPresent = 0;
+ data.osids.osidsStreamNotPresent = 1;
+ data.contextIDs.extendedStreamsPresent = 0;
+ data.contextIDs.osidsStreamNotPresent = 0;
+ //uint64_t cur_offset = offset;
+ //LOG.debug("starting deserialization %lx(%lx) / %lx", offset, offset - cur_offset, end);
+ data.oids = deserializeObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs();
+ //LOG.debug("mid deserialization %lx(%lx) / %lx", offset, offset - cur_offset, end);
+ if (data.oids.osidsStreamNotPresent == 0) {
+ data.osids = deserializeObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs();
+ }
+ //LOG.debug("lat deserialization %lx(%lx) / %lx", offset, offset - cur_offset, end);
+ if (data.oids.extendedStreamsPresent != 0) {
+ data.contextIDs = deserializeObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs();
+ }
+ return data;
+ }
+
+ private ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs deserializeObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs() throws IOException
+ , TikaException {
+ ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs data = new ObjectSpaceObjectStreamOfOIDsOSIDsOrContextIDs();
+ long header = deserializeLittleEndianInt();
+ data.count = header & 0xffffff;
+ data.osidsStreamNotPresent = ((header >> 31) & 0x1);
+ data.extendedStreamsPresent = ((header >> 30) & 0x1);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(
+ "{}Deserialized Stream Header count: {} OsidsNotPresent {} Extended {}",
+ getIndent(), data.count,
+ data.osidsStreamNotPresent,
+ data.extendedStreamsPresent);
+ }
+ for (int i = 0; i < data.count; ++i) {
+ CompactID cid;
+ cid = deserializeCompactID();
+ data.data.add(cid);
+ }
+ return data;
+ }
+
+ long roomLeft() {
+ return end - offset;
+ }
+
+ public void dumpHex() throws TikaMemoryLimitException, IOException {
+ if (end - offset > dif.size()) {
+ throw new TikaMemoryLimitException("Exceeded memory limit when trying to dumpHex - " + (end - offset) + " > " + dif.size());
+ }
+ ByteBuffer byteBuffer = ByteBuffer.allocate((int) (end - offset));
+ LOG.debug(Hex.encodeHexString(byteBuffer.array()));
+ }
+
+ public int size() {
+ return (int) (end - offset);
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
new file mode 100644
index 0000000..14b3745
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
@@ -0,0 +1,579 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.time.Instant;
+import java.time.LocalDateTime;
+import java.time.Month;
+import java.time.ZoneOffset;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Walk the one note tree and create a Map while it goes.
+ * Also writes user input text to a print writer as it parses.
+ */
+class OneNoteTreeWalker {
+
+ private static final String P = "p";
+ private static Pattern HYPERLINK_PATTERN = Pattern.compile("\uFDDFHYPERLINK\\s+\"([^\"]+)\"([^\"]+)$");
+
+ /**
+ * See spec MS-ONE - 2.3.1 - TIME32 - epoch of jan 1 1980 UTC.
+ * So we create this offset used to calculate number of seconds between this and the Instant.EPOCH.
+ */
+ private static final long TIME32_EPOCH_DIFF_1980;
+ static {
+ LocalDateTime time32Epoch1980 = LocalDateTime.of(1980, Month.JANUARY, 1, 0, 0);
+ Instant instant = time32Epoch1980.atZone(ZoneOffset.UTC).toInstant();
+ TIME32_EPOCH_DIFF_1980 = (instant.toEpochMilli() - Instant.EPOCH.toEpochMilli()) / 1000;
+ }
+ /**
+ * See spec MS-DTYP - 2.3.3 - DATETIME dates are based on epoch of jan 1 1601 UTC.
+ * So we create this offset used to calculate number of seconds between this and the Instant.EPOCH.
+ */
+ private static final long DATETIME_EPOCH_DIFF_1601;
+ static {
+ LocalDateTime time32Epoch1601 = LocalDateTime.of(1601, Month.JANUARY, 1, 0, 0);
+ Instant instant = time32Epoch1601.atZone(ZoneOffset.UTC).toInstant();
+ DATETIME_EPOCH_DIFF_1601 = (instant.toEpochMilli() - Instant.EPOCH.toEpochMilli()) / 1000;
+ }
+
+ private OneNoteTreeWalkerOptions options;
+ private OneNoteDocument oneNoteDocument;
+ private OneNoteDirectFileResource dif;
+ private XHTMLContentHandler xhtml;
+ private Pair<Long, ExtendedGUID> roleAndContext;
+ private final Metadata parentMetadata;
+ private final EmbeddedDocumentExtractor embeddedDocumentExtractor;
+ private final Set<String> authors = new HashSet<>();
+ private final Set<String> mostRecentAuthors = new HashSet<>();
+ private final Set<String> originalAuthors = new HashSet<>();
+ private Instant lastModifiedTimestamp = Instant.MIN;
+ private long creationTimestamp = Long.MAX_VALUE;
+ private long lastModified = Long.MIN_VALUE;
+ private boolean mostRecentAuthorProp = false;
+ private boolean originalAuthorProp = false;
+
+ /**
+ * Create a one tree walker.
+ *
+ * @param options The options for how to walk this tree.
+ * @param oneNoteDocument The one note document we want to walk.
+ * @param dif The random file access structure we read and reposition while extracting the content.
+ * @param xhtml The XHTMLContentHandler to populate as you walk the tree.
+ * @param roleAndContext The role and context value we want to use when crawling. Set this to null if you are
+ * crawling all root file nodes, and don't care about revisions.
+ */
+ public OneNoteTreeWalker(OneNoteTreeWalkerOptions options, OneNoteDocument oneNoteDocument,
+ OneNoteDirectFileResource dif, XHTMLContentHandler xhtml,
+ Metadata parentMetadata, ParseContext parseContext, Pair<Long, ExtendedGUID> roleAndContext) {
+ this.options = options;
+ this.oneNoteDocument = oneNoteDocument;
+ this.dif = dif;
+ this.roleAndContext = roleAndContext;
+ this.xhtml = xhtml;
+ this.parentMetadata = parentMetadata;
+ this.embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
+ }
+
+ /**
+ * Parse the tree.
+ *
+ * @return Map of the fully parsed one note document.
+ * @throws IOException Can throw these when manipulating the seekable byte channel.
+ */
+ public Map<String, Object> walkTree() throws IOException, TikaException, SAXException {
+ Map<String, Object> structure = new HashMap<>();
+ structure.put("header", oneNoteDocument.header);
+ structure.put("rootFileNodes", walkRootFileNodes());
+ return structure;
+ }
+
+ /**
+ * Walk the root file nodes, depending on the options will crawl revisions or the entire revision tree.
+ *
+ * @return List of the root file nodes.
+ * @throws IOException Can throw these when manipulating the seekable byte channel.
+ */
+ public List<Map<String, Object>> walkRootFileNodes() throws IOException, TikaException, SAXException {
+ List<Map<String, Object>> res = new ArrayList<>();
+ if (options.isCrawlAllFileNodesFromRoot()) {
+ res.add(walkFileNodeList(oneNoteDocument.root));
+ } else {
+ for (ExtendedGUID revisionListGuid : oneNoteDocument.revisionListOrder) {
+ Map<String, Object> structure = new HashMap<>();
+ structure.put("oneNoteType", "Revision");
+ structure.put("revisionListGuid", revisionListGuid.toString());
+ FileNodePtr fileNodePtr = oneNoteDocument.revisionManifestLists.get(revisionListGuid);
+ structure.put("fileNode", walkRevision(fileNodePtr));
+ res.add(structure);
+ }
+ }
+ return res;
+ }
+
+ /**
+ * Does the revision role map have this revision role id.
+ *
+ * @param rid The revision id.
+ * @param revisionRole The revision role Long,GUID pair.
+ * @return True if exists, false if not.
+ */
+ private boolean hasRevisionRole(ExtendedGUID rid, Pair<Long, ExtendedGUID> revisionRole) {
+ Pair<Long, ExtendedGUID> where = oneNoteDocument.revisionRoleMap.get(rid);
+ return where != null && where.equals(revisionRole);
+ }
+
+ /**
+ * Walk revisions.
+ *
+ * @param fileNodePtr The file node pointer to start with.
+ * @return A map of the parsed data.
+ * @throws IOException Can throw these when manipulating the seekable byte channel.
+ */
+ private Map<String, Object> walkRevision(FileNodePtr fileNodePtr) throws IOException, TikaException, SAXException {
+ Map<String, Object> structure = new HashMap<>();
+ structure.put("oneNoteType", "FileNodePointer");
+ structure.put("offsets", fileNodePtr.nodeListPositions);
+ FileNode revisionFileNode = fileNodePtr.dereference(oneNoteDocument);
+ structure.put("fileNodeId", revisionFileNode.id);
+ if (revisionFileNode.gosid != null) {
+ structure.put("gosid", revisionFileNode.gosid.toString());
+ }
+ structure.put("subType", revisionFileNode.subType);
+ structure.put("size", revisionFileNode.size);
+ structure.put("isFileData", revisionFileNode.isFileData);
+
+ Set<ExtendedGUID> validRevisions = new HashSet<>();
+ for (int i = revisionFileNode.childFileNodeList.children.size() - 1; i >= 0; --i) {
+ FileNode child = revisionFileNode.childFileNodeList.children.get(i);
+ if (roleAndContext != null && hasRevisionRole(child.gosid, roleAndContext)) {
+ validRevisions.add(child.gosid);
+ if (options.isOnlyLatestRevision()) {
+ break;
+ }
+ }
+ }
+ List<Map<String, Object>> children = new ArrayList<>();
+ boolean okGroup = false;
+ for (FileNode child : revisionFileNode.childFileNodeList.children) {
+ if (child.id == FndStructureConstants.RevisionManifestStart4FND ||
+ child.id == FndStructureConstants.RevisionManifestStart6FND ||
+ child.id == FndStructureConstants.RevisionManifestStart7FND) {
+ okGroup = validRevisions.contains(child.gosid);
+ }
+ if (okGroup) {
+ if ((child.id == FndStructureConstants.RootObjectReference2FNDX ||
+ child.id == FndStructureConstants.RootObjectReference3FND) &&
+ child.subType.rootObjectReference.rootObjectReferenceBase.rootRole == 1) {
+ FileNodePtr childFileNodePointer = oneNoteDocument.guidToObject.get(child.gosid);
+ children.add(walkFileNodePtr(childFileNodePointer));
+ }
+ }
+ }
+ if (!children.isEmpty()) {
+ Map<String, Object> childFileNodeListMap = new HashMap<>();
+ childFileNodeListMap.put("fileNodeListHeader", revisionFileNode.childFileNodeList.fileNodeListHeader);
+ childFileNodeListMap.put("children", children);
+ structure.put("revisionFileNodeList", childFileNodeListMap);
+ }
+ return structure;
+ }
+
+ /**
+ * Walk the file node pointer.
+ *
+ * @param fileNodePtr The file node pointer.
+ * @return Returns a map of the main data.
+ * @throws IOException Can throw these when manipulating the seekable byte channel.
+ */
+ public Map<String, Object> walkFileNodePtr(FileNodePtr fileNodePtr) throws IOException, TikaException, SAXException {
+ if (fileNodePtr != null) {
+ FileNode fileNode = fileNodePtr.dereference(oneNoteDocument);
+ return walkFileNode(fileNode);
+ }
+ return Collections.emptyMap();
+ }
+
+ /**
+ * Walk the file node list.
+ *
+ * @param fileNodeList The file node list to parse.
+ * @return The result.
+ * @throws IOException Can throw these when manipulating the seekable byte channel.
+ */
+ public Map<String, Object> walkFileNodeList(FileNodeList fileNodeList) throws IOException, TikaException, SAXException {
+ Map<String, Object> structure = new HashMap<>();
+ structure.put("oneNoteType", "FileNodeList");
+ structure.put("fileNodeListHeader", fileNodeList.fileNodeListHeader);
+ if (!fileNodeList.children.isEmpty()) {
+ List<Map<String, Object>> children = new ArrayList<>();
+ for (FileNode child : fileNodeList.children) {
+ children.add(walkFileNode(child));
+ }
+ structure.put("children", children);
+ }
+ return structure;
+ }
+
+ /**
+ * Walk a single file node.
+ *
+ * @param fileNode The file node.
+ * @return Map which is result of the parsed file node.
+ * @throws IOException Can throw these when manipulating the seekable byte channel.
+ */
+ public Map<String, Object> walkFileNode(FileNode fileNode) throws IOException, TikaException, SAXException {
+ Map<String, Object> structure = new HashMap<>();
+ structure.put("oneNoteType", "FileNode");
+ structure.put("gosid", fileNode.gosid.toString());
+ structure.put("size", fileNode.size);
+ structure.put("fileNodeId", "0x" + Long.toHexString(fileNode.id));
+ structure.put("fileNodeIdName", FndStructureConstants.nameOf(fileNode.id));
+ structure.put("fileNodeBaseType", "0x" + Long.toHexString(fileNode.baseType));
+ structure.put("isFileData", fileNode.isFileData);
+ structure.put("idDesc", fileNode.idDesc);
+ if (fileNode.childFileNodeList != null && fileNode.childFileNodeList.fileNodeListHeader != null) {
+ structure.put("childFileNodeList", walkFileNodeList(fileNode.childFileNodeList));
+ }
+ if (fileNode.propertySet != null) {
+ List<Map<String, Object>> propSet = processPropertySet(fileNode.propertySet);
+ if (!propSet.isEmpty()) {
+ structure.put("propertySet", propSet);
+ }
+ }
+ if (fileNode.subType.fileDataStoreObjectReference.ref != null &&
+ !FileChunkReference.nil().equals(fileNode.subType.fileDataStoreObjectReference.ref.fileData)) {
+ structure.put("fileDataStoreObjectReference",
+ walkFileDataStoreObjectReference(fileNode.subType.fileDataStoreObjectReference));
+ }
+ return structure;
+ }
+
+ /**
+ * Walk a file data store object reference.
+ *
+ * @param fileDataStoreObjectReference The file data store object reference we are parsing.
+ * @return Map containing parsed content.
+ * @throws IOException Can throw these when manipulating the seekable byte channel.
+ */
+ private Map<String, Object> walkFileDataStoreObjectReference(
+ FileDataStoreObjectReference fileDataStoreObjectReference) throws IOException, SAXException, TikaException {
+ Map<String, Object> structure = new HashMap<>();
+ OneNotePtr content = new OneNotePtr(oneNoteDocument, dif);
+ content.reposition(fileDataStoreObjectReference.ref.fileData);
+ if (fileDataStoreObjectReference.ref.fileData.cb > dif.size()) {
+ throw new TikaMemoryLimitException("File data store cb " + fileDataStoreObjectReference.ref.fileData.cb +
+ " exceeds document size: " + dif.size());
+ }
+ handleEmbedded((int)fileDataStoreObjectReference.ref.fileData.cb);
+ structure.put("fileDataStoreObjectMetadata", fileDataStoreObjectReference);
+ return structure;
+ }
+
+ private void handleEmbedded(int length) throws TikaException, IOException, SAXException {
+ TikaInputStream stream = null;
+ ByteBuffer buf = null;
+ try {
+ buf = ByteBuffer.allocate(length);
+ dif.read(buf);
+ } catch (IOException e) {
+ //store this exception in the parent's metadata
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
+ return;
+ }
+ Metadata embeddedMetadata = new Metadata();
+ try {
+ stream = TikaInputStream.get(buf.array());
+ embeddedDocumentExtractor.parseEmbedded(
+ stream,
+ new EmbeddedContentHandler(xhtml),
+ embeddedMetadata, false);
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+
+ }
+
+ /**
+ * @param propertySet
+ * @return
+ * @throws IOException Can throw these when manipulating the seekable byte channel.
+ */
+ private List<Map<String, Object>> processPropertySet(PropertySet propertySet) throws IOException, TikaException,
+ SAXException {
+ List<Map<String, Object>> propValues = new ArrayList<>();
+ for (PropertyValue propertyValue : propertySet.rgPridsData) {
+ propValues.add(processPropertyValue(propertyValue));
+ }
+ return propValues;
+ }
+
+ /**
+ * Is this property a binary property?
+ *
+ * @param property The property.
+ * @return Is it binary?
+ */
+ private boolean propertyIsBinary(OneNotePropertyEnum property) {
+ return property == OneNotePropertyEnum.RgOutlineIndentDistance ||
+ property == OneNotePropertyEnum.NotebookManagementEntityGuid ||
+ property == OneNotePropertyEnum.RichEditTextUnicode;
+ }
+
+ /**
+ * Process a property value and populate a map containing all the property value data.
+ * <p>
+ * Parse out any relevant text and write it to the print writer as well for easy search engine parsing.
+ *
+ * @param propertyValue The property value we are parsing.
+ * @return The map parsed by this property value.
+ * @throws IOException Can throw these when manipulating the seekable byte channel.
+ */
+ private Map<String, Object> processPropertyValue(PropertyValue propertyValue) throws IOException, TikaException,
+ SAXException {
+ Map<String, Object> propMap = new HashMap<>();
+ propMap.put("oneNoteType", "PropertyValue");
+ propMap.put("propertyId", propertyValue.propertyId.toString());
+
+ if (propertyValue.propertyId.propertyEnum == OneNotePropertyEnum.LastModifiedTimeStamp) {
+ long fullval = propertyValue.scalar;
+ Instant instant = Instant.ofEpochSecond(fullval / 10000000 + DATETIME_EPOCH_DIFF_1601);
+ if (instant.isAfter(lastModifiedTimestamp)) {
+ lastModifiedTimestamp = instant;
+ }
+ } else if (propertyValue.propertyId.propertyEnum == OneNotePropertyEnum.CreationTimeStamp) {
+ // add the TIME32_EPOCH_DIFF_1980 because OneNote TIME32 epoch time is per 1980, not 1970
+ long creationTs = propertyValue.scalar + TIME32_EPOCH_DIFF_1980;
+ if (creationTs < creationTimestamp) {
+ creationTimestamp = creationTs;
+ }
+ } else if (propertyValue.propertyId.propertyEnum == OneNotePropertyEnum.LastModifiedTime) {
+ // add the TIME32_EPOCH_DIFF_1980 because OneNote TIME32 epoch time is per 1980, not 1970
+ long lastMod = propertyValue.scalar + TIME32_EPOCH_DIFF_1980;
+ if (lastMod > lastModified) {
+ lastModified = lastMod;
+ }
+ } else if (propertyValue.propertyId.propertyEnum == OneNotePropertyEnum.Author) {
+ String author = getAuthor(propertyValue);
+ if (mostRecentAuthorProp) {
+ propMap.put("MostRecentAuthor", author);
+ mostRecentAuthors.add(author);
+ } else if (originalAuthorProp) {
+ propMap.put("OriginalAuthor", author);
+ originalAuthors.add(author);
+ } else {
+ propMap.put("Author", author);
+ authors.add(author);
+ }
+ mostRecentAuthorProp = false;
+ originalAuthorProp = false;
+ } else if (propertyValue.propertyId.propertyEnum == OneNotePropertyEnum.AuthorMostRecent) {
+ mostRecentAuthorProp = true;
+ } else if (propertyValue.propertyId.propertyEnum == OneNotePropertyEnum.AuthorOriginal) {
+ originalAuthorProp = true;
+ } else if (propertyValue.propertyId.type > 0 && propertyValue.propertyId.type <= 6) {
+ propMap.put("scalar", propertyValue.scalar);
+ } else {
+ OneNotePtr content = new OneNotePtr(oneNoteDocument, dif);
+ content.reposition(propertyValue.rawData);
+ boolean isBinary = propertyIsBinary(propertyValue.propertyId.propertyEnum);
+ propMap.put("isBinary", isBinary);
+ if ((content.size() & 1) == 0
+ && propertyValue.propertyId.propertyEnum != OneNotePropertyEnum.TextExtendedAscii
+ && isBinary == false) {
+ if (content.size() > dif.size()) {
+ throw new TikaMemoryLimitException("File data store cb " + content.size() +
+ " exceeds document size: " + dif.size());
+ }
+ ByteBuffer buf = ByteBuffer.allocate(content.size());
+ dif.read(buf);
+ propMap.put("dataUnicode16LE", new String(buf.array(), StandardCharsets.UTF_16LE));
+ if (options.getUtf16PropertiesToPrint().contains(propertyValue.propertyId)) {
+ xhtml.startElement(P);
+ xhtml.characters((String) propMap.get("dataUnicode16LE"));
+ xhtml.endElement(P);
+ }
+ } else if (propertyValue.propertyId.propertyEnum == OneNotePropertyEnum.TextExtendedAscii) {
+ if (content.size() > dif.size()) {
+ throw new TikaMemoryLimitException("File data store cb " + content.size() +
+ " exceeds document size: " + dif.size());
+ }
+ ByteBuffer buf = ByteBuffer.allocate(content.size());
+ dif.read(buf);
+ propMap.put("dataAscii", new String(buf.array(), StandardCharsets.US_ASCII));
+ xhtml.startElement(P);
+ xhtml.characters((String) propMap.get("dataAscii"));
+ xhtml.endElement(P);
+ } else if (isBinary == false) {
+ if (content.size() > dif.size()) {
+ throw new TikaMemoryLimitException("File data store cb " + content.size() +
+ " exceeds document size: " + dif.size());
+ }
+ ByteBuffer buf = ByteBuffer.allocate(content.size());
+ dif.read(buf);
+ propMap.put("dataUnicode16LE", new String(buf.array(), StandardCharsets.UTF_16LE));
+ if (options.getUtf16PropertiesToPrint().contains(propertyValue.propertyId)) {
+ xhtml.startElement(P);
+ xhtml.characters((String) propMap.get("dataUnicode16LE"));
+ xhtml.endElement(P);
+ }
+ } else {
+ if (content.size() > dif.size()) {
+ throw new TikaMemoryLimitException("File data store cb " + content.size() +
+ " exceeds document size: " + dif.size());
+ }
+ if (propertyValue.propertyId.propertyEnum == OneNotePropertyEnum.RichEditTextUnicode) {
+ handleRichEditTextUnicode(content.size());
+ } else {
+ //TODO -- these seem to be somewhat broken font files and other
+ //odds and ends...what are they and how should we process them?
+ //handleEmbedded(content.size());
+ }
+ }
+ }
+ if (propertyValue.compactIDs != null) {
+ List<Map<String, Object>> children = new ArrayList<>();
+ for (CompactID compactID : propertyValue.compactIDs) {
+ FileNodePtr childFileNodePointer = oneNoteDocument.guidToObject.get(compactID.guid);
+ children.add(walkFileNodePtr(childFileNodePointer));
+ }
+ if (!children.isEmpty()) {
+ propMap.put("children", children);
+ }
+ }
+ if (propertyValue.propertySet != null && propertyValue.propertySet.rgPridsData != null) {
+ List<Map<String, Object>> propSet = processPropertySet(propertyValue.propertySet);
+ if (!propSet.isEmpty()) {
+ propMap.put("propertySet", propSet);
+ }
+ }
+ return propMap;
+ }
+
+ /**
+ * returns a UTF-16LE author string.
+ * @param propertyValue The property value of an author.
+ * @return Resulting author string in UTF-16LE format.
+ */
+ private String getAuthor(PropertyValue propertyValue) throws IOException, TikaMemoryLimitException {
+ OneNotePtr content = new OneNotePtr(oneNoteDocument, dif);
+ content.reposition(propertyValue.rawData);
+ if (content.size() > dif.size()) {
+ throw new TikaMemoryLimitException("File data store cb " + content.size() +
+ " exceeds document size: " + dif.size());
+ }
+ ByteBuffer buf = ByteBuffer.allocate(content.size());
+ dif.read(buf);
+ return new String(buf.array(), StandardCharsets.UTF_16LE);
+ }
+
+ private void handleRichEditTextUnicode(int length) throws SAXException, IOException, TikaException {
+ //this is a null-ended UTF-16LE string
+ ByteBuffer buf = ByteBuffer.allocate(length);
+ dif.read(buf);
+ byte[] arr = buf.array();
+ //look for the first null
+ int firstNull = 0;
+ for (int i = 0; i < arr.length-1; i += 2) {
+ if (arr[i] == 0 && arr[i+1] == 0) {
+ firstNull = (i > 0) ? i : 0;
+ break;
+ }
+ }
+
+ if (firstNull == 0) {
+ return;
+ }
+ String txt = new String(arr, 0, firstNull, StandardCharsets.UTF_16LE);
+ Matcher m = HYPERLINK_PATTERN.matcher(txt);
+ if (m.find()) {
+ xhtml.startElement("a", "href", m.group(1));
+ xhtml.characters(m.group(2));
+ xhtml.endElement("a");
+ } else {
+ xhtml.startElement(P);
+ xhtml.characters(txt);
+ xhtml.endElement(P);
+ }
+ }
+
+ public Set<String> getAuthors() {
+ return authors;
+ }
+
+ public Set<String> getMostRecentAuthors() {
+ return mostRecentAuthors;
+ }
+
+ public Set<String> getOriginalAuthors() {
+ return originalAuthors;
+ }
+
+ public Instant getLastModifiedTimestamp() {
+ return lastModifiedTimestamp;
+ }
+
+ public void setLastModifiedTimestamp(Instant lastModifiedTimestamp) {
+ this.lastModifiedTimestamp = lastModifiedTimestamp;
+ }
+
+ public long getLastModified() {
+ return lastModified;
+ }
+
+ public void setLastModified(long lastModified) {
+ this.lastModified = lastModified;
+ }
+
+ public long getCreationTimestamp() {
+ return creationTimestamp;
+ }
+
+ public void setCreationTimestamp(long creationTimestamp) {
+ this.creationTimestamp = creationTimestamp;
+ }
+}
\ No newline at end of file
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalkerOptions.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalkerOptions.java
new file mode 100644
index 0000000..b25fd05
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalkerOptions.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Options when walking the one note tree.
+ */
+class OneNoteTreeWalkerOptions {
+ private boolean crawlAllFileNodesFromRoot = true;
+ private boolean onlyLatestRevision = true;
+ private Set<OneNotePropertyEnum> utf16PropertiesToPrint = new HashSet<>(
+ Arrays.asList(OneNotePropertyEnum.ImageFilename,
+ OneNotePropertyEnum.Author,
+ OneNotePropertyEnum.CachedTitleString));
+
+ /**
+ * Do this to ignore revisions and just parse all file nodes from the root recursively.
+ */
+ public boolean isCrawlAllFileNodesFromRoot() {
+ return crawlAllFileNodesFromRoot;
+ }
+
+ /**
+ * Do this to ignore revisions and just parse all file nodes from the root recursively.
+ *
+ * @param crawlAllFileNodesFromRoot
+ * @return
+ */
+ public OneNoteTreeWalkerOptions setCrawlAllFileNodesFromRoot(boolean crawlAllFileNodesFromRoot) {
+ this.crawlAllFileNodesFromRoot = crawlAllFileNodesFromRoot;
+ return this;
+ }
+
+ /**
+ * Only parse the latest revision.
+ */
+ public boolean isOnlyLatestRevision() {
+ return onlyLatestRevision;
+ }
+
+ /**
+ * Only parse the latest revision.
+ *
+ * @param onlyLatestRevision
+ * @return Returns this, as per builder pattern.
+ */
+ public OneNoteTreeWalkerOptions setOnlyLatestRevision(boolean onlyLatestRevision) {
+ this.onlyLatestRevision = onlyLatestRevision;
+ return this;
+ }
+
+ /**
+ * Print file node data in UTF-16 format when they match these props.
+ */
+ public Set<OneNotePropertyEnum> getUtf16PropertiesToPrint() {
+ return utf16PropertiesToPrint;
+ }
+
+ /**
+ * Print file node data in UTF-16 format when they match these props.
+ *
+ * @param utf16PropertiesToPrint The set of UTF properties you want to print UTF-16 for. Defaults are usually ok here.
+ * @return Returns this, as per builder pattern.
+ */
+ public OneNoteTreeWalkerOptions setUtf16PropertiesToPrint(Set<OneNotePropertyEnum> utf16PropertiesToPrint) {
+ this.utf16PropertiesToPrint = utf16PropertiesToPrint;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/PropertyIDType.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/PropertyIDType.java
new file mode 100644
index 0000000..87782e6
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/PropertyIDType.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+enum PropertyIDType {
+ ObjectID,
+ ArrayOfObjectIDs,
+ ObjectSpaceID,
+ ArrayOfObjectSpaceIDs,
+ ContextID,
+ ArrayofContextIDs;
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/PropertySet.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/PropertySet.java
new file mode 100644
index 0000000..a23d671
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/PropertySet.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+import org.apache.tika.exception.TikaMemoryLimitException;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+
+/**
+ * A property set is a collection of properties that specify the attributes of an object (section 2.1.5).
+ * <p>
+ * The PropertySet structure specifies the format of a property set and is contained by an ObjectSpaceObjectPropSet structure
+ * (section 2.6.1). The meaning of each property in the set is specified in [MS-ONE] section 2.1.12.
+ * <p>
+ * A PropertySet structure can contain references to other objects.
+ * <p>
+ * The data for a property that is not an object reference is contained in the PropertySet.rgData stream field. The rgData stream is read
+ * sequentially beginning with the first property in a PropertySet.rgPrids array until every property has been read.
+ * <p>
+ * The number of bytes read for each property is specified by the PropertyID.type field.
+ * <p>
+ * The data for a property that is a reference to one or more objects (section 2.1.5) is contained in the streams within an
+ * ObjectSpaceObjectPropSet structure (OIDs.body, OSIDs.body, ContextIDs.body).
+ * <p>
+ * The streams are read sequentially beginning with the first property in a PropertySet.rgPrids array.
+ * <p>
+ * If the PropertyID.type field specifies a single object (0x8, 0xA, 0xC), a single CompactID (4 bytes) is read from the corresponding
+ * stream in the ObjectSpaceObjectPropSet structure.
+ * <p>
+ * If the PropertyID.type field specifies an array of objects (0x9, 0xB, 0xD), an unsigned integer (4 bytes) is read from the
+ * PropertySet.rgDatastream and specifies the number of CompactID structures (section 2.2.2) to read from the corresponding stream in the
+ * ObjectSpaceObjectPropSet structure.
+ * <p>
+ * The streams for each PropertyID.type field are given by the following table.
+ * <p>
+ * 0x8 (ObjectID, section 2.6.6) - ObjectSpaceObjectPropSet.OIDs.body
+ * 0x9 (ArrayOfObjectIDs, section 2.6.6) - ObjectSpaceObjectPropSet.OIDs.body
+ * 0xA (ObjectSpaceID, section 2.6.6) - ObjectSpaceObjectPropSet.OSIDs.body
+ * 0xB (ArrayOfObjectSpaceIDs, section 2.6.6) - ObjectSpaceObjectPropSet.OSIDs.body
+ * 0xC (ContextID, section 2.6.6) - ObjectSpaceObjectPropSet.ContextIDs.body
+ * 0xD (ArrayOfContextIDs, section 2.6.6) - ObjectSpaceObjectPropSet.ContextIDs.body
+ */
+
+class PropertySet {
+ List<PropertyValue> rgPridsData = new ArrayList<>();
+
+ public void print(OneNoteDocument document, OneNotePtr pointer, int indentLevel) throws IOException, TikaMemoryLimitException {
+ for (PropertyValue child : rgPridsData) {
+ child.print(document, pointer, indentLevel);
+ }
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ PropertySet that = (PropertySet) o;
+ return Objects.equals(rgPridsData, that.rgPridsData);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(rgPridsData);
+ }
+
+ public List<PropertyValue> getRgPridsData() {
+ return rgPridsData;
+ }
+
+ public PropertySet setRgPridsData(List<PropertyValue> rgPridsData) {
+ this.rgPridsData = rgPridsData;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/PropertyValue.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/PropertyValue.java
new file mode 100644
index 0000000..454a3ea
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/PropertyValue.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+import org.apache.tika.exception.TikaMemoryLimitException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+class PropertyValue {
+
+ private static final Logger LOG = LoggerFactory.getLogger(PropertyValue.class);
+
+ OneNotePropertyId propertyId = new OneNotePropertyId();
+ // union of one of these things based on the type of the corresponding PropertyID
+ long scalar; // holds a boolean value if type = 0x2, retrieved from header
+ // either ObjectID or ObjectSpaceID or ContextID (single value in array)
+ // either ArrayOfObjectIDs or ArrayOfObjectSpaceIDs or ArrayOfContextID
+ List<CompactID> compactIDs = new ArrayList<>();
+ PropertySet propertySet = new PropertySet(); // or used to house a single value
+ FileChunkReference rawData = new FileChunkReference(); // FourBytesOfLengthFollowedByData
+
+ public void print(OneNoteDocument document, OneNotePtr pointer, int indentLevel) throws IOException, TikaMemoryLimitException {
+ boolean isRawText = true; //std::string(get_property_id_name(propertyId.id)).find("TextE")!=-1;
+
+ long type = propertyId.type;
+
+ if (isRawText) {
+ LOG.debug("{}<{}", IndentUtil.getIndent(indentLevel + 1), propertyId);
+ }
+ if (type > 0 && type <= 6) {
+ if (isRawText) {
+ LOG.debug("(%d)", scalar);
+ }
+ } else if (type == 7) {
+ OneNotePtr content = new OneNotePtr(pointer);
+ content.reposition(rawData);
+ if (isRawText) {
+ LOG.debug(" [");
+ content.dumpHex();
+ LOG.debug("]");
+ }
+ } else if (type == 0x9 || type == 0x8
+ || type == 0xb || type == 0xc
+ || type == 0xa || type == 0xd) {
+ String xtype = "contextID";
+ if (type == 0x8 || type == 0x9) {
+ xtype = "OIDs";
+ }
+ if (type == 0xa || type == 0xb) {
+ xtype = "OSIDS";
+ }
+ if (isRawText) {
+ if (!compactIDs.isEmpty()) {
+ LOG.debug("");
+ }
+ for (CompactID compactID : compactIDs) {
+ LOG.debug("{}{}[{}]", IndentUtil.getIndent(indentLevel + 1), xtype, compactID);
+ FileNodePtr where = document.guidToObject.get(compactID.guid);
+ if (where != null) {
+ where.dereference(document).print(document, pointer, indentLevel + 1);
+ }
+ }
+ }
+ } else if (type == 0x10 || type == 0x11) {
+ if (isRawText) {
+ LOG.debug("SubProperty");
+ }
+ propertySet.print(document, pointer, indentLevel + 1);
+ }
+ if (isRawText) {
+ LOG.debug(">");
+ }
+ }
+
+ public OneNotePropertyId getPropertyId() {
+ return propertyId;
+ }
+
+ public PropertyValue setPropertyId(OneNotePropertyId propertyId) {
+ this.propertyId = propertyId;
+ return this;
+ }
+
+ public long getScalar() {
+ return scalar;
+ }
+
+ public PropertyValue setScalar(long scalar) {
+ this.scalar = scalar;
+ return this;
+ }
+
+ public List<CompactID> getCompactIDs() {
+ return compactIDs;
+ }
+
+ public PropertyValue setCompactIDs(List<CompactID> compactIDs) {
+ this.compactIDs = compactIDs;
+ return this;
+ }
+
+ public PropertySet getPropertySet() {
+ return propertySet;
+ }
+
+ public PropertyValue setPropertySet(PropertySet propertySet) {
+ this.propertySet = propertySet;
+ return this;
+ }
+
+ public FileChunkReference getRawData() {
+ return rawData;
+ }
+
+ public PropertyValue setRawData(FileChunkReference rawData) {
+ this.rawData = rawData;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/Revision.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/Revision.java
new file mode 100644
index 0000000..e6ca0fc
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/Revision.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+class Revision {
+ Map<Long, GUID> globalId = new HashMap<>();
+ List<FileNodePtr> manifestList = new ArrayList<>();
+ ExtendedGUID gosid = ExtendedGUID.nil();
+ ExtendedGUID dependent = ExtendedGUID.nil();
+
+ public Map<Long, GUID> getGlobalId() {
+ return globalId;
+ }
+
+ public void setGlobalId(Map<Long, GUID> globalId) {
+ this.globalId = globalId;
+ }
+
+ public List<FileNodePtr> getManifestList() {
+ return manifestList;
+ }
+
+ public void setManifestList(List<FileNodePtr> manifestList) {
+ this.manifestList = manifestList;
+ }
+
+ public ExtendedGUID getGosid() {
+ return gosid;
+ }
+
+ public void setGosid(ExtendedGUID gosid) {
+ this.gosid = gosid;
+ }
+
+ public ExtendedGUID getDependent() {
+ return dependent;
+ }
+
+ public void setDependent(ExtendedGUID dependent) {
+ this.dependent = dependent;
+ }
+
+ public Revision() {
+
+ }
+
+ public Revision(Map<Long, GUID> globalId, List<FileNodePtr> manifestList, ExtendedGUID gosid, ExtendedGUID dependent) {
+ this.globalId = globalId;
+ this.manifestList = manifestList;
+ this.gosid = gosid;
+ this.dependent = dependent;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/RevisionManifest.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/RevisionManifest.java
new file mode 100644
index 0000000..4bd18b5
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/RevisionManifest.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+class RevisionManifest {
+ ExtendedGUID ridDependent;
+ long timeCreation; //ignored
+ long revisionRole;
+ long odcsDefault;
+
+ public ExtendedGUID getRidDependent() {
+ return ridDependent;
+ }
+
+ public RevisionManifest setRidDependent(ExtendedGUID ridDependent) {
+ this.ridDependent = ridDependent;
+ return this;
+ }
+
+ public long getTimeCreation() {
+ return timeCreation;
+ }
+
+ public RevisionManifest setTimeCreation(long timeCreation) {
+ this.timeCreation = timeCreation;
+ return this;
+ }
+
+ public long getRevisionRole() {
+ return revisionRole;
+ }
+
+ public RevisionManifest setRevisionRole(long revisionRole) {
+ this.revisionRole = revisionRole;
+ return this;
+ }
+
+ public long getOdcsDefault() {
+ return odcsDefault;
+ }
+
+ public RevisionManifest setOdcsDefault(long odcsDefault) {
+ this.odcsDefault = odcsDefault;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/RevisionManifestListStart.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/RevisionManifestListStart.java
new file mode 100644
index 0000000..613ce5e
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/RevisionManifestListStart.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+class RevisionManifestListStart {
+ long nInstanceIgnored;
+
+ public long getnInstanceIgnored() {
+ return nInstanceIgnored;
+ }
+
+ public RevisionManifestListStart setnInstanceIgnored(long nInstanceIgnored) {
+ this.nInstanceIgnored = nInstanceIgnored;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/RevisionRoleDeclaration.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/RevisionRoleDeclaration.java
new file mode 100644
index 0000000..258292a
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/RevisionRoleDeclaration.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+class RevisionRoleDeclaration {
+ long revisionRole;
+
+ public long getRevisionRole() {
+ return revisionRole;
+ }
+
+ public RevisionRoleDeclaration setRevisionRole(long revisionRole) {
+ this.revisionRole = revisionRole;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/RootObjectReference.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/RootObjectReference.java
new file mode 100644
index 0000000..94017b9
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/RootObjectReference.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+class RootObjectReference {
+ CompactID oidRoot = new CompactID();
+ RootObjectReferenceBase rootObjectReferenceBase = new RootObjectReferenceBase();
+
+ public CompactID getOidRoot() {
+ return oidRoot;
+ }
+
+ public RootObjectReference setOidRoot(CompactID oidRoot) {
+ this.oidRoot = oidRoot;
+ return this;
+ }
+
+ public RootObjectReferenceBase getRootObjectReferenceBase() {
+ return rootObjectReferenceBase;
+ }
+
+ public RootObjectReference setRootObjectReferenceBase(RootObjectReferenceBase rootObjectReferenceBase) {
+ this.rootObjectReferenceBase = rootObjectReferenceBase;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/RootObjectReferenceBase.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/RootObjectReferenceBase.java
new file mode 100644
index 0000000..05cdd70
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/RootObjectReferenceBase.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+class RootObjectReferenceBase {
+ long rootRole;
+
+ public long getRootRole() {
+ return rootRole;
+ }
+
+ public RootObjectReferenceBase setRootRole(long rootRole) {
+ this.rootRole = rootRole;
+ return this;
+ }
+}
diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 83e78d4..763e2f7 100644
--- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -38,6 +38,7 @@ org.apache.tika.parser.iwork.IWorkPackageParser
org.apache.tika.parser.jpeg.JpegParser
org.apache.tika.parser.mail.RFC822Parser
org.apache.tika.parser.mbox.MboxParser
+org.apache.tika.parser.onenote.OneNoteParser
org.apache.tika.parser.mbox.OutlookPSTParser
org.apache.tika.parser.microsoft.EMFParser
org.apache.tika.parser.microsoft.WMFParser
@@ -46,6 +47,7 @@ org.apache.tika.parser.microsoft.MSOwnerFileParser
org.apache.tika.parser.microsoft.OfficeParser
org.apache.tika.parser.microsoft.OldExcelParser
org.apache.tika.parser.microsoft.TNEFParser
+org.apache.tika.parser.microsoft.onenote.OneNoteParser
org.apache.tika.parser.microsoft.ooxml.OOXMLParser
org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006.Word2006MLParser
org.apache.tika.parser.microsoft.xml.WordMLParser
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index 3046303..2d641b6 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -17,12 +17,12 @@
package org.apache.tika.mime;
// Junit imports
-import static java.nio.charset.StandardCharsets.UTF_16BE;
-import static java.nio.charset.StandardCharsets.UTF_16LE;
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNotSame;
+
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Before;
+import org.junit.Test;
import java.io.ByteArrayInputStream;
import java.io.File;
@@ -30,11 +30,12 @@ import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
-import org.apache.tika.Tika;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.metadata.Metadata;
-import org.junit.Before;
-import org.junit.Test;
+import static java.nio.charset.StandardCharsets.UTF_16BE;
+import static java.nio.charset.StandardCharsets.UTF_16LE;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNotSame;
/**
*
@@ -1177,8 +1178,11 @@ public class TestMimeTypes {
public void testOneNote() throws Exception {
// With name or data we can get the full details
assertTypeByName("application/onenote; format=one", "testOneNote.one");
- assertTypeByData("application/onenote; format=one", "testOneNote.one");
-
+ assertTypeByData("application/onenote; format=one", "testOneNote2.one");
+ assertTypeByData("application/onenote; format=one", "testOneNote3.one");
+ assertTypeByData("application/onenote; format=one", "testOneNote4.one");
+ assertTypeByData("application/onenote; format=one", "testOneNote1.one");
+
// TODO Get sample .onetoc2 and .onepkg files
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
index 2c584a5..6c800eb 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
@@ -73,12 +73,6 @@ public class TiffParserTest extends TikaTest {
(long)getXML("testTIFF_multipage.tif")
.metadata
.getInt(TIFF.EXIF_PAGE_COUNT));
-
- //Comment \u000A System.out.println("actual code");
}
- @Test
- public void testUnicodeNewLine() throws Exception {
- //Comment \u000A System.out.println("actual code");
- }
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
new file mode 100644
index 0000000..c72ebec
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.onenote;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.time.Instant;
+import java.util.Arrays;
+import java.util.List;
+
+public class OneNoteParserTest extends TikaTest {
+
+ //test recursive parser wrapper for image files
+
+ /**
+ * This is the sample document that is automatically created from onenote 2013.
+ */
+ @Test
+ public void testOneNote2013Doc1() throws Exception {
+// List<Metadata> metadataList = getRecursiveMetadata("testOneNote1.one");
+ // debug(metadataList);
+ Metadata metadata = new Metadata();
+ String txt = getText("testOneNote1.one", metadata);
+ assertNoJunk(txt);
+
+ List<String> authors = Arrays.asList(metadata.getValues("authors"));
+ assertContains("Olya Veselova\u0000", authors);
+ assertContains("Microsoft\u0000", authors);
+ assertContains("Scott\u0000", authors);
+ assertContains("Scott H. W. Snyder\u0000", authors);
+
+ List<String> mostRecentAuthors = Arrays.asList(metadata.getValues("mostRecentAuthors"));
+ assertContains("Microsoft\u0000", mostRecentAuthors);
+
+ List<String> originalAuthors = Arrays.asList(metadata.getValues("originalAuthors"));
+ assertContains("Microsoft\u0000", originalAuthors);
+
+ Assert.assertEquals(Instant.ofEpochSecond(1336059427), Instant.ofEpochSecond(Long.parseLong(metadata.get("creationTimestamp"))));
+ Assert.assertEquals(Instant.ofEpochMilli(1383613114000L), Instant.ofEpochMilli(Long.parseLong(metadata.get("lastModifiedTimestamp"))));
+ Assert.assertEquals(Instant.ofEpochSecond(1446572147), Instant.ofEpochSecond(Long.parseLong(metadata.get("lastModified"))));
+ }
+
+ @Test
+ public void testOneNote2013Doc2() throws Exception {
+ Metadata metadata = new Metadata();
+ String txt = getText("testOneNote2.one", metadata);
+ assertContains("wow this is neat", txt);
+ assertContains("neat info about totally killin it bro", txt);
+ assertContains("Section1TextArea1", txt);
+ assertContains("Section1HeaderTitle", txt);
+ assertContains("Section1TextArea2", txt);
+ assertNoJunk(txt);
+
+ List<String> authors = Arrays.asList(metadata.getValues("authors"));
+ assertContains("Olya Veselova\u0000", authors);
+ assertContains("Microsoft\u0000", authors);
+ assertContains("Scott\u0000", authors);
+ assertContains("Scott H. W. Snyder\u0000", authors);
+ assertContains("ndipiazza\u0000", authors);
+
+ List<String> mostRecentAuthors = Arrays.asList(metadata.getValues("mostRecentAuthors"));
+ assertContains("ndipiazza\u0000", mostRecentAuthors);
+ assertContains("Microsoft\u0000", mostRecentAuthors);
+
+ List<String> originalAuthors = Arrays.asList(metadata.getValues("originalAuthors"));
+ assertContains("Microsoft\u0000", originalAuthors);
+ assertContains("ndipiazza\u0000", mostRecentAuthors);
+
+ Assert.assertEquals(Instant.ofEpochSecond(1336059427), Instant.ofEpochSecond(Long.parseLong(metadata.get("creationTimestamp"))));
+ Assert.assertEquals(Instant.ofEpochMilli(1574426629000L), Instant.ofEpochMilli(Long.parseLong(metadata.get("lastModifiedTimestamp"))));
+ Assert.assertEquals(Instant.ofEpochSecond(1574426628), Instant.ofEpochSecond(Long.parseLong(metadata.get("lastModified"))));
+ }
+
+ @Test
+ public void testOneNote2013Doc3() throws Exception {
+ Metadata metadata = new Metadata();
+ String txt = getText("testOneNote3.one", metadata);
+ assertContains("awesome information about sports or some crap like that.", txt);
+ assertContains("Quit doing horrible things to me. Dang you. ", txt);
+ assertContains("Section2TextArea1", txt);
+ assertContains("Section2HeaderTitle", txt);
+ assertContains("Section2TextArea2", txt);
+ assertNoJunk(txt);
+
+ List<String> authors = Arrays.asList(metadata.getValues("authors"));
+ assertNotContained("Olya Veselova\u0000", authors);
+ assertNotContained("Microsoft\u0000", authors);
+ assertNotContained("Scott\u0000", authors);
+ assertNotContained("Scott H. W. Snyder\u0000", authors);
+ assertContains("ndipiazza\u0000", authors);
+
+ List<String> mostRecentAuthors = Arrays.asList(metadata.getValues("mostRecentAuthors"));
+ assertContains("ndipiazza\u0000", mostRecentAuthors);
+ assertNotContained("Microsoft\u0000", mostRecentAuthors);
+
+ List<String> originalAuthors = Arrays.asList(metadata.getValues("originalAuthors"));
+ assertNotContained("Microsoft\u0000", originalAuthors);
+ assertContains("ndipiazza\u0000", mostRecentAuthors);
+
+ Assert.assertEquals(Instant.ofEpochSecond(1574426349), Instant.ofEpochSecond(Long.parseLong(metadata.get("creationTimestamp"))));
+ Assert.assertEquals(Instant.ofEpochMilli(1574426623000L), Instant.ofEpochMilli(Long.parseLong(metadata.get("lastModifiedTimestamp"))));
+ Assert.assertEquals(Instant.ofEpochSecond(1574426624), Instant.ofEpochSecond(Long.parseLong(metadata.get("lastModified"))));
+ }
+
+ @Test
+ public void testOneNote2013Doc4() throws Exception {
+ Metadata metadata = new Metadata();
+ String txt = getText("testOneNote4.one", metadata);
+
+ assertContains("way too much information about poptarts to handle.", txt);
+ assertContains("Section3TextArea1", txt);
+ assertContains("Section3HeaderTitle", txt);
+ assertContains("Section3TextArea2", txt);
+ assertNoJunk(txt);
+
+ List<String> authors = Arrays.asList(metadata.getValues("authors"));
+ assertNotContained("Olya Veselova\u0000", authors);
+ assertNotContained("Microsoft\u0000", authors);
+ assertNotContained("Scott\u0000", authors);
+ assertNotContained("Scott H. W. Snyder\u0000", authors);
+ assertContains("ndipiazza\u0000", authors);
+
+ List<String> mostRecentAuthors = Arrays.asList(metadata.getValues("mostRecentAuthors"));
+ assertContains("ndipiazza\u0000", mostRecentAuthors);
+ assertNotContained("Microsoft\u0000", mostRecentAuthors);
+
+ List<String> originalAuthors = Arrays.asList(metadata.getValues("originalAuthors"));
+ assertNotContained("Microsoft\u0000", originalAuthors);
+ assertContains("ndipiazza\u0000", mostRecentAuthors);
+
+ Assert.assertEquals(Instant.ofEpochSecond(1574426385), Instant.ofEpochSecond(Long.parseLong(metadata.get("creationTimestamp"))));
+ Assert.assertEquals(Instant.ofEpochMilli(1574426548000L), Instant.ofEpochMilli(Long.parseLong(metadata.get("lastModifiedTimestamp"))));
+ Assert.assertEquals(Instant.ofEpochSecond(1574426547), Instant.ofEpochSecond(Long.parseLong(metadata.get("lastModified"))));
+ }
+
+ @Test
+ public void testOneNote2016() throws Exception {
+ Metadata metadata = new Metadata();
+ String txt = getText("testOneNote2016.one", metadata);
+
+ assertContains("So good", txt);
+ assertContains("This is one note 2016", txt);
+ assertNoJunk(txt);
+
+ List<String> authors = Arrays.asList(metadata.getValues("authors"));
+ assertContains("nicholas dipiazza\u0000", authors);
+
+ List<String> mostRecentAuthors = Arrays.asList(metadata.getValues("mostRecentAuthors"));
+ assertContains("nicholas dipiazza\u0000", mostRecentAuthors);
+
+ List<String> originalAuthors = Arrays.asList(metadata.getValues("originalAuthors"));
+ assertContains("nicholas dipiazza\u0000", originalAuthors);
+
+ Assert.assertEquals(Instant.ofEpochSecond(1576107472), Instant.ofEpochSecond(Long.parseLong(metadata.get("creationTimestamp"))));
+ Assert.assertEquals(Instant.ofEpochMilli(1576107481000L), Instant.ofEpochMilli(Long.parseLong(metadata.get("lastModifiedTimestamp"))));
+ Assert.assertEquals(Instant.ofEpochSecond(1576107480), Instant.ofEpochSecond(Long.parseLong(metadata.get("lastModified"))));
+ }
+
+ @Test
+ public void testOneNoteEmbeddedWordDoc() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testOneNoteEmbeddedWordDoc.one");
+
+ Assert.assertTrue(metadataList.stream().anyMatch(ml ->
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document".equals(ml.get("Content-Type"))));
+ }
+
+ private void assertNoJunk(String txt) {
+ //Should not include font names in the text
+ assertNotContained("Calibri", txt);
+ //Should not include UTF-16 property values that are garbage
+ assertNotContained("\u5902", txt);
+ assertNotContained("\u83F2", txt);
+ assertNotContained("\u432F", txt);
+ assertNotContained("\u01E1", txt);
+ }
+}
diff --git a/tika-parsers/src/test/resources/test-documents/testOneNote1.one b/tika-parsers/src/test/resources/test-documents/testOneNote1.one
new file mode 100644
index 0000000..d410ed4
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testOneNote1.one differ
diff --git a/tika-parsers/src/test/resources/test-documents/testOneNote2.one b/tika-parsers/src/test/resources/test-documents/testOneNote2.one
new file mode 100755
index 0000000..ad27e9a
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testOneNote2.one differ
diff --git a/tika-parsers/src/test/resources/test-documents/testOneNote2016.one b/tika-parsers/src/test/resources/test-documents/testOneNote2016.one
new file mode 100644
index 0000000..5bc5f33
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testOneNote2016.one differ
diff --git a/tika-parsers/src/test/resources/test-documents/testOneNote3.one b/tika-parsers/src/test/resources/test-documents/testOneNote3.one
new file mode 100755
index 0000000..b750828
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testOneNote3.one differ
diff --git a/tika-parsers/src/test/resources/test-documents/testOneNote4.one b/tika-parsers/src/test/resources/test-documents/testOneNote4.one
new file mode 100755
index 0000000..bc6126e
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testOneNote4.one differ
diff --git a/tika-parsers/src/test/resources/test-documents/testOneNoteEmbeddedWordDoc.one b/tika-parsers/src/test/resources/test-documents/testOneNoteEmbeddedWordDoc.one
new file mode 100644
index 0000000..6c31d05
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testOneNoteEmbeddedWordDoc.one differ