You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@corinthia.apache.org by pm...@apache.org on 2015/04/24 18:18:13 UTC

[46/55] [abbrv] incubator-corinthia git commit: Use document hash to protect against misuse of put

Use document hash to protect against misuse of put

The put operation was potentially quite dangerous if you didn't know
what you were doing, or just accidentally used it with the wrong source
document and/or HTML file. The algorithms used to update an existing
Word document (and, in the future, other formats) depend on a correct
mapping between HTML elements and the XML elements in the source
document from which they were generated. If the wrong HTML file was used
to update a .docx file, an incorrect mapping could lead to file
corruption in the sense of the XML data not matching the expecetd
schema.

To protect against this, in the get operation, we now compute a hash of
all the XML documents inside a .docx package, and store this in a <meta>
tag in the HTML document. In the put operation, we re-compute this hash,
and check that it matches the one given in the <meta> tag of the HTML
document being used to update the source.

A common situation where this protection helps is if you try to invoke
put twice on the same pair of files. For example:

    # dfconvert get test.docx test.html
    ... Make some changes to test.html ...
    # dfconvert put test.docx test.html
    # dfconvert put test.docx test.html

The second put will now fail, because the first changed the XML
structure of the docx file in a way that made the node identifiers
referenced from the HTML id attributes no longer match in the intended
way. Now, we detect that on th second put, the hash of test.docx will be
different to that given in test.html, and we abort with an error message
stating: "HTML document was generated from a different file to the one
being updated".

To avoid having to update all the test cases, we strip out the hash when
doing a test of get, and set the special value "ignore" in the <meta>
tag before doing a put, which disables the check.


Project: http://git-wip-us.apache.org/repos/asf/incubator-corinthia/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-corinthia/commit/c0d96a24
Tree: http://git-wip-us.apache.org/repos/asf/incubator-corinthia/tree/c0d96a24
Diff: http://git-wip-us.apache.org/repos/asf/incubator-corinthia/diff/c0d96a24

Branch: refs/heads/stable
Commit: c0d96a24a498ed8bd04280f4be735d609297d169
Parents: 4c4d03a
Author: Peter Kelly <pe...@uxproductivity.com>
Authored: Sun Apr 12 23:46:10 2015 +0700
Committer: Peter Kelly <pe...@uxproductivity.com>
Committed: Sun Apr 12 23:46:10 2015 +0700

----------------------------------------------------------------------
 DocFormats/api/src/Operations.c                 | 76 ++++++++++++++++++++
 DocFormats/core/src/html/DFHTML.c               | 50 +++++++++++++
 DocFormats/core/src/html/DFHTML.h               |  4 ++
 .../filters/ooxml/src/word/WordConverter.c      |  1 +
 DocFormats/filters/ooxml/tests/word/WordTests.c |  3 +
 5 files changed, 134 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-corinthia/blob/c0d96a24/DocFormats/api/src/Operations.c
----------------------------------------------------------------------
diff --git a/DocFormats/api/src/Operations.c b/DocFormats/api/src/Operations.c
index d13efbc..328e1a5 100644
--- a/DocFormats/api/src/Operations.c
+++ b/DocFormats/api/src/Operations.c
@@ -27,6 +27,8 @@
 #include "DFXML.h"
 #include "DFZipFile.h"
 #include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
 
 struct DFConcreteDocument {
     size_t retainCount;
@@ -39,6 +41,55 @@ struct DFAbstractDocument {
     DFDocument *htmlDoc;
 };
 
+/**
+ * Compute a hash of the set of all XML files in the archive. When the get operation is executed,
+ * this hash is stored in the HTML file, as a record of the document from which it was generated.
+ * When the put operation is executed, the hash is compared with that of the HTML file, and an error
+ * reported if a mismatch occurs.
+ *
+ * This check ensures that put can only be executed with HTML documents that were genuinely
+ * generated from this exact (version of the) document, and thus can be safely assumed to have id
+ * attributes that correctly match elements in the HTML document to elements in the original XML
+ * file(s) from which they were generated, avoiding corruption during the update process.
+ *
+ * If someone tries to call put with a HTML document that was not originally created from this exact
+ * concrete document, the operation will fail.
+ */
+static int computeXMLHash(DFStorage *storage, DFHashCode *result, DFError **error)
+{
+    int ok = 0;
+    *result = 0;
+
+    DFHashCode hash = 0;
+    DFHashBegin(hash);
+    const char **filenames = DFStorageList(storage,error);
+    if (filenames == NULL)
+        goto end;
+    for (int i = 0; filenames[i]; i++) {
+        unsigned char *buf = NULL;
+        size_t nbytes = 0;
+        if (!DFStorageRead(storage,filenames[i],(void **)&buf,&nbytes,error)) {
+            DFErrorFormat(error,"%s: %s",filenames[i],DFErrorMessage(error));
+            goto end;
+        }
+        // The hash algorithm works on 32-bit integers; add 4 NULL bytes at the end of the buffer to
+        // ensure its entire contents are taken into account when computing the hash.
+        buf = xrealloc(buf,nbytes+4);
+        memset(&buf[nbytes],0,4);
+        uint32_t *intbuf = (uint32_t *)buf;
+        for (size_t pos = 0; pos < (nbytes+3)/4; pos++)
+            DFHashUpdate(hash,intbuf[pos]);
+        free(buf);
+    }
+    DFHashEnd(hash);
+    *result = hash;
+    ok = 1;
+
+end:
+    free(filenames);
+    return ok;
+}
+
 DFConcreteDocument *DFConcreteDocumentNew(DFStorage *storage)
 {
     DFConcreteDocument *concrete = 
@@ -188,7 +239,15 @@ int DFGet(DFConcreteDocument *concrete,
     }
 
     if (htmlDoc == NULL)
+        return 0;;
+
+    // Store a hash of the concrete document in the HTML file, so we can check it in DFPut()
+    DFHashCode hash = 0;
+    if (!computeXMLHash(concrete->storage,&hash,error))
         return 0;
+    char hashstr[100];
+    snprintf(hashstr,100,"%X",hash);
+    HTMLMetaSet(htmlDoc,"corinthia-document-hash",hashstr);
 
     DFDocumentRelease(abstract->htmlDoc);
     abstract->htmlDoc = htmlDoc;
@@ -206,6 +265,23 @@ int DFPut(DFConcreteDocument *concreteDoc,
         return 0;
     }
 
+    // Check that the document hash in the HTML file matches that of the concrete document. This
+    // ensures that we're using a HTML file that was generated from this exact document (see above)
+    // and can rely on the element mappings from the id attributes. This comparison is ignored
+    // for test cases, which specify the special value "ignore" in the meta tag.
+    DFHashCode expectedHash = 0;
+    if (!computeXMLHash(concreteDoc->storage,&expectedHash,error))
+        return 0;;
+    DFHashCode actualHash = 0;
+    int hashMatches = 0;
+    const char *hashstr = HTMLMetaGet(abstractDoc->htmlDoc,"corinthia-document-hash");
+    if ((hashstr != NULL) && (sscanf(hashstr,"%X",&actualHash) == 1))
+        hashMatches = (expectedHash == actualHash);
+    if (!hashMatches && !DFStringEquals(hashstr,"ignore")) {
+        DFErrorFormat(error,"HTML document was generated from a different file to the one being updated");
+        return 0;
+    }
+
     int ok = 0;
     switch (DFStorageFormat(concreteDoc->storage)) {
         case DFFileFormatDocx:

http://git-wip-us.apache.org/repos/asf/incubator-corinthia/blob/c0d96a24/DocFormats/core/src/html/DFHTML.c
----------------------------------------------------------------------
diff --git a/DocFormats/core/src/html/DFHTML.c b/DocFormats/core/src/html/DFHTML.c
index 8716d66..b6538ee 100644
--- a/DocFormats/core/src/html/DFHTML.c
+++ b/DocFormats/core/src/html/DFHTML.c
@@ -362,6 +362,56 @@ int isHashRRGGBB(const char *str)
     return (strlen(str) == 7) && (str[0] == '#') && isRRGGBB(&str[1]);
 }
 
+const char *HTMLMetaGet(DFDocument *htmlDoc, const char *name)
+{
+    assert(htmlDoc->root != NULL);
+    assert(htmlDoc->root->tag == HTML_HTML);
+    DFNode *head = DFChildWithTag(htmlDoc->root,HTML_HEAD);
+    if (head == NULL)
+        return NULL;
+    for (DFNode *meta = head->first; meta != NULL; meta = meta->next) {
+        if ((meta->tag == HTML_META) && DFStringEquals(DFGetAttribute(meta,HTML_NAME),name))
+            return DFGetAttribute(meta,HTML_CONTENT);
+    }
+    return NULL;
+}
+
+void HTMLMetaSet(DFDocument *htmlDoc, const char *name, const char *content)
+{
+    assert(htmlDoc->root != NULL);
+    assert(htmlDoc->root->tag == HTML_HTML);
+    DFNode *head = DFChildWithTag(htmlDoc->root,HTML_HEAD);
+    if (head == NULL) {
+        head = DFCreateElement(htmlDoc,HTML_HEAD);
+        DFNode *body = DFChildWithTag(htmlDoc->root,HTML_BODY);
+        DFInsertBefore(htmlDoc->root,head,body);
+    }
+    for (DFNode *meta = head->first; meta != NULL; meta = meta->next) {
+        if ((meta->tag == HTML_META) && DFStringEquals(DFGetAttribute(meta,HTML_NAME),name)) {
+            DFSetAttribute(meta,HTML_CONTENT,content);
+            return;
+        }
+    }
+    DFNode *meta = DFCreateChildElement(head,HTML_META);
+    DFSetAttribute(meta,HTML_NAME,name);
+    DFSetAttribute(meta,HTML_CONTENT,content);
+}
+
+void HTMLMetaRemove(DFDocument *htmlDoc, const char *name)
+{
+    assert(htmlDoc->root != NULL);
+    assert(htmlDoc->root->tag == HTML_HTML);
+    DFNode *head = DFChildWithTag(htmlDoc->root,HTML_HEAD);
+    if (head == NULL)
+        return;;
+    DFNode *next;
+    for (DFNode *meta = head->first; meta != NULL; meta = next) {
+        next = meta->next;
+        if ((meta->tag == HTML_META) && DFStringEquals(DFGetAttribute(meta,HTML_NAME),name))
+            DFRemoveNode(meta);
+    }
+}
+
 DFDocument *DFParseHTMLString(const char *str, int removeSpecial, DFError **error)
 {
     DFHTDocument *htdoc = DFHTDocumentNew();

http://git-wip-us.apache.org/repos/asf/incubator-corinthia/blob/c0d96a24/DocFormats/core/src/html/DFHTML.h
----------------------------------------------------------------------
diff --git a/DocFormats/core/src/html/DFHTML.h b/DocFormats/core/src/html/DFHTML.h
index abbf320..81c9592 100644
--- a/DocFormats/core/src/html/DFHTML.h
+++ b/DocFormats/core/src/html/DFHTML.h
@@ -50,6 +50,10 @@ CSSSize HTML_getImageDimensions(DFNode *img);
 int isRRGGBB(const char *str);
 int isHashRRGGBB(const char *str);
 
+const char *HTMLMetaGet(DFDocument *htmlDoc, const char *name);
+void HTMLMetaSet(DFDocument *htmlDoc, const char *name, const char *content);
+void HTMLMetaRemove(DFDocument *htmlDoc, const char *name);
+
 DFDocument *DFParseHTMLString(const char *str, int removeSpecial, DFError **error);
 DFDocument *DFParseHTMLFile(const char *filename, int removeSpecial, DFError **error);
 const char **DFHTMLGetImages(DFDocument *htmlDoc);

http://git-wip-us.apache.org/repos/asf/incubator-corinthia/blob/c0d96a24/DocFormats/filters/ooxml/src/word/WordConverter.c
----------------------------------------------------------------------
diff --git a/DocFormats/filters/ooxml/src/word/WordConverter.c b/DocFormats/filters/ooxml/src/word/WordConverter.c
index 1d754f6..a68629e 100644
--- a/DocFormats/filters/ooxml/src/word/WordConverter.c
+++ b/DocFormats/filters/ooxml/src/word/WordConverter.c
@@ -707,6 +707,7 @@ int WordConverterGet(DFDocument *html, DFStorage *abstractStorage, WordPackage *
     get.conv = converter;
     DFNode *abstract = WordDocumentLens.get(&get,wordDocument);
     DFAppendChild(converter->html->docNode,abstract);
+    converter->html->root = abstract;
     Word_postProcessHTMLDoc(converter);
 
     HTMLAddExternalStyleSheet(converter->html,"reset.css");

http://git-wip-us.apache.org/repos/asf/incubator-corinthia/blob/c0d96a24/DocFormats/filters/ooxml/tests/word/WordTests.c
----------------------------------------------------------------------
diff --git a/DocFormats/filters/ooxml/tests/word/WordTests.c b/DocFormats/filters/ooxml/tests/word/WordTests.c
index 5e9b30f..dd85701 100644
--- a/DocFormats/filters/ooxml/tests/word/WordTests.c
+++ b/DocFormats/filters/ooxml/tests/word/WordTests.c
@@ -22,6 +22,7 @@
 #include "Word.h"
 #include "HTMLPlain.h"
 #include "DFString.h"
+#include "DFHTML.h"
 #include <DocFormats/Operations.h>
 #include <stddef.h>
 #include <string.h>
@@ -109,6 +110,7 @@ static void test_get(void)
         goto end;
     }
 
+    HTMLMetaRemove(htmlDoc,"corinthia-document-hash");
     htmlPlain = HTML_toPlain(htmlDoc,abstractStorage,&error);
 
 end:
@@ -214,6 +216,7 @@ static void test_put(void)
     if (htmlDoc == NULL)
         goto end;
 
+    HTMLMetaSet(htmlDoc,"corinthia-document-hash","ignore");
     DFAbstractDocumentSetHTML(abstractDoc,htmlDoc);
 
     // Update the docx file based on the contents of the HTML file