You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@corinthia.apache.org by gb...@apache.org on 2015/05/10 03:13:26 UTC
[3/3] incubator-corinthia git commit: Produce a list of html nodes.
Produce a list of html nodes.
This is the current blueprint I hope to use and I think is ready to be
looked at to check this approach for usefulness.
I can and probably will add function pointers to struct
ODF_to_HTML_key where needed.
What works: This produces a list of HTML nodes. H1..H6 tags are
correctly reported and corresponding nodes are generated. Missing
TEXT_H entries also are correctly reported. Text nodes are created.
To test, from the build directory:
~/odf-branch/incubator-corinthia/build>
$ ./bin/dfconvert get ../gbg_samples/headers.odt foo.html;
* gbg_test_output.txt: Output of test above for convenience.
* DocFormats/filters/odf/src/text/ODFText.c
(ODFTextGet): add full report for newly generated htmlNode list.
(traverseContent): plug in the traversal algorithm.
* DocFormats/filters/odf/src/text/gbg_test.c
(struct ODF_to_HTML_key): Move declaration to gbg_test.h.
(report_tags_found): Minor changes.
(listODF_keys): New analysis function to only print out the current
mappings.
(locate_HTML): Remove magic number.
(show_node): New function. Traverse a list of nodes and print every
node out.
(printNode): Functions now prints a full DFNode profile (where
applicable).
(printMissingTag): New function. Create string to alert to broken
nodes.
* DocFormats/filters/odf/src/text/gbg_test.h
Add prototypes for the new functions mentioned in gbg_test.c.
(translateXMLEnumName): Add 10 dummy entries so the magic number
'10' can be removed from code.
* gbg_samples: New directory with debugging odt files.
* gbg_samples/headers.odt: odt files containing all ten possible header tags.
Project: http://git-wip-us.apache.org/repos/asf/incubator-corinthia/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-corinthia/commit/c81e6862
Tree: http://git-wip-us.apache.org/repos/asf/incubator-corinthia/tree/c81e6862
Diff: http://git-wip-us.apache.org/repos/asf/incubator-corinthia/diff/c81e6862
Branch: refs/heads/odf-filter-attempt2
Commit: c81e68626489b9515e7e8f3a5ce5d38ac8f59af0
Parents: 88d7f15
Author: Gabriela Gibson <gb...@apache.org>
Authored: Sun May 10 02:14:03 2015 +0100
Committer: Gabriela Gibson <gb...@apache.org>
Committed: Sun May 10 02:14:03 2015 +0100
----------------------------------------------------------------------
DocFormats/filters/odf/src/text/ODFText.c | 48 +-
DocFormats/filters/odf/src/text/gbg_test.c | 99 +-
DocFormats/filters/odf/src/text/gbg_test.h | 6719 ++++++++++++-----------
gbg_samples/headers.odt | Bin 0 -> 9780 bytes
gbg_test_output.txt | 427 ++
5 files changed, 3895 insertions(+), 3398 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-corinthia/blob/c81e6862/DocFormats/filters/odf/src/text/ODFText.c
----------------------------------------------------------------------
diff --git a/DocFormats/filters/odf/src/text/ODFText.c b/DocFormats/filters/odf/src/text/ODFText.c
index de1baed..3025f0b 100644
--- a/DocFormats/filters/odf/src/text/ODFText.c
+++ b/DocFormats/filters/odf/src/text/ODFText.c
@@ -34,24 +34,33 @@ typedef struct {
DFHashTable *htmlIdByNumId;
} ODFPutData;
-
static void traverseContent(ODFTextConverter *conv, DFNode *odfNode, DFNode *htmlNode)
{
- for (DFNode *odfChild = odfNode->first; odfChild != NULL; odfChild = odfChild->next)
- {
- // printNode(odfChild);
- if (odfChild->tag == 2) {
- // we have some text here.
+ DFNode *child;
+
+ for (DFNode *odfChild = odfNode->first; odfChild != NULL; odfChild = odfChild->next) {
+ if (odfChild->tag == 2) { // we have some text here.
+ child = DFCreateChildElement(htmlNode, odfChild->tag);
+ child->value = xstrdup(odfChild->value);
+ }
+ else {
+ Tag newTag = locate_HTML(odfChild);
+ if (newTag) { // we find an already mapped ODF -> HTML tag
+ child = DFCreateChildElement(htmlNode, newTag);
}
- else {
- Tag newTag = locate_HTML(odfChild);
+ else { // We found a missing tag
+ child = DFCreateChildElement(htmlNode, 2);
+ child->value = printMissingTag(odfChild->tag);
+ if (odfChild->attrs)
+ DFSetAttribute(child, odfChild->attrs->tag, odfChild->attrs->value);
}
- traverseContent(conv,odfChild,htmlNode);
}
- // TODO: Add a switch statement here to check the type of ODF element, and use
- // DFCreateChildElement to create a new element in the HTML document as a child of htmlNode
- // based on the type. As this function gets more complicated, it will likely be useful to
- // split it up into several functions
+ traverseContent(conv,odfChild,htmlNode);
+ }
+ // TODO: Add a switch statement here to check the type of ODF element, and use
+ // DFCreateChildElement to create a new element in the HTML document as a child of htmlNode
+ // based on the type. As this function gets more complicated, it will likely be useful to
+ // split it up into several functions
}
DFDocument *ODFTextGet(DFStorage *concreteStorage, DFStorage *abstractStorage, const char *idPrefix, DFError **error)
@@ -74,12 +83,19 @@ DFDocument *ODFTextGet(DFStorage *concreteStorage, DFStorage *abstractStorage, c
// contentDoc is loaded from content.xml, and represents the most important information in
// the document, i.e. the text, tables, lists, etc.
tagSeen = " ";
- // Tag newTag = locate_HTML(package->contentDoc->root);
+
traverseContent(conv, package->contentDoc->root, body);
-
- if (REPORT_TAG_FOUND)
+
+ if (REPORT_TAG_FOUND)
free(tagSeen);
+ printf("============================================================\n"
+ "Showing the result of the traverseContent function\n"
+ "============================================================\n"
+ );
+ show_nodes(body);
+
+
// TODO: Once this basic traversal is implemented and is capable of producing paragraphs,
// tables, lists, and spans, add ids to the HTML elements as they are created. That is, set
// the id attribute of each new HTML element to a string containing the idPrefix followed by
http://git-wip-us.apache.org/repos/asf/incubator-corinthia/blob/c81e6862/DocFormats/filters/odf/src/text/gbg_test.c
----------------------------------------------------------------------
diff --git a/DocFormats/filters/odf/src/text/gbg_test.c b/DocFormats/filters/odf/src/text/gbg_test.c
index f56c14f..ea7b92f 100644
--- a/DocFormats/filters/odf/src/text/gbg_test.c
+++ b/DocFormats/filters/odf/src/text/gbg_test.c
@@ -9,18 +9,6 @@
#include "DFXMLNames.h"
#include "gbg_test.h"
-void printNode(DFNode *node); // temp func, just for some convenience
-/// Helper functions
-
-
-// it may be that the Tag Attribute is not needed.
-typedef struct {
- Tag ODF_KEY;
- Tag HTML_KEY;
- Tag attribute;
- char *attribute_value;
-} ODF_to_HTML_key;
-
#define ENDMARKER 555555
ODF_to_HTML_key ODF_to_HTML_keys [] = {
{ 1, HTML_A, 0, NULL},
@@ -111,7 +99,7 @@ ODF_to_HTML_key ODF_to_HTML_keys [] = {
{ TEXT_H, HTML_H6, 2310, "Heading_20_7" },
{ TEXT_H, HTML_H6, 2310, "Heading_20_8" },
{ TEXT_H, HTML_H6, 2310, "Heading_20_9" },
- { TEXT_H, HTML_H6, 2310, "Heading_20_10" },
+ // { TEXT_H, HTML_H6, 2310, "Heading_20_10" },
{ 1, HTML_HEAD, 0, NULL},
{ 1, HTML_HEADER, 0, NULL},
{ 1, HTML_HEADERS, 0, NULL},
@@ -329,7 +317,9 @@ ODF_to_HTML_key ODF_to_HTML_keys [] = {
{ 0,ENDMARKER, 0, NULL},
};
-
+// strictly speaking because printing the generated node list out
+// gives the same information, this is no longer neccessary, but it is
+// an easier to read report.
void report_tags_found(const char *name, Tag HTML, Tag missing_tag)
{
if (!REPORT_TAG_FOUND) return;
@@ -345,9 +335,9 @@ void report_tags_found(const char *name, Tag HTML, Tag missing_tag)
snprintf(newTagSeen, len,"%s%s",tagSeen,name);
tagSeen = xstrdup(newTagSeen);
free(newTagSeen);
-
+
if (missing_tag == 1) {
- printf("Missing: { %s,\"Add HTML key here\" },\n",name);
+ printf("ODF Key not matched: %s --- %zu\n",name, HTML);
}
else if (missing_tag == 2) {
printf("Error: No entry found in DFXMLNames: DFNodeName = %s Tag: %d\n", name, HTML);
@@ -359,34 +349,45 @@ void report_tags_found(const char *name, Tag HTML, Tag missing_tag)
}
}
+void listODF_keys(int how)
+{
+ for (int i = 0; ODF_to_HTML_keys[i].HTML_KEY != ENDMARKER; i++) {
+ if (ODF_to_HTML_keys[i].ODF_KEY > 3 && ODF_to_HTML_keys[i].HTML_KEY > 3) {
+ printf("%-16s <---> %s\n",
+ translateXMLEnumName[ODF_to_HTML_keys[i].ODF_KEY],
+ translateXMLEnumName[ODF_to_HTML_keys[i].HTML_KEY]);
+ } else if (ODF_to_HTML_keys[i].ODF_KEY > 3 && ODF_to_HTML_keys[i].HTML_KEY < 3) {
+ printf("No suitable match found: %-16s\n",
+ translateXMLEnumName[ODF_to_HTML_keys[i].ODF_KEY]);
+ }
+ }
+}
+
+
Tag locate_HTML(DFNode *odfNode)
{
- // subtract the offset of 10 in the enum defined in DFXMLNames.h
- int index = (int)odfNode->tag - 10;
+ int index = (int)odfNode->tag;
int attrib_not_found = 0;
if (index > -1) {
for (int i = 0; ODF_to_HTML_keys[i].HTML_KEY != ENDMARKER; i++) {
- //printf("Seen: %s\n", translateXMLEnumName[ODF_to_HTML_keys[i].ODF_KEY - 10]);
- if (ODF_to_HTML_keys[i].ODF_KEY - 10 == index) {
+ if (ODF_to_HTML_keys[i].ODF_KEY == index) {
if (ODF_to_HTML_keys[i].attribute_value) {
if (strcmp(odfNode->attrs->value, ODF_to_HTML_keys[i].attribute_value)) {
attrib_not_found = 1;
continue;
} else {
- report_tags_found(translateXMLEnumName[index], ODF_to_HTML_keys[i].HTML_KEY - 10, 0);
- attrib_not_found = 0;
+ report_tags_found(translateXMLEnumName[index], ODF_to_HTML_keys[i].HTML_KEY, 0);
return ODF_to_HTML_keys[i].HTML_KEY;
}
}
- if (attrib_not_found == 1) { // we have attribs, but one is missing
- report_tags_found(odfNode->attrs->value, ODF_to_HTML_keys[i-1].ODF_KEY - 10, 3);
+ if (attrib_not_found) { // we have attribs, but one is missing
+ report_tags_found(odfNode->attrs->value, ODF_to_HTML_keys[i-1].ODF_KEY, 3);
return 0;
}
}
- }
- // Valid Tag not found in array
- report_tags_found(translateXMLEnumName[index], 0, 1);
+ }
+ report_tags_found(translateXMLEnumName[index], index, 1);
return 0;
}
else {
@@ -397,18 +398,46 @@ Tag locate_HTML(DFNode *odfNode)
}
+void show_nodes(DFNode *odfNode)
+{
+ for (DFNode *odfChild = odfNode->first; odfChild != NULL; odfChild = odfChild->next) {
+ printNode(odfChild);
+ }
+}
+
void printNode(DFNode *n)
{
if (n == NULL) return;
- // printf("Tag = %d Attrcount = %d\t", n->tag, n->attrcount);
- // printf("seqNo = %zu \t", n->seqNo);
- printf("value = %s \t\t", n->value);
+
+ printf("Tag tag: %zu\n",n->tag);
+ printf("unsigned int seqNo: %d\n",n->seqNo);
+ // printf("struct DFDocument *doc: %p\n",n->doc);
+ if (n->js) printf("void *js: %p\n",n->js);
+ if (n->changed) printf("int changed: %d\n",n->changed);
+ if (n->childrenChanged) printf("int childrenChanged %d\n",n->childrenChanged);
+ if (n->seqNoHashNext) printf("DFNode *seqNoHashNext %p\n", n->seqNoHashNext);
if (n->attrs) {
- printf("HTML TAG = %d %s \t", n->attrs->tag,
- translateXMLEnumName[locate_HTML(n)-10]);
- // printf("attr value = %s \t", n->attrs->value);
+ printf("DFAttribute *attrs: %p ",n->attrs);
+ printf(", Tag tags: %zu ",n->attrs->tag);
+ printf(", char *value: %s ",n->attrs->value);
+ printf("HTML TAG = %d: %s \n", n->attrs->tag,
+ translateXMLEnumName[locate_HTML(n)]);
+
}
+ if (n->attrsCount) printf("unsigned int attrsCount: %d\n",n->attrsCount);
+ if (n->attrsAlloc) printf("unsigned int attrsAlloc: %d\n", n->attrsAlloc);
+ if (n->target) printf("char *target: %s\n", n->target);
+ if (n->value) printf("char *value: %s\n", n->value);
if (n->tag > 2)
- printf("ODFKey = %s ", translateXMLEnumName[n->tag-10]);
- printf("\n");
+ printf("Tag Text = %s ", translateXMLEnumName[n->tag]);
+ printf("\n==================================================\n");
+}
+
+char *printMissingTag(Tag tag)
+{
+ char *s = translateXMLEnumName[tag];
+ int len = strlen(s)+14;
+ char *r = malloc(len);
+ snprintf(r, len,"Missing tag: %s",s);
+ return r;
}