You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2012/04/28 00:48:32 UTC
svn commit: r1331618 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
test/resources/test-documents/testPagesHeadersFootersFootnotes.pages
Author: nick
Date: Fri Apr 27 22:48:32 2012
New Revision: 1331618
URL: http://svn.apache.org/viewvc?rev=1331618&view=rev
Log:
TIKA-906 Support extracting Headers, Footers and Footnotes in iWorks Pages files. As part of this, make the parser a little more aware of where in the file it is, and start tracking some of the earlier parts of the file ready for when we hit the main text
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesHeadersFootersFootnotes.pages (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java?rev=1331618&r1=1331617&r2=1331618&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java Fri Apr 27 22:48:32 2012
@@ -33,11 +33,22 @@ class PagesContentHandler extends Defaul
private final XHTMLContentHandler xhtml;
private final Metadata metadata;
- private boolean inMetaDataPart = false;
+ /** The (interesting) part of the document we're in. Should be more structured... */
+ private enum DocumentPart {
+ METADATA, PARSABLE_TEXT,
+ HEADERS, HEADER_ODD, HEADER_EVEN, HEADER_FIRST,
+ FOOTERS, FOOTER_ODD, FOOTER_EVEN, FOOTER_FIRST,
+ FOOTNOTES;
+ }
+ private DocumentPart inPart = null;
+
private boolean parseProperty = false;
- private boolean inParsableText = false;
private int pageCount = 0;
+ private HeaderFooter headers = null;
+ private HeaderFooter footers = null;
+ private Footnotes footnotes = null;
+
private Map<String, List<List<String>>> tableData =
new HashMap<String, List<List<String>>>();
private String activeTableId;
@@ -56,6 +67,7 @@ class PagesContentHandler extends Defaul
public void endDocument() throws SAXException {
metadata.set(Metadata.PAGE_COUNT, String.valueOf(pageCount));
if (pageCount > 0) {
+ doFooter();
xhtml.endElement("div");
}
}
@@ -77,17 +89,19 @@ class PagesContentHandler extends Defaul
}
if ("sl:publication-info".equals(qName)) {
- inMetaDataPart = true;
+ inPart = DocumentPart.METADATA;
} else if ("sf:metadata".equals(qName)) {
- inMetaDataPart = true;
+ inPart = DocumentPart.METADATA;
} else if ("sf:page-start".equals(qName)) {
if (pageCount > 0) {
+ doFooter();
xhtml.endElement("div");
}
xhtml.startElement("div");
pageCount++;
+ doHeader();
} else if ("sf:p".equals(qName) && pageCount > 0) {
- inParsableText = true;
+ inPart = DocumentPart.PARSABLE_TEXT;
xhtml.startElement("p");
} else if ("sf:attachment".equals(qName)) {
String kind = attributes.getValue("sf:kind");
@@ -98,13 +112,40 @@ class PagesContentHandler extends Defaul
} else if ("sf:attachment-ref".equals(qName)) {
String idRef = attributes.getValue("sfa:IDREF");
outputTable(idRef);
+ } else if ("sf:headers".equals(qName)) {
+ headers = new HeaderFooter(qName);
+ inPart = DocumentPart.HEADERS;
+ } else if ("sf:footers".equals(qName)) {
+ footers = new HeaderFooter(qName);
+ inPart = DocumentPart.FOOTERS;
+ } else if ("sf:header".equals(qName)) {
+ inPart = headers.identifyPart(attributes.getValue("sf:name"));
+ } else if ("sf:footer".equals(qName)) {
+ inPart = footers.identifyPart(attributes.getValue("sf:name"));
+ } else if ("sf:footnotes".equals(qName)) {
+ footnotes = new Footnotes();
+ inPart = DocumentPart.FOOTNOTES;
+ } else if ("sf:footnote-mark".equals(qName)) {
+ footnotes.recordMark(attributes.getValue("sf:mark"));
+ } else if ("sf:footnote".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
+ // What about non auto-numbered?
+ String footnoteMark = attributes.getValue("sf:autonumber");
+ if (footnotes != null) {
+ String footnoteText = footnotes.footnotes.get(footnoteMark);
+ if (footnoteText != null) {
+ xhtml.startElement("div", "style", "footnote");
+ xhtml.characters("Footnote:" ); // As shown in Pages
+ xhtml.characters(footnoteText);
+ xhtml.endElement("div");
+ }
+ }
}
if (activeTableId != null) {
parseTableData(qName, attributes);
}
- if (inMetaDataPart) {
+ if (inPart == DocumentPart.METADATA) {
metaDataLocalName = localName;
metaDataQName = qName;
parseProperty = true;
@@ -120,11 +161,11 @@ class PagesContentHandler extends Defaul
}
if ("sl:publication-info".equals(qName)) {
- inMetaDataPart = false;
+ inPart = null;
} else if ("sf:metadata".equals(qName)) {
- inMetaDataPart = false;
+ inPart = null;
} else if ("sf:p".equals(qName) && pageCount > 0) {
- inParsableText = false;
+ inPart = null;
xhtml.endElement("p");
} else if ("sf:attachment".equals(qName)) {
activeTableId = null;
@@ -133,8 +174,19 @@ class PagesContentHandler extends Defaul
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
- if (inParsableText && length > 0) {
- xhtml.characters(ch, start, length);
+ if (length > 0) {
+ if (inPart == DocumentPart.PARSABLE_TEXT) {
+ xhtml.characters(ch, start, length);
+ } else if(inPart != null) {
+ String str = new String(ch, start, length);
+ if (inPart == DocumentPart.HEADER_FIRST) headers.defaultFirst = str;
+ if (inPart == DocumentPart.HEADER_EVEN) headers.defaultEven = str;
+ if (inPart == DocumentPart.HEADER_ODD) headers.defaultOdd = str;
+ if (inPart == DocumentPart.FOOTER_FIRST) footers.defaultFirst = str;
+ if (inPart == DocumentPart.FOOTER_EVEN) footers.defaultEven = str;
+ if (inPart == DocumentPart.FOOTER_ODD) footers.defaultOdd = str;
+ if (inPart == DocumentPart.FOOTNOTES) footnotes.text(str);
+ }
}
}
@@ -217,5 +269,86 @@ class PagesContentHandler extends Defaul
return null;
}
+
+ private void doHeader() throws SAXException {
+ if (headers != null) {
+ headers.output("header");
+ }
+ }
+ private void doFooter() throws SAXException {
+ if (footers != null) {
+ footers.output("footer");
+ }
+ }
+ /**
+ * Represents the Headers or Footers in a document
+ */
+ private class HeaderFooter {
+ private String type; // sf:headers or sf:footers
+ private String defaultOdd;
+ private String defaultEven;
+ private String defaultFirst;
+ // TODO Can there be custom ones?
+
+ private HeaderFooter(String type) {
+ this.type = type;
+ }
+ private DocumentPart identifyPart(String name) {
+ if("SFWPDefaultOddHeaderIdentifier".equals(name))
+ return DocumentPart.HEADER_ODD;
+ if("SFWPDefaultEvenHeaderIdentifier".equals(name))
+ return DocumentPart.HEADER_EVEN;
+ if("SFWPDefaultFirstHeaderIdentifier".equals(name))
+ return DocumentPart.HEADER_FIRST;
+
+ if("SFWPDefaultOddFooterIdentifier".equals(name))
+ return DocumentPart.FOOTER_ODD;
+ if("SFWPDefaultEvenFooterIdentifier".equals(name))
+ return DocumentPart.FOOTER_EVEN;
+ if("SFWPDefaultFirstFooterIdentifier".equals(name))
+ return DocumentPart.FOOTER_FIRST;
+
+ return null;
+ }
+ private void output(String what) throws SAXException {
+ String text = null;
+ if (pageCount == 1 && defaultFirst != null) {
+ text = defaultFirst;
+ } else if (pageCount % 2 == 0 && defaultEven != null) {
+ text = defaultEven;
+ } else {
+ text = defaultOdd;
+ }
+
+ if (text != null) {
+ xhtml.startElement("div", "class", "header");
+ xhtml.characters(text);
+ xhtml.endElement("div");
+ }
+ }
+ }
+ /**
+ * Represents Footnotes in a document
+ */
+ private static class Footnotes {
+ /** Mark -> Text */
+ Map<String,String> footnotes = new HashMap<String, String>();
+ String lastSeenMark = null;
+
+ /**
+ * Normally happens before the text of the mark
+ */
+ private void recordMark(String mark) {
+ lastSeenMark = mark;
+ }
+ private void text(String text) {
+ if (lastSeenMark != null) {
+ if (footnotes.containsKey(lastSeenMark)) {
+ text = footnotes.get(lastSeenMark) + text;
+ }
+ footnotes.put(lastSeenMark, text);
+ }
+ }
+ }
}
\ No newline at end of file
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java?rev=1331618&r1=1331617&r2=1331618&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java Fri Apr 27 22:48:32 2012
@@ -159,4 +159,34 @@ public class IWorkParserTest extends Tes
// Will have been identified as encrypted
assertEquals("application/x-tika-iworks-protected", metadata.get(Metadata.CONTENT_TYPE));
}
+
+ /**
+ * Check we get headers, footers and footnotes from keynote
+ */
+ public void testParsePagesHeadersFootersFootnotes() throws Exception {
+ String footnote = "Footnote: Do a lot of people really use iWork?!?!";
+ String header = "THIS IS SOME HEADER TEXT";
+ String footer = "THIS IS SOME FOOTER TEXT";
+
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersFootnotes.pages");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, metadata, parseContext);
+ String contents = handler.toString();
+
+ // Check regular text
+ assertContains(contents, "Both Pages 1.x"); // P1
+ assertContains(contents, "understanding the Pages document"); // P1
+ assertContains(contents, "should be page 2"); // P2
+
+ // Check for headers, footers and footnotes
+ assertContains(contents, header);
+ assertContains(contents, footer);
+ assertContains(contents, footnote);
+ }
+
+ public void assertContains(String haystack, String needle) {
+ assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
+ }
}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesHeadersFootersFootnotes.pages
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesHeadersFootersFootnotes.pages?rev=1331618&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesHeadersFootersFootnotes.pages
------------------------------------------------------------------------------
svn:mime-type = application/zip