You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2012/04/28 00:48:32 UTC

svn commit: r1331618 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/iwork/PagesContentHandler.java test/java/org/apache/tika/parser/iwork/IWorkParserTest.java test/resources/test-documents/testPagesHeadersFootersFootnotes.pages

Author: nick
Date: Fri Apr 27 22:48:32 2012
New Revision: 1331618

URL: http://svn.apache.org/viewvc?rev=1331618&view=rev
Log:
TIKA-906 Support extracting Headers, Footers and Footnotes in iWorks Pages files. As part of this, make the parser a little more aware of where in the file it is, and start tracking some of the earlier parts of the file ready for when we hit the main text

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesHeadersFootersFootnotes.pages   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java?rev=1331618&r1=1331617&r2=1331618&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java Fri Apr 27 22:48:32 2012
@@ -33,11 +33,22 @@ class PagesContentHandler extends Defaul
     private final XHTMLContentHandler xhtml;
     private final Metadata metadata;
 
-    private boolean inMetaDataPart = false;
+    /** The (interesting) part of the document we're in. Should be more structured... */
+    private enum DocumentPart {
+       METADATA, PARSABLE_TEXT, 
+       HEADERS, HEADER_ODD, HEADER_EVEN, HEADER_FIRST,
+       FOOTERS, FOOTER_ODD, FOOTER_EVEN, FOOTER_FIRST,
+       FOOTNOTES;
+    }
+    private DocumentPart inPart = null;
+    
     private boolean parseProperty = false;
-    private boolean inParsableText = false;
     private int pageCount = 0;
 
+    private HeaderFooter headers = null;
+    private HeaderFooter footers = null;
+    private Footnotes footnotes = null; 
+    
     private Map<String, List<List<String>>> tableData =
         new HashMap<String, List<List<String>>>();
     private String activeTableId;
@@ -56,6 +67,7 @@ class PagesContentHandler extends Defaul
     public void endDocument() throws SAXException {
         metadata.set(Metadata.PAGE_COUNT, String.valueOf(pageCount));
         if (pageCount > 0) {
+            doFooter();
             xhtml.endElement("div");
         }
     }
@@ -77,17 +89,19 @@ class PagesContentHandler extends Defaul
         }
 
         if ("sl:publication-info".equals(qName)) {
-            inMetaDataPart = true;
+            inPart = DocumentPart.METADATA;
         } else if ("sf:metadata".equals(qName)) {
-            inMetaDataPart = true;
+           inPart = DocumentPart.METADATA;
         } else if ("sf:page-start".equals(qName)) {
             if (pageCount > 0) {
+                doFooter();
                 xhtml.endElement("div");
             }
             xhtml.startElement("div");
             pageCount++;
+            doHeader();
         } else if ("sf:p".equals(qName) && pageCount > 0) {
-            inParsableText = true;
+            inPart = DocumentPart.PARSABLE_TEXT;
             xhtml.startElement("p");
         } else if ("sf:attachment".equals(qName)) {
             String kind = attributes.getValue("sf:kind");
@@ -98,13 +112,40 @@ class PagesContentHandler extends Defaul
         } else if ("sf:attachment-ref".equals(qName)) {
             String idRef = attributes.getValue("sfa:IDREF");
             outputTable(idRef);
+        } else if ("sf:headers".equals(qName)) {
+            headers = new HeaderFooter(qName);
+            inPart = DocumentPart.HEADERS;
+        } else if ("sf:footers".equals(qName)) {
+           footers = new HeaderFooter(qName);
+           inPart = DocumentPart.FOOTERS;
+        } else if ("sf:header".equals(qName)) {
+            inPart = headers.identifyPart(attributes.getValue("sf:name"));
+        } else if ("sf:footer".equals(qName)) {
+           inPart = footers.identifyPart(attributes.getValue("sf:name"));
+        } else if ("sf:footnotes".equals(qName)) {
+           footnotes = new Footnotes();
+           inPart = DocumentPart.FOOTNOTES;
+        } else if ("sf:footnote-mark".equals(qName)) {
+           footnotes.recordMark(attributes.getValue("sf:mark"));
+        } else if ("sf:footnote".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
+           // What about non auto-numbered?
+           String footnoteMark = attributes.getValue("sf:autonumber");
+           if (footnotes != null) {
+              String footnoteText = footnotes.footnotes.get(footnoteMark);
+              if (footnoteText != null) {
+                 xhtml.startElement("div", "style", "footnote");
+                 xhtml.characters("Footnote:" ); // As shown in Pages
+                 xhtml.characters(footnoteText);
+                 xhtml.endElement("div");
+              }
+           }
         }
 
         if (activeTableId != null) {
             parseTableData(qName, attributes);
         }
 
-        if (inMetaDataPart) {
+        if (inPart == DocumentPart.METADATA) {
             metaDataLocalName = localName;
             metaDataQName = qName;
             parseProperty = true;
@@ -120,11 +161,11 @@ class PagesContentHandler extends Defaul
         }
 
         if ("sl:publication-info".equals(qName)) {
-            inMetaDataPart = false;
+            inPart = null;
         } else if ("sf:metadata".equals(qName)) {
-            inMetaDataPart = false;
+            inPart = null;
         } else if ("sf:p".equals(qName) && pageCount > 0) {
-            inParsableText = false;
+            inPart = null;
             xhtml.endElement("p");
         } else if ("sf:attachment".equals(qName)) {
             activeTableId = null;
@@ -133,8 +174,19 @@ class PagesContentHandler extends Defaul
 
     @Override
     public void characters(char[] ch, int start, int length) throws SAXException {
-        if (inParsableText && length > 0) {
-            xhtml.characters(ch, start, length);
+        if (length > 0) {
+           if (inPart == DocumentPart.PARSABLE_TEXT) {
+              xhtml.characters(ch, start, length);
+          } else if(inPart != null) {
+              String str = new String(ch, start, length);
+              if (inPart == DocumentPart.HEADER_FIRST) headers.defaultFirst = str;
+              if (inPart == DocumentPart.HEADER_EVEN)  headers.defaultEven = str;
+              if (inPart == DocumentPart.HEADER_ODD)   headers.defaultOdd = str;
+              if (inPart == DocumentPart.FOOTER_FIRST) footers.defaultFirst = str;
+              if (inPart == DocumentPart.FOOTER_EVEN)  footers.defaultEven = str;
+              if (inPart == DocumentPart.FOOTER_ODD)   footers.defaultOdd = str;
+              if (inPart == DocumentPart.FOOTNOTES)    footnotes.text(str);
+          }
         }
     }
 
@@ -217,5 +269,86 @@ class PagesContentHandler extends Defaul
 
         return null;
     }
+    
+    private void doHeader() throws SAXException {
+       if (headers != null) {
+          headers.output("header");
+       }
+    }
+    private void doFooter() throws SAXException {
+       if (footers != null) {
+          footers.output("footer");
+       }
+    }
 
+    /**
+     * Represents the Headers or Footers in a document
+     */
+    private class HeaderFooter {
+       private String type; // sf:headers or sf:footers
+       private String defaultOdd;
+       private String defaultEven;
+       private String defaultFirst;
+       // TODO Can there be custom ones?
+       
+       private HeaderFooter(String type) {
+          this.type = type; 
+       }
+       private DocumentPart identifyPart(String name) {
+          if("SFWPDefaultOddHeaderIdentifier".equals(name))
+             return DocumentPart.HEADER_ODD;
+          if("SFWPDefaultEvenHeaderIdentifier".equals(name))
+             return DocumentPart.HEADER_EVEN;
+          if("SFWPDefaultFirstHeaderIdentifier".equals(name))
+             return DocumentPart.HEADER_FIRST;
+          
+          if("SFWPDefaultOddFooterIdentifier".equals(name))
+             return DocumentPart.FOOTER_ODD;
+          if("SFWPDefaultEvenFooterIdentifier".equals(name))
+             return DocumentPart.FOOTER_EVEN;
+          if("SFWPDefaultFirstFooterIdentifier".equals(name))
+             return DocumentPart.FOOTER_FIRST;
+          
+          return null;
+       }
+       private void output(String what) throws SAXException {
+          String text = null;
+          if (pageCount == 1 && defaultFirst != null) {
+             text = defaultFirst;
+          } else if (pageCount % 2 == 0 && defaultEven != null) {
+             text = defaultEven;
+          } else {
+             text = defaultOdd;
+          }
+          
+          if (text != null) {
+             xhtml.startElement("div", "class", "header");
+             xhtml.characters(text);
+             xhtml.endElement("div");
+          }
+       }
+    }
+    /**
+     * Represents Footnotes in a document
+     */
+    private static class Footnotes {
+       /** Mark -> Text */
+       Map<String,String> footnotes = new HashMap<String, String>();
+       String lastSeenMark = null;
+       
+       /**
+        * Normally happens before the text of the mark
+        */
+       private void recordMark(String mark) {
+          lastSeenMark = mark;
+       }
+       private void text(String text) {
+          if (lastSeenMark != null) {
+             if (footnotes.containsKey(lastSeenMark)) {
+                text = footnotes.get(lastSeenMark) + text;
+             }
+             footnotes.put(lastSeenMark, text);
+          }
+       }
+    }
 }
\ No newline at end of file

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java?rev=1331618&r1=1331617&r2=1331618&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java Fri Apr 27 22:48:32 2012
@@ -159,4 +159,34 @@ public class IWorkParserTest extends Tes
        // Will have been identified as encrypted
        assertEquals("application/x-tika-iworks-protected", metadata.get(Metadata.CONTENT_TYPE));
     }
+    
+    /**
+     * Check we get headers, footers and footnotes from keynote
+     */
+    public void testParsePagesHeadersFootersFootnotes() throws Exception {
+       String footnote = "Footnote: Do a lot of people really use iWork?!?!";
+       String header = "THIS IS SOME HEADER TEXT";
+       String footer = "THIS IS SOME FOOTER TEXT";
+       
+       InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesHeadersFootersFootnotes.pages");
+       Metadata metadata = new Metadata();
+       ContentHandler handler = new BodyContentHandler();
+
+       iWorkParser.parse(input, handler, metadata, parseContext);
+       String contents = handler.toString();
+
+       // Check regular text
+       assertContains(contents, "Both Pages 1.x"); // P1
+       assertContains(contents, "understanding the Pages document"); // P1
+       assertContains(contents, "should be page 2"); // P2
+       
+       // Check for headers, footers and footnotes
+       assertContains(contents, header);
+       assertContains(contents, footer);
+       assertContains(contents, footnote);
+    }
+    
+    public void assertContains(String haystack, String needle) {
+       assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
+    }
 }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesHeadersFootersFootnotes.pages
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesHeadersFootersFootnotes.pages?rev=1331618&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesHeadersFootersFootnotes.pages
------------------------------------------------------------------------------
    svn:mime-type = application/zip