You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2012/04/28 01:55:10 UTC

svn commit: r1331640 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/iwork/PagesContentHandler.java test/java/org/apache/tika/parser/iwork/IWorkParserTest.java test/resources/test-documents/testPagesComments.pages

Author: nick
Date: Fri Apr 27 23:55:09 2012
New Revision: 1331640

URL: http://svn.apache.org/viewvc?rev=1331640&view=rev
Log:
TIKA-907 Comments in iWorks Pages files

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesComments.pages   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java?rev=1331640&r1=1331639&r2=1331640&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java Fri Apr 27 23:55:09 2012
@@ -38,7 +38,7 @@ class PagesContentHandler extends Defaul
        METADATA, PARSABLE_TEXT, 
        HEADERS, HEADER_ODD, HEADER_EVEN, HEADER_FIRST,
        FOOTERS, FOOTER_ODD, FOOTER_EVEN, FOOTER_FIRST,
-       FOOTNOTES;
+       FOOTNOTES, ANNOTATIONS;
     }
     private DocumentPart inPart = null;
     
@@ -48,6 +48,7 @@ class PagesContentHandler extends Defaul
     private HeaderFooter headers = null;
     private HeaderFooter footers = null;
     private Footnotes footnotes = null; 
+    private Annotations annotations = null; 
     
     private Map<String, List<List<String>>> tableData =
         new HashMap<String, List<List<String>>>();
@@ -139,6 +140,20 @@ class PagesContentHandler extends Defaul
                  xhtml.endElement("div");
               }
            }
+        } else if ("sf:annotations".equals(qName)) {
+           annotations = new Annotations();
+           inPart = DocumentPart.ANNOTATIONS;
+        } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
+           annotations.start(attributes.getValue("sf:target"));
+        } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
+           xhtml.startElement("div", "style", "annotated");
+           
+           String annotationText = annotations.annotations.get(attributes.getValue("sfa:ID"));
+           if (annotationText != null) {
+              xhtml.startElement("div", "style", "annotation");
+              xhtml.characters(annotationText);
+              xhtml.endElement("div");
+           }
         }
 
         if (activeTableId != null) {
@@ -169,6 +184,10 @@ class PagesContentHandler extends Defaul
             xhtml.endElement("p");
         } else if ("sf:attachment".equals(qName)) {
             activeTableId = null;
+        } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
+           annotations.end();
+        } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
+           xhtml.endElement("div");
         }
     }
 
@@ -186,6 +205,7 @@ class PagesContentHandler extends Defaul
               if (inPart == DocumentPart.FOOTER_EVEN)  footers.defaultEven = str;
               if (inPart == DocumentPart.FOOTER_ODD)   footers.defaultOdd = str;
               if (inPart == DocumentPart.FOOTNOTES)    footnotes.text(str);
+              if (inPart == DocumentPart.ANNOTATIONS)  annotations.text(str);
           }
         }
     }
@@ -329,7 +349,8 @@ class PagesContentHandler extends Defaul
        }
     }
     /**
-     * Represents Footnotes in a document
+     * Represents Footnotes in a document. The way these work
+     *  in the file format isn't very clean...
      */
     private static class Footnotes {
        /** Mark -> Text */
@@ -351,4 +372,31 @@ class PagesContentHandler extends Defaul
           }
        }
     }
+    /**
+     * Represents Annotations in a document. We currently
+     *  just grab all the sf:p text in each one 
+     */
+    private class Annotations {
+       /** ID -> Text */
+       Map<String,String> annotations = new HashMap<String, String>();
+       String currentID = null;
+       StringBuffer currentText = null;
+       
+       private void start(String id) {
+          currentID = id;
+          currentText = new StringBuffer();
+       }
+       private void text(String text) {
+          if (text != null && text.length() > 0 && currentText != null) {
+             currentText.append(text);
+          }
+       }
+       private void end() {
+          if (currentText.length() > 0) {
+             annotations.put(currentID, currentText.toString());
+             currentID = null;
+             currentText = null;
+          }
+       }
+    }
 }
\ No newline at end of file

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java?rev=1331640&r1=1331639&r2=1331640&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java Fri Apr 27 23:55:09 2012
@@ -161,7 +161,7 @@ public class IWorkParserTest extends Tes
     }
     
     /**
-     * Check we get headers, footers and footnotes from keynote
+     * Check we get headers, footers and footnotes from Pages
      */
     public void testParsePagesHeadersFootersFootnotes() throws Exception {
        String footnote = "Footnote: Do a lot of people really use iWork?!?!";
@@ -186,6 +186,31 @@ public class IWorkParserTest extends Tes
        assertContains(contents, footnote);
     }
     
+    /**
+     * Check we get annotations (eg comments) from Pages
+     */
+    public void testParsePagesAnnotations() throws Exception {
+       String commentA = "comment about the APXL file";
+       String commentB = "comment about UIMA";
+       
+       
+       InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesComments.pages");
+       Metadata metadata = new Metadata();
+       ContentHandler handler = new BodyContentHandler();
+
+       iWorkParser.parse(input, handler, metadata, parseContext);
+       String contents = handler.toString();
+
+       // Check regular text
+       assertContains(contents, "Both Pages 1.x"); // P1
+       assertContains(contents, "understanding the Pages document"); // P1
+       assertContains(contents, "should be page 2"); // P2
+       
+       // Check for comments
+       assertContains(contents, commentA);
+       assertContains(contents, commentB);
+    }
+    
     public void assertContains(String haystack, String needle) {
        assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
     }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesComments.pages
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesComments.pages?rev=1331640&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesComments.pages
------------------------------------------------------------------------------
    svn:mime-type = application/zip