You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/05/26 16:33:55 UTC

svn commit: r948452 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java test/java/org/apache/tika/parser/iwork/IWorkParserTest.java test/resources/test-documents/testKeynote.key

Author: jukka
Date: Wed May 26 14:33:54 2010
New Revision: 948452

URL: http://svn.apache.org/viewvc?rev=948452&view=rev
Log:
TIKA-402: Support for Keynote and Pages documents

Patch by Martijn van Groningen

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
    tika/trunk/tika-parsers/src/test/resources/test-documents/testKeynote.key

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java?rev=948452&r1=948451&r2=948452&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java Wed May 26 14:33:54 2010
@@ -34,14 +34,14 @@ class KeynoteContentHandler extends Defa
     private boolean inTheme = false;
     private boolean inTitle = false;
     private boolean inBody = false;
+    private String tableId;
+    private Integer numberOfColumns = null;
+    private Integer currentColumn = null;
 
     private boolean inMetadata = false;
     private boolean inMetaDataTitle = false;
     private boolean inMetaDataAuthors = false;
 
-    private boolean stickNote = false;
-    private boolean notes = false;
-
     private boolean inParsableText = false;
 
     private int numberOfSlides = 0;
@@ -93,6 +93,16 @@ class KeynoteContentHandler extends Defa
             metadata.set(Metadata.TITLE, attributes.getValue("sfa:string"));
         } else if (inMetaDataAuthors && "key:string".equals(qName)) {
             metadata.add(Metadata.AUTHOR, attributes.getValue("sfa:string"));
+        } else if (inSlide && "sf:tabular-model".equals(qName)) {
+            tableId = attributes.getValue("sfa:ID");
+            xhtml.startElement("table");
+        } else if (tableId != null && "sf:columns".equals(qName)) {
+            numberOfColumns = Integer.parseInt(attributes.getValue("sf:count"));
+            currentColumn = 0;
+        } else if (tableId != null && "sf:ct".equals(qName)) {
+            parseTableData(attributes.getValue("sfa:s"));
+        } else if (tableId != null && "sf:n".equals(qName)) {
+            parseTableData(attributes.getValue("sf:v"));
         }
     }
 
@@ -122,6 +132,11 @@ class KeynoteContentHandler extends Defa
             inMetaDataTitle = false;
         } else if (inMetadata && "key:authors".equals(qName)) {
             inMetaDataAuthors = false;
+        } else if (inSlide && "sf:tabular-model".equals(qName)) {
+            xhtml.endElement("table");
+            tableId = null;
+            numberOfColumns = null;
+            currentColumn = null;
         }
     }
 
@@ -138,4 +153,16 @@ class KeynoteContentHandler extends Defa
         }
     }
 
+    private void parseTableData(String value) throws SAXException {
+      if (currentColumn == 0) {
+          xhtml.startElement("tr");
+      }
+
+      xhtml.element("td", value);
+
+      if (currentColumn.equals(numberOfColumns)) {
+          xhtml.endElement("tr");
+      }
+    }
+
 }
\ No newline at end of file

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java?rev=948452&r1=948451&r2=948452&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java Wed May 26 14:33:54 2010
@@ -25,7 +25,7 @@ import org.xml.sax.ContentHandler;
 import java.io.InputStream;
 
 /**
- * 
+ * Tests if the different iwork parsers parse the content and metadata properly. 
  */
 public class IWorkParserTest extends TestCase {
 
@@ -46,20 +46,30 @@ public class IWorkParserTest extends Tes
 
         assertEquals(6, metadata.size());
         assertEquals("application/vnd.apple.keynote", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("2", metadata.get(Metadata.SLIDE_COUNT));
+        assertEquals("3", metadata.get(Metadata.SLIDE_COUNT));
         assertEquals("1024", metadata.get(KeynoteContentHandler.PRESENTATION_WIDTH));
         assertEquals("768", metadata.get(KeynoteContentHandler.PRESENTATION_HEIGHT));
         assertEquals("Tika user", metadata.get(Metadata.AUTHOR));
         assertEquals("Apache tika", metadata.get(Metadata.TITLE));
 
         String content = handler.toString();
-        System.out.println(content);
         assertTrue(content.contains("A sample presentation"));
         assertTrue(content.contains("For the Apache Tika project"));
         assertTrue(content.contains("Slide 1"));
-        //assertTrue(content.contains("Some random text for the sake of testability."));
+        assertTrue(content.contains("Some random text for the sake of testability."));
         assertTrue(content.contains("A nice comment"));
         assertTrue(content.contains("A nice note"));
+
+        // test table data
+        assertTrue(content.contains("Cell one"));
+        assertTrue(content.contains("Cell two"));
+        assertTrue(content.contains("Cell three"));
+        assertTrue(content.contains("Cell four"));
+        assertTrue(content.contains("Cell 5"));
+        assertTrue(content.contains("Cell six"));
+        assertTrue(content.contains("7"));
+        assertTrue(content.contains("Cell eight"));
+        assertTrue(content.contains("5/5/1985"));
     }
 
     public void testParsePages() throws Exception {
@@ -80,7 +90,24 @@ public class IWorkParserTest extends Tes
         assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
 
         String content = handler.toString();
-        //System.out.println(content);
+
+        // text on page 1
+        assertTrue(content.contains("Sample pages document"));
+        assertTrue(content.contains("Some plain text to parse."));
+        assertTrue(content.contains("Cell one"));
+        assertTrue(content.contains("Cell two"));
+        assertTrue(content.contains("Cell three"));
+        assertTrue(content.contains("Cell four"));
+        assertTrue(content.contains("Cell five"));
+        assertTrue(content.contains("Cell six"));
+        assertTrue(content.contains("Cell seven"));
+        assertTrue(content.contains("Cell eight"));
+        assertTrue(content.contains("Cell nine"));
+        assertTrue(content.contains("Both Pages 1.x and Keynote 2.x")); // ...
+
+        // text on page 2
+        assertTrue(content.contains("A second page...."));
+        assertTrue(content.contains("Extensible Markup Language")); // ...
     }
 
 }

Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testKeynote.key
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testKeynote.key?rev=948452&r1=948451&r2=948452&view=diff
==============================================================================
Binary files - no diff available.