You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/05/26 16:33:55 UTC
svn commit: r948452 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
test/resources/test-documents/testKeynote.key
Author: jukka
Date: Wed May 26 14:33:54 2010
New Revision: 948452
URL: http://svn.apache.org/viewvc?rev=948452&view=rev
Log:
TIKA-402: Support for Keynote and Pages documents
Patch by Martijn van Groningen
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
tika/trunk/tika-parsers/src/test/resources/test-documents/testKeynote.key
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java?rev=948452&r1=948451&r2=948452&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java Wed May 26 14:33:54 2010
@@ -34,14 +34,14 @@ class KeynoteContentHandler extends Defa
private boolean inTheme = false;
private boolean inTitle = false;
private boolean inBody = false;
+ private String tableId;
+ private Integer numberOfColumns = null;
+ private Integer currentColumn = null;
private boolean inMetadata = false;
private boolean inMetaDataTitle = false;
private boolean inMetaDataAuthors = false;
- private boolean stickNote = false;
- private boolean notes = false;
-
private boolean inParsableText = false;
private int numberOfSlides = 0;
@@ -93,6 +93,16 @@ class KeynoteContentHandler extends Defa
metadata.set(Metadata.TITLE, attributes.getValue("sfa:string"));
} else if (inMetaDataAuthors && "key:string".equals(qName)) {
metadata.add(Metadata.AUTHOR, attributes.getValue("sfa:string"));
+ } else if (inSlide && "sf:tabular-model".equals(qName)) {
+ tableId = attributes.getValue("sfa:ID");
+ xhtml.startElement("table");
+ } else if (tableId != null && "sf:columns".equals(qName)) {
+ numberOfColumns = Integer.parseInt(attributes.getValue("sf:count"));
+ currentColumn = 0;
+ } else if (tableId != null && "sf:ct".equals(qName)) {
+ parseTableData(attributes.getValue("sfa:s"));
+ } else if (tableId != null && "sf:n".equals(qName)) {
+ parseTableData(attributes.getValue("sf:v"));
}
}
@@ -122,6 +132,11 @@ class KeynoteContentHandler extends Defa
inMetaDataTitle = false;
} else if (inMetadata && "key:authors".equals(qName)) {
inMetaDataAuthors = false;
+ } else if (inSlide && "sf:tabular-model".equals(qName)) {
+ xhtml.endElement("table");
+ tableId = null;
+ numberOfColumns = null;
+ currentColumn = null;
}
}
@@ -138,4 +153,16 @@ class KeynoteContentHandler extends Defa
}
}
+ private void parseTableData(String value) throws SAXException {
+ if (currentColumn == 0) {
+ xhtml.startElement("tr");
+ }
+
+ xhtml.element("td", value);
+
+ if (currentColumn.equals(numberOfColumns)) {
+ xhtml.endElement("tr");
+ }
+ }
+
}
\ No newline at end of file
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java?rev=948452&r1=948451&r2=948452&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java Wed May 26 14:33:54 2010
@@ -25,7 +25,7 @@ import org.xml.sax.ContentHandler;
import java.io.InputStream;
/**
- *
+ * Tests if the different iwork parsers parse the content and metadata properly.
*/
public class IWorkParserTest extends TestCase {
@@ -46,20 +46,30 @@ public class IWorkParserTest extends Tes
assertEquals(6, metadata.size());
assertEquals("application/vnd.apple.keynote", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("2", metadata.get(Metadata.SLIDE_COUNT));
+ assertEquals("3", metadata.get(Metadata.SLIDE_COUNT));
assertEquals("1024", metadata.get(KeynoteContentHandler.PRESENTATION_WIDTH));
assertEquals("768", metadata.get(KeynoteContentHandler.PRESENTATION_HEIGHT));
assertEquals("Tika user", metadata.get(Metadata.AUTHOR));
assertEquals("Apache tika", metadata.get(Metadata.TITLE));
String content = handler.toString();
- System.out.println(content);
assertTrue(content.contains("A sample presentation"));
assertTrue(content.contains("For the Apache Tika project"));
assertTrue(content.contains("Slide 1"));
- //assertTrue(content.contains("Some random text for the sake of testability."));
+ assertTrue(content.contains("Some random text for the sake of testability."));
assertTrue(content.contains("A nice comment"));
assertTrue(content.contains("A nice note"));
+
+ // test table data
+ assertTrue(content.contains("Cell one"));
+ assertTrue(content.contains("Cell two"));
+ assertTrue(content.contains("Cell three"));
+ assertTrue(content.contains("Cell four"));
+ assertTrue(content.contains("Cell 5"));
+ assertTrue(content.contains("Cell six"));
+ assertTrue(content.contains("7"));
+ assertTrue(content.contains("Cell eight"));
+ assertTrue(content.contains("5/5/1985"));
}
public void testParsePages() throws Exception {
@@ -80,7 +90,24 @@ public class IWorkParserTest extends Tes
assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
String content = handler.toString();
- //System.out.println(content);
+
+ // text on page 1
+ assertTrue(content.contains("Sample pages document"));
+ assertTrue(content.contains("Some plain text to parse."));
+ assertTrue(content.contains("Cell one"));
+ assertTrue(content.contains("Cell two"));
+ assertTrue(content.contains("Cell three"));
+ assertTrue(content.contains("Cell four"));
+ assertTrue(content.contains("Cell five"));
+ assertTrue(content.contains("Cell six"));
+ assertTrue(content.contains("Cell seven"));
+ assertTrue(content.contains("Cell eight"));
+ assertTrue(content.contains("Cell nine"));
+ assertTrue(content.contains("Both Pages 1.x and Keynote 2.x")); // ...
+
+ // text on page 2
+ assertTrue(content.contains("A second page...."));
+ assertTrue(content.contains("Extensible Markup Language")); // ...
}
}
Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testKeynote.key
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testKeynote.key?rev=948452&r1=948451&r2=948452&view=diff
==============================================================================
Binary files - no diff available.