You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2012/04/28 01:55:10 UTC
svn commit: r1331640 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
test/resources/test-documents/testPagesComments.pages
Author: nick
Date: Fri Apr 27 23:55:09 2012
New Revision: 1331640
URL: http://svn.apache.org/viewvc?rev=1331640&view=rev
Log:
TIKA-907 Comments in iWorks Pages files
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesComments.pages (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java?rev=1331640&r1=1331639&r2=1331640&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java Fri Apr 27 23:55:09 2012
@@ -38,7 +38,7 @@ class PagesContentHandler extends Defaul
METADATA, PARSABLE_TEXT,
HEADERS, HEADER_ODD, HEADER_EVEN, HEADER_FIRST,
FOOTERS, FOOTER_ODD, FOOTER_EVEN, FOOTER_FIRST,
- FOOTNOTES;
+ FOOTNOTES, ANNOTATIONS;
}
private DocumentPart inPart = null;
@@ -48,6 +48,7 @@ class PagesContentHandler extends Defaul
private HeaderFooter headers = null;
private HeaderFooter footers = null;
private Footnotes footnotes = null;
+ private Annotations annotations = null;
private Map<String, List<List<String>>> tableData =
new HashMap<String, List<List<String>>>();
@@ -139,6 +140,20 @@ class PagesContentHandler extends Defaul
xhtml.endElement("div");
}
}
+ } else if ("sf:annotations".equals(qName)) {
+ annotations = new Annotations();
+ inPart = DocumentPart.ANNOTATIONS;
+ } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
+ annotations.start(attributes.getValue("sf:target"));
+ } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
+ xhtml.startElement("div", "style", "annotated");
+
+ String annotationText = annotations.annotations.get(attributes.getValue("sfa:ID"));
+ if (annotationText != null) {
+ xhtml.startElement("div", "style", "annotation");
+ xhtml.characters(annotationText);
+ xhtml.endElement("div");
+ }
}
if (activeTableId != null) {
@@ -169,6 +184,10 @@ class PagesContentHandler extends Defaul
xhtml.endElement("p");
} else if ("sf:attachment".equals(qName)) {
activeTableId = null;
+ } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
+ annotations.end();
+ } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
+ xhtml.endElement("div");
}
}
@@ -186,6 +205,7 @@ class PagesContentHandler extends Defaul
if (inPart == DocumentPart.FOOTER_EVEN) footers.defaultEven = str;
if (inPart == DocumentPart.FOOTER_ODD) footers.defaultOdd = str;
if (inPart == DocumentPart.FOOTNOTES) footnotes.text(str);
+ if (inPart == DocumentPart.ANNOTATIONS) annotations.text(str);
}
}
}
@@ -329,7 +349,8 @@ class PagesContentHandler extends Defaul
}
}
/**
- * Represents Footnotes in a document
+ * Represents Footnotes in a document. The way these work
+ * in the file format isn't very clean...
*/
private static class Footnotes {
/** Mark -> Text */
@@ -351,4 +372,31 @@ class PagesContentHandler extends Defaul
}
}
}
+ /**
+ * Represents Annotations in a document. We currently
+ * just grab all the sf:p text in each one
+ */
+ private class Annotations {
+ /** ID -> Text */
+ Map<String,String> annotations = new HashMap<String, String>();
+ String currentID = null;
+ StringBuffer currentText = null;
+
+ private void start(String id) {
+ currentID = id;
+ currentText = new StringBuffer();
+ }
+ private void text(String text) {
+ if (text != null && text.length() > 0 && currentText != null) {
+ currentText.append(text);
+ }
+ }
+ private void end() {
+ if (currentText.length() > 0) {
+ annotations.put(currentID, currentText.toString());
+ currentID = null;
+ currentText = null;
+ }
+ }
+ }
}
\ No newline at end of file
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java?rev=1331640&r1=1331639&r2=1331640&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java Fri Apr 27 23:55:09 2012
@@ -161,7 +161,7 @@ public class IWorkParserTest extends Tes
}
/**
- * Check we get headers, footers and footnotes from keynote
+ * Check we get headers, footers and footnotes from Pages
*/
public void testParsePagesHeadersFootersFootnotes() throws Exception {
String footnote = "Footnote: Do a lot of people really use iWork?!?!";
@@ -186,6 +186,31 @@ public class IWorkParserTest extends Tes
assertContains(contents, footnote);
}
+ /**
+ * Check we get annotations (eg comments) from Pages
+ */
+ public void testParsePagesAnnotations() throws Exception {
+ String commentA = "comment about the APXL file";
+ String commentB = "comment about UIMA";
+
+
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesComments.pages");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, metadata, parseContext);
+ String contents = handler.toString();
+
+ // Check regular text
+ assertContains(contents, "Both Pages 1.x"); // P1
+ assertContains(contents, "understanding the Pages document"); // P1
+ assertContains(contents, "should be page 2"); // P2
+
+ // Check for comments
+ assertContains(contents, commentA);
+ assertContains(contents, commentB);
+ }
+
public void assertContains(String haystack, String needle) {
assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesComments.pages
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesComments.pages?rev=1331640&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesComments.pages
------------------------------------------------------------------------------
svn:mime-type = application/zip