You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/05/18 11:52:54 UTC
svn commit: r1340052 - in /tika/trunk: ./
tika-parsers/src/main/java/org/apache/tika/parser/iwork/
tika-parsers/src/test/java/org/apache/tika/parser/iwork/
tika-parsers/src/test/resources/test-documents/
Author: mikemccand
Date: Fri May 18 09:52:53 2012
New Revision: 1340052
URL: http://svn.apache.org/viewvc?rev=1340052&view=rev
Log:
TIKA-904: handle iWork Pages documents created in layout mode
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesLayout.pages (with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1340052&r1=1340051&r2=1340052&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri May 18 09:52:53 2012
@@ -19,7 +19,8 @@ Release 1.2 - Current Development
(TIKA-906). Don't throw NullPointerException on passsword
protected iWork files, even though we can't parse their contents
yet (TIKA-903). Text extracted from Keynote text boxes and bullet
- points no longer runs together (TIKA-910).
+ points no longer runs together (TIKA-910). Also extract text for
+ Pages documents created in layout mode (TIKA-904).
Release 1.1 - 3/7/2012
---------------------------------
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java?rev=1340052&r1=1340051&r2=1340052&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java Fri May 18 09:52:53 2012
@@ -42,9 +42,11 @@ class PagesContentHandler extends Defaul
FOOTNOTES, ANNOTATIONS;
}
private DocumentPart inPart = null;
-
+ private boolean ghostText;
+
private boolean parseProperty = false;
private int pageCount = 0;
+ private int slPageCount = 0;
private HeaderFooter headers = null;
private HeaderFooter footers = null;
@@ -94,17 +96,23 @@ class PagesContentHandler extends Defaul
inPart = DocumentPart.METADATA;
} else if ("sf:metadata".equals(qName)) {
inPart = DocumentPart.METADATA;
- } else if ("sf:page-start".equals(qName)) {
+ } else if ("sf:page-start".equals(qName) || "sl:page-group".equals(qName)) {
if (pageCount > 0) {
doFooter();
xhtml.endElement("div");
}
xhtml.startElement("div");
- pageCount++;
+ if ("sl:page-group".equals(qName)) {
+ slPageCount++;
+ } else {
+ pageCount++;
+ }
doHeader();
- } else if ("sf:p".equals(qName) && pageCount > 0) {
+ } else if ("sf:p".equals(qName)) {
+ if (pageCount+slPageCount > 0) {
inPart = DocumentPart.PARSABLE_TEXT;
xhtml.startElement("p");
+ }
} else if ("sf:attachment".equals(qName)) {
String kind = attributes.getValue("sf:kind");
if ("tabular-attachment".equals(kind)) {
@@ -155,6 +163,8 @@ class PagesContentHandler extends Defaul
xhtml.characters(annotationText);
xhtml.endElement("div");
}
+ } else if ("sf:ghost-text".equals(qName)) {
+ ghostText = true;
}
if (activeTableId != null) {
@@ -180,15 +190,17 @@ class PagesContentHandler extends Defaul
inPart = null;
} else if ("sf:metadata".equals(qName)) {
inPart = null;
- } else if ("sf:p".equals(qName) && pageCount > 0) {
+ } else if ("sf:p".equals(qName) && (pageCount+slPageCount) > 0) {
inPart = null;
xhtml.endElement("p");
} else if ("sf:attachment".equals(qName)) {
activeTableId = null;
} else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
- annotations.end();
+ annotations.end();
} else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
- xhtml.endElement("div");
+ xhtml.endElement("div");
+ } else if ("sf:ghost-text".equals(qName)) {
+ ghostText = false;
}
}
@@ -196,7 +208,9 @@ class PagesContentHandler extends Defaul
public void characters(char[] ch, int start, int length) throws SAXException {
if (length > 0) {
if (inPart == DocumentPart.PARSABLE_TEXT) {
- xhtml.characters(ch, start, length);
+ if (!ghostText) {
+ xhtml.characters(ch, start, length);
+ }
} else if(inPart != null) {
String str = new String(ch, start, length);
if (inPart == DocumentPart.HEADER_FIRST) headers.defaultFirst = str;
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java?rev=1340052&r1=1340051&r2=1340052&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java Fri May 18 09:52:53 2012
@@ -171,6 +171,22 @@ public class IWorkParserTest extends Tes
assertTrue(content.contains("Extensible Markup Language")); // ...
}
+ // TIKA-904
+ public void testPagesLayoutMode() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesLayout.pages");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ String content = handler.toString();
+ assertTrue(content.contains("text box 1 - here is some text"));
+ assertTrue(content.contains("created in a text box in layout mode"));
+ assertTrue(content.contains("text box 2 - more text!@!$@#"));
+ assertTrue(content.contains("this is text inside of a green box"));
+ assertTrue(content.contains("text inside of a green circle"));
+ }
+
public void testParseNumbers() throws Exception {
InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbers.numbers");
Metadata metadata = new Metadata();
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesLayout.pages
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesLayout.pages?rev=1340052&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesLayout.pages
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream