You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2012/05/18 11:52:54 UTC

svn commit: r1340052 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/iwork/ tika-parsers/src/test/java/org/apache/tika/parser/iwork/ tika-parsers/src/test/resources/test-documents/

Author: mikemccand
Date: Fri May 18 09:52:53 2012
New Revision: 1340052

URL: http://svn.apache.org/viewvc?rev=1340052&view=rev
Log:
TIKA-904: handle iWork Pages documents created in layout mode

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesLayout.pages   (with props)
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1340052&r1=1340051&r2=1340052&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri May 18 09:52:53 2012
@@ -19,7 +19,8 @@ Release 1.2 - Current Development
     (TIKA-906).  Don't throw NullPointerException on passsword
     protected iWork files, even though we can't parse their contents
     yet (TIKA-903).  Text extracted from Keynote text boxes and bullet
-    points no longer runs together (TIKA-910).
+    points no longer runs together (TIKA-910). Also extract text for
+    Pages documents created in layout mode (TIKA-904). 
 
 Release 1.1 - 3/7/2012
 ---------------------------------

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java?rev=1340052&r1=1340051&r2=1340052&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java Fri May 18 09:52:53 2012
@@ -42,9 +42,11 @@ class PagesContentHandler extends Defaul
        FOOTNOTES, ANNOTATIONS;
     }
     private DocumentPart inPart = null;
-    
+    private boolean ghostText;
+
     private boolean parseProperty = false;
     private int pageCount = 0;
+    private int slPageCount = 0;
 
     private HeaderFooter headers = null;
     private HeaderFooter footers = null;
@@ -94,17 +96,23 @@ class PagesContentHandler extends Defaul
             inPart = DocumentPart.METADATA;
         } else if ("sf:metadata".equals(qName)) {
            inPart = DocumentPart.METADATA;
-        } else if ("sf:page-start".equals(qName)) {
+        } else if ("sf:page-start".equals(qName) || "sl:page-group".equals(qName)) {
             if (pageCount > 0) {
                 doFooter();
                 xhtml.endElement("div");
             }
             xhtml.startElement("div");
-            pageCount++;
+            if ("sl:page-group".equals(qName)) {
+                slPageCount++;
+            } else {
+                pageCount++;
+            }
             doHeader();
-        } else if ("sf:p".equals(qName) && pageCount > 0) {
+        } else if ("sf:p".equals(qName)) {
+          if (pageCount+slPageCount > 0) {
             inPart = DocumentPart.PARSABLE_TEXT;
             xhtml.startElement("p");
+          }
         } else if ("sf:attachment".equals(qName)) {
             String kind = attributes.getValue("sf:kind");
             if ("tabular-attachment".equals(kind)) {
@@ -155,6 +163,8 @@ class PagesContentHandler extends Defaul
               xhtml.characters(annotationText);
               xhtml.endElement("div");
            }
+        } else if ("sf:ghost-text".equals(qName)) {
+            ghostText = true;
         }
 
         if (activeTableId != null) {
@@ -180,15 +190,17 @@ class PagesContentHandler extends Defaul
             inPart = null;
         } else if ("sf:metadata".equals(qName)) {
             inPart = null;
-        } else if ("sf:p".equals(qName) && pageCount > 0) {
+        } else if ("sf:p".equals(qName) && (pageCount+slPageCount) > 0) {
             inPart = null;
             xhtml.endElement("p");
         } else if ("sf:attachment".equals(qName)) {
             activeTableId = null;
         } else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
-           annotations.end();
+            annotations.end();
         } else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
-           xhtml.endElement("div");
+            xhtml.endElement("div");
+        } else if ("sf:ghost-text".equals(qName)) {
+            ghostText = false;
         }
     }
 
@@ -196,7 +208,9 @@ class PagesContentHandler extends Defaul
     public void characters(char[] ch, int start, int length) throws SAXException {
         if (length > 0) {
            if (inPart == DocumentPart.PARSABLE_TEXT) {
-              xhtml.characters(ch, start, length);
+               if (!ghostText) {
+                   xhtml.characters(ch, start, length);
+               }
           } else if(inPart != null) {
               String str = new String(ch, start, length);
               if (inPart == DocumentPart.HEADER_FIRST) headers.defaultFirst = str;

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java?rev=1340052&r1=1340051&r2=1340052&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java Fri May 18 09:52:53 2012
@@ -171,6 +171,22 @@ public class IWorkParserTest extends Tes
         assertTrue(content.contains("Extensible Markup Language")); // ...
     }
 
+    // TIKA-904
+    public void testPagesLayoutMode() throws Exception {
+        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesLayout.pages");
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+
+        iWorkParser.parse(input, handler, metadata, parseContext);
+
+        String content = handler.toString();
+        assertTrue(content.contains("text box 1 - here is some text"));
+        assertTrue(content.contains("created in a text box in layout mode"));
+        assertTrue(content.contains("text box 2 - more text!@!$@#"));
+        assertTrue(content.contains("this is text inside of a green box"));
+        assertTrue(content.contains("text inside of a green circle"));
+    }
+
     public void testParseNumbers() throws Exception {
         InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testNumbers.numbers");
         Metadata metadata = new Metadata();

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesLayout.pages
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesLayout.pages?rev=1340052&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesLayout.pages
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream