You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/05/12 18:38:17 UTC

svn commit: r943569 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/parser/ tika-core/src/main/resources/org/apache/tika/mime/ tika-parsers/src/main/java/org/apache/tika/parser/iwork/ tika-parsers/src/main/java/org/apache/tika/parser/xml/ tik...

Author: jukka
Date: Wed May 12 16:38:17 2010
New Revision: 943569

URL: http://svn.apache.org/viewvc?rev=943569&view=rev
Log:
TIKA-402: Support for Keynote and Pages documents

Based on the patch by Martijn van Groningen.

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkParser.java   (with props)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java   (with props)
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java   (with props)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testKeynote.key   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testPages.pages   (with props)
Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
    tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java
    tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java?rev=943569&r1=943568&r2=943569&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java Wed May 12 16:38:17 2010
@@ -19,6 +19,16 @@ package org.apache.tika.parser;
 import java.util.HashMap;
 import java.util.Map;
 
+import javax.xml.XMLConstants;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXNotRecognizedException;
+import org.xml.sax.SAXNotSupportedException;
+
 /**
  * Parse context. Used to pass context information to Tika parsers.
  *
@@ -48,4 +58,60 @@ public class ParseContext {
         }
     }
 
+    /**
+     * Returns the SAX parser specified in this parsing context. If a parser
+     * is not explicitly specified, then one is created using the specified
+     * or the default SAX parser factory.
+     *
+     * @see #getSAXParserFactory()
+     * @since Apache Tika 0.8
+     * @return SAX parser
+     * @throws TikaException if a SAX parser could not be created
+     */
+    public SAXParser getSAXParser() throws TikaException {
+        SAXParser parser = get(SAXParser.class);
+        if (parser != null) {
+            return parser;
+        } else {
+            try {
+                return getSAXParserFactory().newSAXParser();
+            } catch (ParserConfigurationException e) {
+                throw new TikaException("Unable to configure a SAX parser", e);
+            } catch (SAXException e) {
+                throw new TikaException("Unable to create a SAX parser", e);
+            }
+        }
+    }
+
+    /**
+     * Returns the SAX parser factory specified in this parsing context.
+     * If a factory is not explicitly specified, then a default factory
+     * instance is created and returned. The default factory instance is
+     * configured to be namespace-aware and to use
+     * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
+     *
+     * @since Apache Tika 0.8
+     * @return SAX parser factory
+     */
+    public SAXParserFactory getSAXParserFactory() {
+        SAXParserFactory factory = get(SAXParserFactory.class);
+        if (factory == null) {
+            factory = SAXParserFactory.newInstance();
+            factory.setNamespaceAware(true);
+            try {
+                factory.setFeature(
+                        XMLConstants.FEATURE_SECURE_PROCESSING, true);
+            } catch (ParserConfigurationException e) {
+            } catch (SAXNotSupportedException e) {
+            } catch (SAXNotRecognizedException e) {
+                // TIKA-271: Some XML parsers do not support the
+                // secure-processing feature, even though it's required by
+                // JAXP in Java 5. Ignoring the exception is fine here, as
+                // deployments without this feature are inherently vulnerable
+                // to XML denial-of-service attacks.
+            }
+        }
+        return factory;
+    }
+
 }

Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=943569&r1=943568&r2=943569&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Wed May 12 16:38:17 2010
@@ -556,6 +556,17 @@
   <mime-type type="application/vnd.apple.installer+xml">
     <glob pattern="*.mpkg"/>
   </mime-type>
+  <mime-type type="application/vnd.apple.keynote">
+    <sub-class-of type="application/zip"/>
+    <alias type="application/vnd.apple.pages"/>
+    <alias type="application/vnd.apple.numbers"/>
+    <magic priority="40">
+      <match value="0x504b0304140000000000" type="string" offset="0"/>
+    </magic>
+    <glob pattern="*.key"/>
+    <glob pattern="*.pages"/>
+    <glob pattern="*.numbers"/>
+  </mime-type>
   <mime-type type="application/vnd.arastra.swi">
     <glob pattern="*.swi"/>
   </mime-type>

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkParser.java?rev=943569&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkParser.java Wed May 12 16:38:17 2010
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+/**
+ * A parser for the IWork formats.
+ *
+ * Currently supported formats:
+ * <ol>
+ * <li>Keynote format version 2.x. Currently only tested with Keynote version 5.x
+ * <li>Pages format version 1.x. Currently only tested with Keynote version 4.0.x
+ * </ol>
+ */
+public class IWorkParser implements Parser {
+
+    private final static Set<MediaType> supportedTypes =
+        Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                MediaType.application("vnd.apple.keynote"),
+                MediaType.application("vnd.apple.pages"),
+                MediaType.application("vnd.apple.numbers")
+        )));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return supportedTypes;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        ZipInputStream zip =
+            new ZipInputStream(new CloseShieldInputStream(stream));
+        ZipEntry entry = zip.getNextEntry();
+        while (entry != null) {
+            if ("index.apxl".equals(entry.getName())) {
+                if (metadata.get(Metadata.CONTENT_TYPE) == null) {
+                    metadata.set(
+                            Metadata.CONTENT_TYPE,
+                            "application/vnd.apple.keynote");
+                }
+
+                context.getSAXParser().parse(
+                        new CloseShieldInputStream(zip),
+                        new OfflineContentHandler(
+                                new KeynoteContentHandler(xhtml, metadata)));
+            } else if ("index.xml".equals(entry.getName())) {
+                // TODO: Numbers has index.xml as well. Therefore the filename
+                // cannot be used for detecting type. The xml file should be
+                // sniffed before determining the extractor
+
+                if (metadata.get(Metadata.CONTENT_TYPE) == null) {
+                    metadata.set(
+                            Metadata.CONTENT_TYPE,
+                            "application/vnd.apple.pages");
+                }
+
+                context.getSAXParser().parse(
+                        new CloseShieldInputStream(zip),
+                        new OfflineContentHandler(
+                                new PagesContentHandler(xhtml, metadata)));
+            }
+            entry = zip.getNextEntry();
+        }
+        zip.close();
+
+        xhtml.endDocument();
+    }
+
+
+    /**
+     * @deprecated This method will be removed in Apache Tika 1.0.
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException, TikaException {
+        parse(stream, handler, metadata, new ParseContext());
+    }
+}

Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java?rev=943569&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java Wed May 12 16:38:17 2010
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+class KeynoteContentHandler extends DefaultHandler {
+
+    public final static String PRESENTATION_WIDTH = "slides-width";
+    public final static String PRESENTATION_HEIGHT = "slides-height";
+
+    private final XHTMLContentHandler xhtml;
+    private final Metadata metadata;
+
+    private boolean inSlide = false;
+    private boolean inTheme = false;
+    private boolean inTitle = false;
+    private boolean inBody = false;
+
+    private boolean inMetadata = false;
+    private boolean inMetaDataTitle = false;
+    private boolean inMetaDataAuthors = false;
+
+    private boolean stickNote = false;
+    private boolean notes = false;
+
+    private boolean inParsableText = false;
+
+    private int numberOfSlides = 0;
+
+    KeynoteContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+        this.xhtml = xhtml;
+        this.metadata = metadata;
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+        metadata.set(Metadata.SLIDE_COUNT, String.valueOf(numberOfSlides));
+    }
+
+    @Override
+    public void startElement(
+            String uri, String localName, String qName, Attributes attributes)
+            throws SAXException {
+        if ("key:theme".equals(qName)) {
+            inTheme = true;
+        } else if ("key:slide".equals(qName)) {
+            inSlide = true;
+            numberOfSlides++;
+            xhtml.startElement("div");
+        } else if ("key:title-placeholder".equals(qName) && inSlide) {
+            inTitle = true;
+            xhtml.startElement("h1");
+        } else if ("sf:sticky-note".equals(qName) && inSlide) {
+            xhtml.startElement("p");
+        } else if ("key:notes".equals(qName) && inSlide) {
+            xhtml.startElement("p");
+        } else if ("key:body-placeholder".equals(qName) && inSlide) {
+            xhtml.startElement("p");
+            inBody = true;
+        } else if ("key:size".equals(qName) && !inTheme) {
+            String width = attributes.getValue("sfa:w");
+            String height = attributes.getValue("sfa:h");
+            metadata.set(PRESENTATION_WIDTH, width);
+            metadata.set(PRESENTATION_HEIGHT, height);
+        } else if ("sf:text-body".equals(qName)) {
+            inParsableText = true;
+        } else if ("key:metadata".equals(qName)) {
+            inMetadata = true;
+        } else if (inMetadata && "key:title".equals(qName)) {
+            inMetaDataTitle = true;
+        } else if (inMetadata && "key:authors".equals(qName)) {
+            inMetaDataAuthors = true;
+        } else if (inMetaDataTitle && "key:string".equals(qName)) {
+            metadata.set(Metadata.TITLE, attributes.getValue("sfa:string"));
+        } else if (inMetaDataAuthors && "key:string".equals(qName)) {
+            metadata.add(Metadata.AUTHOR, attributes.getValue("sfa:string"));
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName)
+            throws SAXException {
+        if ("key:theme".equals(qName)) {
+            inTheme = false;
+        } else if ("key:slide".equals(qName)) {
+            inSlide = false;
+            xhtml.endElement("div");
+        } else if ("key:title-placeholder".equals(qName) && inSlide) {
+            inTitle = false;
+            xhtml.endElement("h1");
+        } else if ("sf:sticky-note".equals(qName) && inSlide) {
+            xhtml.endElement("p");
+        } else if ("key:notes".equals(qName) && inSlide) {
+            xhtml.endElement("p");
+        } else if ("key:body-placeholder".equals(qName) && inSlide) {
+            xhtml.endElement("p");
+            inBody = false;
+        } else if ("sf:text-body".equals(qName)) {
+            inParsableText = false;
+        } else if ("key:metadata".equals(qName)) {
+            inMetadata = false;
+        } else if (inMetadata && "key:title".equals(qName)) {
+            inMetaDataTitle = false;
+        } else if (inMetadata && "key:authors".equals(qName)) {
+            inMetaDataAuthors = false;
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length)
+            throws SAXException {
+        if (!inParsableText || !inSlide) {
+            return;
+        }
+
+        String text = new String(ch, start, length).trim();
+        if (text.length() != 0) {
+            xhtml.characters(text);
+        }
+    }
+
+}
\ No newline at end of file

Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java?rev=943569&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java Wed May 12 16:38:17 2010
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+class PagesContentHandler extends DefaultHandler {
+
+    private final XHTMLContentHandler xhtml;
+    private final Metadata metadata;
+
+    private boolean inMetaDataPart = false;
+    private boolean parseProperty = false;
+    private boolean inParsableText = false;
+    private int pageCount = 0;
+
+    private Map<String, List<List<String>>> tableData =
+        new HashMap<String, List<List<String>>>();
+    private String activeTableId;
+    private int numberOfColumns = 0;
+    private List<String> activeRow = new ArrayList<String>();
+
+    private String metaDataLocalName;
+    private String metaDataQName;
+
+    PagesContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+        this.xhtml = xhtml;
+        this.metadata = metadata;
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+        metadata.set(Metadata.PAGE_COUNT, String.valueOf(pageCount));
+        if (pageCount > 0) {
+            xhtml.endElement("div");
+        }
+    }
+
+    @Override
+    public void startElement(
+            String uri, String localName, String qName, Attributes attributes)
+            throws SAXException {
+        if (parseProperty) {
+            String value = parsePrimitiveElementValue(qName, attributes);
+            if (value != null) {
+                String metaDataKey = resolveMetaDataKey(metaDataLocalName);
+                metadata.add(metaDataKey, value);
+            }
+        }
+
+        if ("sl:publication-info".equals(qName)) {
+            inMetaDataPart = true;
+        } else if ("sf:metadata".equals(qName)) {
+            inMetaDataPart = true;
+        } else if ("sf:page-start".equals(qName)) {
+            if (pageCount > 0) {
+                xhtml.endElement("div");
+            }
+            xhtml.startElement("div");
+            pageCount++;
+        } else if ("sf:p".equals(qName) && pageCount > 0) {
+            inParsableText = true;
+            xhtml.startElement("p");
+        } else if ("sf:attachment".equals(qName)) {
+            String kind = attributes.getValue("sf:kind");
+            if ("tabular-attachment".equals(kind)) {
+                activeTableId = attributes.getValue("sfa:ID");
+                tableData.put(activeTableId, new ArrayList<List<String>>());
+            }
+        } else if ("sf:attachment-ref".equals(qName)) {
+            String idRef = attributes.getValue("sfa:IDREF");
+            outputTable(idRef);
+        }
+
+        if (activeTableId != null) {
+            parseTableData(qName, attributes);
+        }
+
+        if (inMetaDataPart) {
+            metaDataLocalName = localName;
+            metaDataQName = qName;
+            parseProperty = true;
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName)
+            throws SAXException {
+        if (metaDataLocalName != null && metaDataLocalName.equals(localName)) {
+            metaDataLocalName = null;
+            parseProperty = false;
+        }
+
+        if ("sl:publication-info".equals(qName)) {
+            inMetaDataPart = false;
+        } else if ("sf:metadata".equals(qName)) {
+            inMetaDataPart = false;
+        } else if ("sf:p".equals(qName) && pageCount > 0) {
+            inParsableText = false;
+            xhtml.endElement("p");
+        } else if ("sf:attachment".equals(qName)) {
+            activeTableId = null;
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws SAXException {
+        if (!inParsableText) {
+            return;
+        }
+
+        String text = new String(ch, start, length).trim();
+        if (text.length() != 0) {
+            xhtml.characters(text);
+        }
+    }
+
+    private void parseTableData(String qName, Attributes attributes) {
+        if ("sf:grid".equals(qName)) {
+            String numberOfColumns = attributes.getValue("sf:numcols");
+            this.numberOfColumns = Integer.parseInt(numberOfColumns);
+        } else if ("sf:ct".equals(qName)) {
+            activeRow.add(attributes.getValue("sfa:s"));
+
+            if (activeRow.size() >= 3) {
+                tableData.get(activeTableId).add(activeRow);
+                activeRow = new ArrayList<String>();
+            }
+        }
+    }
+
+    private void outputTable(String idRef) throws SAXException {
+        List<List<String>> tableData = this.tableData.get(idRef);
+        if (tableData != null) {
+            xhtml.startElement("table");
+            for (List<String> row : tableData) {
+                xhtml.startElement("tr");
+                for (String cell : row) {
+                    xhtml.element("td", cell);
+                }
+                xhtml.endElement("tr");
+            }
+            xhtml.endElement("table");
+        }
+    }
+
+    /**
+     * Returns a resolved key that is common in other document types or
+     * returns the specified metaDataLocalName if no common key could be found.
+     *
+     * @param metaDataLocalName The localname of the element containing metadata
+     * @return a resolved key that is common in other document types
+     */
+    private String resolveMetaDataKey(String metaDataLocalName) {
+        String metaDataKey = metaDataLocalName;
+        if ("sf:authors".equals(metaDataQName)) {
+            metaDataKey = Metadata.AUTHOR;
+        } else if ("sf:title".equals(metaDataQName)) {
+            metaDataKey = Metadata.TITLE;
+        } else if ("sl:SLCreationDateProperty".equals(metaDataQName)) {
+            metaDataKey = Metadata.CREATION_DATE;
+        } else if ("sl:SLLastModifiedDateProperty".equals(metaDataQName)) {
+            metaDataKey = Metadata.LAST_MODIFIED;
+        } else if ("sl:language".equals(metaDataQName)) {
+            metaDataKey = Metadata.LANGUAGE;
+        }
+        return metaDataKey;
+    }
+
+    /**
+     * Returns the value of a primitive element e.g.:
+     * &lt;sl:number sfa:number="0" sfa:type="f"/&gt; - the number attribute
+     * &lt;sl:string sfa:string="en"/&gt; = the string attribute
+     * <p>
+     * Returns <code>null</code> if the value could not be extracted from
+     * the list of attributes.
+     *
+     * @param qName      The fully qualified name of the element containing
+     *                   the value to extract
+     * @param attributes The list of attributes of which one contains the
+     *                   value to be extracted
+     * @return the value of a primitive element
+     */
+    private String parsePrimitiveElementValue(
+            String qName, Attributes attributes) {
+        if ("sl:string".equals(qName) || "sf:string".equals(qName)) {
+            return attributes.getValue("sfa:string");
+        } else if ("sl:number".equals(qName)) {
+            return attributes.getValue("sfa:number");
+        } else if ("sl:date".equals(qName)) {
+            return attributes.getValue("sf:val");
+        }
+
+        return null;
+    }
+
+}
\ No newline at end of file

Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=943569&r1=943568&r2=943569&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java Wed May 12 16:38:17 2010
@@ -16,18 +16,6 @@
  */
 package org.apache.tika.parser.xml;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import javax.xml.XMLConstants;
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -39,25 +27,16 @@ import org.apache.tika.sax.TextContentHa
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
-import org.xml.sax.SAXNotRecognizedException;
-import org.xml.sax.SAXNotSupportedException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
 
 /**
  * XML parser.
- * <p>
- * This class uses the following parsing context entries:
- * <dl>
- *   <dt>javax.xml.parsers.SAXParser</dt>
- *   <dd>
- *     The SAX parser ({@link SAXParser} instance) to be used for parsing
- *     the XML input documents. Optional.
- *   </dd>
- *   <dt>javax.xml.parsers.SAXParserFactory</dt>
- *   <dd>
- *     The SAX parser factory ({@link SAXParserFactory} instance) used to
- *     create a SAX parser if one has not been explicitly specified. Optional.
- *   </dd>
- * </dl>
  */
 public class XMLParser implements Parser {
 
@@ -83,7 +62,7 @@ public class XMLParser implements Parser
         xhtml.startDocument();
         xhtml.startElement("p");
 
-        getSAXParser(context).parse(
+        context.getSAXParser().parse(
                 new CloseShieldInputStream(stream),
                 new OfflineContentHandler(
                         getContentHandler(handler, metadata)));
@@ -106,71 +85,4 @@ public class XMLParser implements Parser
         return new TextContentHandler(handler);
     }
 
-    /**
-     * Returns the SAX parser specified in the parsing context. If a parse
-     * is not explicitly specified, then one is created using the specified
-     * or the default SAX parser factory.
-     *
-     * @see #getSAXParserFactory()
-     * @param context parsing context
-     * @return SAX parser
-     * @throws TikaException if a SAX parser could not be created
-     */
-    private SAXParser getSAXParser(ParseContext context)
-            throws TikaException {
-        SAXParser parser = context.get(SAXParser.class);
-        if (parser instanceof SAXParser) {
-            return parser;
-        } else {
-            try {
-                return getSAXParserFactory(context).newSAXParser();
-            } catch (ParserConfigurationException e) {
-                throw new TikaException("Unable to configure a SAX parser", e);
-            } catch (SAXException e) {
-                throw new TikaException("Unable to create a SAX parser", e);
-            }
-        }
-    }
-
-    /**
-     * Returns the SAX parser factory specified in the parsing context.
-     * If a factory is not explicitly specified, then a default factory
-     * instance is created and returned.
-     *
-     * @see #getDefaultSAXParserFactory()
-     * @param context parsing context
-     * @return SAX parser factory
-     */
-    private SAXParserFactory getSAXParserFactory(ParseContext context) {
-        SAXParserFactory factory = context.get(SAXParserFactory.class);
-        if (factory != null) {
-            return factory;
-        } else {
-            return getDefaultSAXParserFactory();
-        }
-    }
-
-    /**
-     * Creates and returns a default SAX parser factory. The factory is
-     * configured to be namespace-aware and to use secure XML processing.
-     *
-     * @see XMLConstants#FEATURE_SECURE_PROCESSING
-     * @return default SAX parser factory
-     */
-    private SAXParserFactory getDefaultSAXParserFactory() {
-        SAXParserFactory factory = SAXParserFactory.newInstance();
-        factory.setNamespaceAware(true);
-        try {
-            factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
-        } catch (ParserConfigurationException e) {
-        } catch (SAXNotSupportedException e) {
-        } catch (SAXNotRecognizedException e) {
-            // TIKA-271: Some XML parsers do not support the secure-processing
-            // feature, even though it's required by JAXP in Java 5. Ignoring
-            // the exception is fine here, as deployments without this feature
-            // are inherently vulnerable to XML denial-of-service attacks.
-        }
-        return factory;
-    }
-
 }

Modified: tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=943569&r1=943568&r2=943569&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original)
+++ tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed May 12 16:38:17 2010
@@ -32,3 +32,4 @@ org.apache.tika.parser.rtf.RTFParser
 org.apache.tika.parser.txt.TXTParser
 org.apache.tika.parser.video.FLVParser
 org.apache.tika.parser.xml.DcXMLParser
+org.apache.tika.parser.iwork.IWorkParser

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java?rev=943569&r1=943568&r2=943569&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java Wed May 12 16:38:17 2010
@@ -16,15 +16,14 @@
  */
 package org.apache.tika.parser;
 
-import java.io.IOException;
-import java.io.InputStream;
-
+import junit.framework.TestCase;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
 
-import junit.framework.TestCase;
+import java.io.IOException;
+import java.io.InputStream;
 
 public class AutoDetectParserTest extends TestCase {
 
@@ -34,6 +33,7 @@ public class AutoDetectParserTest extend
     private static final String HTML       = "text/html";
     private static final String PDF        = "application/pdf";
     private static final String POWERPOINT = "application/vnd.ms-powerpoint";
+    private static final String KEYNOTE = "application/vnd.apple.keynote";
     private static final String RTF        = "application/rtf";
     private static final String PLAINTEXT  = "text/plain";
     private static final String WORD       = "application/msword";
@@ -123,6 +123,10 @@ public class AutoDetectParserTest extend
         assertAutoDetect(resource, badResource, type, wrongMimeType, content);
     }
 
+    public void testKeynote() throws Exception {
+      assertAutoDetect("testKeynote.key", KEYNOTE, "A sample presentation");  
+    }
+
     public void testEpub() throws Exception {
         assertAutoDetect(
                 "testEPUB.epub", "application/epub+zip",

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java?rev=943569&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java Wed May 12 16:38:17 2010
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import junit.framework.TestCase;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+import java.io.InputStream;
+
+/**
+ * 
+ */
+public class IWorkParserTest extends TestCase {
+
+    private IWorkParser iWorkParser;
+
+    @Override
+    protected void setUp() throws Exception {
+        iWorkParser = new IWorkParser();
+    }
+
+    public void testParseKeynote() throws Exception {
+        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key");
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        ParseContext parseContext = new ParseContext();
+
+        iWorkParser.parse(input, handler, metadata, parseContext);
+
+        assertEquals(6, metadata.size());
+        assertEquals("application/vnd.apple.keynote", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("2", metadata.get(Metadata.SLIDE_COUNT));
+        assertEquals("1024", metadata.get(KeynoteContentHandler.PRESENTATION_WIDTH));
+        assertEquals("768", metadata.get(KeynoteContentHandler.PRESENTATION_HEIGHT));
+        assertEquals("Tika user", metadata.get(Metadata.AUTHOR));
+        assertEquals("Apache tika", metadata.get(Metadata.TITLE));
+
+        String content = handler.toString();
+        System.out.println(content);
+        assertTrue(content.contains("A sample presentation"));
+        assertTrue(content.contains("For the Apache Tika project"));
+        assertTrue(content.contains("Slide 1"));
+        //assertTrue(content.contains("Some random text for the sake of testability."));
+        assertTrue(content.contains("A nice comment"));
+        assertTrue(content.contains("A nice note"));
+    }
+
+    public void testParsePages() throws Exception {
+        InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPages.pages");
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        ParseContext parseContext = new ParseContext();
+
+        iWorkParser.parse(input, handler, metadata, parseContext);
+
+        assertEquals(51, metadata.size());
+        assertEquals("application/vnd.apple.pages", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Tika user", metadata.get(Metadata.AUTHOR));
+        assertEquals("Apache tika", metadata.get(Metadata.TITLE));
+        assertEquals("2010-05-09T21:34:38+0200", metadata.get(Metadata.CREATION_DATE));
+        assertEquals("2010-05-09T23:50:36+0200", metadata.get(Metadata.LAST_MODIFIED));
+        assertEquals("en", metadata.get(Metadata.LANGUAGE));
+        assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
+
+        String content = handler.toString();
+        //System.out.println(content);
+    }
+
+}

Propchange: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testKeynote.key
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testKeynote.key?rev=943569&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testKeynote.key
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPages.pages
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPages.pages?rev=943569&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPages.pages
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream