You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/05/12 18:38:17 UTC
svn commit: r943569 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/parser/
tika-core/src/main/resources/org/apache/tika/mime/
tika-parsers/src/main/java/org/apache/tika/parser/iwork/
tika-parsers/src/main/java/org/apache/tika/parser/xml/ tik...
Author: jukka
Date: Wed May 12 16:38:17 2010
New Revision: 943569
URL: http://svn.apache.org/viewvc?rev=943569&view=rev
Log:
TIKA-402: Support for Keynote and Pages documents
Based on the patch by Martijn van Groningen.
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkParser.java (with props)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java (with props)
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java (with props)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testKeynote.key (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testPages.pages (with props)
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java?rev=943569&r1=943568&r2=943569&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java Wed May 12 16:38:17 2010
@@ -19,6 +19,16 @@ package org.apache.tika.parser;
import java.util.HashMap;
import java.util.Map;
+import javax.xml.XMLConstants;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXNotRecognizedException;
+import org.xml.sax.SAXNotSupportedException;
+
/**
* Parse context. Used to pass context information to Tika parsers.
*
@@ -48,4 +58,60 @@ public class ParseContext {
}
}
+ /**
+ * Returns the SAX parser specified in this parsing context. If a parser
+ * is not explicitly specified, then one is created using the specified
+ * or the default SAX parser factory.
+ *
+ * @see #getSAXParserFactory()
+ * @since Apache Tika 0.8
+ * @return SAX parser
+ * @throws TikaException if a SAX parser could not be created
+ */
+ public SAXParser getSAXParser() throws TikaException {
+ SAXParser parser = get(SAXParser.class);
+ if (parser != null) {
+ return parser;
+ } else {
+ try {
+ return getSAXParserFactory().newSAXParser();
+ } catch (ParserConfigurationException e) {
+ throw new TikaException("Unable to configure a SAX parser", e);
+ } catch (SAXException e) {
+ throw new TikaException("Unable to create a SAX parser", e);
+ }
+ }
+ }
+
+ /**
+ * Returns the SAX parser factory specified in this parsing context.
+ * If a factory is not explicitly specified, then a default factory
+ * instance is created and returned. The default factory instance is
+ * configured to be namespace-aware and to use
+ * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
+ *
+ * @since Apache Tika 0.8
+ * @return SAX parser factory
+ */
+ public SAXParserFactory getSAXParserFactory() {
+ SAXParserFactory factory = get(SAXParserFactory.class);
+ if (factory == null) {
+ factory = SAXParserFactory.newInstance();
+ factory.setNamespaceAware(true);
+ try {
+ factory.setFeature(
+ XMLConstants.FEATURE_SECURE_PROCESSING, true);
+ } catch (ParserConfigurationException e) {
+ } catch (SAXNotSupportedException e) {
+ } catch (SAXNotRecognizedException e) {
+ // TIKA-271: Some XML parsers do not support the
+ // secure-processing feature, even though it's required by
+ // JAXP in Java 5. Ignoring the exception is fine here, as
+ // deployments without this feature are inherently vulnerable
+ // to XML denial-of-service attacks.
+ }
+ }
+ return factory;
+ }
+
}
Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=943569&r1=943568&r2=943569&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Wed May 12 16:38:17 2010
@@ -556,6 +556,17 @@
<mime-type type="application/vnd.apple.installer+xml">
<glob pattern="*.mpkg"/>
</mime-type>
+ <mime-type type="application/vnd.apple.keynote">
+ <sub-class-of type="application/zip"/>
+ <alias type="application/vnd.apple.pages"/>
+ <alias type="application/vnd.apple.numbers"/>
+ <magic priority="40">
+ <match value="0x504b0304140000000000" type="string" offset="0"/>
+ </magic>
+ <glob pattern="*.key"/>
+ <glob pattern="*.pages"/>
+ <glob pattern="*.numbers"/>
+ </mime-type>
<mime-type type="application/vnd.arastra.swi">
<glob pattern="*.swi"/>
</mime-type>
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkParser.java?rev=943569&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkParser.java Wed May 12 16:38:17 2010
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+/**
+ * A parser for the IWork formats.
+ *
+ * Currently supported formats:
+ * <ol>
+ * <li>Keynote format version 2.x. Currently only tested with Keynote version 5.x
+ * <li>Pages format version 1.x. Currently only tested with Keynote version 4.0.x
+ * </ol>
+ */
+public class IWorkParser implements Parser {
+
+ private final static Set<MediaType> supportedTypes =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("vnd.apple.keynote"),
+ MediaType.application("vnd.apple.pages"),
+ MediaType.application("vnd.apple.numbers")
+ )));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return supportedTypes;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ ZipInputStream zip =
+ new ZipInputStream(new CloseShieldInputStream(stream));
+ ZipEntry entry = zip.getNextEntry();
+ while (entry != null) {
+ if ("index.apxl".equals(entry.getName())) {
+ if (metadata.get(Metadata.CONTENT_TYPE) == null) {
+ metadata.set(
+ Metadata.CONTENT_TYPE,
+ "application/vnd.apple.keynote");
+ }
+
+ context.getSAXParser().parse(
+ new CloseShieldInputStream(zip),
+ new OfflineContentHandler(
+ new KeynoteContentHandler(xhtml, metadata)));
+ } else if ("index.xml".equals(entry.getName())) {
+ // TODO: Numbers has index.xml as well. Therefore the filename
+ // cannot be used for detecting type. The xml file should be
+ // sniffed before determining the extractor
+
+ if (metadata.get(Metadata.CONTENT_TYPE) == null) {
+ metadata.set(
+ Metadata.CONTENT_TYPE,
+ "application/vnd.apple.pages");
+ }
+
+ context.getSAXParser().parse(
+ new CloseShieldInputStream(zip),
+ new OfflineContentHandler(
+ new PagesContentHandler(xhtml, metadata)));
+ }
+ entry = zip.getNextEntry();
+ }
+ zip.close();
+
+ xhtml.endDocument();
+ }
+
+
+ /**
+ * @deprecated This method will be removed in Apache Tika 1.0.
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
+ parse(stream, handler, metadata, new ParseContext());
+ }
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java?rev=943569&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java Wed May 12 16:38:17 2010
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+class KeynoteContentHandler extends DefaultHandler {
+
+ public final static String PRESENTATION_WIDTH = "slides-width";
+ public final static String PRESENTATION_HEIGHT = "slides-height";
+
+ private final XHTMLContentHandler xhtml;
+ private final Metadata metadata;
+
+ private boolean inSlide = false;
+ private boolean inTheme = false;
+ private boolean inTitle = false;
+ private boolean inBody = false;
+
+ private boolean inMetadata = false;
+ private boolean inMetaDataTitle = false;
+ private boolean inMetaDataAuthors = false;
+
+ private boolean stickNote = false;
+ private boolean notes = false;
+
+ private boolean inParsableText = false;
+
+ private int numberOfSlides = 0;
+
+ KeynoteContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+ this.xhtml = xhtml;
+ this.metadata = metadata;
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ metadata.set(Metadata.SLIDE_COUNT, String.valueOf(numberOfSlides));
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
+ if ("key:theme".equals(qName)) {
+ inTheme = true;
+ } else if ("key:slide".equals(qName)) {
+ inSlide = true;
+ numberOfSlides++;
+ xhtml.startElement("div");
+ } else if ("key:title-placeholder".equals(qName) && inSlide) {
+ inTitle = true;
+ xhtml.startElement("h1");
+ } else if ("sf:sticky-note".equals(qName) && inSlide) {
+ xhtml.startElement("p");
+ } else if ("key:notes".equals(qName) && inSlide) {
+ xhtml.startElement("p");
+ } else if ("key:body-placeholder".equals(qName) && inSlide) {
+ xhtml.startElement("p");
+ inBody = true;
+ } else if ("key:size".equals(qName) && !inTheme) {
+ String width = attributes.getValue("sfa:w");
+ String height = attributes.getValue("sfa:h");
+ metadata.set(PRESENTATION_WIDTH, width);
+ metadata.set(PRESENTATION_HEIGHT, height);
+ } else if ("sf:text-body".equals(qName)) {
+ inParsableText = true;
+ } else if ("key:metadata".equals(qName)) {
+ inMetadata = true;
+ } else if (inMetadata && "key:title".equals(qName)) {
+ inMetaDataTitle = true;
+ } else if (inMetadata && "key:authors".equals(qName)) {
+ inMetaDataAuthors = true;
+ } else if (inMetaDataTitle && "key:string".equals(qName)) {
+ metadata.set(Metadata.TITLE, attributes.getValue("sfa:string"));
+ } else if (inMetaDataAuthors && "key:string".equals(qName)) {
+ metadata.add(Metadata.AUTHOR, attributes.getValue("sfa:string"));
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ if ("key:theme".equals(qName)) {
+ inTheme = false;
+ } else if ("key:slide".equals(qName)) {
+ inSlide = false;
+ xhtml.endElement("div");
+ } else if ("key:title-placeholder".equals(qName) && inSlide) {
+ inTitle = false;
+ xhtml.endElement("h1");
+ } else if ("sf:sticky-note".equals(qName) && inSlide) {
+ xhtml.endElement("p");
+ } else if ("key:notes".equals(qName) && inSlide) {
+ xhtml.endElement("p");
+ } else if ("key:body-placeholder".equals(qName) && inSlide) {
+ xhtml.endElement("p");
+ inBody = false;
+ } else if ("sf:text-body".equals(qName)) {
+ inParsableText = false;
+ } else if ("key:metadata".equals(qName)) {
+ inMetadata = false;
+ } else if (inMetadata && "key:title".equals(qName)) {
+ inMetaDataTitle = false;
+ } else if (inMetadata && "key:authors".equals(qName)) {
+ inMetaDataAuthors = false;
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ if (!inParsableText || !inSlide) {
+ return;
+ }
+
+ String text = new String(ch, start, length).trim();
+ if (text.length() != 0) {
+ xhtml.characters(text);
+ }
+ }
+
+}
\ No newline at end of file
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/KeynoteContentHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java?rev=943569&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java Wed May 12 16:38:17 2010
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+class PagesContentHandler extends DefaultHandler {
+
+ private final XHTMLContentHandler xhtml;
+ private final Metadata metadata;
+
+ private boolean inMetaDataPart = false;
+ private boolean parseProperty = false;
+ private boolean inParsableText = false;
+ private int pageCount = 0;
+
+ private Map<String, List<List<String>>> tableData =
+ new HashMap<String, List<List<String>>>();
+ private String activeTableId;
+ private int numberOfColumns = 0;
+ private List<String> activeRow = new ArrayList<String>();
+
+ private String metaDataLocalName;
+ private String metaDataQName;
+
+ PagesContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+ this.xhtml = xhtml;
+ this.metadata = metadata;
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ metadata.set(Metadata.PAGE_COUNT, String.valueOf(pageCount));
+ if (pageCount > 0) {
+ xhtml.endElement("div");
+ }
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
+ if (parseProperty) {
+ String value = parsePrimitiveElementValue(qName, attributes);
+ if (value != null) {
+ String metaDataKey = resolveMetaDataKey(metaDataLocalName);
+ metadata.add(metaDataKey, value);
+ }
+ }
+
+ if ("sl:publication-info".equals(qName)) {
+ inMetaDataPart = true;
+ } else if ("sf:metadata".equals(qName)) {
+ inMetaDataPart = true;
+ } else if ("sf:page-start".equals(qName)) {
+ if (pageCount > 0) {
+ xhtml.endElement("div");
+ }
+ xhtml.startElement("div");
+ pageCount++;
+ } else if ("sf:p".equals(qName) && pageCount > 0) {
+ inParsableText = true;
+ xhtml.startElement("p");
+ } else if ("sf:attachment".equals(qName)) {
+ String kind = attributes.getValue("sf:kind");
+ if ("tabular-attachment".equals(kind)) {
+ activeTableId = attributes.getValue("sfa:ID");
+ tableData.put(activeTableId, new ArrayList<List<String>>());
+ }
+ } else if ("sf:attachment-ref".equals(qName)) {
+ String idRef = attributes.getValue("sfa:IDREF");
+ outputTable(idRef);
+ }
+
+ if (activeTableId != null) {
+ parseTableData(qName, attributes);
+ }
+
+ if (inMetaDataPart) {
+ metaDataLocalName = localName;
+ metaDataQName = qName;
+ parseProperty = true;
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ if (metaDataLocalName != null && metaDataLocalName.equals(localName)) {
+ metaDataLocalName = null;
+ parseProperty = false;
+ }
+
+ if ("sl:publication-info".equals(qName)) {
+ inMetaDataPart = false;
+ } else if ("sf:metadata".equals(qName)) {
+ inMetaDataPart = false;
+ } else if ("sf:p".equals(qName) && pageCount > 0) {
+ inParsableText = false;
+ xhtml.endElement("p");
+ } else if ("sf:attachment".equals(qName)) {
+ activeTableId = null;
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (!inParsableText) {
+ return;
+ }
+
+ String text = new String(ch, start, length).trim();
+ if (text.length() != 0) {
+ xhtml.characters(text);
+ }
+ }
+
+ private void parseTableData(String qName, Attributes attributes) {
+ if ("sf:grid".equals(qName)) {
+ String numberOfColumns = attributes.getValue("sf:numcols");
+ this.numberOfColumns = Integer.parseInt(numberOfColumns);
+ } else if ("sf:ct".equals(qName)) {
+ activeRow.add(attributes.getValue("sfa:s"));
+
+ if (activeRow.size() >= 3) {
+ tableData.get(activeTableId).add(activeRow);
+ activeRow = new ArrayList<String>();
+ }
+ }
+ }
+
+ private void outputTable(String idRef) throws SAXException {
+ List<List<String>> tableData = this.tableData.get(idRef);
+ if (tableData != null) {
+ xhtml.startElement("table");
+ for (List<String> row : tableData) {
+ xhtml.startElement("tr");
+ for (String cell : row) {
+ xhtml.element("td", cell);
+ }
+ xhtml.endElement("tr");
+ }
+ xhtml.endElement("table");
+ }
+ }
+
+ /**
+ * Returns a resolved key that is common in other document types or
+ * returns the specified metaDataLocalName if no common key could be found.
+ *
+ * @param metaDataLocalName The localname of the element containing metadata
+ * @return a resolved key that is common in other document types
+ */
+ private String resolveMetaDataKey(String metaDataLocalName) {
+ String metaDataKey = metaDataLocalName;
+ if ("sf:authors".equals(metaDataQName)) {
+ metaDataKey = Metadata.AUTHOR;
+ } else if ("sf:title".equals(metaDataQName)) {
+ metaDataKey = Metadata.TITLE;
+ } else if ("sl:SLCreationDateProperty".equals(metaDataQName)) {
+ metaDataKey = Metadata.CREATION_DATE;
+ } else if ("sl:SLLastModifiedDateProperty".equals(metaDataQName)) {
+ metaDataKey = Metadata.LAST_MODIFIED;
+ } else if ("sl:language".equals(metaDataQName)) {
+ metaDataKey = Metadata.LANGUAGE;
+ }
+ return metaDataKey;
+ }
+
+ /**
+ * Returns the value of a primitive element e.g.:
+ * <sl:number sfa:number="0" sfa:type="f"/> - the number attribute
+ * <sl:string sfa:string="en"/> = the string attribute
+ * <p>
+ * Returns <code>null</code> if the value could not be extracted from
+ * the list of attributes.
+ *
+ * @param qName The fully qualified name of the element containing
+ * the value to extract
+ * @param attributes The list of attributes of which one contains the
+ * value to be extracted
+ * @return the value of a primitive element
+ */
+ private String parsePrimitiveElementValue(
+ String qName, Attributes attributes) {
+ if ("sl:string".equals(qName) || "sf:string".equals(qName)) {
+ return attributes.getValue("sfa:string");
+ } else if ("sl:number".equals(qName)) {
+ return attributes.getValue("sfa:number");
+ } else if ("sl:date".equals(qName)) {
+ return attributes.getValue("sf:val");
+ }
+
+ return null;
+ }
+
+}
\ No newline at end of file
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/PagesContentHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=943569&r1=943568&r2=943569&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java Wed May 12 16:38:17 2010
@@ -16,18 +16,6 @@
*/
package org.apache.tika.parser.xml;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import javax.xml.XMLConstants;
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
@@ -39,25 +27,16 @@ import org.apache.tika.sax.TextContentHa
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import org.xml.sax.SAXNotRecognizedException;
-import org.xml.sax.SAXNotSupportedException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
/**
* XML parser.
- * <p>
- * This class uses the following parsing context entries:
- * <dl>
- * <dt>javax.xml.parsers.SAXParser</dt>
- * <dd>
- * The SAX parser ({@link SAXParser} instance) to be used for parsing
- * the XML input documents. Optional.
- * </dd>
- * <dt>javax.xml.parsers.SAXParserFactory</dt>
- * <dd>
- * The SAX parser factory ({@link SAXParserFactory} instance) used to
- * create a SAX parser if one has not been explicitly specified. Optional.
- * </dd>
- * </dl>
*/
public class XMLParser implements Parser {
@@ -83,7 +62,7 @@ public class XMLParser implements Parser
xhtml.startDocument();
xhtml.startElement("p");
- getSAXParser(context).parse(
+ context.getSAXParser().parse(
new CloseShieldInputStream(stream),
new OfflineContentHandler(
getContentHandler(handler, metadata)));
@@ -106,71 +85,4 @@ public class XMLParser implements Parser
return new TextContentHandler(handler);
}
- /**
- * Returns the SAX parser specified in the parsing context. If a parse
- * is not explicitly specified, then one is created using the specified
- * or the default SAX parser factory.
- *
- * @see #getSAXParserFactory()
- * @param context parsing context
- * @return SAX parser
- * @throws TikaException if a SAX parser could not be created
- */
- private SAXParser getSAXParser(ParseContext context)
- throws TikaException {
- SAXParser parser = context.get(SAXParser.class);
- if (parser instanceof SAXParser) {
- return parser;
- } else {
- try {
- return getSAXParserFactory(context).newSAXParser();
- } catch (ParserConfigurationException e) {
- throw new TikaException("Unable to configure a SAX parser", e);
- } catch (SAXException e) {
- throw new TikaException("Unable to create a SAX parser", e);
- }
- }
- }
-
- /**
- * Returns the SAX parser factory specified in the parsing context.
- * If a factory is not explicitly specified, then a default factory
- * instance is created and returned.
- *
- * @see #getDefaultSAXParserFactory()
- * @param context parsing context
- * @return SAX parser factory
- */
- private SAXParserFactory getSAXParserFactory(ParseContext context) {
- SAXParserFactory factory = context.get(SAXParserFactory.class);
- if (factory != null) {
- return factory;
- } else {
- return getDefaultSAXParserFactory();
- }
- }
-
- /**
- * Creates and returns a default SAX parser factory. The factory is
- * configured to be namespace-aware and to use secure XML processing.
- *
- * @see XMLConstants#FEATURE_SECURE_PROCESSING
- * @return default SAX parser factory
- */
- private SAXParserFactory getDefaultSAXParserFactory() {
- SAXParserFactory factory = SAXParserFactory.newInstance();
- factory.setNamespaceAware(true);
- try {
- factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
- } catch (ParserConfigurationException e) {
- } catch (SAXNotSupportedException e) {
- } catch (SAXNotRecognizedException e) {
- // TIKA-271: Some XML parsers do not support the secure-processing
- // feature, even though it's required by JAXP in Java 5. Ignoring
- // the exception is fine here, as deployments without this feature
- // are inherently vulnerable to XML denial-of-service attacks.
- }
- return factory;
- }
-
}
Modified: tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=943569&r1=943568&r2=943569&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original)
+++ tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed May 12 16:38:17 2010
@@ -32,3 +32,4 @@ org.apache.tika.parser.rtf.RTFParser
org.apache.tika.parser.txt.TXTParser
org.apache.tika.parser.video.FLVParser
org.apache.tika.parser.xml.DcXMLParser
+org.apache.tika.parser.iwork.IWorkParser
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java?rev=943569&r1=943568&r2=943569&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java Wed May 12 16:38:17 2010
@@ -16,15 +16,14 @@
*/
package org.apache.tika.parser;
-import java.io.IOException;
-import java.io.InputStream;
-
+import junit.framework.TestCase;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
-import junit.framework.TestCase;
+import java.io.IOException;
+import java.io.InputStream;
public class AutoDetectParserTest extends TestCase {
@@ -34,6 +33,7 @@ public class AutoDetectParserTest extend
private static final String HTML = "text/html";
private static final String PDF = "application/pdf";
private static final String POWERPOINT = "application/vnd.ms-powerpoint";
+ private static final String KEYNOTE = "application/vnd.apple.keynote";
private static final String RTF = "application/rtf";
private static final String PLAINTEXT = "text/plain";
private static final String WORD = "application/msword";
@@ -123,6 +123,10 @@ public class AutoDetectParserTest extend
assertAutoDetect(resource, badResource, type, wrongMimeType, content);
}
+ public void testKeynote() throws Exception {
+ assertAutoDetect("testKeynote.key", KEYNOTE, "A sample presentation");
+ }
+
public void testEpub() throws Exception {
assertAutoDetect(
"testEPUB.epub", "application/epub+zip",
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java?rev=943569&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java Wed May 12 16:38:17 2010
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.iwork;
+
+import junit.framework.TestCase;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+import java.io.InputStream;
+
+/**
+ *
+ */
+public class IWorkParserTest extends TestCase {
+
+ private IWorkParser iWorkParser;
+
+ @Override
+ protected void setUp() throws Exception {
+ iWorkParser = new IWorkParser();
+ }
+
+ public void testParseKeynote() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testKeynote.key");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ ParseContext parseContext = new ParseContext();
+
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ assertEquals(6, metadata.size());
+ assertEquals("application/vnd.apple.keynote", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("2", metadata.get(Metadata.SLIDE_COUNT));
+ assertEquals("1024", metadata.get(KeynoteContentHandler.PRESENTATION_WIDTH));
+ assertEquals("768", metadata.get(KeynoteContentHandler.PRESENTATION_HEIGHT));
+ assertEquals("Tika user", metadata.get(Metadata.AUTHOR));
+ assertEquals("Apache tika", metadata.get(Metadata.TITLE));
+
+ String content = handler.toString();
+ System.out.println(content);
+ assertTrue(content.contains("A sample presentation"));
+ assertTrue(content.contains("For the Apache Tika project"));
+ assertTrue(content.contains("Slide 1"));
+ //assertTrue(content.contains("Some random text for the sake of testability."));
+ assertTrue(content.contains("A nice comment"));
+ assertTrue(content.contains("A nice note"));
+ }
+
+ public void testParsePages() throws Exception {
+ InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPages.pages");
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ ParseContext parseContext = new ParseContext();
+
+ iWorkParser.parse(input, handler, metadata, parseContext);
+
+ assertEquals(51, metadata.size());
+ assertEquals("application/vnd.apple.pages", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Tika user", metadata.get(Metadata.AUTHOR));
+ assertEquals("Apache tika", metadata.get(Metadata.TITLE));
+ assertEquals("2010-05-09T21:34:38+0200", metadata.get(Metadata.CREATION_DATE));
+ assertEquals("2010-05-09T23:50:36+0200", metadata.get(Metadata.LAST_MODIFIED));
+ assertEquals("en", metadata.get(Metadata.LANGUAGE));
+ assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
+
+ String content = handler.toString();
+ //System.out.println(content);
+ }
+
+}
Propchange: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testKeynote.key
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testKeynote.key?rev=943569&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testKeynote.key
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPages.pages
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPages.pages?rev=943569&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPages.pages
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream