You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 11:11:28 UTC
[22/39] tika git commit: Convert new lines from windows to unix
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
index 75b556c..a32d406 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
@@ -1,496 +1,496 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.odf;
-
-import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
-
-import javax.xml.namespace.QName;
-import javax.xml.parsers.SAXParser;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.BitSet;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
-import java.util.Stack;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.ElementMappingContentHandler;
-import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
-import org.apache.tika.sax.OfflineContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * Parser for ODF <code>content.xml</code> files.
- */
-public class OpenDocumentContentParser extends AbstractParser {
- private interface Style {
- }
-
- private static class TextStyle implements Style {
- public boolean italic;
- public boolean bold;
- public boolean underlined;
- }
-
- private static class ListStyle implements Style {
- public boolean ordered;
-
- public String getTag() {
- return ordered ? "ol" : "ul";
- }
- }
-
- private static final class OpenDocumentElementMappingContentHandler extends
- ElementMappingContentHandler {
- private final ContentHandler handler;
- private final BitSet textNodeStack = new BitSet();
- private int nodeDepth = 0;
- private int completelyFiltered = 0;
- private Stack<String> headingStack = new Stack<String>();
- private Map<String, TextStyle> textStyleMap = new HashMap<String, TextStyle>();
- private Map<String, ListStyle> listStyleMap = new HashMap<String, ListStyle>();
- private TextStyle textStyle;
- private TextStyle lastTextStyle;
- private Stack<ListStyle> listStyleStack = new Stack<ListStyle>();
- private ListStyle listStyle;
-
- private OpenDocumentElementMappingContentHandler(ContentHandler handler,
- Map<QName, TargetElement> mappings) {
- super(handler, mappings);
- this.handler = handler;
- }
-
- @Override
- public void characters(char[] ch, int start, int length)
- throws SAXException {
- // only forward content of tags from text:-namespace
- if (completelyFiltered == 0 && nodeDepth > 0
- && textNodeStack.get(nodeDepth - 1)) {
- lazyEndSpan();
- super.characters(ch, start, length);
- }
- }
-
- // helper for checking tags which need complete filtering
- // (with sub-tags)
- private boolean needsCompleteFiltering(
- String namespaceURI, String localName) {
- if (TEXT_NS.equals(namespaceURI)) {
- return localName.endsWith("-template")
- || localName.endsWith("-style");
- }
- return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
- }
-
- // map the heading level to <hX> HTML tags
- private String getXHTMLHeaderTagName(Attributes atts) {
- String depthStr = atts.getValue(TEXT_NS, "outline-level");
- if (depthStr == null) {
- return "h1";
- }
-
- int depth = Integer.parseInt(depthStr);
- if (depth >= 6) {
- return "h6";
- } else if (depth <= 1) {
- return "h1";
- } else {
- return "h" + depth;
- }
- }
-
- /**
- * Check if a node is a text node
- */
- private boolean isTextNode(String namespaceURI, String localName) {
- if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) {
- return true;
- }
- if (SVG_NS.equals(namespaceURI)) {
- return "title".equals(localName) ||
- "desc".equals(localName);
- }
- return false;
- }
-
- private void startList(String name) throws SAXException {
- String elementName = "ul";
- if (name != null) {
- ListStyle style = listStyleMap.get(name);
- elementName = style != null ? style.getTag() : "ul";
- listStyleStack.push(style);
- }
- handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
- }
-
- private void endList() throws SAXException {
- String elementName = "ul";
- if (!listStyleStack.isEmpty()) {
- ListStyle style = listStyleStack.pop();
- elementName = style != null ? style.getTag() : "ul";
- }
- handler.endElement(XHTML, elementName, elementName);
- }
-
- private void startSpan(String name) throws SAXException {
- if (name == null) {
- return;
- }
-
- TextStyle style = textStyleMap.get(name);
- if (style == null) {
- return;
- }
-
- // End tags that refer to no longer valid styles
- if (!style.underlined && lastTextStyle != null && lastTextStyle.underlined) {
- handler.endElement(XHTML, "u", "u");
- }
- if (!style.italic && lastTextStyle != null && lastTextStyle.italic) {
- handler.endElement(XHTML, "i", "i");
- }
- if (!style.bold && lastTextStyle != null && lastTextStyle.bold) {
- handler.endElement(XHTML, "b", "b");
- }
-
- // Start tags for new styles
- if (style.bold && (lastTextStyle == null || !lastTextStyle.bold)) {
- handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
- }
- if (style.italic && (lastTextStyle == null || !lastTextStyle.italic)) {
- handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
- }
- if (style.underlined && (lastTextStyle == null || !lastTextStyle.underlined)) {
- handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
- }
-
- textStyle = style;
- lastTextStyle = null;
- }
-
- private void endSpan() throws SAXException {
- lastTextStyle = textStyle;
- textStyle = null;
- }
-
- private void lazyEndSpan() throws SAXException {
- if (lastTextStyle == null) {
- return;
- }
-
- if (lastTextStyle.underlined) {
- handler.endElement(XHTML, "u", "u");
- }
- if (lastTextStyle.italic) {
- handler.endElement(XHTML, "i", "i");
- }
- if (lastTextStyle.bold) {
- handler.endElement(XHTML, "b", "b");
- }
-
- lastTextStyle = null;
- }
-
- @Override
- public void startElement(
- String namespaceURI, String localName, String qName,
- Attributes attrs) throws SAXException {
- // keep track of current node type. If it is a text node,
- // a bit at the current depth its set in textNodeStack.
- // characters() checks the top bit to determine, if the
- // actual node is a text node to print out nodeDepth contains
- // the depth of the current node and also marks top of stack.
- assert nodeDepth >= 0;
-
- // Set styles
- if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
- String family = attrs.getValue(STYLE_NS, "family");
- if ("text".equals(family)) {
- textStyle = new TextStyle();
- String name = attrs.getValue(STYLE_NS, "name");
- textStyleMap.put(name, textStyle);
- }
- } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
- listStyle = new ListStyle();
- String name = attrs.getValue(STYLE_NS, "name");
- listStyleMap.put(name, listStyle);
- } else if (textStyle != null && STYLE_NS.equals(namespaceURI)
- && "text-properties".equals(localName)) {
- String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
- if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
- textStyle.italic = true;
- }
- String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
- if ("bold".equals(fontWeight) || "bolder".equals(fontWeight)
- || (fontWeight != null && Character.isDigit(fontWeight.charAt(0))
- && Integer.valueOf(fontWeight) > 500)) {
- textStyle.bold = true;
- }
- String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
- if (underlineStyle != null) {
- textStyle.underlined = true;
- }
- } else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
- if ("list-level-style-bullet".equals(localName)) {
- listStyle.ordered = false;
- } else if ("list-level-style-number".equals(localName)) {
- listStyle.ordered = true;
- }
- }
-
- textNodeStack.set(nodeDepth++,
- isTextNode(namespaceURI, localName));
- // filter *all* content of some tags
- assert completelyFiltered >= 0;
-
- if (needsCompleteFiltering(namespaceURI, localName)) {
- completelyFiltered++;
- }
- // call next handler if no filtering
- if (completelyFiltered == 0) {
- // special handling of text:h, that are directly passed
- // to incoming handler
- if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
- final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
- handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
- } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
- startList(attrs.getValue(TEXT_NS, "style-name"));
- } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
- startSpan(attrs.getValue(TEXT_NS, "style-name"));
- } else {
- super.startElement(namespaceURI, localName, qName, attrs);
- }
- }
- }
-
- @Override
- public void endElement(
- String namespaceURI, String localName, String qName)
- throws SAXException {
- if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
- textStyle = null;
- } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
- listStyle = null;
- }
-
- // call next handler if no filtering
- if (completelyFiltered == 0) {
- // special handling of text:h, that are directly passed
- // to incoming handler
- if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
- final String el = headingStack.pop();
- handler.endElement(XHTMLContentHandler.XHTML, el, el);
- } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
- endList();
- } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
- endSpan();
- } else {
- if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
- lazyEndSpan();
- }
- super.endElement(namespaceURI, localName, qName);
- }
-
- // special handling of tabulators
- if (TEXT_NS.equals(namespaceURI)
- && ("tab-stop".equals(localName)
- || "tab".equals(localName))) {
- this.characters(TAB, 0, TAB.length);
- }
- }
-
- // revert filter for *all* content of some tags
- if (needsCompleteFiltering(namespaceURI, localName)) {
- completelyFiltered--;
- }
- assert completelyFiltered >= 0;
-
- // reduce current node depth
- nodeDepth--;
- assert nodeDepth >= 0;
- }
-
- @Override
- public void startPrefixMapping(String prefix, String uri) {
- // remove prefix mappings as they should not occur in XHTML
- }
-
- @Override
- public void endPrefixMapping(String prefix) {
- // remove prefix mappings as they should not occur in XHTML
- }
- }
-
- public static final String TEXT_NS =
- "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
-
- public static final String TABLE_NS =
- "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
-
- public static final String STYLE_NS =
- "urn:oasis:names:tc:opendocument:xmlns:style:1.0";
-
- public static final String FORMATTING_OBJECTS_NS =
- "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
-
- public static final String OFFICE_NS =
- "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
-
- public static final String SVG_NS =
- "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
-
- public static final String PRESENTATION_NS =
- "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
-
- public static final String DRAW_NS =
- "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
-
- public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
-
- protected static final char[] TAB = new char[]{'\t'};
-
- private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
-
- /**
- * Mappings between ODF tag names and XHTML tag names
- * (including attributes). All other tag names/attributes are ignored
- * and left out from event stream.
- */
- private static final HashMap<QName, TargetElement> MAPPINGS =
- new HashMap<QName, TargetElement>();
-
- static {
- // general mappings of text:-tags
- MAPPINGS.put(
- new QName(TEXT_NS, "p"),
- new TargetElement(XHTML, "p"));
- // text:h-tags are mapped specifically in startElement/endElement
- MAPPINGS.put(
- new QName(TEXT_NS, "line-break"),
- new TargetElement(XHTML, "br"));
- MAPPINGS.put(
- new QName(TEXT_NS, "list-item"),
- new TargetElement(XHTML, "li"));
- MAPPINGS.put(
- new QName(TEXT_NS, "note"),
- new TargetElement(XHTML, "div"));
- MAPPINGS.put(
- new QName(OFFICE_NS, "annotation"),
- new TargetElement(XHTML, "div"));
- MAPPINGS.put(
- new QName(PRESENTATION_NS, "notes"),
- new TargetElement(XHTML, "div"));
- MAPPINGS.put(
- new QName(DRAW_NS, "object"),
- new TargetElement(XHTML, "object"));
- MAPPINGS.put(
- new QName(DRAW_NS, "text-box"),
- new TargetElement(XHTML, "div"));
- MAPPINGS.put(
- new QName(SVG_NS, "title"),
- new TargetElement(XHTML, "span"));
- MAPPINGS.put(
- new QName(SVG_NS, "desc"),
- new TargetElement(XHTML, "span"));
- MAPPINGS.put(
- new QName(TEXT_NS, "span"),
- new TargetElement(XHTML, "span"));
-
- final HashMap<QName, QName> aAttsMapping =
- new HashMap<QName, QName>();
- aAttsMapping.put(
- new QName(XLINK_NS, "href"),
- new QName("href"));
- aAttsMapping.put(
- new QName(XLINK_NS, "title"),
- new QName("title"));
- MAPPINGS.put(
- new QName(TEXT_NS, "a"),
- new TargetElement(XHTML, "a", aAttsMapping));
-
- // create HTML tables from table:-tags
- MAPPINGS.put(
- new QName(TABLE_NS, "table"),
- new TargetElement(XHTML, "table"));
- // repeating of rows is ignored; for columns, see below!
- MAPPINGS.put(
- new QName(TABLE_NS, "table-row"),
- new TargetElement(XHTML, "tr"));
- // special mapping for rowspan/colspan attributes
- final HashMap<QName, QName> tableCellAttsMapping =
- new HashMap<QName, QName>();
- tableCellAttsMapping.put(
- new QName(TABLE_NS, "number-columns-spanned"),
- new QName("colspan"));
- tableCellAttsMapping.put(
- new QName(TABLE_NS, "number-rows-spanned"),
- new QName("rowspan"));
- /* TODO: The following is not correct, the cell should be repeated not spanned!
- * Code generates a HTML cell, spanning all repeated columns, to make the cell look correct.
- * Problems may occur when both spanning and repeating is given, which is not allowed by spec.
- * Cell spanning instead of repeating is not a problem, because OpenOffice uses it
- * only for empty cells.
- */
- tableCellAttsMapping.put(
- new QName(TABLE_NS, "number-columns-repeated"),
- new QName("colspan"));
- MAPPINGS.put(
- new QName(TABLE_NS, "table-cell"),
- new TargetElement(XHTML, "td", tableCellAttsMapping));
- }
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return Collections.emptySet(); // not a top-level parser
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- parseInternal(stream,
- new XHTMLContentHandler(handler, metadata),
- metadata, context);
- }
-
- void parseInternal(
- InputStream stream, final ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
-
- DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);
-
-
- SAXParser parser = context.getSAXParser();
- parser.parse(
- new CloseShieldInputStream(stream),
- new OfflineContentHandler(
- new NSNormalizerContentHandler(dh)));
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
+import javax.xml.namespace.QName;
+import javax.xml.parsers.SAXParser;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.ElementMappingContentHandler;
+import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Parser for ODF <code>content.xml</code> files.
+ */
+public class OpenDocumentContentParser extends AbstractParser {
+ private interface Style {
+ }
+
+ private static class TextStyle implements Style {
+ public boolean italic;
+ public boolean bold;
+ public boolean underlined;
+ }
+
+ private static class ListStyle implements Style {
+ public boolean ordered;
+
+ public String getTag() {
+ return ordered ? "ol" : "ul";
+ }
+ }
+
+ private static final class OpenDocumentElementMappingContentHandler extends
+ ElementMappingContentHandler {
+ private final ContentHandler handler;
+ private final BitSet textNodeStack = new BitSet();
+ private int nodeDepth = 0;
+ private int completelyFiltered = 0;
+ private Stack<String> headingStack = new Stack<String>();
+ private Map<String, TextStyle> textStyleMap = new HashMap<String, TextStyle>();
+ private Map<String, ListStyle> listStyleMap = new HashMap<String, ListStyle>();
+ private TextStyle textStyle;
+ private TextStyle lastTextStyle;
+ private Stack<ListStyle> listStyleStack = new Stack<ListStyle>();
+ private ListStyle listStyle;
+
+ private OpenDocumentElementMappingContentHandler(ContentHandler handler,
+ Map<QName, TargetElement> mappings) {
+ super(handler, mappings);
+ this.handler = handler;
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ // only forward content of tags from text:-namespace
+ if (completelyFiltered == 0 && nodeDepth > 0
+ && textNodeStack.get(nodeDepth - 1)) {
+ lazyEndSpan();
+ super.characters(ch, start, length);
+ }
+ }
+
+ // helper for checking tags which need complete filtering
+ // (with sub-tags)
+ private boolean needsCompleteFiltering(
+ String namespaceURI, String localName) {
+ if (TEXT_NS.equals(namespaceURI)) {
+ return localName.endsWith("-template")
+ || localName.endsWith("-style");
+ }
+ return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
+ }
+
+ // map the heading level to <hX> HTML tags
+ private String getXHTMLHeaderTagName(Attributes atts) {
+ String depthStr = atts.getValue(TEXT_NS, "outline-level");
+ if (depthStr == null) {
+ return "h1";
+ }
+
+ int depth = Integer.parseInt(depthStr);
+ if (depth >= 6) {
+ return "h6";
+ } else if (depth <= 1) {
+ return "h1";
+ } else {
+ return "h" + depth;
+ }
+ }
+
+ /**
+ * Check if a node is a text node
+ */
+ private boolean isTextNode(String namespaceURI, String localName) {
+ if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) {
+ return true;
+ }
+ if (SVG_NS.equals(namespaceURI)) {
+ return "title".equals(localName) ||
+ "desc".equals(localName);
+ }
+ return false;
+ }
+
+ private void startList(String name) throws SAXException {
+ String elementName = "ul";
+ if (name != null) {
+ ListStyle style = listStyleMap.get(name);
+ elementName = style != null ? style.getTag() : "ul";
+ listStyleStack.push(style);
+ }
+ handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
+ }
+
+ private void endList() throws SAXException {
+ String elementName = "ul";
+ if (!listStyleStack.isEmpty()) {
+ ListStyle style = listStyleStack.pop();
+ elementName = style != null ? style.getTag() : "ul";
+ }
+ handler.endElement(XHTML, elementName, elementName);
+ }
+
+ private void startSpan(String name) throws SAXException {
+ if (name == null) {
+ return;
+ }
+
+ TextStyle style = textStyleMap.get(name);
+ if (style == null) {
+ return;
+ }
+
+ // End tags that refer to no longer valid styles
+ if (!style.underlined && lastTextStyle != null && lastTextStyle.underlined) {
+ handler.endElement(XHTML, "u", "u");
+ }
+ if (!style.italic && lastTextStyle != null && lastTextStyle.italic) {
+ handler.endElement(XHTML, "i", "i");
+ }
+ if (!style.bold && lastTextStyle != null && lastTextStyle.bold) {
+ handler.endElement(XHTML, "b", "b");
+ }
+
+ // Start tags for new styles
+ if (style.bold && (lastTextStyle == null || !lastTextStyle.bold)) {
+ handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
+ }
+ if (style.italic && (lastTextStyle == null || !lastTextStyle.italic)) {
+ handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
+ }
+ if (style.underlined && (lastTextStyle == null || !lastTextStyle.underlined)) {
+ handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
+ }
+
+ textStyle = style;
+ lastTextStyle = null;
+ }
+
+ private void endSpan() throws SAXException {
+ lastTextStyle = textStyle;
+ textStyle = null;
+ }
+
+ private void lazyEndSpan() throws SAXException {
+ if (lastTextStyle == null) {
+ return;
+ }
+
+ if (lastTextStyle.underlined) {
+ handler.endElement(XHTML, "u", "u");
+ }
+ if (lastTextStyle.italic) {
+ handler.endElement(XHTML, "i", "i");
+ }
+ if (lastTextStyle.bold) {
+ handler.endElement(XHTML, "b", "b");
+ }
+
+ lastTextStyle = null;
+ }
+
+ @Override
+ public void startElement(
+ String namespaceURI, String localName, String qName,
+ Attributes attrs) throws SAXException {
+ // keep track of current node type. If it is a text node,
+ // a bit at the current depth its set in textNodeStack.
+ // characters() checks the top bit to determine, if the
+ // actual node is a text node to print out nodeDepth contains
+ // the depth of the current node and also marks top of stack.
+ assert nodeDepth >= 0;
+
+ // Set styles
+ if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+ String family = attrs.getValue(STYLE_NS, "family");
+ if ("text".equals(family)) {
+ textStyle = new TextStyle();
+ String name = attrs.getValue(STYLE_NS, "name");
+ textStyleMap.put(name, textStyle);
+ }
+ } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
+ listStyle = new ListStyle();
+ String name = attrs.getValue(STYLE_NS, "name");
+ listStyleMap.put(name, listStyle);
+ } else if (textStyle != null && STYLE_NS.equals(namespaceURI)
+ && "text-properties".equals(localName)) {
+ String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
+ if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
+ textStyle.italic = true;
+ }
+ String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
+ if ("bold".equals(fontWeight) || "bolder".equals(fontWeight)
+ || (fontWeight != null && Character.isDigit(fontWeight.charAt(0))
+ && Integer.valueOf(fontWeight) > 500)) {
+ textStyle.bold = true;
+ }
+ String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
+ if (underlineStyle != null) {
+ textStyle.underlined = true;
+ }
+ } else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
+ if ("list-level-style-bullet".equals(localName)) {
+ listStyle.ordered = false;
+ } else if ("list-level-style-number".equals(localName)) {
+ listStyle.ordered = true;
+ }
+ }
+
+ textNodeStack.set(nodeDepth++,
+ isTextNode(namespaceURI, localName));
+ // filter *all* content of some tags
+ assert completelyFiltered >= 0;
+
+ if (needsCompleteFiltering(namespaceURI, localName)) {
+ completelyFiltered++;
+ }
+ // call next handler if no filtering
+ if (completelyFiltered == 0) {
+ // special handling of text:h, that are directly passed
+ // to incoming handler
+ if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
+ final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
+ handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
+ } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
+ startList(attrs.getValue(TEXT_NS, "style-name"));
+ } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
+ startSpan(attrs.getValue(TEXT_NS, "style-name"));
+ } else {
+ super.startElement(namespaceURI, localName, qName, attrs);
+ }
+ }
+ }
+
+ @Override
+ public void endElement(
+ String namespaceURI, String localName, String qName)
+ throws SAXException {
+ if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+ textStyle = null;
+ } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
+ listStyle = null;
+ }
+
+ // call next handler if no filtering
+ if (completelyFiltered == 0) {
+ // special handling of text:h, that are directly passed
+ // to incoming handler
+ if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
+ final String el = headingStack.pop();
+ handler.endElement(XHTMLContentHandler.XHTML, el, el);
+ } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
+ endList();
+ } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
+ endSpan();
+ } else {
+ if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
+ lazyEndSpan();
+ }
+ super.endElement(namespaceURI, localName, qName);
+ }
+
+ // special handling of tabulators
+ if (TEXT_NS.equals(namespaceURI)
+ && ("tab-stop".equals(localName)
+ || "tab".equals(localName))) {
+ this.characters(TAB, 0, TAB.length);
+ }
+ }
+
+ // revert filter for *all* content of some tags
+ if (needsCompleteFiltering(namespaceURI, localName)) {
+ completelyFiltered--;
+ }
+ assert completelyFiltered >= 0;
+
+ // reduce current node depth
+ nodeDepth--;
+ assert nodeDepth >= 0;
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) {
+ // remove prefix mappings as they should not occur in XHTML
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) {
+ // remove prefix mappings as they should not occur in XHTML
+ }
+ }
+
+ public static final String TEXT_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
+
+ public static final String TABLE_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
+
+ public static final String STYLE_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:style:1.0";
+
+ public static final String FORMATTING_OBJECTS_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
+
+ public static final String OFFICE_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
+
+ public static final String SVG_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
+
+ public static final String PRESENTATION_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
+
+ public static final String DRAW_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
+
+ public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
+
+ protected static final char[] TAB = new char[]{'\t'};
+
+ private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+
+ /**
+ * Mappings between ODF tag names and XHTML tag names
+ * (including attributes). All other tag names/attributes are ignored
+ * and left out from event stream.
+ */
+ private static final HashMap<QName, TargetElement> MAPPINGS =
+ new HashMap<QName, TargetElement>();
+
+ static {
+ // general mappings of text:-tags
+ MAPPINGS.put(
+ new QName(TEXT_NS, "p"),
+ new TargetElement(XHTML, "p"));
+ // text:h-tags are mapped specifically in startElement/endElement
+ MAPPINGS.put(
+ new QName(TEXT_NS, "line-break"),
+ new TargetElement(XHTML, "br"));
+ MAPPINGS.put(
+ new QName(TEXT_NS, "list-item"),
+ new TargetElement(XHTML, "li"));
+ MAPPINGS.put(
+ new QName(TEXT_NS, "note"),
+ new TargetElement(XHTML, "div"));
+ MAPPINGS.put(
+ new QName(OFFICE_NS, "annotation"),
+ new TargetElement(XHTML, "div"));
+ MAPPINGS.put(
+ new QName(PRESENTATION_NS, "notes"),
+ new TargetElement(XHTML, "div"));
+ MAPPINGS.put(
+ new QName(DRAW_NS, "object"),
+ new TargetElement(XHTML, "object"));
+ MAPPINGS.put(
+ new QName(DRAW_NS, "text-box"),
+ new TargetElement(XHTML, "div"));
+ MAPPINGS.put(
+ new QName(SVG_NS, "title"),
+ new TargetElement(XHTML, "span"));
+ MAPPINGS.put(
+ new QName(SVG_NS, "desc"),
+ new TargetElement(XHTML, "span"));
+ MAPPINGS.put(
+ new QName(TEXT_NS, "span"),
+ new TargetElement(XHTML, "span"));
+
+ final HashMap<QName, QName> aAttsMapping =
+ new HashMap<QName, QName>();
+ aAttsMapping.put(
+ new QName(XLINK_NS, "href"),
+ new QName("href"));
+ aAttsMapping.put(
+ new QName(XLINK_NS, "title"),
+ new QName("title"));
+ MAPPINGS.put(
+ new QName(TEXT_NS, "a"),
+ new TargetElement(XHTML, "a", aAttsMapping));
+
+ // create HTML tables from table:-tags
+ MAPPINGS.put(
+ new QName(TABLE_NS, "table"),
+ new TargetElement(XHTML, "table"));
+ // repeating of rows is ignored; for columns, see below!
+ MAPPINGS.put(
+ new QName(TABLE_NS, "table-row"),
+ new TargetElement(XHTML, "tr"));
+ // special mapping for rowspan/colspan attributes
+ final HashMap<QName, QName> tableCellAttsMapping =
+ new HashMap<QName, QName>();
+ tableCellAttsMapping.put(
+ new QName(TABLE_NS, "number-columns-spanned"),
+ new QName("colspan"));
+ tableCellAttsMapping.put(
+ new QName(TABLE_NS, "number-rows-spanned"),
+ new QName("rowspan"));
+ /* TODO: The following is not correct, the cell should be repeated not spanned!
+ * Code generates a HTML cell, spanning all repeated columns, to make the cell look correct.
+ * Problems may occur when both spanning and repeating is given, which is not allowed by spec.
+ * Cell spanning instead of repeating is not a problem, because OpenOffice uses it
+ * only for empty cells.
+ */
+ tableCellAttsMapping.put(
+ new QName(TABLE_NS, "number-columns-repeated"),
+ new QName("colspan"));
+ MAPPINGS.put(
+ new QName(TABLE_NS, "table-cell"),
+ new TargetElement(XHTML, "td", tableCellAttsMapping));
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.emptySet(); // not a top-level parser
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ parseInternal(stream,
+ new XHTMLContentHandler(handler, metadata),
+ metadata, context);
+ }
+
+ void parseInternal(
+ InputStream stream, final ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);
+
+
+ SAXParser parser = context.getSAXParser();
+ parser.parse(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(
+ new NSNormalizerContentHandler(dh)));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
index 4713022..14b9674 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
@@ -1,199 +1,199 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.odf;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.DublinCore;
-import org.apache.tika.metadata.MSOffice;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.PagedText;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.xml.AttributeDependantMetadataHandler;
-import org.apache.tika.parser.xml.AttributeMetadataHandler;
-import org.apache.tika.parser.xml.ElementMetadataHandler;
-import org.apache.tika.parser.xml.MetadataHandler;
-import org.apache.tika.parser.xml.XMLParser;
-import org.apache.tika.sax.TeeContentHandler;
-import org.apache.tika.sax.xpath.CompositeMatcher;
-import org.apache.tika.sax.xpath.Matcher;
-import org.apache.tika.sax.xpath.MatchingContentHandler;
-import org.apache.tika.sax.xpath.XPathParser;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Parser for OpenDocument <code>meta.xml</code> files.
- */
-public class OpenDocumentMetaParser extends XMLParser {
- /**
- * Serial version UID
- */
- private static final long serialVersionUID = -8739250869531737584L;
-
- private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
- private static final XPathParser META_XPATH = new XPathParser("meta", META_NS);
-
- /**
- * @see OfficeOpenXMLCore#SUBJECT
- * @deprecated use OfficeOpenXMLCore#SUBJECT
- */
- @Deprecated
- private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR =
- Property.composite(Office.INITIAL_AUTHOR,
- new Property[]{Property.externalText("initial-creator")});
-
- private static ContentHandler getDublinCoreHandler(
- Metadata metadata, Property property, String element) {
- return new ElementMetadataHandler(
- DublinCore.NAMESPACE_URI_DC, element,
- metadata, property);
- }
-
- private static ContentHandler getMeta(
- ContentHandler ch, Metadata md, Property property, String element) {
- Matcher matcher = new CompositeMatcher(
- META_XPATH.parse("//meta:" + element),
- META_XPATH.parse("//meta:" + element + "//text()"));
- ContentHandler branch =
- new MatchingContentHandler(new MetadataHandler(md, property), matcher);
- return new TeeContentHandler(ch, branch);
- }
-
- private static ContentHandler getUserDefined(
- ContentHandler ch, Metadata md) {
- Matcher matcher = new CompositeMatcher(
- META_XPATH.parse("//meta:user-defined/@meta:name"),
- META_XPATH.parse("//meta:user-defined//text()"));
- // eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
- ContentHandler branch = new MatchingContentHandler(
- new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX),
- matcher);
- return new TeeContentHandler(ch, branch);
- }
-
- @Deprecated
- private static ContentHandler getStatistic(
- ContentHandler ch, Metadata md, String name, String attribute) {
- Matcher matcher =
- META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
- ContentHandler branch = new MatchingContentHandler(
- new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
- return new TeeContentHandler(ch, branch);
- }
-
- private static ContentHandler getStatistic(
- ContentHandler ch, Metadata md, Property property, String attribute) {
- Matcher matcher =
- META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
- ContentHandler branch = new MatchingContentHandler(
- new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
- return new TeeContentHandler(ch, branch);
- }
-
- protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
- // We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date
- // Process the Dublin Core Attributes
- ch = new TeeContentHandler(super.getContentHandler(ch, md, context),
- getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
- getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"),
- getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"),
- getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"),
- getDublinCoreHandler(md, TikaCoreProperties.CONTRIBUTOR, "contributor"),
- getDublinCoreHandler(md, TikaCoreProperties.TYPE, "type"),
- getDublinCoreHandler(md, TikaCoreProperties.FORMAT, "format"),
- getDublinCoreHandler(md, TikaCoreProperties.IDENTIFIER, "identifier"),
- getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE, "language"),
- getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, "rights"));
-
- // Process the OO Meta Attributes
- ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
- // ODF uses dc:date for modified
- ch = new TeeContentHandler(ch, new ElementMetadataHandler(
- DublinCore.NAMESPACE_URI_DC, "date",
- md, TikaCoreProperties.MODIFIED));
-
- // ODF uses dc:subject for description
- ch = new TeeContentHandler(ch, new ElementMetadataHandler(
- DublinCore.NAMESPACE_URI_DC, "subject",
- md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
- ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
-
- ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
- ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
- ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator");
- ch = getMeta(ch, md, Property.externalText("generator"), "generator");
-
- // Process the user defined Meta Attributes
- ch = getUserDefined(ch, md);
-
- // Process the OO Statistics Attributes
- ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count");
- ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count");
- ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count");
- ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count");
- ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count");
- ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count");
- ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
- ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
-
- // Legacy, Tika-1.0 style attributes
- // TODO Remove these in Tika 2.0
- ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
- ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
- ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
- ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
- ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
- ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
- ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
-
- // Legacy Statistics Attributes, replaced with real keys above
- // TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
- ch = getStatistic(ch, md, "nbPage", "page-count");
- ch = getStatistic(ch, md, "nbPara", "paragraph-count");
- ch = getStatistic(ch, md, "nbWord", "word-count");
- ch = getStatistic(ch, md, "nbCharacter", "character-count");
- ch = getStatistic(ch, md, "nbTab", "table-count");
- ch = getStatistic(ch, md, "nbObject", "object-count");
- ch = getStatistic(ch, md, "nbImg", "image-count");
-
- // Normalise the rest
- ch = new NSNormalizerContentHandler(ch);
- return ch;
- }
-
- @Override
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- super.parse(stream, handler, metadata, context);
- // Copy subject to description for OO2
- String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT);
- if (odfSubject != null && !odfSubject.equals("") &&
- (metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
- metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject);
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.MSOffice;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.xml.AttributeDependantMetadataHandler;
+import org.apache.tika.parser.xml.AttributeMetadataHandler;
+import org.apache.tika.parser.xml.ElementMetadataHandler;
+import org.apache.tika.parser.xml.MetadataHandler;
+import org.apache.tika.parser.xml.XMLParser;
+import org.apache.tika.sax.TeeContentHandler;
+import org.apache.tika.sax.xpath.CompositeMatcher;
+import org.apache.tika.sax.xpath.Matcher;
+import org.apache.tika.sax.xpath.MatchingContentHandler;
+import org.apache.tika.sax.xpath.XPathParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for OpenDocument <code>meta.xml</code> files.
+ */
+public class OpenDocumentMetaParser extends XMLParser {
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -8739250869531737584L;
+
+ private static final String META_NS = "urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
+ private static final XPathParser META_XPATH = new XPathParser("meta", META_NS);
+
+ /**
+ * @see OfficeOpenXMLCore#SUBJECT
+ * @deprecated use OfficeOpenXMLCore#SUBJECT
+ */
+ @Deprecated
+ private static final Property TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR =
+ Property.composite(Office.INITIAL_AUTHOR,
+ new Property[]{Property.externalText("initial-creator")});
+
+ private static ContentHandler getDublinCoreHandler(
+ Metadata metadata, Property property, String element) {
+ return new ElementMetadataHandler(
+ DublinCore.NAMESPACE_URI_DC, element,
+ metadata, property);
+ }
+
+ private static ContentHandler getMeta(
+ ContentHandler ch, Metadata md, Property property, String element) {
+ Matcher matcher = new CompositeMatcher(
+ META_XPATH.parse("//meta:" + element),
+ META_XPATH.parse("//meta:" + element + "//text()"));
+ ContentHandler branch =
+ new MatchingContentHandler(new MetadataHandler(md, property), matcher);
+ return new TeeContentHandler(ch, branch);
+ }
+
+ private static ContentHandler getUserDefined(
+ ContentHandler ch, Metadata md) {
+ Matcher matcher = new CompositeMatcher(
+ META_XPATH.parse("//meta:user-defined/@meta:name"),
+ META_XPATH.parse("//meta:user-defined//text()"));
+ // eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
+ ContentHandler branch = new MatchingContentHandler(
+ new AttributeDependantMetadataHandler(md, "meta:name", Metadata.USER_DEFINED_METADATA_NAME_PREFIX),
+ matcher);
+ return new TeeContentHandler(ch, branch);
+ }
+
+ @Deprecated
+ private static ContentHandler getStatistic(
+ ContentHandler ch, Metadata md, String name, String attribute) {
+ Matcher matcher =
+ META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
+ ContentHandler branch = new MatchingContentHandler(
+ new AttributeMetadataHandler(META_NS, attribute, md, name), matcher);
+ return new TeeContentHandler(ch, branch);
+ }
+
+ private static ContentHandler getStatistic(
+ ContentHandler ch, Metadata md, Property property, String attribute) {
+ Matcher matcher =
+ META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
+ ContentHandler branch = new MatchingContentHandler(
+ new AttributeMetadataHandler(META_NS, attribute, md, property), matcher);
+ return new TeeContentHandler(ch, branch);
+ }
+
+ protected ContentHandler getContentHandler(ContentHandler ch, Metadata md, ParseContext context) {
+ // We can no longer extend DcXMLParser due to the handling of dc:subject and dc:date
+ // Process the Dublin Core Attributes
+ ch = new TeeContentHandler(super.getContentHandler(ch, md, context),
+ getDublinCoreHandler(md, TikaCoreProperties.TITLE, "title"),
+ getDublinCoreHandler(md, TikaCoreProperties.CREATOR, "creator"),
+ getDublinCoreHandler(md, TikaCoreProperties.DESCRIPTION, "description"),
+ getDublinCoreHandler(md, TikaCoreProperties.PUBLISHER, "publisher"),
+ getDublinCoreHandler(md, TikaCoreProperties.CONTRIBUTOR, "contributor"),
+ getDublinCoreHandler(md, TikaCoreProperties.TYPE, "type"),
+ getDublinCoreHandler(md, TikaCoreProperties.FORMAT, "format"),
+ getDublinCoreHandler(md, TikaCoreProperties.IDENTIFIER, "identifier"),
+ getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE, "language"),
+ getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, "rights"));
+
+ // Process the OO Meta Attributes
+ ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
+ // ODF uses dc:date for modified
+ ch = new TeeContentHandler(ch, new ElementMetadataHandler(
+ DublinCore.NAMESPACE_URI_DC, "date",
+ md, TikaCoreProperties.MODIFIED));
+
+ // ODF uses dc:subject for description
+ ch = new TeeContentHandler(ch, new ElementMetadataHandler(
+ DublinCore.NAMESPACE_URI_DC, "subject",
+ md, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT));
+ ch = getMeta(ch, md, TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, "keyword");
+
+ ch = getMeta(ch, md, Property.externalText(MSOffice.EDIT_TIME), "editing-duration");
+ ch = getMeta(ch, md, Property.externalText("editing-cycles"), "editing-cycles");
+ ch = getMeta(ch, md, TRANSITION_INITIAL_CREATOR_TO_INITIAL_AUTHOR, "initial-creator");
+ ch = getMeta(ch, md, Property.externalText("generator"), "generator");
+
+ // Process the user defined Meta Attributes
+ ch = getUserDefined(ch, md);
+
+ // Process the OO Statistics Attributes
+ ch = getStatistic(ch, md, Office.OBJECT_COUNT, "object-count");
+ ch = getStatistic(ch, md, Office.IMAGE_COUNT, "image-count");
+ ch = getStatistic(ch, md, Office.PAGE_COUNT, "page-count");
+ ch = getStatistic(ch, md, PagedText.N_PAGES, "page-count");
+ ch = getStatistic(ch, md, Office.TABLE_COUNT, "table-count");
+ ch = getStatistic(ch, md, Office.PARAGRAPH_COUNT, "paragraph-count");
+ ch = getStatistic(ch, md, Office.WORD_COUNT, "word-count");
+ ch = getStatistic(ch, md, Office.CHARACTER_COUNT, "character-count");
+
+ // Legacy, Tika-1.0 style attributes
+ // TODO Remove these in Tika 2.0
+ ch = getStatistic(ch, md, MSOffice.OBJECT_COUNT, "object-count");
+ ch = getStatistic(ch, md, MSOffice.IMAGE_COUNT, "image-count");
+ ch = getStatistic(ch, md, MSOffice.PAGE_COUNT, "page-count");
+ ch = getStatistic(ch, md, MSOffice.TABLE_COUNT, "table-count");
+ ch = getStatistic(ch, md, MSOffice.PARAGRAPH_COUNT, "paragraph-count");
+ ch = getStatistic(ch, md, MSOffice.WORD_COUNT, "word-count");
+ ch = getStatistic(ch, md, MSOffice.CHARACTER_COUNT, "character-count");
+
+ // Legacy Statistics Attributes, replaced with real keys above
+ // TODO Remove these shortly, eg after Tika 1.1 (TIKA-770)
+ ch = getStatistic(ch, md, "nbPage", "page-count");
+ ch = getStatistic(ch, md, "nbPara", "paragraph-count");
+ ch = getStatistic(ch, md, "nbWord", "word-count");
+ ch = getStatistic(ch, md, "nbCharacter", "character-count");
+ ch = getStatistic(ch, md, "nbTab", "table-count");
+ ch = getStatistic(ch, md, "nbObject", "object-count");
+ ch = getStatistic(ch, md, "nbImg", "image-count");
+
+ // Normalise the rest
+ ch = new NSNormalizerContentHandler(ch);
+ return ch;
+ }
+
+ @Override
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ super.parse(stream, handler, metadata, context);
+ // Copy subject to description for OO2
+ String odfSubject = metadata.get(OfficeOpenXMLCore.SUBJECT);
+ if (odfSubject != null && !odfSubject.equals("") &&
+ (metadata.get(TikaCoreProperties.DESCRIPTION) == null || metadata.get(TikaCoreProperties.DESCRIPTION).equals(""))) {
+ metadata.set(TikaCoreProperties.DESCRIPTION, odfSubject);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index 2739340..00145d2 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -1,225 +1,225 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.odf;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Enumeration;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipFile;
-import java.util.zip.ZipInputStream;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.EndDocumentShieldingContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * OpenOffice parser
- */
-public class OpenDocumentParser extends AbstractParser {
-
- /**
- * Serial version UID
- */
- private static final long serialVersionUID = -6410276875438618287L;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.application("vnd.sun.xml.writer"),
- MediaType.application("vnd.oasis.opendocument.text"),
- MediaType.application("vnd.oasis.opendocument.graphics"),
- MediaType.application("vnd.oasis.opendocument.presentation"),
- MediaType.application("vnd.oasis.opendocument.spreadsheet"),
- MediaType.application("vnd.oasis.opendocument.chart"),
- MediaType.application("vnd.oasis.opendocument.image"),
- MediaType.application("vnd.oasis.opendocument.formula"),
- MediaType.application("vnd.oasis.opendocument.text-master"),
- MediaType.application("vnd.oasis.opendocument.text-web"),
- MediaType.application("vnd.oasis.opendocument.text-template"),
- MediaType.application("vnd.oasis.opendocument.graphics-template"),
- MediaType.application("vnd.oasis.opendocument.presentation-template"),
- MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
- MediaType.application("vnd.oasis.opendocument.chart-template"),
- MediaType.application("vnd.oasis.opendocument.image-template"),
- MediaType.application("vnd.oasis.opendocument.formula-template"),
- MediaType.application("x-vnd.oasis.opendocument.text"),
- MediaType.application("x-vnd.oasis.opendocument.graphics"),
- MediaType.application("x-vnd.oasis.opendocument.presentation"),
- MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
- MediaType.application("x-vnd.oasis.opendocument.chart"),
- MediaType.application("x-vnd.oasis.opendocument.image"),
- MediaType.application("x-vnd.oasis.opendocument.formula"),
- MediaType.application("x-vnd.oasis.opendocument.text-master"),
- MediaType.application("x-vnd.oasis.opendocument.text-web"),
- MediaType.application("x-vnd.oasis.opendocument.text-template"),
- MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
- MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
- MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
- MediaType.application("x-vnd.oasis.opendocument.chart-template"),
- MediaType.application("x-vnd.oasis.opendocument.image-template"),
- MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
-
- private static final String META_NAME = "meta.xml";
-
- private Parser meta = new OpenDocumentMetaParser();
-
- private Parser content = new OpenDocumentContentParser();
-
- public Parser getMetaParser() {
- return meta;
- }
-
- public void setMetaParser(Parser meta) {
- this.meta = meta;
- }
-
- public Parser getContentParser() {
- return content;
- }
-
- public void setContentParser(Parser content) {
- this.content = content;
- }
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler baseHandler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
-
- // Open the Zip stream
- // Use a File if we can, and an already open zip is even better
- ZipFile zipFile = null;
- ZipInputStream zipStream = null;
- if (stream instanceof TikaInputStream) {
- TikaInputStream tis = (TikaInputStream) stream;
- Object container = ((TikaInputStream) stream).getOpenContainer();
- if (container instanceof ZipFile) {
- zipFile = (ZipFile) container;
- } else if (tis.hasFile()) {
- zipFile = new ZipFile(tis.getFile());
- } else {
- zipStream = new ZipInputStream(stream);
- }
- } else {
- zipStream = new ZipInputStream(stream);
- }
-
- // Prepare to handle the content
- XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
-
- // As we don't know which of the metadata or the content
- // we'll hit first, catch the endDocument call initially
- EndDocumentShieldingContentHandler handler =
- new EndDocumentShieldingContentHandler(xhtml);
-
- if (zipFile != null) {
- try {
- handleZipFile(zipFile, metadata, context, handler);
- } finally {
- //Do we want to close silently == catch an exception here?
- zipFile.close();
- }
- } else {
- try {
- handleZipStream(zipStream, metadata, context, handler);
- } finally {
- //Do we want to close silently == catch an exception here?
- zipStream.close();
- }
- }
-
- // Only now call the end document
- if (handler.getEndDocumentWasCalled()) {
- handler.reallyEndDocument();
- }
- }
-
- private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
- ZipEntry entry = zipStream.getNextEntry();
- while (entry != null) {
- handleZipEntry(entry, zipStream, metadata, context, handler);
- entry = zipStream.getNextEntry();
- }
- }
-
- private void handleZipFile(ZipFile zipFile, Metadata metadata,
- ParseContext context, EndDocumentShieldingContentHandler handler)
- throws IOException, TikaException, SAXException {
- // If we can, process the metadata first, then the
- // rest of the file afterwards (TIKA-1353)
- // Only possible to guarantee that when opened from a file not a stream
-
- ZipEntry entry = zipFile.getEntry(META_NAME);
- if (entry != null) {
- handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
- }
-
- Enumeration<? extends ZipEntry> entries = zipFile.entries();
- while (entries.hasMoreElements()) {
- entry = entries.nextElement();
- if (!META_NAME.equals(entry.getName())) {
- handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
- }
- }
- }
- private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
- ParseContext context, EndDocumentShieldingContentHandler handler)
- throws IOException, SAXException, TikaException {
- if (entry == null) return;
-
- if (entry.getName().equals("mimetype")) {
- String type = IOUtils.toString(zip, UTF_8);
- metadata.set(Metadata.CONTENT_TYPE, type);
- } else if (entry.getName().equals(META_NAME)) {
- meta.parse(zip, new DefaultHandler(), metadata, context);
- } else if (entry.getName().endsWith("content.xml")) {
- if (content instanceof OpenDocumentContentParser) {
- ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
- } else {
- // Foreign content parser was set:
- content.parse(zip, handler, metadata, context);
- }
- } else if (entry.getName().endsWith("styles.xml")) {
- if (content instanceof OpenDocumentContentParser) {
- ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
- } else {
- // Foreign content parser was set:
- content.parse(zip, handler, metadata, context);
- }
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipInputStream;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.EndDocumentShieldingContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * OpenOffice parser
+ */
+public class OpenDocumentParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -6410276875438618287L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("vnd.sun.xml.writer"),
+ MediaType.application("vnd.oasis.opendocument.text"),
+ MediaType.application("vnd.oasis.opendocument.graphics"),
+ MediaType.application("vnd.oasis.opendocument.presentation"),
+ MediaType.application("vnd.oasis.opendocument.spreadsheet"),
+ MediaType.application("vnd.oasis.opendocument.chart"),
+ MediaType.application("vnd.oasis.opendocument.image"),
+ MediaType.application("vnd.oasis.opendocument.formula"),
+ MediaType.application("vnd.oasis.opendocument.text-master"),
+ MediaType.application("vnd.oasis.opendocument.text-web"),
+ MediaType.application("vnd.oasis.opendocument.text-template"),
+ MediaType.application("vnd.oasis.opendocument.graphics-template"),
+ MediaType.application("vnd.oasis.opendocument.presentation-template"),
+ MediaType.application("vnd.oasis.opendocument.spreadsheet-template"),
+ MediaType.application("vnd.oasis.opendocument.chart-template"),
+ MediaType.application("vnd.oasis.opendocument.image-template"),
+ MediaType.application("vnd.oasis.opendocument.formula-template"),
+ MediaType.application("x-vnd.oasis.opendocument.text"),
+ MediaType.application("x-vnd.oasis.opendocument.graphics"),
+ MediaType.application("x-vnd.oasis.opendocument.presentation"),
+ MediaType.application("x-vnd.oasis.opendocument.spreadsheet"),
+ MediaType.application("x-vnd.oasis.opendocument.chart"),
+ MediaType.application("x-vnd.oasis.opendocument.image"),
+ MediaType.application("x-vnd.oasis.opendocument.formula"),
+ MediaType.application("x-vnd.oasis.opendocument.text-master"),
+ MediaType.application("x-vnd.oasis.opendocument.text-web"),
+ MediaType.application("x-vnd.oasis.opendocument.text-template"),
+ MediaType.application("x-vnd.oasis.opendocument.graphics-template"),
+ MediaType.application("x-vnd.oasis.opendocument.presentation-template"),
+ MediaType.application("x-vnd.oasis.opendocument.spreadsheet-template"),
+ MediaType.application("x-vnd.oasis.opendocument.chart-template"),
+ MediaType.application("x-vnd.oasis.opendocument.image-template"),
+ MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
+
+ private static final String META_NAME = "meta.xml";
+
+ private Parser meta = new OpenDocumentMetaParser();
+
+ private Parser content = new OpenDocumentContentParser();
+
+ public Parser getMetaParser() {
+ return meta;
+ }
+
+ public void setMetaParser(Parser meta) {
+ this.meta = meta;
+ }
+
+ public Parser getContentParser() {
+ return content;
+ }
+
+ public void setContentParser(Parser content) {
+ this.content = content;
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler baseHandler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ // Open the Zip stream
+ // Use a File if we can, and an already open zip is even better
+ ZipFile zipFile = null;
+ ZipInputStream zipStream = null;
+ if (stream instanceof TikaInputStream) {
+ TikaInputStream tis = (TikaInputStream) stream;
+ Object container = ((TikaInputStream) stream).getOpenContainer();
+ if (container instanceof ZipFile) {
+ zipFile = (ZipFile) container;
+ } else if (tis.hasFile()) {
+ zipFile = new ZipFile(tis.getFile());
+ } else {
+ zipStream = new ZipInputStream(stream);
+ }
+ } else {
+ zipStream = new ZipInputStream(stream);
+ }
+
+ // Prepare to handle the content
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata);
+
+ // As we don't know which of the metadata or the content
+ // we'll hit first, catch the endDocument call initially
+ EndDocumentShieldingContentHandler handler =
+ new EndDocumentShieldingContentHandler(xhtml);
+
+ if (zipFile != null) {
+ try {
+ handleZipFile(zipFile, metadata, context, handler);
+ } finally {
+ //Do we want to close silently == catch an exception here?
+ zipFile.close();
+ }
+ } else {
+ try {
+ handleZipStream(zipStream, metadata, context, handler);
+ } finally {
+ //Do we want to close silently == catch an exception here?
+ zipStream.close();
+ }
+ }
+
+ // Only now call the end document
+ if (handler.getEndDocumentWasCalled()) {
+ handler.reallyEndDocument();
+ }
+ }
+
+ private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
+ ZipEntry entry = zipStream.getNextEntry();
+ while (entry != null) {
+ handleZipEntry(entry, zipStream, metadata, context, handler);
+ entry = zipStream.getNextEntry();
+ }
+ }
+
+ private void handleZipFile(ZipFile zipFile, Metadata metadata,
+ ParseContext context, EndDocumentShieldingContentHandler handler)
+ throws IOException, TikaException, SAXException {
+ // If we can, process the metadata first, then the
+ // rest of the file afterwards (TIKA-1353)
+ // Only possible to guarantee that when opened from a file not a stream
+
+ ZipEntry entry = zipFile.getEntry(META_NAME);
+ if (entry != null) {
+ handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+ }
+
+ Enumeration<? extends ZipEntry> entries = zipFile.entries();
+ while (entries.hasMoreElements()) {
+ entry = entries.nextElement();
+ if (!META_NAME.equals(entry.getName())) {
+ handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler);
+ }
+ }
+ }
+ private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata,
+ ParseContext context, EndDocumentShieldingContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ if (entry == null) return;
+
+ if (entry.getName().equals("mimetype")) {
+ String type = IOUtils.toString(zip, UTF_8);
+ metadata.set(Metadata.CONTENT_TYPE, type);
+ } else if (entry.getName().equals(META_NAME)) {
+ meta.parse(zip, new DefaultHandler(), metadata, context);
+ } else if (entry.getName().endsWith("content.xml")) {
+ if (content instanceof OpenDocumentContentParser) {
+ ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+ } else {
+ // Foreign content parser was set:
+ content.parse(zip, handler, metadata, context);
+ }
+ } else if (entry.getName().endsWith("styles.xml")) {
+ if (content instanceof OpenDocumentContentParser) {
+ ((OpenDocumentContentParser) content).parseInternal(zip, handler, metadata, context);
+ } else {
+ // Foreign content parser was set:
+ content.parse(zip, handler, metadata, context);
+ }
+ }
+ }
+}