You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by tp...@apache.org on 2015/03/14 00:25:53 UTC
svn commit: r1666607 - in /tika/trunk: CHANGES.txt
tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
Author: tpalsulich
Date: Fri Mar 13 23:25:53 2015
New Revision: 1666607
URL: http://svn.apache.org/r1666607
Log:
TIKA-1063. Add basic ODF style support, contributed by Axel Dörfler.
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1666607&r1=1666606&r2=1666607&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri Mar 13 23:25:53 2015
@@ -1,5 +1,8 @@
Release 1.8 - Current Development
+ * Added basic style support for ODF documents, contributed by
+ Axel Dörfler (TIKA-1063).
+
* Move Tika server resources and writers to separate
org.apache.tika.server.resource and writer packages (TIKA-1564).
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java?rev=1666607&r1=1666606&r2=1666607&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java Fri Mar 13 23:25:53 2015
@@ -54,6 +54,22 @@ import org.xml.sax.helpers.DefaultHandle
* Parser for ODF <code>content.xml</code> files.
*/
public class OpenDocumentContentParser extends AbstractParser {
+ private interface Style {
+ }
+
+ private static class TextStyle implements Style {
+ public boolean italic;
+ public boolean bold;
+ public boolean underlined;
+ }
+
+ private static class ListStyle implements Style {
+ public boolean ordered;
+
+ public String getTag() {
+ return ordered ? "ol" : "ul";
+ }
+ }
private static final class OpenDocumentElementMappingContentHandler extends
ElementMappingContentHandler {
@@ -62,6 +78,12 @@ public class OpenDocumentContentParser e
private int nodeDepth = 0;
private int completelyFiltered = 0;
private Stack<String> headingStack = new Stack<String>();
+ private Map<String, TextStyle> textStyleMap = new HashMap<String, TextStyle>();
+ private Map<String, ListStyle> listStyleMap = new HashMap<String, ListStyle>();
+ private TextStyle textStyle;
+ private TextStyle lastTextStyle;
+ private Stack<ListStyle> listStyleStack = new Stack<ListStyle>();
+ private ListStyle listStyle;
private OpenDocumentElementMappingContentHandler(ContentHandler handler,
Map<QName, TargetElement> mappings) {
@@ -75,6 +97,7 @@ public class OpenDocumentContentParser e
// only forward content of tags from text:-namespace
if (completelyFiltered == 0 && nodeDepth > 0
&& textNodeStack.get(nodeDepth - 1)) {
+ lazyEndSpan();
super.characters(ch,start,length);
}
}
@@ -86,11 +109,8 @@ public class OpenDocumentContentParser e
if (TEXT_NS.equals(namespaceURI)) {
return localName.endsWith("-template")
|| localName.endsWith("-style");
- } else if (TABLE_NS.equals(namespaceURI)) {
- return "covered-table-cell".equals(localName);
- } else {
- return false;
}
+ return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
}
// map the heading level to <hX> HTML tags
@@ -124,10 +144,85 @@ public class OpenDocumentContentParser e
return false;
}
+ private void startList(String name) throws SAXException {
+ String elementName = "ul";
+ if (name != null) {
+ ListStyle style = listStyleMap.get(name);
+ elementName = style != null ? style.getTag() : "ul";
+ listStyleStack.push(style);
+ }
+ handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
+ }
+
+ private void endList() throws SAXException {
+ String elementName = "ul";
+ if (!listStyleStack.isEmpty()) {
+ ListStyle style = listStyleStack.pop();
+ elementName = style != null ? style.getTag() : "ul";
+ }
+ handler.endElement(XHTML, elementName, elementName);
+ }
+
+ private void startSpan(String name) throws SAXException {
+ if (name == null) {
+ return;
+ }
+
+ TextStyle style = textStyleMap.get(name);
+
+ // End tags that refer to no longer valid styles
+ if (!style.underlined && lastTextStyle != null && lastTextStyle.underlined) {
+ handler.endElement(XHTML, "u", "u");
+ }
+ if (!style.italic && lastTextStyle != null && lastTextStyle.italic) {
+ handler.endElement(XHTML, "i", "i");
+ }
+ if (!style.bold && lastTextStyle != null && lastTextStyle.bold) {
+ handler.endElement(XHTML, "b", "b");
+ }
+
+ // Start tags for new styles
+ if (style.bold && (lastTextStyle == null || !lastTextStyle.bold)) {
+ handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
+ }
+ if (style.italic && (lastTextStyle == null || !lastTextStyle.italic)) {
+ handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
+ }
+ if (style.underlined && (lastTextStyle == null || !lastTextStyle.underlined)) {
+ handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
+ }
+
+ textStyle = style;
+ lastTextStyle = null;
+ }
+
+ private void endSpan() throws SAXException {
+ lastTextStyle = textStyle;
+ textStyle = null;
+ }
+
+ private void lazyEndSpan() throws SAXException {
+ if (lastTextStyle == null) {
+ return;
+ }
+
+ if (lastTextStyle.underlined) {
+ handler.endElement(XHTML, "u", "u");
+ }
+ if (lastTextStyle.italic) {
+ handler.endElement(XHTML, "i", "i");
+ }
+ if (lastTextStyle.bold) {
+ handler.endElement(XHTML, "b", "b");
+ }
+
+ lastTextStyle = null;
+ }
+
@Override
public void startElement(
String namespaceURI, String localName, String qName,
- Attributes atts) throws SAXException {
+ Attributes attrs) throws SAXException {
// keep track of current node type. If it is a text node,
// a bit at the current depth ist set in textNodeStack.
// characters() checks the top bit to determine, if the
@@ -135,6 +230,42 @@ public class OpenDocumentContentParser e
// the depth of the current node and also marks top of stack.
assert nodeDepth >= 0;
+ // Set styles
+ if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+ String family = attrs.getValue(STYLE_NS, "family");
+ if ("text".equals(family)) {
+ textStyle = new TextStyle();
+ String name = attrs.getValue(STYLE_NS, "name");
+ textStyleMap.put(name, textStyle);
+ }
+ } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
+ listStyle = new ListStyle();
+ String name = attrs.getValue(STYLE_NS, "name");
+ listStyleMap.put(name, listStyle);
+ } else if (textStyle != null && STYLE_NS.equals(namespaceURI)
+ && "text-properties".equals(localName)) {
+ String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
+ if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
+ textStyle.italic = true;
+ }
+ String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
+ if ("bold".equals(fontWeight) || "bolder".equals(fontWeight)
+ || (fontWeight!=null && Character.isDigit(fontWeight.charAt(0))
+ && Integer.valueOf(fontWeight) > 500)) {
+ textStyle.bold = true;
+ }
+ String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
+ if (underlineStyle != null) {
+ textStyle.underlined = true;
+ }
+ } else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
+ if ("list-level-style-bullet".equals(localName)) {
+ listStyle.ordered = false;
+ } else if ("list-level-style-number".equals(localName)) {
+ listStyle.ordered = true;
+ }
+ }
+
textNodeStack.set(nodeDepth++,
isTextNode(namespaceURI, localName));
// filter *all* content of some tags
@@ -148,11 +279,14 @@ public class OpenDocumentContentParser e
// special handling of text:h, that are directly passed
// to incoming handler
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
- final String el = headingStack.push(getXHTMLHeaderTagName(atts));
+ final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
+ } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
+ startList(attrs.getValue(TEXT_NS, "style-name"));
+ } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
+ startSpan(attrs.getValue(TEXT_NS, "style-name"));
} else {
- super.startElement(
- namespaceURI, localName, qName, atts);
+ super.startElement(namespaceURI, localName, qName, attrs);
}
}
}
@@ -161,6 +295,12 @@ public class OpenDocumentContentParser e
public void endElement(
String namespaceURI, String localName, String qName)
throws SAXException {
+ if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+ textStyle = null;
+ } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
+ listStyle = null;
+ }
+
// call next handler if no filtering
if (completelyFiltered == 0) {
// special handling of text:h, that are directly passed
@@ -168,7 +308,14 @@ public class OpenDocumentContentParser e
if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
final String el = headingStack.pop();
handler.endElement(XHTMLContentHandler.XHTML, el, el);
+ } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
+ endList();
+ } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
+ endSpan();
} else {
+ if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
+ lazyEndSpan();
+ }
super.endElement(namespaceURI,localName,qName);
}
@@ -208,6 +355,12 @@ public class OpenDocumentContentParser e
public static final String TABLE_NS =
"urn:oasis:names:tc:opendocument:xmlns:table:1.0";
+ public static final String STYLE_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:style:1.0";
+
+ public static final String FORMATTING_OBJECTS_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
+
public static final String OFFICE_NS =
"urn:oasis:names:tc:opendocument:xmlns:office:1.0";
@@ -244,9 +397,6 @@ public class OpenDocumentContentParser e
new QName(TEXT_NS, "line-break"),
new TargetElement(XHTML, "br"));
MAPPINGS.put(
- new QName(TEXT_NS, "list"),
- new TargetElement(XHTML, "ul"));
- MAPPINGS.put(
new QName(TEXT_NS, "list-item"),
new TargetElement(XHTML, "li"));
MAPPINGS.put(
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java?rev=1666607&r1=1666606&r2=1666607&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java Fri Mar 13 23:25:53 2015
@@ -373,4 +373,15 @@ public class ODFParserTest extends TikaT
tis.close();
}
}
+
+ @Test
+ public void testODTStyles() throws Exception {
+ String xml = getXML("testStyles.odt").xml;
+ assertContains("This <i>is</i> <b>just</b> a <u>test</u>", xml);
+ assertContains("<p>And <b>another <i>test</i> is</b> here.</p>", xml);
+ assertContains("<ol>\t<li><p>One</p>", xml);
+ assertContains("</ol>", xml);
+ assertContains("<ul>\t<li><p>First</p>", xml);
+ assertContains("</ul>", xml);
+ }
}