You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by tp...@apache.org on 2015/03/14 00:25:53 UTC

svn commit: r1666607 - in /tika/trunk: CHANGES.txt tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java

Author: tpalsulich
Date: Fri Mar 13 23:25:53 2015
New Revision: 1666607

URL: http://svn.apache.org/r1666607
Log:
TIKA-1063. Add basic ODF style support, contributed by Axel Dörfler.

Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1666607&r1=1666606&r2=1666607&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri Mar 13 23:25:53 2015
@@ -1,5 +1,8 @@
 Release 1.8 - Current Development
 
+  * Added basic style support for ODF documents, contributed by
+    Axel Dörfler (TIKA-1063).
+
   * Move Tika server resources and writers to separate
     org.apache.tika.server.resource and writer packages (TIKA-1564).
 

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java?rev=1666607&r1=1666606&r2=1666607&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java Fri Mar 13 23:25:53 2015
@@ -54,6 +54,22 @@ import org.xml.sax.helpers.DefaultHandle
  * Parser for ODF <code>content.xml</code> files.
  */
 public class OpenDocumentContentParser extends AbstractParser {
+    private interface Style {
+    }
+
+    private static class TextStyle implements Style {
+        public boolean italic;
+        public boolean bold;
+        public boolean underlined;
+    }
+    
+    private static class ListStyle implements Style {
+        public boolean ordered;
+        
+        public String getTag() {
+            return ordered ? "ol" : "ul";
+        }
+    }
 
     private static final class OpenDocumentElementMappingContentHandler extends
 			ElementMappingContentHandler {
@@ -62,6 +78,12 @@ public class OpenDocumentContentParser e
 		private int nodeDepth = 0;
 		private int completelyFiltered = 0;
 		private Stack<String> headingStack = new Stack<String>();
+		private Map<String, TextStyle> textStyleMap = new HashMap<String, TextStyle>();
+        private Map<String, ListStyle> listStyleMap = new HashMap<String, ListStyle>();
+        private TextStyle textStyle;
+        private TextStyle lastTextStyle;
+        private Stack<ListStyle> listStyleStack = new Stack<ListStyle>();
+        private ListStyle listStyle;
 
 		private OpenDocumentElementMappingContentHandler(ContentHandler handler,
 				Map<QName, TargetElement> mappings) {
@@ -75,6 +97,7 @@ public class OpenDocumentContentParser e
 		    // only forward content of tags from text:-namespace
 		    if (completelyFiltered == 0 && nodeDepth > 0
 		            && textNodeStack.get(nodeDepth - 1)) {
+		        lazyEndSpan();
 		        super.characters(ch,start,length);
 		    }
 		}
@@ -86,11 +109,8 @@ public class OpenDocumentContentParser e
 		    if (TEXT_NS.equals(namespaceURI)) {
 		        return localName.endsWith("-template")
 		            || localName.endsWith("-style");
-		    } else if (TABLE_NS.equals(namespaceURI)) {
-		        return "covered-table-cell".equals(localName);
-		    } else {
-		        return false;
 		    }
+            return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
 		}
 
 		// map the heading level to <hX> HTML tags
@@ -124,10 +144,85 @@ public class OpenDocumentContentParser e
 		    return false;
 		}
 
+		private void startList(String name) throws SAXException {
+		    String elementName = "ul";
+		    if (name != null) {
+		        ListStyle style = listStyleMap.get(name);
+		        elementName = style != null ? style.getTag() : "ul";
+	            listStyleStack.push(style);
+		    }
+            handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
+		}
+
+		private void endList() throws SAXException {
+            String elementName = "ul";
+            if (!listStyleStack.isEmpty()) {
+                ListStyle style = listStyleStack.pop();
+                elementName = style != null ? style.getTag() : "ul";
+            }
+            handler.endElement(XHTML, elementName, elementName);
+		}
+
+		private void startSpan(String name) throws SAXException {
+		    if (name == null) {
+		        return;
+		    }
+
+            TextStyle style = textStyleMap.get(name);
+
+            // End tags that refer to no longer valid styles
+            if (!style.underlined && lastTextStyle != null && lastTextStyle.underlined) {
+                handler.endElement(XHTML, "u", "u");
+            }
+            if (!style.italic && lastTextStyle != null && lastTextStyle.italic) {
+                handler.endElement(XHTML, "i", "i");
+            }
+            if (!style.bold && lastTextStyle != null && lastTextStyle.bold) {
+                handler.endElement(XHTML, "b", "b");
+            }
+
+            // Start tags for new styles
+            if (style.bold && (lastTextStyle == null || !lastTextStyle.bold)) {
+                handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
+            }
+            if (style.italic && (lastTextStyle == null || !lastTextStyle.italic)) {
+                handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
+            }
+            if (style.underlined && (lastTextStyle == null || !lastTextStyle.underlined)) {
+                handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
+            }
+
+            textStyle = style;
+            lastTextStyle = null;
+		}
+
+		private void endSpan() throws SAXException {
+		    lastTextStyle = textStyle;
+		    textStyle = null;
+		}
+		
+		private void lazyEndSpan() throws SAXException {
+		    if (lastTextStyle == null) {
+		        return;
+		    }
+
+            if (lastTextStyle.underlined) {
+                handler.endElement(XHTML, "u", "u");
+            }
+            if (lastTextStyle.italic) {
+                handler.endElement(XHTML, "i", "i");
+            }
+            if (lastTextStyle.bold) {
+                handler.endElement(XHTML, "b", "b");
+            }
+
+            lastTextStyle = null;
+		}
+
 		@Override
 		public void startElement(
 		        String namespaceURI, String localName, String qName,
-		        Attributes atts) throws SAXException {
+		        Attributes attrs) throws SAXException {
 		    // keep track of current node type. If it is a text node,
 		    // a bit at the current depth ist set in textNodeStack.
 		    // characters() checks the top bit to determine, if the
@@ -135,6 +230,42 @@ public class OpenDocumentContentParser e
 		    // the depth of the current node and also marks top of stack.
 		    assert nodeDepth >= 0;
 
+		    // Set styles
+		    if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+		        String family = attrs.getValue(STYLE_NS, "family");
+		        if ("text".equals(family)) {
+		            textStyle = new TextStyle();
+	                String name = attrs.getValue(STYLE_NS, "name");
+		            textStyleMap.put(name, textStyle);
+		        }
+		    } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
+		        listStyle = new ListStyle();
+                String name = attrs.getValue(STYLE_NS, "name");
+                listStyleMap.put(name, listStyle);
+		    } else if (textStyle != null && STYLE_NS.equals(namespaceURI)
+		            && "text-properties".equals(localName)) {
+		        String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
+		        if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
+		            textStyle.italic = true;
+		        }
+		        String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
+		        if ("bold".equals(fontWeight) || "bolder".equals(fontWeight)
+		                || (fontWeight!=null && Character.isDigit(fontWeight.charAt(0))
+		                && Integer.valueOf(fontWeight) > 500)) {
+		            textStyle.bold = true;
+		        }
+		        String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
+		        if (underlineStyle != null) {
+		            textStyle.underlined = true;
+		        }
+		    } else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
+		        if ("list-level-style-bullet".equals(localName)) {
+		            listStyle.ordered = false;
+		        } else if ("list-level-style-number".equals(localName)) {
+		            listStyle.ordered = true;
+		        }
+		    }
+
 		    textNodeStack.set(nodeDepth++, 
 		            isTextNode(namespaceURI, localName));
 		    // filter *all* content of some tags
@@ -148,11 +279,14 @@ public class OpenDocumentContentParser e
 		        // special handling of text:h, that are directly passed
 		        // to incoming handler
 		        if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
-		            final String el = headingStack.push(getXHTMLHeaderTagName(atts));
+		            final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
 		            handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
+		        } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
+		            startList(attrs.getValue(TEXT_NS, "style-name"));
+                } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
+                    startSpan(attrs.getValue(TEXT_NS, "style-name"));
 		        } else {
-		            super.startElement(
-		                    namespaceURI, localName, qName, atts);
+		            super.startElement(namespaceURI, localName, qName, attrs);
 		        }
 		    }
 		}
@@ -161,6 +295,12 @@ public class OpenDocumentContentParser e
 		public void endElement(
 		        String namespaceURI, String localName, String qName)
 		        throws SAXException {
+            if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+                textStyle = null;
+            } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
+                listStyle = null;
+            }
+
 		    // call next handler if no filtering
 		    if (completelyFiltered == 0) {
 		        // special handling of text:h, that are directly passed
@@ -168,7 +308,14 @@ public class OpenDocumentContentParser e
 		        if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
 		            final String el = headingStack.pop();
 		            handler.endElement(XHTMLContentHandler.XHTML, el, el);
+		        } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
+		            endList();
+                } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
+                    endSpan();
 		        } else {
+		            if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
+		                lazyEndSpan();
+		            }
 		            super.endElement(namespaceURI,localName,qName);
 		        }
 
@@ -208,6 +355,12 @@ public class OpenDocumentContentParser e
     public static final String TABLE_NS =
         "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
 
+    public static final String STYLE_NS =
+        "urn:oasis:names:tc:opendocument:xmlns:style:1.0";
+
+    public static final String FORMATTING_OBJECTS_NS =
+        "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
+
     public static final String OFFICE_NS =
         "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
 
@@ -244,9 +397,6 @@ public class OpenDocumentContentParser e
                 new QName(TEXT_NS, "line-break"),
                 new TargetElement(XHTML, "br"));
         MAPPINGS.put(
-                new QName(TEXT_NS, "list"),
-                new TargetElement(XHTML, "ul"));
-        MAPPINGS.put(
                 new QName(TEXT_NS, "list-item"),
                 new TargetElement(XHTML, "li"));
         MAPPINGS.put(

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java?rev=1666607&r1=1666606&r2=1666607&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java Fri Mar 13 23:25:53 2015
@@ -373,4 +373,15 @@ public class ODFParserTest extends TikaT
           tis.close();
         }
     }
+
+    @Test
+    public void testODTStyles() throws Exception {
+        String xml = getXML("testStyles.odt").xml;
+        assertContains("This <i>is</i> <b>just</b> a <u>test</u>", xml);
+        assertContains("<p>And <b>another <i>test</i> is</b> here.</p>", xml);
+        assertContains("<ol>\t<li><p>One</p>", xml);
+        assertContains("</ol>", xml);
+        assertContains("<ul>\t<li><p>First</p>", xml);
+        assertContains("</ul>", xml);
+    }
 }