You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/12 15:42:06 UTC

tika git commit: TIKA-2191: convert Styles reader to SAX and store only styleId->styleName map.

Repository: tika
Updated Branches:
  refs/heads/master 0f3fe380c -> 0f78a314f


TIKA-2191: convert Styles reader to SAX and store only styleId->styleName map.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/0f78a314
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/0f78a314
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/0f78a314

Branch: refs/heads/master
Commit: 0f78a314f52b64d84072758ea66fc0d797271f2f
Parents: 0f3fe38
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 12 10:41:58 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Mon Dec 12 10:41:58 2016 -0500

----------------------------------------------------------------------
 .../ooxml/SXWPFWordExtractorDecorator.java      | 10 +--
 .../ooxml/xwpf/XWPFDocumentXMLBodyHandler.java  |  2 +-
 .../microsoft/ooxml/xwpf/XWPFStylesShim.java    | 73 ++++++++++++++++++--
 .../ooxml/xwpf/XWPFTikaBodyPartHandler.java     | 12 ++--
 4 files changed, 77 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/0f78a314/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index b97b690..70c7399 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -129,7 +129,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
         //load the numbering/list manager and styles from the main document part
         XWPFNumbering numbering = loadNumbering(documentPart);
         XWPFListManager listManager = new XWPFListManager(numbering);
-        XWPFStyles styles = loadStyles(documentPart);
+        XWPFStylesShim styles = loadStyles(documentPart);
 
         //headers
         try {
@@ -168,7 +168,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
         }
     }
 
-    private void handlePart(PackagePart packagePart, XWPFStyles styles,
+    private void handlePart(PackagePart packagePart, XWPFStylesShim styles,
                             XWPFListManager listManager, XHTMLContentHandler xhtml) throws IOException, SAXException {
 
         Map<String, String> linkedRelationships = loadLinkedRelationships(packagePart);
@@ -231,7 +231,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
         return linkedRelationships;
     }
 
-    private XWPFStyles loadStyles(PackagePart packagePart) {
+    private XWPFStylesShim loadStyles(PackagePart packagePart) {
         try {
             PackageRelationshipCollection stylesParts =
                     packagePart.getRelationshipsByType(XWPFRelation.STYLES.getRelation());
@@ -245,9 +245,9 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
                     return null;
                 }
 
-                return new XWPFStylesShim(stylesPart);
+                return new XWPFStylesShim(stylesPart, context);
             }
-        } catch (IOException|OpenXML4JException e) {
+        } catch (OpenXML4JException e) {
             //swallow
         }
         return null;

http://git-wip-us.apache.org/repos/asf/tika/blob/0f78a314/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
index 25621df..7760ce0 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
@@ -44,7 +44,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
     }
 
 
-    private final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
+    final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
     private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
     private final static String O_NS = "urn:schemas-microsoft-com:office:office";
     private final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";

http://git-wip-us.apache.org/repos/asf/tika/blob/0f78a314/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java
index 579c29a..ff7a63f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java
@@ -18,20 +18,79 @@
 package org.apache.tika.parser.microsoft.ooxml.xwpf;
 
 import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
 
-import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 import org.apache.poi.openxml4j.opc.PackagePart;
-import org.apache.poi.xwpf.usermodel.XWPFStyles;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
 
 /**
- * Stub class of POI's XWPFStyles because onDocumentRead() is protected
+ * For Tika, all we need (so far) is a mapping between styleId and a style's name.
+ *
+ * This class uses SAX to scrape that info out of the styles.xml file.  If
+ * either the styleId or the style's name is null, no information is recorded.
  */
-public class XWPFStylesShim extends XWPFStyles {
+public class XWPFStylesShim {
+
+    private Map<String, String> styles = new HashMap<>();
+
+    public XWPFStylesShim(PackagePart part, ParseContext parseContext) {
+        try (InputStream is = part.getInputStream()) {
+            onDocumentLoad(parseContext, is);
+        } catch (IOException|TikaException|SAXException e) {
+            //swallow
+        }
+    }
+
+    private void onDocumentLoad(ParseContext parseContext, InputStream stream) throws TikaException, IOException, SAXException {
+        parseContext.getSAXParser().parse(stream,
+                new OfflineContentHandler(new StylesStripper()));
+    }
 
-    public XWPFStylesShim(PackagePart part) throws IOException, OpenXML4JException {
-        super(part);
-        onDocumentRead();
+    /**
+     *
+     * @param styleId
+     * @return style's name or null if styleId is null or can't be found
+     */
+    public String getStyleName(String styleId) {
+        if (styleId == null) {
+            return null;
+        }
+        return styles.get(styleId);
     }
 
+    private class StylesStripper extends DefaultHandler {
+
+        String currentStyleId = null;
+
+        @Override
+        public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+            if (uri == null || XWPFDocumentXMLBodyHandler.W_NS.equals(uri)) {
+                if ("style".equals(localName)) {
+                    currentStyleId = atts.getValue(XWPFDocumentXMLBodyHandler.W_NS, "styleId");
+                } else if ("name".equals(localName)) {
+                    String name = atts.getValue(XWPFDocumentXMLBodyHandler.W_NS, "val");
+                    if (currentStyleId != null && name != null) {
+                        styles.put(currentStyleId, name);
+                    }
+                }
+            }
+        }
+
+        @Override
+        public void endElement(String uri, String localName, String qName) throws SAXException {
+            if (uri == null || XWPFDocumentXMLBodyHandler.W_NS.equals(uri)) {
+                if ("style".equals(localName)) {
+                    currentStyleId = null;
+                }
+            }
+        }
+    }
 
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/0f78a314/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
index 759457a..343b420 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
@@ -21,8 +21,6 @@ package org.apache.tika.parser.microsoft.ooxml.xwpf;
 import java.math.BigInteger;
 import java.util.Date;
 
-import org.apache.poi.xwpf.usermodel.XWPFStyle;
-import org.apache.poi.xwpf.usermodel.XWPFStyles;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.parser.microsoft.WordExtractor;
 import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
@@ -39,7 +37,7 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
     private final XWPFListManager listManager;
     private final boolean includeDeletedText;
     private final boolean includeMoveFromText;
-    private final XWPFStyles styles;
+    private final XWPFStylesShim styles;
 
     private int pDepth = 0; //paragraph depth
     private int tableDepth = 0;//table depth
@@ -52,7 +50,7 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
     //if we're marking more that the first level <p/> element
     private String paragraphTag = null;
 
-    public XWPFTikaBodyPartHandler(XHTMLContentHandler xhtml, XWPFStyles styles, XWPFListManager listManager, OfficeParserConfig parserConfig) {
+    public XWPFTikaBodyPartHandler(XHTMLContentHandler xhtml, XWPFStylesShim styles, XWPFListManager listManager, OfficeParserConfig parserConfig) {
         this.xhtml = xhtml;
         this.styles = styles;
         this.listManager = listManager;
@@ -127,12 +125,12 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
             String styleClass = null;
             //TIKA-2144 check that styles is not null
             if (paragraphProperties.getStyleID() != null && styles != null) {
-                XWPFStyle style = styles.getStyle(
+                String styleName = styles.getStyleName(
                         paragraphProperties.getStyleID()
                 );
-                if (style != null && style.getName() != null) {
+                if (styleName != null) {
                     WordExtractor.TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(
-                            style.getName(), false);
+                            styleName, false);
                     paragraphTag = tas.getTag();
                     styleClass = tas.getStyleClass();
                 }