You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/12 15:42:06 UTC
tika git commit: TIKA-2191: convert Styles reader to SAX and store
only styleId->styleName map.
Repository: tika
Updated Branches:
refs/heads/master 0f3fe380c -> 0f78a314f
TIKA-2191: convert Styles reader to SAX and store only styleId->styleName map.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/0f78a314
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/0f78a314
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/0f78a314
Branch: refs/heads/master
Commit: 0f78a314f52b64d84072758ea66fc0d797271f2f
Parents: 0f3fe38
Author: tballison <ta...@mitre.org>
Authored: Mon Dec 12 10:41:58 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Mon Dec 12 10:41:58 2016 -0500
----------------------------------------------------------------------
.../ooxml/SXWPFWordExtractorDecorator.java | 10 +--
.../ooxml/xwpf/XWPFDocumentXMLBodyHandler.java | 2 +-
.../microsoft/ooxml/xwpf/XWPFStylesShim.java | 73 ++++++++++++++++++--
.../ooxml/xwpf/XWPFTikaBodyPartHandler.java | 12 ++--
4 files changed, 77 insertions(+), 20 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/0f78a314/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index b97b690..70c7399 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -129,7 +129,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
//load the numbering/list manager and styles from the main document part
XWPFNumbering numbering = loadNumbering(documentPart);
XWPFListManager listManager = new XWPFListManager(numbering);
- XWPFStyles styles = loadStyles(documentPart);
+ XWPFStylesShim styles = loadStyles(documentPart);
//headers
try {
@@ -168,7 +168,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
}
- private void handlePart(PackagePart packagePart, XWPFStyles styles,
+ private void handlePart(PackagePart packagePart, XWPFStylesShim styles,
XWPFListManager listManager, XHTMLContentHandler xhtml) throws IOException, SAXException {
Map<String, String> linkedRelationships = loadLinkedRelationships(packagePart);
@@ -231,7 +231,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
return linkedRelationships;
}
- private XWPFStyles loadStyles(PackagePart packagePart) {
+ private XWPFStylesShim loadStyles(PackagePart packagePart) {
try {
PackageRelationshipCollection stylesParts =
packagePart.getRelationshipsByType(XWPFRelation.STYLES.getRelation());
@@ -245,9 +245,9 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
return null;
}
- return new XWPFStylesShim(stylesPart);
+ return new XWPFStylesShim(stylesPart, context);
}
- } catch (IOException|OpenXML4JException e) {
+ } catch (OpenXML4JException e) {
//swallow
}
return null;
http://git-wip-us.apache.org/repos/asf/tika/blob/0f78a314/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
index 25621df..7760ce0 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
@@ -44,7 +44,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
}
- private final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
+ final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
private final static String O_NS = "urn:schemas-microsoft-com:office:office";
private final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
http://git-wip-us.apache.org/repos/asf/tika/blob/0f78a314/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java
index 579c29a..ff7a63f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java
@@ -18,20 +18,79 @@
package org.apache.tika.parser.microsoft.ooxml.xwpf;
import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
-import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.PackagePart;
-import org.apache.poi.xwpf.usermodel.XWPFStyles;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
/**
- * Stub class of POI's XWPFStyles because onDocumentRead() is protected
+ * For Tika, all we need (so far) is a mapping between styleId and a style's name.
+ *
+ * This class uses SAX to scrape that info out of the styles.xml file. If
+ * either the styleId or the style's name is null, no information is recorded.
*/
-public class XWPFStylesShim extends XWPFStyles {
+public class XWPFStylesShim {
+
+ private Map<String, String> styles = new HashMap<>();
+
+ public XWPFStylesShim(PackagePart part, ParseContext parseContext) {
+ try (InputStream is = part.getInputStream()) {
+ onDocumentLoad(parseContext, is);
+ } catch (IOException|TikaException|SAXException e) {
+ //swallow
+ }
+ }
+
+ private void onDocumentLoad(ParseContext parseContext, InputStream stream) throws TikaException, IOException, SAXException {
+ parseContext.getSAXParser().parse(stream,
+ new OfflineContentHandler(new StylesStripper()));
+ }
- public XWPFStylesShim(PackagePart part) throws IOException, OpenXML4JException {
- super(part);
- onDocumentRead();
+ /**
+ *
+ * @param styleId
+ * @return style's name or null if styleId is null or can't be found
+ */
+ public String getStyleName(String styleId) {
+ if (styleId == null) {
+ return null;
+ }
+ return styles.get(styleId);
}
+ private class StylesStripper extends DefaultHandler {
+
+ String currentStyleId = null;
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ if (uri == null || XWPFDocumentXMLBodyHandler.W_NS.equals(uri)) {
+ if ("style".equals(localName)) {
+ currentStyleId = atts.getValue(XWPFDocumentXMLBodyHandler.W_NS, "styleId");
+ } else if ("name".equals(localName)) {
+ String name = atts.getValue(XWPFDocumentXMLBodyHandler.W_NS, "val");
+ if (currentStyleId != null && name != null) {
+ styles.put(currentStyleId, name);
+ }
+ }
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ if (uri == null || XWPFDocumentXMLBodyHandler.W_NS.equals(uri)) {
+ if ("style".equals(localName)) {
+ currentStyleId = null;
+ }
+ }
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/0f78a314/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
index 759457a..343b420 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
@@ -21,8 +21,6 @@ package org.apache.tika.parser.microsoft.ooxml.xwpf;
import java.math.BigInteger;
import java.util.Date;
-import org.apache.poi.xwpf.usermodel.XWPFStyle;
-import org.apache.poi.xwpf.usermodel.XWPFStyles;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.WordExtractor;
import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
@@ -39,7 +37,7 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
private final XWPFListManager listManager;
private final boolean includeDeletedText;
private final boolean includeMoveFromText;
- private final XWPFStyles styles;
+ private final XWPFStylesShim styles;
private int pDepth = 0; //paragraph depth
private int tableDepth = 0;//table depth
@@ -52,7 +50,7 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
//if we're marking more that the first level <p/> element
private String paragraphTag = null;
- public XWPFTikaBodyPartHandler(XHTMLContentHandler xhtml, XWPFStyles styles, XWPFListManager listManager, OfficeParserConfig parserConfig) {
+ public XWPFTikaBodyPartHandler(XHTMLContentHandler xhtml, XWPFStylesShim styles, XWPFListManager listManager, OfficeParserConfig parserConfig) {
this.xhtml = xhtml;
this.styles = styles;
this.listManager = listManager;
@@ -127,12 +125,12 @@ public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFB
String styleClass = null;
//TIKA-2144 check that styles is not null
if (paragraphProperties.getStyleID() != null && styles != null) {
- XWPFStyle style = styles.getStyle(
+ String styleName = styles.getStyleName(
paragraphProperties.getStyleID()
);
- if (style != null && style.getName() != null) {
+ if (styleName != null) {
WordExtractor.TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(
- style.getName(), false);
+ styleName, false);
paragraphTag = tas.getTag();
styleClass = tas.getStyleClass();
}