You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/17 00:47:05 UTC
[2/2] tika git commit: TIKA-2210 -- add experimental SAX parser for
pptx -- this is a first cut. More refactoring is in order.
TIKA-2210 -- add experimental SAX parser for pptx -- this is a first cut. More refactoring is in order.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/90cdf1f6
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/90cdf1f6
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/90cdf1f6
Branch: refs/heads/master
Commit: 90cdf1f6a844e0d0541167bc0364bb3963f93b2d
Parents: 1d9445b
Author: tballison <ta...@mitre.org>
Authored: Fri Dec 16 19:46:55 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Fri Dec 16 19:46:55 2016 -0500
----------------------------------------------------------------------
CHANGES.txt | 3 +
.../src/test/java/org/apache/tika/TikaTest.java | 4 +
.../parser/microsoft/AbstractOfficeParser.java | 4 +
.../parser/microsoft/OfficeParserConfig.java | 19 +-
.../ooxml/AbstractDocumentXMLBodyHandler.java | 99 ++++
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 74 +++
.../microsoft/ooxml/MetadataExtractor.java | 4 +-
.../microsoft/ooxml/OOXMLExtractorFactory.java | 47 +-
.../microsoft/ooxml/ParagraphProperties.java | 56 ++
.../parser/microsoft/ooxml/RunProperties.java | 44 ++
.../SXSLFPowerPointExtractorDecorator.java | 428 +++++++++++++++
.../ooxml/SXWPFWordExtractorDecorator.java | 56 +-
.../ooxml/xslf/XSLFDocumentXMLBodyHandler.java | 330 ++++++++++++
.../xslf/XSLFEventBasedPowerPointExtractor.java | 161 ++++++
.../ooxml/xslf/XSLFTikaBodyPartHandler.java | 262 +++++++++
.../ooxml/xwpf/XWPFDocumentXMLBodyHandler.java | 76 +--
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 28 +-
.../ooxml/xwpf/XWPFParagraphProperties.java | 56 --
.../microsoft/ooxml/xwpf/XWPFRunProperties.java | 44 --
.../microsoft/ooxml/xwpf/XWPFStylesShim.java | 9 +-
.../ooxml/xwpf/XWPFTikaBodyPartHandler.java | 6 +-
.../microsoft/ooxml/SXSLFExtractorTest.java | 533 +++++++++++++++++++
.../testPPTX_overlappingRelations.pptx | Bin 0 -> 38135 bytes
.../test-documents/testPPT_various2.pptx | Bin 0 -> 248937 bytes
24 files changed, 2083 insertions(+), 260 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 90823c6..e215499 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.15 - ??
+ * Added experimental SAX parser for .pptx files. To select this parser,
+ set useSAXPptxExtractor(true) on OfficeParserConfig (TIKA-2210).
+
* Upgrade to PDFBox 2.0.4 (TIKA-2209).
* Refactor MockParser to consolidate service loading
http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-core/src/test/java/org/apache/tika/TikaTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 6644d86..11eb801 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -173,6 +173,10 @@ public abstract class TikaTest {
return getXML(filePath, new AutoDetectParser(), parseContext);
}
+ protected XMLResult getXML(String filePath, Metadata metadata, ParseContext parseContext) throws Exception {
+ return getXML(getResourceAsStream("/test-documents/"+filePath), new AutoDetectParser(), metadata, parseContext);
+ }
+
protected XMLResult getXML(String filePath, Metadata metadata) throws Exception {
return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), metadata, null);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
index d8186bc..e01fe0c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
@@ -66,4 +66,8 @@ public abstract class AbstractOfficeParser extends AbstractParser {
defaultOfficeParserConfig.setUseSAXDocxExtractor(useSAXDocxExtractor);
}
+ @Field
+ public void setUseSAXPptxExtractor(boolean useSAXPptxExtractor) {
+ defaultOfficeParserConfig.setUseSAXPptxExtractor(useSAXPptxExtractor);
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index f3cdbfe..05275d7 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -25,6 +25,7 @@ public class OfficeParserConfig implements Serializable {
private boolean includeMoveFromContent = false;
private boolean useSAXDocxExtractor = false;
+ private boolean useSAXPptxExtractor = false;
/**
* Sets whether or not the parser should include deleted content.
@@ -71,12 +72,28 @@ public class OfficeParserConfig implements Serializable {
* If set to <code>false</code>, the classic parser will be used; if <code>true</code>,
* the new experimental parser will be used.
* <p/>
- * Default: classic parser
+ * Default: <code>false</code> (classic DOM parser)
* @param useSAXDocxExtractor
*/
public void setUseSAXDocxExtractor(boolean useSAXDocxExtractor) {
this.useSAXDocxExtractor = useSAXDocxExtractor;
}
+
+ /**
+ * Use the experimental SAX-based streaming DOCX parser?
+ * If set to <code>false</code>, the classic parser will be used; if <code>true</code>,
+ * the new experimental parser will be used.
+ * <p/>
+ * Default: <code>false</code> (classic DOM parser)
+ * @param useSAXPptxExtractor
+ */
+ public void setUseSAXPptxExtractor(boolean useSAXPptxExtractor) {
+ this.useSAXPptxExtractor = useSAXPptxExtractor;
+ }
+
+ public boolean getUseSAXPptxExtractor() {
+ return useSAXPptxExtractor;
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractDocumentXMLBodyHandler.java
new file mode 100644
index 0000000..5037fd2
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractDocumentXMLBodyHandler.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml;
+
+
+import org.xml.sax.helpers.DefaultHandler;
+
+public class AbstractDocumentXMLBodyHandler extends DefaultHandler {
+
+ protected final static String R = "r";
+ protected final static String FLD = "fld";
+ protected final static String RPR = "rPr";
+ protected final static String P = "p";
+ protected static String P_STYLE = "pStyle";
+ protected final static String PPR = "pPr";
+ protected static String T = "t";
+ protected final static String TAB = "tab";
+ protected final static String B = "b";
+ protected final static String ILVL = "ilvl";
+ protected final static String NUM_ID = "numId";
+ protected final static String TC = "tc";
+ protected final static String TR = "tr";
+ protected final static String I = "i";
+ protected final static String NUM_PR = "numPr";
+ protected final static String BR = "br";
+ protected final static String HYPERLINK = "hyperlink";
+ protected final static String TBL = "tbl";
+ protected final static String PIC = "pic";
+ protected final static String PICT = "pict";
+ protected final static String IMAGEDATA = "imagedata";
+ protected final static String BLIP = "blip";
+ protected final static String CHOICE = "Choice";
+ protected final static String FALLBACK = "Fallback";
+ protected final static String OLE_OBJECT = "OLEObject";
+ protected final static String CR = "cr";
+
+ public final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
+ protected final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
+ protected final static String O_NS = "urn:schemas-microsoft-com:office:office";
+ protected final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
+ protected final static String DRAWING_MAIN_NS = "http://schemas.openxmlformats.org/drawingml/2006/main";
+ protected final static String V_NS = "urn:schemas-microsoft-com:vml";
+
+ protected final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
+
+ protected final static char[] TAB_CHAR = new char[1];
+ protected final static char NEWLINE = '\n';
+
+ static {
+ TAB_CHAR[0] = '\t';
+ }
+
+ protected boolean inR = false;//in run or in field
+ protected boolean inT = false;
+ protected boolean inRPr = false;
+ protected boolean inNumPr = false;
+
+ protected boolean inPic = false;
+ boolean inPict = false;
+ protected String picDescription = null;
+ protected String picRId = null;
+ String picFilename = null;
+
+ //mechanism used to determine when to
+ //signal the start of the p, and still
+ //handle p with pPr and those without
+ protected boolean lastStartElementWasP = false;
+ //have we signaled the start of a p?
+ //pPr can happen multiple times within a p
+ //<p><pPr/><r><t>text</t></r><pPr></p>
+ protected boolean pStarted = false;
+
+ //alternate content can be embedded in itself.
+ //need to track depth.
+ //if in alternate, choose fallback, maybe make this configurable?
+ protected int inACChoiceDepth = 0;
+ protected int inACFallbackDepth = 0;
+
+ protected RunProperties currRunProperties = new RunProperties();
+ protected ParagraphProperties currPProperties = new ParagraphProperties();
+
+ protected final StringBuilder runBuffer = new StringBuilder();
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 6bc867d..a56d43b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -18,12 +18,15 @@ package org.apache.tika.parser.microsoft.ooxml;
import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
+import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
+import java.util.Map;
import java.util.Set;
import org.apache.poi.POIXMLDocument;
@@ -32,13 +35,16 @@ import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.openxml4j.opc.TargetMode;
+import org.apache.poi.openxml4j.opc.internal.FileHelper;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -64,6 +70,9 @@ import org.xml.sax.helpers.AttributesImpl;
* populates the {@link XHTMLContentHandler} object received as parameter.
*/
public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
+
+
+
static final String RELATION_AUDIO = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/audio";
static final String RELATION_IMAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
static final String RELATION_OLE_OBJECT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject";
@@ -72,6 +81,15 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
static final String RELATION_OFFICE_DOCUMENT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
private static final String TYPE_OLE_OBJECT =
"application/vnd.openxmlformats-officedocument.oleObject";
+
+ protected final static String[] EMBEDDED_RELATIONSHIPS = new String[]{
+ RELATION_AUDIO,
+ RELATION_IMAGE,
+ RELATION_PACKAGE,
+ RELATION_OFFICE_DOCUMENT
+ };
+
+
private final EmbeddedDocumentExtractor embeddedExtractor;
protected POIXMLTextExtractor extractor;
@@ -344,4 +362,60 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
throw new TikaException("Broken OOXML file", e);
}
}
+
+ /**
+ * This is used by the SAX docx and pptx decorators to load hyperlinks and
+ * other linked objects
+ *
+ * @param bodyPart
+ * @return
+ */
+ protected Map<String, String> loadLinkedRelationships(PackagePart bodyPart, boolean includeInternal) {
+ Map<String, String> linkedRelationships = new HashMap<>();
+ try {
+ PackageRelationshipCollection prc = bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
+ for (int i = 0; i < prc.size(); i++) {
+ PackageRelationship pr = prc.getRelationship(i);
+ if (pr == null) {
+ continue;
+ }
+ if (! includeInternal && TargetMode.INTERNAL.equals(pr.getTargetMode())) {
+ continue;
+ }
+ String id = pr.getId();
+ String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
+ if (id != null && url != null) {
+ linkedRelationships.put(id, url);
+ }
+ }
+
+ for (String rel : EMBEDDED_RELATIONSHIPS) {
+
+ prc = bodyPart.getRelationshipsByType(rel);
+ for (int i = 0; i < prc.size(); i++) {
+ PackageRelationship pr = prc.getRelationship(i);
+ if (pr == null) {
+ continue;
+ }
+ String id = pr.getId();
+ String uriString = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
+ String fileName = uriString;
+ if (pr.getTargetURI() != null) {
+ try {
+ fileName = FileHelper.getFilename(new File(fileName));
+ } catch (Exception e) {
+ fileName = uriString;
+ }
+ }
+ if (id != null) {
+ fileName = (fileName == null) ? "" : fileName;
+ linkedRelationships.put(id, fileName);
+ }
+ }
+ }
+
+ } catch (InvalidFormatException e) {
+ }
+ return linkedRelationships;
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
index d392346..21c6252 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
@@ -36,6 +36,7 @@ import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.microsoft.SummaryExtractor;
+import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
import org.apache.xmlbeans.impl.values.XmlValueOutOfRangeException;
import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
@@ -59,7 +60,8 @@ public class MetadataExtractor {
public void extract(Metadata metadata) throws TikaException {
if (extractor.getDocument() != null ||
((extractor instanceof XSSFEventBasedExcelExtractor ||
- extractor instanceof XWPFEventBasedWordExtractor) &&
+ extractor instanceof XWPFEventBasedWordExtractor ||
+ extractor instanceof XSLFEventBasedPowerPointExtractor) &&
extractor.getPackage() != null)) {
extractMetadata(extractor.getCoreProperties(), metadata);
extractMetadata(extractor.getExtendedProperties(), metadata);
http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index bbee6b7..30ed1ec 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -30,8 +30,8 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
-import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XMLSlideShow;
+import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
@@ -43,6 +43,7 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
import org.apache.tika.parser.pkg.ZipContainerDetector;
import org.apache.xmlbeans.XmlException;
@@ -93,6 +94,9 @@ public class OOXMLExtractorFactory {
if (config.getUseSAXDocxExtractor()) {
poiExtractor = trySXWPF(pkg);
}
+ if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
+ poiExtractor = trySXSLF(pkg);
+ }
if (poiExtractor == null) {
poiExtractor = ExtractorFactory.createExtractor(pkg);
}
@@ -103,7 +107,12 @@ public class OOXMLExtractorFactory {
context, (XSSFEventBasedExcelExtractor) poiExtractor, locale);
} else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
extractor = new SXWPFWordExtractorDecorator(context,
- (XWPFEventBasedWordExtractor)poiExtractor);
+ (XWPFEventBasedWordExtractor) poiExtractor);
+ metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getSimpleName());
+ } else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) {
+ extractor = new SXSLFPowerPointExtractorDecorator(context,
+ (XSLFEventBasedPowerPointExtractor) poiExtractor);
+ metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getSimpleName());
} else if (document == null) {
throw new TikaException(
"Expecting UserModel based POI OOXML extractor with a document, but none found. " +
@@ -111,7 +120,7 @@ public class OOXMLExtractorFactory {
);
} else if (document instanceof XMLSlideShow) {
extractor = new XSLFPowerPointExtractorDecorator(
- context, (XSLFPowerPointExtractor) poiExtractor);
+ context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
} else if (document instanceof XWPFDocument) {
extractor = new XWPFWordExtractorDecorator(
context, (XWPFWordExtractor) poiExtractor);
@@ -119,6 +128,7 @@ public class OOXMLExtractorFactory {
extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
}
+
// Get the bulk of the metadata first, so that it's accessible during
// parsing if desired by the client (see TIKA-1109)
extractor.getMetadataExtractor().extract(metadata);
@@ -146,7 +156,7 @@ public class OOXMLExtractorFactory {
private static POIXMLTextExtractor trySXWPF(OPCPackage pkg) throws XmlException, OpenXML4JException, IOException {
PackageRelationshipCollection packageRelationshipCollection = pkg.getRelationshipsByType("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument");
- if(packageRelationshipCollection.size() == 0) {
+ if (packageRelationshipCollection.size() == 0) {
packageRelationshipCollection = pkg.getRelationshipsByType("http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument");
}
@@ -163,4 +173,33 @@ public class OOXMLExtractorFactory {
return null;
}
+ private static POIXMLTextExtractor trySXSLF(OPCPackage pkg) throws XmlException, OpenXML4JException, IOException {
+
+ PackageRelationshipCollection packageRelationshipCollection = pkg.getRelationshipsByType("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument");
+ if (packageRelationshipCollection.size() == 0) {
+ packageRelationshipCollection = pkg.getRelationshipsByType("http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument");
+ }
+
+ if (packageRelationshipCollection.size() == 0) {
+ return null;
+ }
+ PackagePart corePart = pkg.getPart(packageRelationshipCollection.getRelationship(0));
+ String targetContentType = corePart.getContentType();
+
+ XSLFRelation[] xslfRelations = org.apache.poi.xslf.extractor.XSLFPowerPointExtractor.SUPPORTED_TYPES;
+
+ for (int i = 0; i < xslfRelations.length; i++) {
+ XSLFRelation xslfRelation = xslfRelations[i];
+ if (xslfRelation.getContentType().equals(targetContentType)) {
+ return new XSLFEventBasedPowerPointExtractor(pkg);
+ }
+ }
+
+ if (XSLFRelation.THEME_MANAGER.getContentType().equals(targetContentType)) {
+ return new XSLFEventBasedPowerPointExtractor(pkg);
+ }
+ return null;
+ }
+
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/ParagraphProperties.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/ParagraphProperties.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/ParagraphProperties.java
new file mode 100644
index 0000000..62ee31e
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/ParagraphProperties.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml;
+
+
+public class ParagraphProperties {
+
+ private String styleId;
+ private int ilvl = -1;
+ private int numId = -1;
+
+ public String getStyleID() {
+ return styleId;
+ }
+
+ public void setStyleID(String styleId) {
+ this.styleId = styleId;
+ }
+
+ public void reset() {
+ styleId = null;
+ ilvl = -1;
+ numId = -1;
+ }
+
+ public void setIlvl(int ilvl) {
+ this.ilvl = ilvl;
+ }
+
+ public void setNumId(int numId) {
+ this.numId = numId;
+ }
+
+ public int getIlvl() {
+ return ilvl;
+ }
+
+ public int getNumId() {
+ return numId;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
new file mode 100644
index 0000000..9fbfcd8
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml;
+
+/**
+ * WARNING: This class is mutable. Make a copy of it
+ * if you want persistence!
+ */
+
+public class RunProperties {
+ boolean italics = false;
+ boolean bold = false;
+
+ public boolean getItalics() {
+ return italics;
+ }
+
+ public boolean getBold() {
+ return bold;
+ }
+
+ public void setItalics(boolean italics) {
+ this.italics = italics;
+ }
+
+ public void setBold(boolean bold) {
+ this.bold = bold;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
new file mode 100644
index 0000000..1ab8bd3
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -0,0 +1,428 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackagePartName;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.openxml4j.opc.PackagingURIHelper;
+import org.apache.poi.openxml4j.opc.TargetMode;
+import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.usermodel.XSLFRelation;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFDocumentXMLBodyHandler;
+import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
+import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFTikaBodyPartHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * SAX/Streaming pptx extractior
+ */
+public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
+
+ private final static String HANDOUT_MASTER = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/handoutMaster";
+
+ //a pptx file should have one of these "main story" parts
+ private final static String[] MAIN_STORY_PART_RELATIONS = new String[]{
+ XSLFRelation.MAIN.getContentType(),
+ XSLFRelation.PRESENTATION_MACRO.getContentType(),
+ XSLFRelation.PRESENTATIONML.getContentType(),
+ XSLFRelation.PRESENTATIONML_TEMPLATE.getContentType(),
+ XSLFRelation.MACRO.getContentType(),
+ XSLFRelation.MACRO_TEMPLATE.getContentType(),
+ XSLFRelation.THEME_MANAGER.getContentType()
+
+
+ //TODO: what else
+ };
+
+ private final OPCPackage opcPackage;
+ private final ParseContext context;
+ private PackagePart mainDocument = null;
+ private final CommentAuthors commentAuthors = new CommentAuthors();
+
+ public SXSLFPowerPointExtractorDecorator(ParseContext context, XSLFEventBasedPowerPointExtractor extractor) {
+ super(context, extractor);
+ this.context = context;
+ this.opcPackage = extractor.getPackage();
+ for (String contentType : MAIN_STORY_PART_RELATIONS) {
+ List<PackagePart> pps = opcPackage.getPartsByContentType(contentType);
+ if (pps.size() > 0) {
+ mainDocument = pps.get(0);
+ break;
+ }
+ }
+ //if mainDocument == null, throw exception
+ }
+
+ /**
+ * @see XSLFPowerPointExtractor#getText()
+ */
+ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
+
+ loadCommentAuthors();
+
+ //TODO: should check for custShowLst and order based on sldLst
+ try {
+
+ PackageRelationshipCollection prc = mainDocument.getRelationshipsByType(XSLFRelation.SLIDE.getRelation());
+ if (prc.size() == 0) {
+
+ }
+ for (int i = 0; i < prc.size(); i++) {
+ handleSlidePart(mainDocument.getRelatedPart(prc.getRelationship(i)), xhtml);
+ }
+ } catch (InvalidFormatException e) {
+ }
+ handleBasicRelatedParts(XSLFRelation.SLIDE_MASTER.getRelation(),
+ "slide-master",
+ mainDocument,
+ new PlaceHolderSkipper(new XSLFDocumentXMLBodyHandler(
+ new XSLFTikaBodyPartHandler(xhtml), new HashMap<String, String>())));
+
+ handleBasicRelatedParts(HANDOUT_MASTER,
+ "slide-handout-master",
+ mainDocument,
+ new XSLFDocumentXMLBodyHandler(
+ new XSLFTikaBodyPartHandler(xhtml), new HashMap<String, String>())
+ );
+ }
+
+ private void loadCommentAuthors() {
+ PackageRelationshipCollection prc = null;
+ try {
+ prc = mainDocument.getRelationshipsByType(XSLFRelation.COMMENT_AUTHORS.getRelation());
+ } catch (InvalidFormatException e) {
+ }
+ if (prc == null || prc.size() == 0) {
+ return;
+ }
+
+ for (int i = 0; i < prc.size(); i++) {
+ PackagePart commentAuthorsPart = null;
+ try {
+ commentAuthorsPart = commentAuthorsPart = mainDocument.getRelatedPart(prc.getRelationship(i));
+ } catch (InvalidFormatException e) {
+
+ }
+ if (commentAuthorsPart == null) {
+ continue;
+ }
+ try (InputStream stream = commentAuthorsPart.getInputStream()) {
+ context.getSAXParser().parse(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(new XSLFCommentAuthorHandler()));
+
+ } catch (TikaException | SAXException | IOException e) {
+ //do something with this
+ }
+ }
+
+ }
+
+ private void handleSlidePart(PackagePart slidePart, XHTMLContentHandler xhtml) throws IOException, SAXException {
+ Map<String, String> linkedRelationships = loadLinkedRelationships(slidePart, false);
+
+// Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
+ xhtml.startElement("div", "class", "slide-content");
+ try (InputStream stream = slidePart.getInputStream()) {
+ context.getSAXParser().parse(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(new EmbeddedContentHandler(
+ new XSLFDocumentXMLBodyHandler(
+ new XSLFTikaBodyPartHandler(xhtml), linkedRelationships))));
+
+ } catch (TikaException e) {
+ //do something with this
+ }
+
+ xhtml.endElement("div");
+
+
+ handleBasicRelatedParts(XSLFRelation.SLIDE_LAYOUT.getRelation(),
+ "slide-master-content", slidePart,
+ new PlaceHolderSkipper(new XSLFDocumentXMLBodyHandler(
+ new XSLFTikaBodyPartHandler(xhtml), linkedRelationships))
+ );
+
+ handleBasicRelatedParts(XSLFRelation.NOTES.getRelation(),
+ "slide-notes", slidePart,
+ new XSLFDocumentXMLBodyHandler(
+ new XSLFTikaBodyPartHandler(xhtml), linkedRelationships));
+
+ handleBasicRelatedParts(XSLFRelation.NOTES_MASTER.getRelation(),
+ "slide-notes-master", slidePart,
+ new XSLFDocumentXMLBodyHandler(
+ new XSLFTikaBodyPartHandler(xhtml), linkedRelationships));
+
+ handleBasicRelatedParts(XSLFRelation.COMMENTS.getRelation(),
+ null, slidePart,
+ new XSLFCommentsHandler(xhtml));
+
+// handleBasicRelatedParts("");
+ }
+
+ /**
+ * This should handle the comments, master, notes, etc
+ *
+ * @param contentType
+ * @param xhtmlClassLabel
+ * @param parentPart
+ * @param contentHandler
+ */
+ private void handleBasicRelatedParts(String contentType, String xhtmlClassLabel,
+ PackagePart parentPart, ContentHandler contentHandler) throws SAXException {
+
+ PackageRelationshipCollection relatedPartPRC = null;
+
+ try {
+ relatedPartPRC = parentPart.getRelationshipsByType(contentType);
+ } catch (InvalidFormatException e) {
+ //swallow
+ }
+ if (relatedPartPRC != null && relatedPartPRC.size() > 0) {
+ AttributesImpl attributes = new AttributesImpl();
+
+ attributes.addAttribute("", "class", "class", "CDATA", xhtmlClassLabel);
+ contentHandler.startElement("", "div", "div", attributes);
+ for (int i = 0; i < relatedPartPRC.size(); i++) {
+ PackageRelationship relatedPartPackageRelationship = relatedPartPRC.getRelationship(i);
+ try {
+ PackagePart relatedPartPart = parentPart.getRelatedPart(relatedPartPackageRelationship);
+ try (InputStream stream = relatedPartPart.getInputStream()) {
+ context.getSAXParser().parse(stream,
+ new OfflineContentHandler(new EmbeddedContentHandler(contentHandler)));
+
+ } catch (IOException|TikaException e) {
+ //do something with this
+ }
+
+ } catch (InvalidFormatException e) {
+ }
+ }
+ contentHandler.endElement("", "div", "div");
+ }
+
+ }
+
+ /**
+ * In PowerPoint files, slides have things embedded in them,
+ * and slide drawings which have the images
+ */
+ @Override
+ protected List<PackagePart> getMainDocumentParts() {
+ List<PackagePart> parts = new ArrayList<>();
+ //TODO: consider: getPackage().getPartsByName(Pattern.compile("/ppt/embeddings/.*?
+ //TODO: consider: getPackage().getPartsByName(Pattern.compile("/ppt/media/.*?
+ try {
+ PackageRelationshipCollection prc = mainDocument.getRelationshipsByType(XSLFRelation.SLIDE.getRelation());
+ for (int i = 0; i < prc.size(); i++) {
+ PackagePart slidePart = mainDocument.getRelatedPart(prc.getRelationship(i));
+ for (PackageRelationship rel : slidePart.getRelationshipsByType(XSLFRelation.VML_DRAWING.getRelation())) {
+ if (rel.getTargetMode() == TargetMode.INTERNAL) {
+ PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
+ parts.add(rel.getPackage().getPart(relName));
+ }
+ }
+ parts.add(slidePart);
+ }
+ } catch (InvalidFormatException e) {
+ //do something
+ }
+ parts.add(mainDocument);
+ return parts;
+ }
+
+ private class XSLFCommentsHandler extends DefaultHandler {
+
+ private String commentAuthorId = null;
+ private StringBuilder commentBuffer = new StringBuilder();
+ private XHTMLContentHandler xhtml;
+ XSLFCommentsHandler(XHTMLContentHandler xhtml) {
+ this.xhtml = xhtml;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ if ("cm".equals(localName)) {
+ commentAuthorId = atts.getValue("", "authorId");
+ //get date (dt)?
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ //TODO: require that we're in <p:text>?
+ commentBuffer.append(ch, start, length);
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ if ("cm".equals(localName)) {
+
+ xhtml.startElement("p", "class", "slide-comment");
+
+ String authorString = commentAuthors.getName(commentAuthorId);
+ String authorInitials = commentAuthors.getInitials(commentAuthorId);
+ if (authorString != null || authorInitials != null) {
+ xhtml.startElement("b");
+ boolean authorExists = false;
+ if (authorString != null) {
+ xhtml.characters(authorString.toString());
+ authorExists = true;
+ }
+ if (authorExists && authorInitials != null) {
+ xhtml.characters(" (");
+ }
+ if (authorInitials != null) {
+ xhtml.characters(authorInitials);
+ }
+ if (authorExists && authorInitials != null) {
+ xhtml.characters(")");
+ }
+ xhtml.endElement("b");
+ }
+ xhtml.characters(commentBuffer.toString());
+ xhtml.endElement("p");
+
+ commentBuffer.setLength(0);
+ commentAuthorId = null;
+ }
+ }
+ }
+
+ private class XSLFCommentAuthorHandler extends DefaultHandler {
+ String id = null;
+ String name = null;
+ String initials = null;
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ if ("cmAuthor".equals(localName)) {
+ for (int i = 0; i < atts.getLength(); i++) {
+ if ("id".equals(atts.getLocalName(i))) {
+ id = atts.getValue(i);
+ } else if ("name".equals(atts.getLocalName(i))) {
+ name = atts.getValue(i);
+ } else if ("initials".equals(atts.getLocalName(i))) {
+ initials = atts.getValue(i);
+ }
+ }
+ commentAuthors.add(id, name, initials);
+ //clear out
+ id = null; name = null; initials = null;
+ }
+ }
+
+ }
+
+
+ private static class PlaceHolderSkipper extends DefaultHandler {
+
+ private final XSLFDocumentXMLBodyHandler wrappedHandler;
+
+ PlaceHolderSkipper(XSLFDocumentXMLBodyHandler wrappedHandler) {
+ this.wrappedHandler = wrappedHandler;
+ }
+
+ boolean inPH = false;
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ if ("ph".equals(localName)) {
+ inPH = true;
+ }
+ if (! inPH) {
+ wrappedHandler.startElement(uri, localName, qName, atts);
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+
+ if (! inPH) {
+ wrappedHandler.endElement(uri, localName, qName);
+ }
+ if ("sp".equals(localName)) {
+ inPH = false;
+ }
+ }
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (! inPH) {
+ wrappedHandler.characters(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+ if (! inPH) {
+ wrappedHandler.characters(ch, start, length);
+ }
+ }
+
+
+ }
+
+ private class CommentAuthors {
+ Map<String, String> nameMap = new HashMap<>();
+ Map<String, String> initialMap = new HashMap<>();
+
+ void add(String id, String name, String initials) {
+ if (id == null) {
+ return;
+ }
+ if (name != null) {
+ nameMap.put(id, name);
+ }
+ if (initials != null) {
+ initialMap.put(id, initials);
+ }
+ }
+
+ String getName(String id) {
+ if (id == null) {
+ return null;
+ }
+ return nameMap.get(id);
+ }
+
+ String getInitials(String id) {
+ if (id == null) {
+ return null;
+ }
+ return initialMap.get(id);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 70c7399..d60b274 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -16,11 +16,9 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
-import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -31,10 +29,8 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
-import org.apache.poi.openxml4j.opc.internal.FileHelper;
import org.apache.poi.xwpf.usermodel.XWPFNumbering;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
-import org.apache.poi.xwpf.usermodel.XWPFStyles;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
@@ -62,12 +58,6 @@ import org.xml.sax.SAXException;
*/
public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
- private final static String[] EMBEDDED_RELATIONSHIPS = new String[]{
- RELATION_AUDIO,
- RELATION_IMAGE,
- RELATION_PACKAGE,
- RELATION_OFFICE_DOCUMENT
- };
//include all parts that might have embedded objects
private final static String[] MAIN_PART_RELATIONS = new String[]{
@@ -171,7 +161,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
private void handlePart(PackagePart packagePart, XWPFStylesShim styles,
XWPFListManager listManager, XHTMLContentHandler xhtml) throws IOException, SAXException {
- Map<String, String> linkedRelationships = loadLinkedRelationships(packagePart);
+ Map<String, String> linkedRelationships = loadLinkedRelationships(packagePart, true);
try (InputStream stream = packagePart.getInputStream()) {
context.getSAXParser().parse(
new CloseShieldInputStream(stream),
@@ -185,51 +175,7 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
- private Map<String, String> loadLinkedRelationships(PackagePart bodyPart) {
- Map<String, String> linkedRelationships = new HashMap<>();
- try {
- PackageRelationshipCollection prc = bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
- for (int i = 0; i < prc.size(); i++) {
- PackageRelationship pr = prc.getRelationship(i);
- if (pr == null) {
- continue;
- }
- String id = pr.getId();
- String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
- if (id != null && url != null) {
- linkedRelationships.put(id, url);
- }
- }
-
- for (String rel : EMBEDDED_RELATIONSHIPS) {
- prc = bodyPart.getRelationshipsByType(rel);
- for (int i = 0; i < prc.size(); i++) {
- PackageRelationship pr = prc.getRelationship(i);
- if (pr == null) {
- continue;
- }
- String id = pr.getId();
- String uriString = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
- String fileName = uriString;
- if (pr.getTargetURI() != null) {
- try {
- fileName = FileHelper.getFilename(new File(fileName));
- } catch (Exception e) {
- fileName = uriString;
- }
- }
- if (id != null) {
- fileName = (fileName == null) ? "" : fileName;
- linkedRelationships.put(id, fileName);
- }
- }
- }
-
- } catch (InvalidFormatException e) {
- }
- return linkedRelationships;
- }
private XWPFStylesShim loadStyles(PackagePart packagePart) {
try {
http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFDocumentXMLBodyHandler.java
new file mode 100644
index 0000000..b5aa449
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFDocumentXMLBodyHandler.java
@@ -0,0 +1,330 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xslf;
+
+
+import java.util.Map;
+
+import org.apache.tika.parser.microsoft.ooxml.AbstractDocumentXMLBodyHandler;
+import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
+import org.apache.tika.parser.microsoft.ooxml.RunProperties;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+/**
+ * This class is intended to handle anything that might contain IBodyElements:
+ * main document, headers, footers, notes, etc.
+ */
+
+public class XSLFDocumentXMLBodyHandler extends AbstractDocumentXMLBodyHandler {
+
+
+ private final XSLFBodyContentsHandler bodyContentsHandler;
+ //private final RelationshipsManager relationshipsManager;
+
+
+ //alternate content can be embedded in itself.
+ //need to track depth.
+ //if in alternate, choose fallback, maybe make this configurable?
+ private int inACChoiceDepth = 0;
+ private int inACFallbackDepth = 0;
+
+ private boolean inHyperlink = false;
+
+ private final Map<String, String> linkedRelationships;
+
+ public XSLFDocumentXMLBodyHandler(XSLFBodyContentsHandler bodyContentsHandler,
+ Map<String, String> linkedRelationships) {
+ this.bodyContentsHandler = bodyContentsHandler;
+ this.linkedRelationships = linkedRelationships;
+ }
+
+
+ @Override
+ public void startDocument() throws SAXException {
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) throws SAXException {
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) throws SAXException {
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ //TODO: checkBox, textBox, sym, headerReference, footerReference, commentRangeEnd
+
+ if (lastStartElementWasP && ! PPR.equals(localName)) {
+ bodyContentsHandler.startParagraph(currPProperties);
+ pStarted = true;
+ }
+
+ lastStartElementWasP = false;
+
+ if (uri != null && uri.equals(MC_NS)) {
+ if (CHOICE.equals(localName)) {
+ inACChoiceDepth++;
+ } else if (FALLBACK.equals(localName)) {
+ inACFallbackDepth++;
+ }
+ }
+
+ if (inACChoiceDepth > 0) {
+ return;
+ }
+ //these are sorted descending by frequency
+ //in our regression corpus
+ if (RPR.equals(localName)) {
+ inRPr = true;
+ } else if (R.equals(localName)) {
+ inR = true;
+ } else if (T.equals(localName)) {
+ inT = true;
+ } else if (TAB.equals(localName)) {
+ runBuffer.append(TAB_CHAR);
+ } else if (P.equals(localName)) {
+ lastStartElementWasP = true;
+ } else if (B.equals(localName)) { //TODO: add bCs
+ if(inR && inRPr) {
+ currRunProperties.setBold(true);
+ }
+ } else if (TC.equals(localName)) {
+ bodyContentsHandler.startTableCell();
+ } else if (P_STYLE.equals(localName)) {
+ String styleId = atts.getValue(W_NS, "val");
+ currPProperties.setStyleID(styleId);
+ } else if (I.equals(localName)) { //TODO: add iCs
+ //rprs don't have to be inR; ignore those that aren't
+ if (inR && inRPr) {
+ currRunProperties.setItalics(true);
+ }
+ } else if (FLD.equals(localName)) {
+ inR = true;
+ } else if (TR.equals(localName)) {
+ bodyContentsHandler.startTableRow();
+ } else if (NUM_PR.equals(localName)) {
+ inNumPr = true;
+ } else if (ILVL.equals(localName)) {
+ if (inNumPr) {
+ currPProperties.setIlvl(getIntVal(atts));
+ }
+ } else if (NUM_ID.equals(localName)) {
+ if (inNumPr) {
+ currPProperties.setNumId(getIntVal(atts));
+ }
+ } else if(BR.equals(localName)) {
+ runBuffer.append(NEWLINE);
+ } else if ("hlinkClick".equals(localName)) {
+ String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+ String hyperlink = null;
+ if (hyperlinkId != null) {
+ hyperlink = linkedRelationships.get(hyperlinkId);
+ bodyContentsHandler.hyperlinkStart(hyperlink);
+ inHyperlink = true;
+ }/* else {
+ String anchor = atts.getValue(W_NS, "anchor");
+ if (anchor != null) {
+ anchor = "#" + anchor;
+ }
+ bodyContentsHandler.hyperlinkStart(anchor);
+ inHyperlink = true;
+ }*/
+ } else if(TBL.equals(localName)) {
+ bodyContentsHandler.startTable();
+ } else if (BLIP.equals(localName)) { //check for DRAWING_NS
+ picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed");
+ } else if ("cNvPr".equals(localName)) { //check for PIC_NS?
+ picDescription = atts.getValue("", "descr");
+ } else if (PIC.equals(localName)) {
+ inPic = true; //check for PIC_NS?
+ } else if (IMAGEDATA.equals(localName)) {
+ picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+ picDescription = atts.getValue(O_NS, "title");
+ } else if (OLE_OBJECT.equals(localName)){ //check for O_NS?
+ String type = null;
+ String refId = null;
+ //TODO: clean this up and ...want to get ProgID?
+ for (int i = 0; i < atts.getLength(); i++) {
+ String attLocalName = atts.getLocalName(i);
+ String attValue = atts.getValue(i);
+ if (attLocalName.equals("Type")) {
+ type = attValue;
+ } else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) && attLocalName.equals("id")) {
+ refId = attValue;
+ }
+ }
+ if ("Embed".equals(type)) {
+ bodyContentsHandler.embeddedOLERef(refId);
+ }
+ } else if(CR.equals(localName)) {
+ runBuffer.append(NEWLINE);
+ }
+
+ }
+
+
+ private int getIntVal(Attributes atts) {
+ String valString = atts.getValue(W_NS, "val");
+ if (valString != null) {
+ try {
+ return Integer.parseInt(valString);
+ } catch (NumberFormatException e) {
+ //swallow
+ }
+ }
+ return -1;
+ }
+
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+
+ if (CHOICE.equals(localName)) {
+ inACChoiceDepth--;
+ } else if (FALLBACK.equals(localName)) {
+ inACFallbackDepth--;
+ }
+ if (inACChoiceDepth > 0) {
+ return;
+ }
+
+ if (PIC.equals(localName)) { //PIC_NS
+ handlePict();
+ inPic = false;
+ return;
+ } else if (RPR.equals(localName)) {
+ inRPr = false;
+ } else if (R.equals(localName)) {
+ handleEndOfRun();
+ } else if (T.equals(localName)) {
+ inT = false;
+ } else if (PPR.equals(localName)) {
+ if (!pStarted) {
+ bodyContentsHandler.startParagraph(currPProperties);
+ pStarted = true;
+ }
+ currPProperties.reset();
+ } else if (P.equals(localName)) {
+ if (runBuffer.length() > 0) {
+ //<p><tab></p>...this will treat that as if it were
+ //a run...TODO: should we swallow whitespace that doesn't occur in a run?
+ bodyContentsHandler.run(currRunProperties, runBuffer.toString());
+ runBuffer.setLength(0);
+ }
+ pStarted = false;
+ bodyContentsHandler.endParagraph();
+ } else if (TC.equals(localName)) {
+ bodyContentsHandler.endTableCell();
+ } else if (TR.equals(localName)) {
+ bodyContentsHandler.endTableRow();
+ } else if (TBL.equals(localName)) {
+ bodyContentsHandler.endTable();
+ } else if (FLD.equals(localName)) {
+ handleEndOfRun();
+ } else if (HYPERLINK.equals(localName)) {
+ bodyContentsHandler.hyperlinkEnd();
+ } else if (PICT.equals(localName)) {
+ handlePict();
+ }
+ }
+
+ private void handleEndOfRun() {
+ bodyContentsHandler.run(currRunProperties, runBuffer.toString());
+ if (inHyperlink) {
+ bodyContentsHandler.hyperlinkEnd();
+ inHyperlink = false;
+ }
+ inR = false;
+ runBuffer.setLength(0);
+ currRunProperties.setBold(false);
+ currRunProperties.setItalics(false);
+ }
+
+ private void handlePict() {
+ String picFileName = null;
+ if (picRId != null) {
+ picFileName = "picId";//TODO: linkedRelationships.get(picRId);
+ }
+ bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
+ picDescription = null;
+ picRId = null;
+ inPic = false;
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+
+ if (inACChoiceDepth > 0) {
+ return;
+ }
+ if (inT) {
+ runBuffer.append(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+ if (inACChoiceDepth > 0) {
+ return;
+ }
+
+ if (inT) {
+ runBuffer.append(ch, start, length);
+ }
+ }
+
+
+ public interface XSLFBodyContentsHandler {
+
+ void run(RunProperties runProperties, String contents);
+
+ /**
+ * @param link the link; can be null
+ */
+ void hyperlinkStart(String link);
+
+ void hyperlinkEnd();
+
+ void startParagraph(ParagraphProperties paragraphProperties);
+
+ void endParagraph();
+
+ void startTable();
+
+ void endTable();
+
+ void startTableRow();
+
+ void endTableRow();
+
+ void startTableCell();
+
+ void endTableCell();
+
+ void embeddedOLERef(String refId);
+
+ void embeddedPicRef(String picFileName, String picDescription);
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
new file mode 100644
index 0000000..15bbd6a
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xslf;
+
+import java.io.IOException;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLProperties;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
+import org.apache.tika.parser.microsoft.ooxml.RunProperties;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
+import org.apache.xmlbeans.XmlException;
+
+public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
+
+
+ private OPCPackage container;
+ private POIXMLProperties properties;
+
+ public XSLFEventBasedPowerPointExtractor(String path) throws XmlException, OpenXML4JException, IOException {
+ this(OPCPackage.open(path));
+ }
+
+ public XSLFEventBasedPowerPointExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
+ super((POIXMLDocument) null);
+ this.container = container;
+ this.properties = new POIXMLProperties(container);
+ }
+
+
+ public static void main(String[] args) throws Exception {
+ if (args.length < 1) {
+ System.err.println("Use:");
+ System.err.println(" XSLFEventBasedPowerPointExtractor <filename.pptx>");
+ System.exit(1);
+ }
+
+ XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(args[0]);
+ System.out.println(extractor.getText());
+ extractor.close();
+ }
+
+ public OPCPackage getPackage() {
+ return this.container;
+ }
+
+ public POIXMLProperties.CoreProperties getCoreProperties() {
+ return this.properties.getCoreProperties();
+ }
+
+ public POIXMLProperties.ExtendedProperties getExtendedProperties() {
+ return this.properties.getExtendedProperties();
+ }
+
+ public POIXMLProperties.CustomProperties getCustomProperties() {
+ return this.properties.getCustomProperties();
+ }
+
+
+ @Override
+ public String getText() {
+ //TODO
+ return "";
+ }
+
+
+
+ private class XSLFToTextContentHandler implements XSLFDocumentXMLBodyHandler.XSLFBodyContentsHandler {
+ private final StringBuilder buffer;
+
+ public XSLFToTextContentHandler(StringBuilder buffer) {
+ this.buffer = buffer;
+ }
+
+ @Override
+ public void run(RunProperties runProperties, String contents) {
+ buffer.append(contents);
+ }
+
+ @Override
+ public void hyperlinkStart(String link) {
+ //no-op
+ }
+
+ @Override
+ public void hyperlinkEnd() {
+ //no-op
+ }
+
+ @Override
+ public void startParagraph(ParagraphProperties paragraphProperties) {
+ //no-op
+ }
+
+ @Override
+ public void endParagraph() {
+ buffer.append("\n");
+ }
+
+ @Override
+ public void startTable() {
+
+ }
+
+ @Override
+ public void endTable() {
+
+ }
+
+ @Override
+ public void startTableRow() {
+
+ }
+
+ @Override
+ public void endTableRow() {
+ buffer.append("\n");
+ }
+
+ @Override
+ public void startTableCell() {
+
+ }
+
+ @Override
+ public void endTableCell() {
+ buffer.append("\t");
+ }
+
+
+
+ @Override
+ public void embeddedOLERef(String refId) {
+ //no-op
+ }
+
+ @Override
+ public void embeddedPicRef(String picFileName, String picDescription) {
+ //no-op
+ }
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFTikaBodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFTikaBodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFTikaBodyPartHandler.java
new file mode 100644
index 0000000..ff587f7
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFTikaBodyPartHandler.java
@@ -0,0 +1,262 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xslf;
+
+
+import java.math.BigInteger;
+
+import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
+import org.apache.tika.parser.microsoft.ooxml.RunProperties;
+import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+public class XSLFTikaBodyPartHandler implements XSLFDocumentXMLBodyHandler.XSLFBodyContentsHandler {
+
+ private final static String P = "p";
+
+ private final static char[] NEWLINE = new char[]{'\n'};
+ private final static char[] TAB = new char[]{'\t'};
+
+ private final XHTMLContentHandler xhtml;
+
+ private int pDepth = 0; //paragraph depth
+ private int tableDepth = 0;//table depth
+ private int pWithinCell = 0;//paragraph count within a cell
+ private boolean isItalics = false;
+ private boolean isBold = false;
+ private boolean wroteHyperlinkStart = false;
+ private boolean inTableCell = false;
+
+ public XSLFTikaBodyPartHandler(XHTMLContentHandler xhtml) {
+ this.xhtml = xhtml;
+ }
+
+ @Override
+ public void run(RunProperties runProperties, String contents) {
+ try {
+ // True if we are currently in the named style tag:
+ if (runProperties.getBold() != isBold) {
+ if (isItalics) {
+ xhtml.endElement("i");
+ isItalics = false;
+ }
+ if (runProperties.getBold()) {
+ xhtml.startElement("b");
+ isBold = true;
+ } else {
+ xhtml.endElement("b");
+ isBold = false;
+ }
+ }
+
+ if (runProperties.getItalics() != isItalics) {
+ if (runProperties.getItalics()) {
+ xhtml.startElement("i");
+ isItalics = true;
+ } else {
+ xhtml.endElement("i");
+ isItalics = false;
+ }
+ }
+
+ xhtml.characters(contents);
+
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void hyperlinkStart(String link) {
+ try {
+ if (link != null) {
+ xhtml.startElement("a", "href", link);
+ wroteHyperlinkStart = true;
+ }
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void hyperlinkEnd() {
+ try {
+ if (wroteHyperlinkStart) {
+ closeStyleTags();
+ wroteHyperlinkStart = false;
+ xhtml.endElement("a");
+ }
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void startParagraph(ParagraphProperties paragraphProperties) {
+ if (pDepth == 0 && tableDepth == 0) {
+ try {
+ xhtml.startElement(P);
+ } catch (SAXException e) {
+
+ }
+ }
+ pDepth++;
+ }
+
+ @Override
+ public void endParagraph() {
+ try {
+ closeStyleTags();
+ if (pDepth == 1 && tableDepth == 0) {
+ xhtml.endElement(P);
+ } else if (pWithinCell > 0){
+ xhtml.characters(NEWLINE, 0, 1);
+ }
+ } catch (SAXException e) {
+
+ }
+ if (inTableCell) {
+ pWithinCell++;
+ }
+ pDepth--;
+ }
+
+ @Override
+ public void startTable() {
+ try {
+ xhtml.startElement("table");
+ tableDepth++;
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void endTable() {
+ try {
+ xhtml.endElement("table");
+ tableDepth--;
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void startTableRow() {
+ try {
+ xhtml.startElement("tr");
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void endTableRow() {
+ try {
+ xhtml.endElement("tr");
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void startTableCell() {
+ try {
+ xhtml.startElement("td");
+ } catch (SAXException e) {
+
+ }
+ inTableCell = true;
+ }
+
+ @Override
+ public void endTableCell() {
+ try {
+ xhtml.endElement("td");
+ } catch (SAXException e) {
+
+ }
+ inTableCell = false;
+ pWithinCell = 0;
+ }
+
+
+ @Override
+ public void embeddedOLERef(String relId) {
+ if (relId == null) {
+ return;
+ }
+ try {
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", relId);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void embeddedPicRef(String picFileName, String picDescription) {
+
+ try {
+ AttributesImpl attr = new AttributesImpl();
+ if (picFileName != null) {
+ attr.addAttribute("", "src", "src", "CDATA", "embedded:" + picFileName);
+ }
+ if (picDescription != null) {
+ attr.addAttribute("", "alt", "alt", "CDATA", picDescription);
+ }
+
+ xhtml.startElement("img", attr);
+ xhtml.endElement("img");
+
+ } catch (SAXException e) {
+
+ }
+ }
+
+ private void closeStyleTags() throws SAXException {
+ if (isItalics) {
+ xhtml.endElement("i");
+ isItalics = false;
+ }
+ if (isBold) {
+ xhtml.endElement("b");
+ isBold = false;
+ }
+ }
+
+ private void writeParagraphNumber(int numId, int ilvl,
+ XWPFListManager listManager,
+ XHTMLContentHandler xhtml) throws SAXException {
+
+ if (ilvl < 0 || numId < 0 || listManager == null) {
+ return;
+ }
+ String number = listManager.getFormattedNumber(BigInteger.valueOf(numId), ilvl);
+ if (number != null) {
+ xhtml.characters(number);
+ }
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
index 610a2cb..d08fb07 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
@@ -21,17 +21,19 @@ package org.apache.tika.parser.microsoft.ooxml.xwpf;
import java.util.Date;
import java.util.Map;
+import org.apache.tika.parser.microsoft.ooxml.AbstractDocumentXMLBodyHandler;
+import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
+import org.apache.tika.parser.microsoft.ooxml.RunProperties;
import org.apache.tika.utils.DateUtils;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
/**
* This class is intended to handle anything that might contain IBodyElements:
* main document, headers, footers, notes, etc.
*/
-public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
+public class XWPFDocumentXMLBodyHandler extends AbstractDocumentXMLBodyHandler {
enum EditType {
@@ -43,86 +45,24 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
}
- final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
- private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
- private final static String O_NS = "urn:schemas-microsoft-com:office:office";
- private final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
- private final static String DRAWING_MAIN_NS = "http://schemas.openxmlformats.org/drawingml/2006/main";
- private static final String V_NS = "urn:schemas-microsoft-com:vml";
-
- private final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
-
- private final static char[] TAB_CHAR = new char[1];
- private final static char NEWLINE = '\n';
-
- static {
- TAB_CHAR[0] = '\t';
- }
-
- private final static String R = "r";
- private final static String RPR = "rPr";
- private final static String P = "p";
- private final static String P_STYLE = "pStyle";
- private final static String PPR = "pPr";
- private final static String T = "t";
- private final static String TAB = "tab";
- private final static String B = "b";
- private final static String ILVL = "ilvl";
- private final static String NUM_ID = "numId";
- private final static String TC = "tc";
- private final static String TR = "tr";
- private final static String I = "i";
- private final static String NUM_PR = "numPr";
- private final static String BR = "br";
private final static String BOOKMARK_START = "bookmarkStart";
private final static String BOOKMARK_END = "bookmarkEnd";
- private final static String HYPERLINK = "hyperlink";
- private final static String TBL = "tbl";
- private final static String PIC = "pic";
- private final static String PICT = "pict";
private final static String FOOTNOTE_REFERENCE = "footnoteReference";
- private final static String IMAGEDATA = "imagedata";
- private final static String BLIP = "blip";
private final static String INS = "ins";
private final static String DEL = "del";
private final static String DEL_TEXT = "delText";
private final static String MOVE_FROM = "moveFrom";
private final static String MOVE_TO = "moveTo";
- private final static String OLE_OBJECT = "OLEObject";
- private final static String CR = "cr";
private final static String ENDNOTE_REFERENCE = "endnoteReference";
- private final static String CHOICE = "Choice";
- private final static String FALLBACK = "Fallback";
private final XWPFBodyContentsHandler bodyContentsHandler;
//private final RelationshipsManager relationshipsManager;
private final Map<String, String> linkedRelationships;
- private final StringBuilder runBuffer = new StringBuilder();
-
- private boolean inR = false;
- private boolean inT = false;
- private int pDepth = 0;
- private boolean inRPr = false;
- private boolean inNumPr = false;
private boolean inDelText = false;
- private boolean inPic = false;
- private boolean inPict = false;
- private String picDescription = null;
- private String picRId = null;
- private String picFilename = null;
- private boolean lastStartElementWasP;
-
- //alternate content can be embedded in itself.
- //need to track depth.
- //if in alternate, choose fallback, maybe make this configurable?
- private int inACChoiceDepth = 0;
- private int inACFallbackDepth = 0;
- private EditType editType = EditType.NONE;
+ private XWPFDocumentXMLBodyHandler.EditType editType = XWPFDocumentXMLBodyHandler.EditType.NONE;
- private XWPFRunProperties currRunProperties = new XWPFRunProperties();
- private XWPFParagraphProperties currPProperties = new XWPFParagraphProperties();
public XWPFDocumentXMLBodyHandler(XWPFBodyContentsHandler bodyContentsHandler,
Map<String, String> hyperlinks) {
@@ -180,7 +120,6 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
runBuffer.append(TAB_CHAR);
} else if (P.equals(localName)) {
lastStartElementWasP = true;
- pDepth++;
} else if (B.equals(localName)) { //TODO: add bCs
if(inR && inRPr) {
currRunProperties.setBold(true);
@@ -334,7 +273,6 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
currPProperties.reset();
} else if (P.equals(localName)) {
bodyContentsHandler.endParagraph();
- pDepth--;
} else if (TC.equals(localName)) {
bodyContentsHandler.endTableCell();
} else if (TR.equals(localName)) {
@@ -398,7 +336,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
public interface XWPFBodyContentsHandler {
- void run(XWPFRunProperties runProperties, String contents);
+ void run(RunProperties runProperties, String contents);
/**
* @param link the link; can be null
@@ -407,7 +345,7 @@ public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
void hyperlinkEnd();
- void startParagraph(XWPFParagraphProperties paragraphProperties);
+ void startParagraph(ParagraphProperties paragraphProperties);
void endParagraph();
http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index 3cae6d9..f61fa56 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -38,6 +38,8 @@ import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.util.SAXHelper;
import org.apache.poi.xwpf.usermodel.XWPFNumbering;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
+import org.apache.tika.parser.microsoft.ooxml.RunProperties;
import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
import org.apache.xmlbeans.XmlException;
import org.xml.sax.InputSource;
@@ -209,29 +211,7 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
}
return hyperlinks;
}
-/*
- private XWPFStyles loadStyles(PackagePart packagePart) {
- try {
- PackageRelationshipCollection stylesParts =
- packagePart.getRelationshipsByType(XWPFRelation.STYLES.getRelation());
- if (stylesParts.size() > 0) {
- PackageRelationship stylesRelationShip = stylesParts.getRelationship(0);
- if (stylesRelationShip == null) {
- return null;
- }
- PackagePart stylesPart = opcPackage.getPart(stylesRelationShip);
- if (stylesPart == null) {
- return null;
- }
- return new XWPFStyles(stylesPart);
- }
- } catch (IOException|OpenXML4JException e) {
- //swallow
- }
- return null;
- }
-*/
private XWPFNumbering loadNumbering(PackagePart packagePart) {
try {
PackageRelationshipCollection numberingParts = packagePart.getRelationshipsByType(XWPFRelation.NUMBERING.getRelation());
@@ -260,7 +240,7 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
}
@Override
- public void run(XWPFRunProperties runProperties, String contents) {
+ public void run(RunProperties runProperties, String contents) {
buffer.append(contents);
}
@@ -275,7 +255,7 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
}
@Override
- public void startParagraph(XWPFParagraphProperties paragraphProperties) {
+ public void startParagraph(ParagraphProperties paragraphProperties) {
//no-op
}
http://git-wip-us.apache.org/repos/asf/tika/blob/90cdf1f6/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFParagraphProperties.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFParagraphProperties.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFParagraphProperties.java
deleted file mode 100644
index fd2b022..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFParagraphProperties.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-public class XWPFParagraphProperties {
-
- private String styleId;
- private int ilvl = -1;
- private int numId = -1;
-
- String getStyleID() {
- return styleId;
- }
-
- void setStyleID(String styleId) {
- this.styleId = styleId;
- }
-
- void reset() {
- styleId = null;
- ilvl = -1;
- numId = -1;
- }
-
- public void setIlvl(int ilvl) {
- this.ilvl = ilvl;
- }
-
- public void setNumId(int numId) {
- this.numId = numId;
- }
-
- public int getIlvl() {
- return ilvl;
- }
-
- public int getNumId() {
- return numId;
- }
-}