You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/08/03 14:35:29 UTC

[tika] branch master updated: TIKA-2703 -- related...simplify XSSFB to use more of XSSF rather copy/paste

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 5cb8d9a  TIKA-2703 -- related...simplify XSSFB to use more of XSSF rather copy/paste
5cb8d9a is described below

commit 5cb8d9a9e091ee61c8dc0f334d23c7677f5f519b
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Aug 3 10:34:45 2018 -0400

    TIKA-2703 -- related...simplify XSSFB to use more of XSSF rather copy/paste
---
 .../ooxml/XSSFBExcelExtractorDecorator.java        | 142 ---------------------
 .../ooxml/XSSFExcelExtractorDecorator.java         |   6 +-
 2 files changed, 2 insertions(+), 146 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
index 33dbb7e..3001318 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFBExcelExtractorDecorator.java
@@ -18,8 +18,6 @@ package org.apache.tika.parser.microsoft.ooxml;
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 
@@ -28,11 +26,6 @@ import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackagePart;
-import org.apache.poi.openxml4j.opc.PackagePartName;
-import org.apache.poi.openxml4j.opc.PackageRelationship;
-import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
-import org.apache.poi.openxml4j.opc.PackagingURIHelper;
-import org.apache.poi.openxml4j.opc.TargetMode;
 import org.apache.poi.xssf.binary.XSSFBCommentsTable;
 import org.apache.poi.xssf.binary.XSSFBSharedStringsTable;
 import org.apache.poi.xssf.binary.XSSFBSheetHandler;
@@ -40,20 +33,13 @@ import org.apache.poi.xssf.binary.XSSFBStylesTable;
 import org.apache.poi.xssf.eventusermodel.XSSFBReader;
 import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
 import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor;
-import org.apache.poi.xssf.usermodel.XSSFDrawing;
-import org.apache.poi.xssf.usermodel.XSSFRelation;
 import org.apache.poi.xssf.usermodel.XSSFShape;
-import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.xmlbeans.XmlException;
-import org.openxmlformats.schemas.drawingml.x2006.main.CTHyperlink;
-import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps;
-import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape;
-import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShapeNonVisual;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -165,94 +151,6 @@ public class XSSFBExcelExtractorDecorator extends XSSFExcelExtractorDecorator {
         }
     }
 
-    private void extractHyperLinks(PackagePart sheetPart, XHTMLContentHandler xhtml) throws SAXException {
-        try {
-            for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
-                xhtml.startElement("a", "href", rel.getTargetURI().toString());
-                xhtml.characters(rel.getTargetURI().toString());
-                xhtml.endElement("a");
-            }
-        } catch (InvalidFormatException e) {
-            //swallow
-        }
-    }
-
-    private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException {
-        if (shapes == null) {
-            return;
-        }
-        for (XSSFShape shape : shapes) {
-            if (shape instanceof XSSFSimpleShape) {
-                String sText = ((XSSFSimpleShape) shape).getText();
-                if (sText != null && sText.length() > 0) {
-                    xhtml.element("p", sText);
-                }
-                extractHyperLinksFromShape(((XSSFSimpleShape)shape).getCTShape(), xhtml);
-            }
-            XSSFDrawing drawing = shape.getDrawing();
-            if (drawing != null) {
-                //dump diagram data
-                handleGeneralTextContainingPart(
-                        AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
-                        "diagram-data",
-                        drawing.getPackagePart(),
-                        metadata,
-                        new OOXMLWordAndPowerPointTextHandler(
-                                new OOXMLTikaBodyPartHandler(xhtml),
-                                new HashMap<String, String>()//empty
-                        )
-                );
-                //dump chart data
-                handleGeneralTextContainingPart(
-                        XSSFRelation.CHART.getRelation(),
-                        "chart",
-                        drawing.getPackagePart(),
-                        metadata,
-                        new OOXMLWordAndPowerPointTextHandler(
-                                new OOXMLTikaBodyPartHandler(xhtml),
-                                new HashMap<String, String>()//empty
-                        )
-                );
-            }
-        }
-    }
-
-    private void extractHyperLinksFromShape(CTShape ctShape, XHTMLContentHandler xhtml) throws SAXException {
-
-        if (ctShape == null)
-            return;
-
-        CTShapeNonVisual nvSpPR = ctShape.getNvSpPr();
-        if (nvSpPR == null)
-            return;
-
-        CTNonVisualDrawingProps cNvPr = nvSpPR.getCNvPr();
-        if (cNvPr == null)
-            return;
-
-        CTHyperlink ctHyperlink = cNvPr.getHlinkClick();
-        if (ctHyperlink == null)
-            return;
-
-        String url = drawingHyperlinks.get(ctHyperlink.getId());
-        if (url != null) {
-            xhtml.startElement("a", "href", url);
-            xhtml.characters(url);
-            xhtml.endElement("a");
-        }
-
-        CTHyperlink ctHoverHyperlink = cNvPr.getHlinkHover();
-        if (ctHoverHyperlink == null)
-            return;
-
-        url = drawingHyperlinks.get(ctHoverHyperlink.getId());
-        if (url != null) {
-            xhtml.startElement("a", "href", url);
-            xhtml.characters(url);
-            xhtml.endElement("a");
-        }
-
-    }
 
     private void processSheet(
             SheetContentsHandler sheetContentsExtractor,
@@ -273,44 +171,4 @@ public class XSSFBExcelExtractorDecorator extends XSSFExcelExtractorDecorator {
         );
         xssfbSheetHandler.parse();
     }
-
-    /**
-     * In Excel files, sheets have things embedded in them,
-     * and sheet drawings which have the images
-     */
-    @Override
-    protected List<PackagePart> getMainDocumentParts() throws TikaException {
-        List<PackagePart> parts = new ArrayList<PackagePart>();
-        for (PackagePart part : sheetParts) {
-            // Add the sheet
-            parts.add(part);
-
-            // If it has drawings, return those too
-            try {
-                for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
-                    if (rel.getTargetMode() == TargetMode.INTERNAL) {
-                        PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
-                        parts.add(rel.getPackage().getPart(relName));
-                    }
-                }
-                for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
-                    if (rel.getTargetMode() == TargetMode.INTERNAL) {
-                        PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
-                        parts.add(rel.getPackage().getPart(relName));
-                    }
-                }
-            } catch (InvalidFormatException e) {
-                throw new TikaException("Broken OOXML file", e);
-            }
-        }
-
-        //add main document so that macros can be extracted
-        //by AbstractOOXMLExtractor
-        for (PackagePart part : extractor.getPackage().
-                getPartsByRelationshipType(PackageRelationshipTypes.CORE_DOCUMENT)) {
-            parts.add(part);
-        }
-
-        return parts;
-    }
 }
\ No newline at end of file
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index b92ecc2..256fd0f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -70,10 +70,8 @@ import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape;
 import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShapeNonVisual;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
-import org.xml.sax.InputSource;
 import org.xml.sax.Locator;
 import org.xml.sax.SAXException;
-import org.xml.sax.XMLReader;
 import org.xml.sax.helpers.DefaultHandler;
 
 public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
@@ -230,7 +228,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
     }
 
 
-    private void extractHyperLinks(PackagePart sheetPart, XHTMLContentHandler xhtml) throws SAXException {
+    protected void extractHyperLinks(PackagePart sheetPart, XHTMLContentHandler xhtml) throws SAXException {
         try {
             for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
                 xhtml.startElement("a", "href", rel.getTargetURI().toString());
@@ -251,7 +249,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
         }
     }
 
-    private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException {
+    protected void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException {
         if (shapes == null) {
             return;
         }