You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC
svn commit: r1723223 [12/32] - in /tika/branches/2.x:
tika-core/src/test/resources/META-INF/
tika-core/src/test/resources/META-INF/services/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-module/src/ ti...
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,270 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import javax.xml.namespace.QName;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackagePartName;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackagingURIHelper;
+import org.apache.poi.openxml4j.opc.TargetMode;
+import org.apache.poi.xslf.XSLFSlideShow;
+import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.usermodel.Placeholder;
+import org.apache.poi.xslf.usermodel.XMLSlideShow;
+import org.apache.poi.xslf.usermodel.XSLFCommentAuthors;
+import org.apache.poi.xslf.usermodel.XSLFComments;
+import org.apache.poi.xslf.usermodel.XSLFGraphicFrame;
+import org.apache.poi.xslf.usermodel.XSLFGroupShape;
+import org.apache.poi.xslf.usermodel.XSLFNotes;
+import org.apache.poi.xslf.usermodel.XSLFNotesMaster;
+import org.apache.poi.xslf.usermodel.XSLFPictureShape;
+import org.apache.poi.xslf.usermodel.XSLFRelation;
+import org.apache.poi.xslf.usermodel.XSLFShape;
+import org.apache.poi.xslf.usermodel.XSLFSheet;
+import org.apache.poi.xslf.usermodel.XSLFSlide;
+import org.apache.poi.xslf.usermodel.XSLFSlideLayout;
+import org.apache.poi.xslf.usermodel.XSLFTable;
+import org.apache.poi.xslf.usermodel.XSLFTableCell;
+import org.apache.poi.xslf.usermodel.XSLFTableRow;
+import org.apache.poi.xslf.usermodel.XSLFTextParagraph;
+import org.apache.poi.xslf.usermodel.XSLFTextShape;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.apache.xmlbeans.XmlObject;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentAuthor;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTPicture;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
+ public XSLFPowerPointExtractorDecorator(ParseContext context, XSLFPowerPointExtractor extractor) {
+ super(context, extractor);
+ }
+
+ /**
+ * @see org.apache.poi.xslf.extractor.XSLFPowerPointExtractor#getText()
+ */
+ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
+ XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument();
+ XSLFCommentAuthors commentAuthors = slideShow.getCommentAuthors();
+
+ List<XSLFSlide> slides = slideShow.getSlides();
+ for (XSLFSlide slide : slides) {
+ String slideDesc;
+ if (slide.getPackagePart() != null && slide.getPackagePart().getPartName() != null) {
+ slideDesc = getJustFileName(slide.getPackagePart().getPartName().toString());
+ slideDesc += "_";
+ } else {
+ slideDesc = null;
+ }
+
+ // slide content
+ xhtml.startElement("div", "class", "slide-content");
+ extractContent(slide.getShapes(), false, xhtml, slideDesc);
+ xhtml.endElement("div");
+
+ // slide layout which is the master sheet for this slide
+ xhtml.startElement("div", "class", "slide-master-content");
+ XSLFSlideLayout slideLayout = slide.getMasterSheet();
+ extractContent(slideLayout.getShapes(), true, xhtml, null);
+ xhtml.endElement("div");
+
+ // slide master which is the master sheet for all text layouts
+ XSLFSheet slideMaster = slideLayout.getMasterSheet();
+ extractContent(slideMaster.getShapes(), true, xhtml, null);
+
+ // notes (if present)
+ XSLFNotes slideNotes = slide.getNotes();
+ if (slideNotes != null) {
+ xhtml.startElement("div", "class", "slide-notes");
+
+ extractContent(slideNotes.getShapes(), false, xhtml, slideDesc);
+
+ // master sheet for this notes
+ XSLFNotesMaster notesMaster = slideNotes.getMasterSheet();
+ extractContent(notesMaster.getShapes(), true, xhtml, null);
+ xhtml.endElement("div");
+ }
+
+ // comments (if present)
+ XSLFComments comments = slide.getComments();
+ if (comments != null) {
+ StringBuilder authorStringBuilder = new StringBuilder();
+ for (int i = 0; i < comments.getNumberOfComments(); i++) {
+ authorStringBuilder.setLength(0);
+ CTComment comment = comments.getCommentAt(i);
+ xhtml.startElement("p", "class", "slide-comment");
+ CTCommentAuthor cta = commentAuthors.getAuthorById(comment.getAuthorId());
+ if (cta != null) {
+ if (cta.getName() != null) {
+ authorStringBuilder.append(cta.getName());
+ }
+ if (cta.getInitials() != null) {
+ if (authorStringBuilder.length() > 0) {
+ authorStringBuilder.append(" ");
+ }
+ authorStringBuilder.append("("+cta.getInitials()+")");
+ }
+ if (comment.getText() != null && authorStringBuilder.length() > 0) {
+ authorStringBuilder.append(" - ");
+ }
+ if (authorStringBuilder.length() > 0) {
+ xhtml.startElement("b");
+ xhtml.characters(authorStringBuilder.toString());
+ xhtml.endElement("b");
+ }
+ }
+ xhtml.characters(comment.getText());
+ xhtml.endElement("p");
+ }
+ }
+ }
+ }
+
+ private void extractContent(List<? extends XSLFShape> shapes, boolean skipPlaceholders, XHTMLContentHandler xhtml, String slideDesc)
+ throws SAXException {
+ for (XSLFShape sh : shapes) {
+ if (sh instanceof XSLFTextShape) {
+ XSLFTextShape txt = (XSLFTextShape) sh;
+ Placeholder ph = txt.getTextType();
+ if (skipPlaceholders && ph != null) {
+ continue;
+ }
+ for (XSLFTextParagraph p : txt.getTextParagraphs()) {
+ xhtml.element("p", p.getText());
+ }
+ } else if (sh instanceof XSLFGroupShape) {
+ // recurse into groups of shapes
+ XSLFGroupShape group = (XSLFGroupShape) sh;
+ extractContent(group.getShapes(), skipPlaceholders, xhtml, slideDesc);
+ } else if (sh instanceof XSLFTable) {
+ //unlike tables in Word, ppt/x can't have recursive tables...I don't think
+ extractTable((XSLFTable)sh, xhtml);
+ } else if (sh instanceof XSLFGraphicFrame) {
+ XSLFGraphicFrame frame = (XSLFGraphicFrame) sh;
+ XmlObject[] sp = frame.getXmlObject().selectPath(
+ "declare namespace p='http://schemas.openxmlformats.org/presentationml/2006/main' .//*/p:oleObj");
+ if (sp != null) {
+ for (XmlObject emb : sp) {
+ XmlObject relIDAtt = emb.selectAttribute(new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"));
+ if (relIDAtt != null) {
+ String relID = relIDAtt.getDomNode().getNodeValue();
+ if (slideDesc != null) {
+ relID = slideDesc + relID;
+ }
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", relID);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+ }
+ }
+ }
+ } else if (sh instanceof XSLFPictureShape) {
+ if (!skipPlaceholders && (sh.getXmlObject() instanceof CTPicture)) {
+ CTPicture ctPic = ((CTPicture) sh.getXmlObject());
+ if (ctPic.getBlipFill() != null && ctPic.getBlipFill().getBlip() != null) {
+ String relID = ctPic.getBlipFill().getBlip().getEmbed();
+ if (relID != null) {
+ if (slideDesc != null) {
+ relID = slideDesc + relID;
+ }
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", relID);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private void extractTable(XSLFTable tbl, XHTMLContentHandler xhtml) throws SAXException {
+ xhtml.startElement("table");
+ for (XSLFTableRow row : tbl) {
+ xhtml.startElement("tr");
+ List<XSLFTableCell> cells = row.getCells();
+ for (XSLFTableCell c : row.getCells()) {
+ xhtml.startElement("td");
+ xhtml.characters(c.getText());
+ xhtml.endElement("td");
+ }
+ xhtml.endElement("tr");
+ }
+ xhtml.endElement("table");
+
+ }
+
+ /**
+ * In PowerPoint files, slides have things embedded in them,
+ * and slide drawings which have the images
+ */
+ @Override
+ protected List<PackagePart> getMainDocumentParts() throws TikaException {
+ List<PackagePart> parts = new ArrayList<>();
+ XMLSlideShow slideShow = (XMLSlideShow) extractor.getDocument();
+ XSLFSlideShow document = null;
+ try {
+ document = slideShow._getXSLFSlideShow(); // TODO Avoid this in future
+ } catch (Exception e) {
+ throw new TikaException(e.getMessage()); // Shouldn't happen
+ }
+
+ CTSlideIdList ctSlideIdList = document.getSlideReferences();
+ if (ctSlideIdList != null) {
+ for (int i = 0; i < ctSlideIdList.sizeOfSldIdArray(); i++) {
+ CTSlideIdListEntry ctSlide = ctSlideIdList.getSldIdArray(i);
+ // Add the slide
+ PackagePart slidePart;
+ try {
+ slidePart = document.getSlidePart(ctSlide);
+ } catch (IOException e) {
+ throw new TikaException("Broken OOXML file", e);
+ } catch (XmlException xe) {
+ throw new TikaException("Broken OOXML file", xe);
+ }
+ parts.add(slidePart);
+
+ // If it has drawings, return those too
+ try {
+ for (PackageRelationship rel : slidePart.getRelationshipsByType(XSLFRelation.VML_DRAWING.getRelation())) {
+ if (rel.getTargetMode() == TargetMode.INTERNAL) {
+ PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
+ parts.add(rel.getPackage().getPart(relName));
+ }
+ }
+ } catch (InvalidFormatException e) {
+ throw new TikaException("Broken OOXML file", e);
+ }
+ }
+ }
+ return parts;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackagePartName;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackagingURIHelper;
+import org.apache.poi.openxml4j.opc.TargetMode;
+import org.apache.poi.ss.usermodel.DataFormatter;
+import org.apache.poi.ss.usermodel.HeaderFooter;
+import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
+import org.apache.poi.xssf.eventusermodel.XSSFReader;
+import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
+import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
+import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
+import org.apache.poi.xssf.model.CommentsTable;
+import org.apache.poi.xssf.model.StylesTable;
+import org.apache.poi.xssf.usermodel.XSSFComment;
+import org.apache.poi.xssf.usermodel.XSSFRelation;
+import org.apache.poi.xssf.usermodel.XSSFShape;
+import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
+import org.apache.poi.xssf.usermodel.helpers.HeaderFooterHelper;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
+ /**
+ * Allows access to headers/footers from raw xml strings
+ */
+ private static HeaderFooterHelper hfHelper = new HeaderFooterHelper();
+ private final XSSFEventBasedExcelExtractor extractor;
+ private final DataFormatter formatter;
+ private final List<PackagePart> sheetParts = new ArrayList<PackagePart>();
+ private Metadata metadata;
+
+ public XSSFExcelExtractorDecorator(
+ ParseContext context, XSSFEventBasedExcelExtractor extractor, Locale locale) {
+ super(context, extractor);
+
+ this.extractor = extractor;
+ extractor.setFormulasNotResults(false);
+ extractor.setLocale(locale);
+
+ if (locale == null) {
+ formatter = new DataFormatter();
+ } else {
+ formatter = new DataFormatter(locale);
+ }
+ }
+
+ @Override
+ public void getXHTML(
+ ContentHandler handler, Metadata metadata, ParseContext context)
+ throws SAXException, XmlException, IOException, TikaException {
+
+ this.metadata = metadata;
+ metadata.set(TikaMetadataKeys.PROTECTED, "false");
+
+ super.getXHTML(handler, metadata, context);
+ }
+
+ /**
+ * @see org.apache.poi.xssf.extractor.XSSFExcelExtractor#getText()
+ */
+ @Override
+ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
+ XmlException, IOException {
+ OPCPackage container = extractor.getPackage();
+
+ ReadOnlySharedStringsTable strings;
+ XSSFReader.SheetIterator iter;
+ XSSFReader xssfReader;
+ StylesTable styles;
+ try {
+ xssfReader = new XSSFReader(container);
+ styles = xssfReader.getStylesTable();
+ iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
+ strings = new ReadOnlySharedStringsTable(container);
+ } catch (InvalidFormatException e) {
+ throw new XmlException(e);
+ } catch (OpenXML4JException oe) {
+ throw new XmlException(oe);
+ }
+
+ while (iter.hasNext()) {
+ InputStream stream = iter.next();
+ sheetParts.add(iter.getSheetPart());
+
+ SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml);
+ CommentsTable comments = iter.getSheetComments();
+
+ // Start, and output the sheet name
+ xhtml.startElement("div");
+ xhtml.element("h1", iter.getSheetName());
+
+ // Extract the main sheet contents
+ xhtml.startElement("table");
+ xhtml.startElement("tbody");
+
+ processSheet(sheetExtractor, comments, styles, strings, stream);
+
+ xhtml.endElement("tbody");
+ xhtml.endElement("table");
+
+ // Output any headers and footers
+ // (Need to process the sheet to get them, so we can't
+ // do the headers before the contents)
+ for (String header : sheetExtractor.headers) {
+ extractHeaderFooter(header, xhtml);
+ }
+ for (String footer : sheetExtractor.footers) {
+ extractHeaderFooter(footer, xhtml);
+ }
+ processShapes(iter.getShapes(), xhtml);
+ // All done with this sheet
+ xhtml.endElement("div");
+ }
+ }
+
+ private void extractHeaderFooter(String hf, XHTMLContentHandler xhtml)
+ throws SAXException {
+ String content = ExcelExtractor._extractHeaderFooter(
+ new HeaderFooterFromString(hf));
+ if (content.length() > 0) {
+ xhtml.element("p", content);
+ }
+ }
+
+ private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException {
+ if (shapes == null) {
+ return;
+ }
+ for (XSSFShape shape : shapes) {
+ if (shape instanceof XSSFSimpleShape) {
+ String sText = ((XSSFSimpleShape) shape).getText();
+ if (sText != null && sText.length() > 0) {
+ xhtml.element("p", sText);
+ }
+ }
+ }
+ }
+
+ public void processSheet(
+ SheetContentsHandler sheetContentsExtractor,
+ CommentsTable comments,
+ StylesTable styles,
+ ReadOnlySharedStringsTable strings,
+ InputStream sheetInputStream)
+ throws IOException, SAXException {
+ InputSource sheetSource = new InputSource(sheetInputStream);
+ SAXParserFactory saxFactory = SAXParserFactory.newInstance();
+ try {
+ SAXParser saxParser = saxFactory.newSAXParser();
+ XMLReader sheetParser = saxParser.getXMLReader();
+ XSSFSheetInterestingPartsCapturer handler =
+ new XSSFSheetInterestingPartsCapturer(new XSSFSheetXMLHandler(
+ styles, comments, strings, sheetContentsExtractor, formatter, false));
+ sheetParser.setContentHandler(handler);
+ sheetParser.parse(sheetSource);
+ sheetInputStream.close();
+
+ if (handler.hasProtection) {
+ metadata.set(TikaMetadataKeys.PROTECTED, "true");
+ }
+ } catch (ParserConfigurationException e) {
+ throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
+ }
+ }
+
+ /**
+ * In Excel files, sheets have things embedded in them,
+ * and sheet drawings which have the images
+ */
+ @Override
+ protected List<PackagePart> getMainDocumentParts() throws TikaException {
+ List<PackagePart> parts = new ArrayList<PackagePart>();
+ for (PackagePart part : sheetParts) {
+ // Add the sheet
+ parts.add(part);
+
+ // If it has drawings, return those too
+ try {
+ for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
+ if (rel.getTargetMode() == TargetMode.INTERNAL) {
+ PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
+ parts.add(rel.getPackage().getPart(relName));
+ }
+ }
+ for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
+ if (rel.getTargetMode() == TargetMode.INTERNAL) {
+ PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
+ parts.add(rel.getPackage().getPart(relName));
+ }
+ }
+ } catch (InvalidFormatException e) {
+ throw new TikaException("Broken OOXML file", e);
+ }
+ }
+
+ return parts;
+ }
+
+ /**
+ * Turns formatted sheet events into HTML
+ */
+ protected static class SheetTextAsHTML implements SheetContentsHandler {
+ private XHTMLContentHandler xhtml;
+ private List<String> headers;
+ private List<String> footers;
+
+ protected SheetTextAsHTML(XHTMLContentHandler xhtml) {
+ this.xhtml = xhtml;
+ headers = new ArrayList<String>();
+ footers = new ArrayList<String>();
+ }
+
+ public void startRow(int rowNum) {
+ try {
+ xhtml.startElement("tr");
+ } catch (SAXException e) {
+ }
+ }
+
+ public void endRow(int rowNum) {
+ try {
+ xhtml.endElement("tr");
+ } catch (SAXException e) {
+ }
+ }
+
+ public void cell(String cellRef, String formattedValue, XSSFComment comment) {
+ try {
+ xhtml.startElement("td");
+
+ // Main cell contents
+ if (formattedValue != null) {
+ xhtml.characters(formattedValue);
+ }
+
+ // Comments
+ if (comment != null) {
+ xhtml.startElement("br");
+ xhtml.endElement("br");
+ xhtml.characters(comment.getAuthor());
+ xhtml.characters(": ");
+ xhtml.characters(comment.getString().getString());
+ }
+
+ xhtml.endElement("td");
+ } catch (SAXException e) {
+ }
+ }
+
+ public void headerFooter(String text, boolean isHeader, String tagName) {
+ if (isHeader) {
+ headers.add(text);
+ } else {
+ footers.add(text);
+ }
+ }
+ }
+
+ protected static class HeaderFooterFromString implements HeaderFooter {
+ private String text;
+
+ protected HeaderFooterFromString(String text) {
+ this.text = text;
+ }
+
+ public String getCenter() {
+ return hfHelper.getCenterSection(text);
+ }
+
+ public void setCenter(String paramString) {
+ }
+
+ public String getLeft() {
+ return hfHelper.getLeftSection(text);
+ }
+
+ public void setLeft(String paramString) {
+ }
+
+ public String getRight() {
+ return hfHelper.getRightSection(text);
+ }
+
+ public void setRight(String paramString) {
+ }
+ }
+
+ /**
+ * Captures information on interesting tags, whilst
+ * delegating the main work to the formatting handler
+ */
+ protected static class XSSFSheetInterestingPartsCapturer implements ContentHandler {
+ private ContentHandler delegate;
+ private boolean hasProtection = false;
+
+ protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) {
+ this.delegate = delegate;
+ }
+
+ public void startElement(String uri, String localName, String qName,
+ Attributes atts) throws SAXException {
+ if ("sheetProtection".equals(qName)) {
+ hasProtection = true;
+ }
+ delegate.startElement(uri, localName, qName, atts);
+ }
+
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ delegate.characters(ch, start, length);
+ }
+
+ public void endDocument() throws SAXException {
+ delegate.endDocument();
+ }
+
+ public void endElement(String uri, String localName, String qName)
+ throws SAXException {
+ delegate.endElement(uri, localName, qName);
+ }
+
+ public void endPrefixMapping(String prefix) throws SAXException {
+ delegate.endPrefixMapping(prefix);
+ }
+
+ public void ignorableWhitespace(char[] ch, int start, int length)
+ throws SAXException {
+ delegate.ignorableWhitespace(ch, start, length);
+ }
+
+ public void processingInstruction(String target, String data)
+ throws SAXException {
+ delegate.processingInstruction(target, data);
+ }
+
+ public void setDocumentLocator(Locator locator) {
+ delegate.setDocumentLocator(locator);
+ }
+
+ public void skippedEntity(String name) throws SAXException {
+ delegate.skippedEntity(name);
+ }
+
+ public void startDocument() throws SAXException {
+ delegate.startDocument();
+ }
+
+ public void startPrefixMapping(String prefix, String uri)
+ throws SAXException {
+ delegate.startPrefixMapping(prefix, uri);
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import org.apache.poi.xwpf.usermodel.XWPFAbstractNum;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFNum;
+import org.apache.poi.xwpf.usermodel.XWPFNumbering;
+import org.apache.poi.xwpf.usermodel.XWPFParagraph;
+import org.apache.tika.parser.microsoft.AbstractListManager;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTAbstractNum;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDecimalNumber;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTLvl;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNum;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumLvl;
+
+
+public class XWPFListManager extends AbstractListManager {
+ private final static boolean OVERRIDE_AVAILABLE;
+ private final static String SKIP_FORMAT = Character.toString((char) 61623);//if this shows up as the lvlText, don't show a number
+
+ static {
+ boolean b = false;
+ try {
+ Class.forName("org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumLvl");
+ b = true;
+ } catch (ClassNotFoundException e) {
+ }
+ b = OVERRIDE_AVAILABLE = false;
+
+ }
+
+ private final XWPFNumbering numbering;
+
+ //map of numId (which paragraph series is this a member of?), levelcounts
+ public XWPFListManager(XWPFDocument document) {
+ numbering = document.getNumbering();
+ }
+
+ /**
+ *
+ * @param paragraph paragraph
+ * @return the formatted number or an empty string if something went wrong
+ */
+ public String getFormattedNumber(final XWPFParagraph paragraph) {
+ int currNumId = paragraph.getNumID().intValue();
+ XWPFNum xwpfNum = numbering.getNum(paragraph.getNumID());
+ if (xwpfNum == null) {
+ return "";
+ }
+ CTNum ctNum = xwpfNum.getCTNum();
+ CTDecimalNumber abNum = ctNum.getAbstractNumId();
+ int currAbNumId = abNum.getVal().intValue();
+
+ ParagraphLevelCounter lc = listLevelMap.get(currAbNumId);
+ LevelTuple[] overrideTuples = overrideTupleMap.get(currNumId);
+ if (lc == null) {
+ lc = loadLevelTuples(abNum);
+ }
+ if (overrideTuples == null) {
+ overrideTuples = loadOverrideTuples(ctNum, lc.getNumberOfLevels());
+ }
+
+ String formattedString = lc.incrementLevel(paragraph.getNumIlvl().intValue(), overrideTuples);
+
+ listLevelMap.put(currAbNumId, lc);
+ overrideTupleMap.put(currNumId, overrideTuples);
+
+ return formattedString;
+ }
+
+ private LevelTuple[] loadOverrideTuples(CTNum ctNum, int length) {
+ LevelTuple[] levelTuples = new LevelTuple[length];
+ int overrideLength = ctNum.sizeOfLvlOverrideArray();
+ if (overrideLength == 0) {
+ return null;
+ }
+ for (int i = 0; i < length; i++) {
+ LevelTuple tuple;
+ if (i >= overrideLength) {
+ tuple = new LevelTuple("%"+i+".");
+ } else {
+ CTNumLvl ctNumLvl = ctNum.getLvlOverrideArray(i);
+ if (ctNumLvl != null) {
+ tuple = buildTuple(i, ctNumLvl.getLvl());
+ } else {
+ tuple = new LevelTuple("%"+i+".");
+ }
+ }
+ levelTuples[i] = tuple;
+ }
+ return levelTuples;
+ }
+
+
+ private ParagraphLevelCounter loadLevelTuples(CTDecimalNumber abNum) {
+ //Unfortunately, we need to go this far into the underlying structure
+ //to get the abstract num information for the edge case where
+ //someone skips a level and the format is not context-free, e.g. "1.B.i".
+ XWPFAbstractNum abstractNum = numbering.getAbstractNum(abNum.getVal());
+ CTAbstractNum ctAbstractNum = abstractNum.getCTAbstractNum();
+
+ LevelTuple[] levels = new LevelTuple[ctAbstractNum.sizeOfLvlArray()];
+ for (int i = 0; i < levels.length; i++) {
+ levels[i] = buildTuple(i, ctAbstractNum.getLvlArray(i));
+ }
+ return new ParagraphLevelCounter(levels);
+ }
+
+ private LevelTuple buildTuple(int level, CTLvl ctLvl) {
+ boolean isLegal = false;
+ int start = 1;
+ int restart = -1;
+ String lvlText = "%" + level + ".";
+ String numFmt = "decimal";
+
+
+ if (ctLvl != null && ctLvl.getIsLgl() != null) {
+ isLegal = true;
+ }
+
+ if (ctLvl != null && ctLvl.getNumFmt() != null &&
+ ctLvl.getNumFmt().getVal() != null) {
+ numFmt = ctLvl.getNumFmt().getVal().toString();
+ }
+ if (ctLvl != null && ctLvl.getLvlRestart() != null &&
+ ctLvl.getLvlRestart().getVal() != null) {
+ restart = ctLvl.getLvlRestart().getVal().intValue();
+ }
+ if (ctLvl != null && ctLvl.getStart() != null &&
+ ctLvl.getStart().getVal() != null) {
+ start = ctLvl.getStart().getVal().intValue();
+ } else {
+
+ //this is a hack. Currently, this gets the lowest possible
+ //start for a given numFmt. We should probably try to grab the
+ //restartNumberingAfterBreak value in
+ //e.g. <w:abstractNum w:abstractNumId="12" w15:restartNumberingAfterBreak="0">???
+ if ("decimal".equals(numFmt) || "ordinal".equals(numFmt) || "decimalZero".equals(numFmt)) {
+ start = 0;
+ } else {
+ start = 1;
+ }
+ }
+ if (ctLvl != null && ctLvl.getLvlText() != null && ctLvl.getLvlText().getVal() != null) {
+ lvlText = ctLvl.getLvlText().getVal();
+ }
+ return new LevelTuple(start, restart, lvlText, numFmt, isLegal);
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,459 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import javax.xml.namespace.QName;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
+import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
+import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
+import org.apache.poi.xwpf.usermodel.BodyType;
+import org.apache.poi.xwpf.usermodel.IBody;
+import org.apache.poi.xwpf.usermodel.IBodyElement;
+import org.apache.poi.xwpf.usermodel.ICell;
+import org.apache.poi.xwpf.usermodel.IRunElement;
+import org.apache.poi.xwpf.usermodel.ISDTContent;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFHeaderFooter;
+import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
+import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
+import org.apache.poi.xwpf.usermodel.XWPFParagraph;
+import org.apache.poi.xwpf.usermodel.XWPFPicture;
+import org.apache.poi.xwpf.usermodel.XWPFPictureData;
+import org.apache.poi.xwpf.usermodel.XWPFRun;
+import org.apache.poi.xwpf.usermodel.XWPFSDT;
+import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
+import org.apache.poi.xwpf.usermodel.XWPFStyle;
+import org.apache.poi.xwpf.usermodel.XWPFStyles;
+import org.apache.poi.xwpf.usermodel.XWPFTable;
+import org.apache.poi.xwpf.usermodel.XWPFTableCell;
+import org.apache.poi.xwpf.usermodel.XWPFTableRow;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.WordExtractor;
+import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlCursor;
+import org.apache.xmlbeans.XmlException;
+import org.apache.xmlbeans.XmlObject;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTObject;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
+
+ // could be improved by using the real delimiter in xchFollow [MS-DOC], v20140721, 2.4.6.3, Part 3, Step 3
+ private static final String LIST_DELIMITER = " ";
+
+
+ private XWPFDocument document;
+ private XWPFStyles styles;
+
+ public XWPFWordExtractorDecorator(ParseContext context, XWPFWordExtractor extractor) {
+ super(context, extractor);
+
+ document = (XWPFDocument) extractor.getDocument();
+ styles = document.getStyles();
+ }
+
+ /**
+ * @see org.apache.poi.xwpf.extractor.XWPFWordExtractor#getText()
+ */
+ @Override
+ protected void buildXHTML(XHTMLContentHandler xhtml)
+ throws SAXException, XmlException, IOException {
+ XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
+ XWPFListManager listManager = new XWPFListManager(document);
+ // headers
+ if (hfPolicy != null) {
+ extractHeaders(xhtml, hfPolicy, listManager);
+ }
+
+ // process text in the order that it occurs in
+ extractIBodyText(document, listManager, xhtml);
+
+ // then all document tables
+ if (hfPolicy != null) {
+ extractFooters(xhtml, hfPolicy, listManager);
+ }
+ }
+
+ private void extractIBodyText(IBody bodyElement, XWPFListManager listManager,
+ XHTMLContentHandler xhtml)
+ throws SAXException, XmlException, IOException {
+ for (IBodyElement element : bodyElement.getBodyElements()) {
+ if (element instanceof XWPFParagraph) {
+ XWPFParagraph paragraph = (XWPFParagraph) element;
+ extractParagraph(paragraph, listManager, xhtml);
+ }
+ if (element instanceof XWPFTable) {
+ XWPFTable table = (XWPFTable) element;
+ extractTable(table, listManager, xhtml);
+ }
+ if (element instanceof XWPFSDT) {
+ extractSDT((XWPFSDT) element, xhtml);
+ }
+
+ }
+ }
+
+ private void extractSDT(XWPFSDT element, XHTMLContentHandler xhtml) throws SAXException,
+ XmlException, IOException {
+ ISDTContent content = element.getContent();
+ String tag = "p";
+ xhtml.startElement(tag);
+ xhtml.characters(content.getText());
+ xhtml.endElement(tag);
+ }
+
+ private void extractParagraph(XWPFParagraph paragraph, XWPFListManager listManager,
+ XHTMLContentHandler xhtml)
+ throws SAXException, XmlException, IOException {
+ // If this paragraph is actually a whole new section, then
+ // it could have its own headers and footers
+ // Check and handle if so
+ XWPFHeaderFooterPolicy headerFooterPolicy = null;
+ if (paragraph.getCTP().getPPr() != null) {
+ CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr();
+ if (ctSectPr != null) {
+ headerFooterPolicy =
+ new XWPFHeaderFooterPolicy(document, ctSectPr);
+ extractHeaders(xhtml, headerFooterPolicy, listManager);
+ }
+ }
+
+ // Is this a paragraph, or a heading?
+ String tag = "p";
+ String styleClass = null;
+ if (paragraph.getStyleID() != null) {
+ XWPFStyle style = styles.getStyle(
+ paragraph.getStyleID()
+ );
+
+ if (style != null && style.getName() != null) {
+ TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(
+ style.getName(), paragraph.getPartType() == BodyType.TABLECELL
+ );
+ tag = tas.getTag();
+ styleClass = tas.getStyleClass();
+ }
+ }
+
+ if (styleClass == null) {
+ xhtml.startElement(tag);
+ } else {
+ xhtml.startElement(tag, "class", styleClass);
+ }
+
+ writeParagraphNumber(paragraph, listManager, xhtml);
+ // Output placeholder for any embedded docs:
+
+ // TODO: replace w/ XPath/XQuery:
+ for (XWPFRun run : paragraph.getRuns()) {
+ XmlCursor c = run.getCTR().newCursor();
+ c.selectPath("./*");
+ while (c.toNextSelection()) {
+ XmlObject o = c.getObject();
+ if (o instanceof CTObject) {
+ XmlCursor c2 = o.newCursor();
+ c2.selectPath("./*");
+ while (c2.toNextSelection()) {
+ XmlObject o2 = c2.getObject();
+
+ XmlObject embedAtt = o2.selectAttribute(new QName("Type"));
+ if (embedAtt != null && embedAtt.getDomNode().getNodeValue().equals("Embed")) {
+ // Type is "Embed"
+ XmlObject relIDAtt = o2.selectAttribute(new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"));
+ if (relIDAtt != null) {
+ String relID = relIDAtt.getDomNode().getNodeValue();
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", relID);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+ }
+ }
+ }
+ c2.dispose();
+ }
+ }
+
+ c.dispose();
+ }
+
+ // Attach bookmarks for the paragraph
+ // (In future, we might put them in the right place, for now
+ // we just put them in the correct paragraph)
+ for (int i = 0; i < paragraph.getCTP().sizeOfBookmarkStartArray(); i++) {
+ CTBookmark bookmark = paragraph.getCTP().getBookmarkStartArray(i);
+ xhtml.startElement("a", "name", bookmark.getName());
+ xhtml.endElement("a");
+ }
+
+ TmpFormatting fmtg = new TmpFormatting(false, false);
+
+ // Do the iruns
+ for (IRunElement run : paragraph.getIRuns()) {
+ if (run instanceof XWPFSDT) {
+ fmtg = closeStyleTags(xhtml, fmtg);
+ processSDTRun((XWPFSDT) run, xhtml);
+ //for now, we're ignoring formatting in sdt
+ //if you hit an sdt reset to false
+ fmtg.setBold(false);
+ fmtg.setItalic(false);
+ } else {
+ fmtg = processRun((XWPFRun) run, paragraph, xhtml, fmtg);
+ }
+ }
+ closeStyleTags(xhtml, fmtg);
+
+
+ // Now do any comments for the paragraph
+ XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, null);
+ String commentText = comments.getCommentText();
+ if (commentText != null && commentText.length() > 0) {
+ xhtml.characters(commentText);
+ }
+
+ String footnameText = paragraph.getFootnoteText();
+ if (footnameText != null && footnameText.length() > 0) {
+ xhtml.characters(footnameText + "\n");
+ }
+
+ // Also extract any paragraphs embedded in text boxes:
+ for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) {
+ extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), listManager, xhtml);
+ }
+
+ // Finish this paragraph
+ xhtml.endElement(tag);
+
+ if (headerFooterPolicy != null) {
+ extractFooters(xhtml, headerFooterPolicy, listManager);
+ }
+ }
+
+ private void writeParagraphNumber(XWPFParagraph paragraph,
+ XWPFListManager listManager,
+ XHTMLContentHandler xhtml) throws SAXException {
+ if (paragraph.getNumIlvl() == null) {
+ return;
+ }
+ String number = listManager.getFormattedNumber(paragraph);
+ if (number != null) {
+ xhtml.characters(number);
+ }
+
+ }
+
+ private TmpFormatting closeStyleTags(XHTMLContentHandler xhtml,
+ TmpFormatting fmtg) throws SAXException {
+ // Close any still open style tags
+ if (fmtg.isItalic()) {
+ xhtml.endElement("i");
+ fmtg.setItalic(false);
+ }
+ if (fmtg.isBold()) {
+ xhtml.endElement("b");
+ fmtg.setBold(false);
+ }
+ return fmtg;
+ }
+
+ private TmpFormatting processRun(XWPFRun run, XWPFParagraph paragraph,
+ XHTMLContentHandler xhtml, TmpFormatting tfmtg)
+ throws SAXException, XmlException, IOException {
+ // True if we are currently in the named style tag:
+ if (run.isBold() != tfmtg.isBold()) {
+ if (tfmtg.isItalic()) {
+ xhtml.endElement("i");
+ tfmtg.setItalic(false);
+ }
+ if (run.isBold()) {
+ xhtml.startElement("b");
+ } else {
+ xhtml.endElement("b");
+ }
+ tfmtg.setBold(run.isBold());
+ }
+
+ if (run.isItalic() != tfmtg.isItalic()) {
+ if (run.isItalic()) {
+ xhtml.startElement("i");
+ } else {
+ xhtml.endElement("i");
+ }
+ tfmtg.setItalic(run.isItalic());
+ }
+
+ boolean addedHREF = false;
+ if (run instanceof XWPFHyperlinkRun) {
+ XWPFHyperlinkRun linkRun = (XWPFHyperlinkRun) run;
+ XWPFHyperlink link = linkRun.getHyperlink(document);
+ if (link != null && link.getURL() != null) {
+ xhtml.startElement("a", "href", link.getURL());
+ addedHREF = true;
+ } else if (linkRun.getAnchor() != null && linkRun.getAnchor().length() > 0) {
+ xhtml.startElement("a", "href", "#" + linkRun.getAnchor());
+ addedHREF = true;
+ }
+ }
+
+ xhtml.characters(run.toString());
+
+ // If we have any pictures, output them
+ for (XWPFPicture picture : run.getEmbeddedPictures()) {
+ if (paragraph.getDocument() != null) {
+ XWPFPictureData data = picture.getPictureData();
+ if (data != null) {
+ AttributesImpl attr = new AttributesImpl();
+
+ attr.addAttribute("", "src", "src", "CDATA", "embedded:" + data.getFileName());
+ attr.addAttribute("", "alt", "alt", "CDATA", picture.getDescription());
+
+ xhtml.startElement("img", attr);
+ xhtml.endElement("img");
+ }
+ }
+ }
+
+ if (addedHREF) {
+ xhtml.endElement("a");
+ }
+
+ return tfmtg;
+ }
+
+ private void processSDTRun(XWPFSDT run, XHTMLContentHandler xhtml)
+ throws SAXException, XmlException, IOException {
+ xhtml.characters(run.getContent().getText());
+ }
+
+ private void extractTable(XWPFTable table, XWPFListManager listManager,
+ XHTMLContentHandler xhtml)
+ throws SAXException, XmlException, IOException {
+ xhtml.startElement("table");
+ xhtml.startElement("tbody");
+ for (XWPFTableRow row : table.getRows()) {
+ xhtml.startElement("tr");
+ for (ICell cell : row.getTableICells()) {
+ xhtml.startElement("td");
+ if (cell instanceof XWPFTableCell) {
+ extractIBodyText((XWPFTableCell) cell, listManager, xhtml);
+ } else if (cell instanceof XWPFSDTCell) {
+ xhtml.characters(((XWPFSDTCell) cell).getContent().getText());
+ }
+ xhtml.endElement("td");
+ }
+ xhtml.endElement("tr");
+ }
+ xhtml.endElement("tbody");
+ xhtml.endElement("table");
+ }
+
+ private void extractFooters(
+ XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy,
+ XWPFListManager listManager)
+ throws SAXException, XmlException, IOException {
+ // footers
+ if (hfPolicy.getFirstPageFooter() != null) {
+ extractHeaderText(xhtml, hfPolicy.getFirstPageFooter(), listManager);
+ }
+ if (hfPolicy.getEvenPageFooter() != null) {
+ extractHeaderText(xhtml, hfPolicy.getEvenPageFooter(), listManager);
+ }
+ if (hfPolicy.getDefaultFooter() != null) {
+ extractHeaderText(xhtml, hfPolicy.getDefaultFooter(), listManager);
+ }
+ }
+
+ private void extractHeaders(
+ XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy, XWPFListManager listManager)
+ throws SAXException, XmlException, IOException {
+ if (hfPolicy == null) return;
+
+ if (hfPolicy.getFirstPageHeader() != null) {
+ extractHeaderText(xhtml, hfPolicy.getFirstPageHeader(), listManager);
+ }
+
+ if (hfPolicy.getEvenPageHeader() != null) {
+ extractHeaderText(xhtml, hfPolicy.getEvenPageHeader(), listManager);
+ }
+
+ if (hfPolicy.getDefaultHeader() != null) {
+ extractHeaderText(xhtml, hfPolicy.getDefaultHeader(), listManager);
+ }
+ }
+
+ private void extractHeaderText(XHTMLContentHandler xhtml, XWPFHeaderFooter header, XWPFListManager listManager) throws SAXException, XmlException, IOException {
+
+ for (IBodyElement e : header.getBodyElements()) {
+ if (e instanceof XWPFParagraph) {
+ extractParagraph((XWPFParagraph) e, listManager, xhtml);
+ } else if (e instanceof XWPFTable) {
+ extractTable((XWPFTable) e, listManager, xhtml);
+ } else if (e instanceof XWPFSDT) {
+ extractSDT((XWPFSDT) e, xhtml);
+ }
+ }
+ }
+
+ /**
+ * Word documents are simple, they only have the one
+ * main part
+ */
+ @Override
+ protected List<PackagePart> getMainDocumentParts() {
+ List<PackagePart> parts = new ArrayList<PackagePart>();
+ parts.add(document.getPackagePart());
+ return parts;
+ }
+
+ private class TmpFormatting {
+ private boolean bold = false;
+ private boolean italic = false;
+
+ private TmpFormatting(boolean bold, boolean italic) {
+ this.bold = bold;
+ this.italic = italic;
+ }
+
+ public boolean isBold() {
+ return bold;
+ }
+
+ public void setBold(boolean bold) {
+ this.bold = bold;
+ }
+
+ public boolean isItalic() {
+ return italic;
+ }
+
+ public void setItalic(boolean italic) {
+ this.italic = italic;
+ }
+
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/NSNormalizerContentHandler.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Locale;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that:<ul>
+ * <li>Maps old OpenOffice 1.0 Namespaces to the OpenDocument ones</li>
+ * <li>Returns a fake DTD when parser requests OpenOffice DTD</li>
+ * </ul>
+ */
+public class NSNormalizerContentHandler extends ContentHandlerDecorator {
+
+ private static final String OLD_NS =
+ "http://openoffice.org/2000/";
+
+ private static final String NEW_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:";
+
+ private static final String DTD_PUBLIC_ID =
+ "-//OpenOffice.org//DTD OfficeDocument 1.0//EN";
+
+ public NSNormalizerContentHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ private String mapOldNS(String ns) {
+ if (ns != null && ns.startsWith(OLD_NS)) {
+ return NEW_NS + ns.substring(OLD_NS.length()) + ":1.0";
+ } else {
+ return ns;
+ }
+ }
+
+ @Override
+ public void startElement(
+ String namespaceURI, String localName, String qName,
+ Attributes atts) throws SAXException {
+ AttributesImpl natts = new AttributesImpl();
+ for (int i = 0; i < atts.getLength(); i++) {
+ natts.addAttribute(
+ mapOldNS(atts.getURI(i)), atts.getLocalName(i),
+ atts.getQName(i), atts.getType(i), atts.getValue(i));
+ }
+ super.startElement(mapOldNS(namespaceURI), localName, qName, atts);
+ }
+
+ @Override
+ public void endElement(String namespaceURI, String localName, String qName)
+ throws SAXException {
+ super.endElement(mapOldNS(namespaceURI), localName, qName);
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri)
+ throws SAXException {
+ super.startPrefixMapping(prefix, mapOldNS(uri));
+ }
+
+ /**
+ * do not load any DTDs (may be requested by parser). Fake the DTD by
+ * returning a empty string as InputSource
+ */
+ @Override
+ public InputSource resolveEntity(String publicId, String systemId)
+ throws IOException, SAXException {
+ if ((systemId != null && systemId.toLowerCase(Locale.ROOT).endsWith(".dtd"))
+ || DTD_PUBLIC_ID.equals(publicId)) {
+ return new InputSource(new StringReader(""));
+ } else {
+ return super.resolveEntity(publicId, systemId);
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentContentParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,515 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.odf;
+
+import javax.xml.XMLConstants;
+import javax.xml.namespace.QName;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.ElementMappingContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXNotRecognizedException;
+import org.xml.sax.helpers.AttributesImpl;
+import org.xml.sax.helpers.DefaultHandler;
+
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
+/**
+ * Parser for ODF <code>content.xml</code> files.
+ */
+public class OpenDocumentContentParser extends AbstractParser {
+ private interface Style {
+ }
+
+ private static class TextStyle implements Style {
+ public boolean italic;
+ public boolean bold;
+ public boolean underlined;
+ }
+
+ private static class ListStyle implements Style {
+ public boolean ordered;
+
+ public String getTag() {
+ return ordered ? "ol" : "ul";
+ }
+ }
+
+ private static final class OpenDocumentElementMappingContentHandler extends
+ ElementMappingContentHandler {
+ private final ContentHandler handler;
+ private final BitSet textNodeStack = new BitSet();
+ private int nodeDepth = 0;
+ private int completelyFiltered = 0;
+ private Stack<String> headingStack = new Stack<String>();
+ private Map<String, TextStyle> textStyleMap = new HashMap<String, TextStyle>();
+ private Map<String, ListStyle> listStyleMap = new HashMap<String, ListStyle>();
+ private TextStyle textStyle;
+ private TextStyle lastTextStyle;
+ private Stack<ListStyle> listStyleStack = new Stack<ListStyle>();
+ private ListStyle listStyle;
+
+ private OpenDocumentElementMappingContentHandler(ContentHandler handler,
+ Map<QName, TargetElement> mappings) {
+ super(handler, mappings);
+ this.handler = handler;
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ // only forward content of tags from text:-namespace
+ if (completelyFiltered == 0 && nodeDepth > 0
+ && textNodeStack.get(nodeDepth - 1)) {
+ lazyEndSpan();
+ super.characters(ch, start, length);
+ }
+ }
+
+ // helper for checking tags which need complete filtering
+ // (with sub-tags)
+ private boolean needsCompleteFiltering(
+ String namespaceURI, String localName) {
+ if (TEXT_NS.equals(namespaceURI)) {
+ return localName.endsWith("-template")
+ || localName.endsWith("-style");
+ }
+ return TABLE_NS.equals(namespaceURI) && "covered-table-cell".equals(localName);
+ }
+
+ // map the heading level to <hX> HTML tags
+ private String getXHTMLHeaderTagName(Attributes atts) {
+ String depthStr = atts.getValue(TEXT_NS, "outline-level");
+ if (depthStr == null) {
+ return "h1";
+ }
+
+ int depth = Integer.parseInt(depthStr);
+ if (depth >= 6) {
+ return "h6";
+ } else if (depth <= 1) {
+ return "h1";
+ } else {
+ return "h" + depth;
+ }
+ }
+
+ /**
+ * Check if a node is a text node
+ */
+ private boolean isTextNode(String namespaceURI, String localName) {
+ if (TEXT_NS.equals(namespaceURI) && !localName.equals("page-number") && !localName.equals("page-count")) {
+ return true;
+ }
+ if (SVG_NS.equals(namespaceURI)) {
+ return "title".equals(localName) ||
+ "desc".equals(localName);
+ }
+ return false;
+ }
+
+ private void startList(String name) throws SAXException {
+ String elementName = "ul";
+ if (name != null) {
+ ListStyle style = listStyleMap.get(name);
+ elementName = style != null ? style.getTag() : "ul";
+ listStyleStack.push(style);
+ }
+ handler.startElement(XHTML, elementName, elementName, EMPTY_ATTRIBUTES);
+ }
+
+ private void endList() throws SAXException {
+ String elementName = "ul";
+ if (!listStyleStack.isEmpty()) {
+ ListStyle style = listStyleStack.pop();
+ elementName = style != null ? style.getTag() : "ul";
+ }
+ handler.endElement(XHTML, elementName, elementName);
+ }
+
+ private void startSpan(String name) throws SAXException {
+ if (name == null) {
+ return;
+ }
+
+ TextStyle style = textStyleMap.get(name);
+ if (style == null) {
+ return;
+ }
+
+ // End tags that refer to no longer valid styles
+ if (!style.underlined && lastTextStyle != null && lastTextStyle.underlined) {
+ handler.endElement(XHTML, "u", "u");
+ }
+ if (!style.italic && lastTextStyle != null && lastTextStyle.italic) {
+ handler.endElement(XHTML, "i", "i");
+ }
+ if (!style.bold && lastTextStyle != null && lastTextStyle.bold) {
+ handler.endElement(XHTML, "b", "b");
+ }
+
+ // Start tags for new styles
+ if (style.bold && (lastTextStyle == null || !lastTextStyle.bold)) {
+ handler.startElement(XHTML, "b", "b", EMPTY_ATTRIBUTES);
+ }
+ if (style.italic && (lastTextStyle == null || !lastTextStyle.italic)) {
+ handler.startElement(XHTML, "i", "i", EMPTY_ATTRIBUTES);
+ }
+ if (style.underlined && (lastTextStyle == null || !lastTextStyle.underlined)) {
+ handler.startElement(XHTML, "u", "u", EMPTY_ATTRIBUTES);
+ }
+
+ textStyle = style;
+ lastTextStyle = null;
+ }
+
+ private void endSpan() throws SAXException {
+ lastTextStyle = textStyle;
+ textStyle = null;
+ }
+
+ private void lazyEndSpan() throws SAXException {
+ if (lastTextStyle == null) {
+ return;
+ }
+
+ if (lastTextStyle.underlined) {
+ handler.endElement(XHTML, "u", "u");
+ }
+ if (lastTextStyle.italic) {
+ handler.endElement(XHTML, "i", "i");
+ }
+ if (lastTextStyle.bold) {
+ handler.endElement(XHTML, "b", "b");
+ }
+
+ lastTextStyle = null;
+ }
+
+ @Override
+ public void startElement(
+ String namespaceURI, String localName, String qName,
+ Attributes attrs) throws SAXException {
+ // keep track of current node type. If it is a text node,
+ // a bit at the current depth its set in textNodeStack.
+ // characters() checks the top bit to determine, if the
+ // actual node is a text node to print out nodeDepth contains
+ // the depth of the current node and also marks top of stack.
+ assert nodeDepth >= 0;
+
+ // Set styles
+ if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+ String family = attrs.getValue(STYLE_NS, "family");
+ if ("text".equals(family)) {
+ textStyle = new TextStyle();
+ String name = attrs.getValue(STYLE_NS, "name");
+ textStyleMap.put(name, textStyle);
+ }
+ } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
+ listStyle = new ListStyle();
+ String name = attrs.getValue(STYLE_NS, "name");
+ listStyleMap.put(name, listStyle);
+ } else if (textStyle != null && STYLE_NS.equals(namespaceURI)
+ && "text-properties".equals(localName)) {
+ String fontStyle = attrs.getValue(FORMATTING_OBJECTS_NS, "font-style");
+ if ("italic".equals(fontStyle) || "oblique".equals(fontStyle)) {
+ textStyle.italic = true;
+ }
+ String fontWeight = attrs.getValue(FORMATTING_OBJECTS_NS, "font-weight");
+ if ("bold".equals(fontWeight) || "bolder".equals(fontWeight)
+ || (fontWeight != null && Character.isDigit(fontWeight.charAt(0))
+ && Integer.valueOf(fontWeight) > 500)) {
+ textStyle.bold = true;
+ }
+ String underlineStyle = attrs.getValue(STYLE_NS, "text-underline-style");
+ if (underlineStyle != null) {
+ textStyle.underlined = true;
+ }
+ } else if (listStyle != null && TEXT_NS.equals(namespaceURI)) {
+ if ("list-level-style-bullet".equals(localName)) {
+ listStyle.ordered = false;
+ } else if ("list-level-style-number".equals(localName)) {
+ listStyle.ordered = true;
+ }
+ }
+
+ textNodeStack.set(nodeDepth++,
+ isTextNode(namespaceURI, localName));
+ // filter *all* content of some tags
+ assert completelyFiltered >= 0;
+
+ if (needsCompleteFiltering(namespaceURI, localName)) {
+ completelyFiltered++;
+ }
+ // call next handler if no filtering
+ if (completelyFiltered == 0) {
+ // special handling of text:h, that are directly passed
+ // to incoming handler
+ if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
+ final String el = headingStack.push(getXHTMLHeaderTagName(attrs));
+ handler.startElement(XHTMLContentHandler.XHTML, el, el, EMPTY_ATTRIBUTES);
+ } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
+ startList(attrs.getValue(TEXT_NS, "style-name"));
+ } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
+ startSpan(attrs.getValue(TEXT_NS, "style-name"));
+ } else {
+ super.startElement(namespaceURI, localName, qName, attrs);
+ }
+ }
+ }
+
+ @Override
+ public void endElement(
+ String namespaceURI, String localName, String qName)
+ throws SAXException {
+ if (STYLE_NS.equals(namespaceURI) && "style".equals(localName)) {
+ textStyle = null;
+ } else if (TEXT_NS.equals(namespaceURI) && "list-style".equals(localName)) {
+ listStyle = null;
+ }
+
+ // call next handler if no filtering
+ if (completelyFiltered == 0) {
+ // special handling of text:h, that are directly passed
+ // to incoming handler
+ if (TEXT_NS.equals(namespaceURI) && "h".equals(localName)) {
+ final String el = headingStack.pop();
+ handler.endElement(XHTMLContentHandler.XHTML, el, el);
+ } else if (TEXT_NS.equals(namespaceURI) && "list".equals(localName)) {
+ endList();
+ } else if (TEXT_NS.equals(namespaceURI) && "span".equals(localName)) {
+ endSpan();
+ } else {
+ if (TEXT_NS.equals(namespaceURI) && "p".equals(localName)) {
+ lazyEndSpan();
+ }
+ super.endElement(namespaceURI, localName, qName);
+ }
+
+ // special handling of tabulators
+ if (TEXT_NS.equals(namespaceURI)
+ && ("tab-stop".equals(localName)
+ || "tab".equals(localName))) {
+ this.characters(TAB, 0, TAB.length);
+ }
+ }
+
+ // revert filter for *all* content of some tags
+ if (needsCompleteFiltering(namespaceURI, localName)) {
+ completelyFiltered--;
+ }
+ assert completelyFiltered >= 0;
+
+ // reduce current node depth
+ nodeDepth--;
+ assert nodeDepth >= 0;
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) {
+ // remove prefix mappings as they should not occur in XHTML
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) {
+ // remove prefix mappings as they should not occur in XHTML
+ }
+ }
+
+ public static final String TEXT_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:text:1.0";
+
+ public static final String TABLE_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:table:1.0";
+
+ public static final String STYLE_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:style:1.0";
+
+ public static final String FORMATTING_OBJECTS_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0";
+
+ public static final String OFFICE_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:office:1.0";
+
+ public static final String SVG_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0";
+
+ public static final String PRESENTATION_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0";
+
+ public static final String DRAW_NS =
+ "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0";
+
+ public static final String XLINK_NS = "http://www.w3.org/1999/xlink";
+
+ protected static final char[] TAB = new char[]{'\t'};
+
+ private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
+
+ /**
+ * Mappings between ODF tag names and XHTML tag names
+ * (including attributes). All other tag names/attributes are ignored
+ * and left out from event stream.
+ */
+ private static final HashMap<QName, TargetElement> MAPPINGS =
+ new HashMap<QName, TargetElement>();
+
+ static {
+ // general mappings of text:-tags
+ MAPPINGS.put(
+ new QName(TEXT_NS, "p"),
+ new TargetElement(XHTML, "p"));
+ // text:h-tags are mapped specifically in startElement/endElement
+ MAPPINGS.put(
+ new QName(TEXT_NS, "line-break"),
+ new TargetElement(XHTML, "br"));
+ MAPPINGS.put(
+ new QName(TEXT_NS, "list-item"),
+ new TargetElement(XHTML, "li"));
+ MAPPINGS.put(
+ new QName(TEXT_NS, "note"),
+ new TargetElement(XHTML, "div"));
+ MAPPINGS.put(
+ new QName(OFFICE_NS, "annotation"),
+ new TargetElement(XHTML, "div"));
+ MAPPINGS.put(
+ new QName(PRESENTATION_NS, "notes"),
+ new TargetElement(XHTML, "div"));
+ MAPPINGS.put(
+ new QName(DRAW_NS, "object"),
+ new TargetElement(XHTML, "object"));
+ MAPPINGS.put(
+ new QName(DRAW_NS, "text-box"),
+ new TargetElement(XHTML, "div"));
+ MAPPINGS.put(
+ new QName(SVG_NS, "title"),
+ new TargetElement(XHTML, "span"));
+ MAPPINGS.put(
+ new QName(SVG_NS, "desc"),
+ new TargetElement(XHTML, "span"));
+ MAPPINGS.put(
+ new QName(TEXT_NS, "span"),
+ new TargetElement(XHTML, "span"));
+
+ final HashMap<QName, QName> aAttsMapping =
+ new HashMap<QName, QName>();
+ aAttsMapping.put(
+ new QName(XLINK_NS, "href"),
+ new QName("href"));
+ aAttsMapping.put(
+ new QName(XLINK_NS, "title"),
+ new QName("title"));
+ MAPPINGS.put(
+ new QName(TEXT_NS, "a"),
+ new TargetElement(XHTML, "a", aAttsMapping));
+
+ // create HTML tables from table:-tags
+ MAPPINGS.put(
+ new QName(TABLE_NS, "table"),
+ new TargetElement(XHTML, "table"));
+ // repeating of rows is ignored; for columns, see below!
+ MAPPINGS.put(
+ new QName(TABLE_NS, "table-row"),
+ new TargetElement(XHTML, "tr"));
+ // special mapping for rowspan/colspan attributes
+ final HashMap<QName, QName> tableCellAttsMapping =
+ new HashMap<QName, QName>();
+ tableCellAttsMapping.put(
+ new QName(TABLE_NS, "number-columns-spanned"),
+ new QName("colspan"));
+ tableCellAttsMapping.put(
+ new QName(TABLE_NS, "number-rows-spanned"),
+ new QName("rowspan"));
+ /* TODO: The following is not correct, the cell should be repeated not spanned!
+ * Code generates a HTML cell, spanning all repeated columns, to make the cell look correct.
+ * Problems may occur when both spanning and repeating is given, which is not allowed by spec.
+ * Cell spanning instead of repeating is not a problem, because OpenOffice uses it
+ * only for empty cells.
+ */
+ tableCellAttsMapping.put(
+ new QName(TABLE_NS, "number-columns-repeated"),
+ new QName("colspan"));
+ MAPPINGS.put(
+ new QName(TABLE_NS, "table-cell"),
+ new TargetElement(XHTML, "td", tableCellAttsMapping));
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.emptySet(); // not a top-level parser
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ parseInternal(stream,
+ new XHTMLContentHandler(handler, metadata),
+ metadata, context);
+ }
+
+ void parseInternal(
+ InputStream stream, final ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ DefaultHandler dh = new OpenDocumentElementMappingContentHandler(handler, MAPPINGS);
+
+ try {
+ SAXParserFactory factory = SAXParserFactory.newInstance();
+ factory.setValidating(false);
+ factory.setNamespaceAware(true);
+ try {
+ factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
+ } catch (SAXNotRecognizedException e) {
+ // TIKA-329: Some XML parsers do not support the secure-processing
+ // feature, even though it's required by JAXP in Java 5. Ignoring
+ // the exception is fine here, deployments without this feature
+ // are inherently vulnerable to XML denial-of-service attacks.
+ }
+ SAXParser parser = factory.newSAXParser();
+ parser.parse(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(
+ new NSNormalizerContentHandler(dh)));
+ } catch (ParserConfigurationException e) {
+ throw new TikaException("XML parser configuration error", e);
+ }
+ }
+
+}