You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/11/30 20:31:15 UTC
[6/7] tika git commit: TIKA 1321 initial commit
http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
new file mode 100644
index 0000000..dce36a2
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
@@ -0,0 +1,318 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+
+import java.util.Date;
+import java.util.Map;
+
+import org.apache.tika.utils.DateUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * This class is intended to handle anything that might contain IBodyElements:
+ * main document, headers, footers, notes, etc.
+ */
+
+public class XWPFDocumentXMLBodyHandler extends DefaultHandler {
+
+
+ enum EditType {
+ NONE,
+ INSERT,
+ DELETE,
+ MOVE_TO,
+ MOVE_FROM
+ }
+
+
+ private final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
+ private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
+ private final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
+
+ private final static char[] TAB = new char[1];
+
+ static {
+ TAB[0] = '\t';
+ }
+
+ private final XWPFBodyContentsHandler bodyContentsHandler;
+ //private final RelationshipsManager relationshipsManager;
+ private final Map<String, String> hyperlinks;
+
+ private final StringBuilder runBuffer = new StringBuilder();
+
+ private boolean inR = false;
+ private boolean inT = false;
+ private int pDepth = 0;
+ private boolean inRPr = false;
+ private boolean inNumPr = false;
+ private boolean inDelText = false;
+ private boolean inHyperlink = false;
+
+ //alternate content can be embedded in itself.
+ //need to track depth.
+ //if in alternate, choose fallback, maybe make this configurable?
+ private int inACChoiceDepth = 0;
+ private int inACFallbackDepth = 0;
+ private EditType editType = EditType.NONE;
+ private String hyperlink = null;
+
+ private XWPFRunProperties currRunProperties = new XWPFRunProperties();
+
+ public XWPFDocumentXMLBodyHandler(XWPFBodyContentsHandler bodyContentsHandler,
+ Map<String, String> hyperlinks) {
+ this.bodyContentsHandler = bodyContentsHandler;
+ this.hyperlinks = hyperlinks;
+ }
+
+
+ @Override
+ public void startDocument() throws SAXException {
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) throws SAXException {
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) throws SAXException {
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ if (uri != null && uri.equals(MC_NS)) {
+ if (localName.equals("Choice")) {
+ inACChoiceDepth++;
+ } else if (localName.equals("Fallback")) {
+ inACFallbackDepth++;
+ }
+ }
+
+ if (inACChoiceDepth > 0) {
+ return;
+ }
+
+ if (uri == null || uri.equals(W_NS)) {
+ if (localName.equals("p")) {
+ bodyContentsHandler.startParagraph();
+ pDepth++;
+ } else if (localName.equals("r")) {
+ inR = true;
+ } else if (localName.equals("t")) {
+ inT = true;
+ } else if (localName.equals("tab")) {
+ runBuffer.append("\t");
+ } else if (localName.equals("tbl")) {
+ bodyContentsHandler.startTable();
+ } else if (localName.equals("tc")) {
+ bodyContentsHandler.startTableCell();
+ } else if (localName.equals("tr")) {
+ bodyContentsHandler.startTableRow();
+ } else if (localName.equals("numPr")) {
+ inNumPr = true;
+ } else if (localName.equals("rPr")) {
+ inRPr = true;
+ } else if (inR && inRPr && localName.equals("i")) {
+ //rprs don't have to be inR; ignore those that aren't
+ currRunProperties.setItalics(true);
+ } else if (inR && inRPr && localName.equals("b")) {
+ currRunProperties.setBold(true);
+ } else if (localName.equals("delText")) {
+ inDelText = true;
+ } else if (localName.equals("ins")) {
+ startEditedSection(editType.INSERT, atts);
+ } else if (localName.equals("del")) {
+ startEditedSection(editType.DELETE, atts);
+ } else if (localName.equals("moveTo")) {
+ startEditedSection(EditType.MOVE_TO, atts);
+ } else if (localName.equals("moveFrom")) {
+ startEditedSection(editType.MOVE_FROM, atts);
+ } else if (localName.equals("hyperlink")) {
+ String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+ if (hyperlinkId != null) {
+ hyperlink = hyperlinks.get(hyperlinkId);
+ }
+ inHyperlink = true;
+ } else if (localName.equals("footnoteReference")) {
+ String id = atts.getValue(W_NS, "id");
+ bodyContentsHandler.footnoteReference(id);
+ } else if (localName.equals("endnoteReference")) {
+ String id = atts.getValue(W_NS, "id");
+ bodyContentsHandler.endnoteReference(id);
+ } /*else if (localName.equals("headerReference")) {
+ //TODO
+ } else if (localName.equals("footerReference")) {
+ //TODO
+ } else if (localName.equals("commentRangeEnd")) {
+ //TODO
+ }*/
+ }
+ }
+
+ private void startEditedSection(EditType editType, Attributes atts) {
+ String editAuthor = atts.getValue(W_NS, "author");
+ String editDateString = atts.getValue(W_NS, "date");
+ Date editDate = null;
+ if (editDateString != null) {
+ editDate = DateUtils.tryToParse(editDateString);
+ }
+ bodyContentsHandler.startEditedSection(editAuthor, editDate, editType);
+ this.editType = editType;
+ }
+
+ private int getIntVal(Attributes atts) {
+ String valString = atts.getValue(W_NS, "val");
+ if (valString != null) {
+ try {
+ return Integer.parseInt(valString);
+ } catch (NumberFormatException e) {
+ //swallow
+ }
+ }
+ return -1;
+ }
+
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ if (uri.equals(MC_NS)) {
+ if (localName.equals("Choice")) {
+ inACChoiceDepth--;
+ } else if (localName.equals("Fallback")) {
+ inACFallbackDepth--;
+ }
+ }
+ if (uri == null || uri.equals(W_NS)) {
+ if (inACChoiceDepth > 0) {
+ return;
+ }
+
+
+ if (localName.equals("r") && !inHyperlink) {
+ bodyContentsHandler.run(currRunProperties, runBuffer.toString());
+ inR = false;
+ runBuffer.setLength(0);
+ currRunProperties.setBold(false);
+ currRunProperties.setItalics(false);
+ } else if (localName.equals("p")) {
+ bodyContentsHandler.endParagraph();
+ pDepth--;
+ } else if (localName.equals("t")) {
+ inT = false;
+ } else if (localName.equals("tbl")) {
+ bodyContentsHandler.endTable();
+ } else if (localName.equals("tc")) {
+ bodyContentsHandler.endTableCell();
+ } else if (localName.equals("tr")) {
+ bodyContentsHandler.endTableRow();
+ } else if (localName.equals("rPr")) {
+ inRPr = false;
+ } else if (localName.equals("delText")) {
+ inDelText = false;
+ } else if (localName.equals("ins") || localName.equals("del") ||
+ localName.equals("moveTo") || localName.equals("moveFrom")) {
+ editType = EditType.NONE;
+ } else if (localName.equals("hyperlink")) {
+ if (hyperlink != null) {
+ bodyContentsHandler.hyperlinkRun(hyperlink, runBuffer.toString());
+ } else {
+ bodyContentsHandler.run(currRunProperties, runBuffer.toString());
+ }
+ runBuffer.setLength(0);
+ inHyperlink = false;
+ }
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+
+ if (inACChoiceDepth > 0) {
+ return;
+ }
+ if (editType.equals(EditType.MOVE_FROM) && inT) {
+ if (bodyContentsHandler.getIncludeMoveFromText()) {
+ runBuffer.append(ch, start, length);
+ }
+ } else if (inT) {
+ runBuffer.append(ch, start, length);
+ } else if (bodyContentsHandler.getIncludeDeletedText() && editType.equals(EditType.DELETE)) {
+ runBuffer.append(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+ if (inACChoiceDepth > 0) {
+ return;
+ }
+
+ if (inT) {
+ runBuffer.append(ch, start, length);
+ } else if (bodyContentsHandler.getIncludeDeletedText() && inDelText) {
+ runBuffer.append(ch, start, length);
+ }
+ }
+
+
+ public interface XWPFBodyContentsHandler {
+
+ void run(XWPFRunProperties runProperties, String contents);
+
+ void hyperlinkRun(String link, String text);
+
+ void startParagraph();
+
+ void endParagraph();
+
+ void startTable();
+
+ void endTable();
+
+ void startTableRow();
+
+ void endTableRow();
+
+ void startTableCell();
+
+ void endTableCell();
+
+ void startSDT();
+
+ void endSDT();
+
+ void startEditedSection(String editor, Date date, EditType editType);
+
+ void endEditedSection();
+
+ boolean getIncludeDeletedText();
+
+ void footnoteReference(String id);
+
+ void endnoteReference(String id);
+
+ boolean getIncludeMoveFromText();
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
new file mode 100644
index 0000000..06ef951
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -0,0 +1,353 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+import javax.xml.parsers.ParserConfigurationException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLProperties;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
+import org.apache.poi.util.SAXHelper;
+import org.apache.poi.xwpf.usermodel.XWPFNumbering;
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.XMLReader;
+
+//TODO: move this into POI?
+/**
+ * Experimental class that is based on POI's XSSFEventBasedExcelExtractor
+ *
+ */
+public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
+
+ private OPCPackage container;
+ private POIXMLProperties properties;
+
+ public XWPFEventBasedWordExtractor(String path) throws XmlException, OpenXML4JException, IOException {
+ this(OPCPackage.open(path));
+ }
+
+ public XWPFEventBasedWordExtractor(OPCPackage container) throws XmlException, OpenXML4JException, IOException {
+ super((POIXMLDocument) null);
+ this.container = container;
+ this.properties = new POIXMLProperties(container);
+ }
+
+
+ public static void main(String[] args) throws Exception {
+ if (args.length < 1) {
+ System.err.println("Use:");
+ System.err.println(" XWPFEventBasedWordExtractor <filename.xlsx>");
+ System.exit(1);
+ }
+
+ XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(args[0]);
+ System.out.println(extractor.getText());
+ extractor.close();
+ }
+
+ public OPCPackage getPackage() {
+ return this.container;
+ }
+
+ public POIXMLProperties.CoreProperties getCoreProperties() {
+ return this.properties.getCoreProperties();
+ }
+
+ public POIXMLProperties.ExtendedProperties getExtendedProperties() {
+ return this.properties.getExtendedProperties();
+ }
+
+ public POIXMLProperties.CustomProperties getCustomProperties() {
+ return this.properties.getCustomProperties();
+ }
+
+
+ @Override
+ public String getText() {
+ StringBuilder sb = new StringBuilder();
+ //handle main document
+ List<PackagePart> pps = container.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
+ if (pps != null) {
+ for (PackagePart pp : pps) {
+ //likely only one, but why not...
+ try {
+ handleDocumentPart(pp, sb);
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (SAXException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ //handle glossary document
+ pps = container.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
+
+ if (pps != null) {
+ for (PackagePart pp : pps) {
+ //likely only one, but why not...
+ try {
+ handleDocumentPart(pp, sb);
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (SAXException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ return sb.toString();
+ }
+
+
+ private void handleDocumentPart(PackagePart documentPart, StringBuilder sb) throws IOException, SAXException {
+ //load the numbering/list manager and styles from the main document part
+ XWPFNumbering numbering = loadNumbering(documentPart);
+ XWPFListManager xwpfListManager = new XWPFListManager(numbering);
+ //TODO: XWPFStyles styles = loadStyles(documentPart);
+
+ //headers
+ try {
+ PackageRelationshipCollection headersPRC = documentPart.getRelationshipsByType(XWPFRelation.HEADER.getRelation());
+ if (headersPRC != null) {
+ for (int i = 0; i < headersPRC.size(); i++) {
+ PackagePart header = documentPart.getRelatedPart(headersPRC.getRelationship(i));
+ handlePart(header, xwpfListManager, sb);
+ }
+ }
+ } catch (InvalidFormatException e) {
+ //swallow
+ }
+
+ //main document
+ handlePart(documentPart, xwpfListManager, sb);
+
+ //for now, just dump other components at end
+ for (XWPFRelation rel : new XWPFRelation[]{
+ XWPFRelation.FOOTNOTE,
+ XWPFRelation.COMMENT,
+ XWPFRelation.FOOTER,
+ XWPFRelation.ENDNOTE
+ }) {
+ try {
+ PackageRelationshipCollection prc = documentPart.getRelationshipsByType(rel.getRelation());
+ if (prc != null) {
+ for (int i = 0; i < prc.size(); i++) {
+ PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i));
+ handlePart(packagePart, xwpfListManager, sb);
+ }
+ }
+ } catch (InvalidFormatException e) {
+ //swallow
+ }
+ }
+ }
+
+ private void handlePart(PackagePart packagePart,
+ XWPFListManager xwpfListManager, StringBuilder buffer) throws IOException, SAXException {
+
+ Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
+ try (InputStream stream = packagePart.getInputStream()) {
+ XMLReader reader = SAXHelper.newXMLReader();
+ reader.setContentHandler(new XWPFDocumentXMLBodyHandler(
+ new XWPFToTextContentHandler(buffer), hyperlinks));
+ reader.parse(new InputSource(new CloseShieldInputStream(stream)));
+
+ } catch (ParserConfigurationException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ private Map<String, String> loadHyperlinkRelationships(PackagePart bodyPart) {
+ Map<String, String> hyperlinks = new HashMap<>();
+ try {
+ PackageRelationshipCollection prc = bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
+ for (int i = 0; i < prc.size(); i++) {
+ PackageRelationship pr = prc.getRelationship(i);
+ if (pr == null) {
+ continue;
+ }
+ String id = pr.getId();
+ String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
+ if (id != null && url != null) {
+ hyperlinks.put(id, url);
+ }
+ }
+ } catch (InvalidFormatException e) {
+ }
+ return hyperlinks;
+ }
+/*
+ private XWPFStyles loadStyles(PackagePart packagePart) {
+ try {
+ PackageRelationshipCollection stylesParts =
+ packagePart.getRelationshipsByType(XWPFRelation.STYLES.getRelation());
+ if (stylesParts.size() > 0) {
+ PackageRelationship stylesRelationShip = stylesParts.getRelationship(0);
+ if (stylesRelationShip == null) {
+ return null;
+ }
+ PackagePart stylesPart = opcPackage.getPart(stylesRelationShip);
+ if (stylesPart == null) {
+ return null;
+ }
+ return new XWPFStyles(stylesPart);
+ }
+ } catch (IOException|OpenXML4JException e) {
+ //swallow
+ }
+ return null;
+
+ }
+*/
+ private XWPFNumbering loadNumbering(PackagePart packagePart) {
+ try {
+ PackageRelationshipCollection numberingParts = packagePart.getRelationshipsByType(XWPFRelation.NUMBERING.getRelation());
+ if (numberingParts.size() > 0) {
+ PackageRelationship numberingRelationShip = numberingParts.getRelationship(0);
+ if (numberingRelationShip == null) {
+ return null;
+ }
+ PackagePart numberingPart = container.getPart(numberingRelationShip);
+ if (numberingPart == null) {
+ return null;
+ }
+ return new XWPFNumbering(numberingPart);
+ }
+ } catch (IOException | OpenXML4JException e) {
+ //swallow
+ }
+ return null;
+ }
+
+ private class XWPFToTextContentHandler implements XWPFDocumentXMLBodyHandler.XWPFBodyContentsHandler {
+ private final StringBuilder buffer;
+
+ public XWPFToTextContentHandler(StringBuilder buffer) {
+ this.buffer = buffer;
+ }
+
+ @Override
+ public void run(XWPFRunProperties runProperties, String contents) {
+ buffer.append(contents);
+ }
+
+ @Override
+ public void hyperlinkRun(String link, String text) {
+ buffer.append(" (").append(text).append(") ");
+ }
+
+ @Override
+ public void startParagraph() {
+ //no-op
+ }
+
+ @Override
+ public void endParagraph() {
+ buffer.append("\n");
+ }
+
+ @Override
+ public void startTable() {
+
+ }
+
+ @Override
+ public void endTable() {
+
+ }
+
+ @Override
+ public void startTableRow() {
+
+ }
+
+ @Override
+ public void endTableRow() {
+ buffer.append("\n");
+ }
+
+ @Override
+ public void startTableCell() {
+
+ }
+
+ @Override
+ public void endTableCell() {
+ buffer.append("\t");
+ }
+
+ @Override
+ public void startSDT() {
+
+ }
+
+ @Override
+ public void endSDT() {
+ buffer.append("\n");
+ }
+
+ @Override
+ public void startEditedSection(String editor, Date date, XWPFDocumentXMLBodyHandler.EditType editType) {
+
+ }
+
+ @Override
+ public void endEditedSection() {
+
+ }
+
+ @Override
+ public boolean getIncludeDeletedText() {
+ return true;
+ }
+
+ @Override
+ public void footnoteReference(String id) {
+
+ }
+
+ @Override
+ public void endnoteReference(String id) {
+
+ }
+
+ @Override
+ public boolean getIncludeMoveFromText() {
+ return false;
+ }
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFRunProperties.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFRunProperties.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFRunProperties.java
new file mode 100644
index 0000000..ad2d656
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFRunProperties.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+/**
+ * WARNING: This class is mutable. Make a copy of it
+ * if you want persistence!
+ */
+
+class XWPFRunProperties {
+ boolean italics = false;
+ boolean bold = false;
+
+ public boolean getItalics() {
+ return italics;
+ }
+
+ public boolean getBold() {
+ return bold;
+ }
+
+ public void setItalics(boolean italics) {
+ this.italics = italics;
+ }
+
+ public void setBold(boolean bold) {
+ this.bold = bold;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
new file mode 100644
index 0000000..2f27739
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf;
+
+
+import java.util.Date;
+
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFBodyContentsHandler {
+
+ private final static char[] NEWLINE = new char[]{'\n'};
+ private final static char[] TAB = new char[]{'\t'};
+
+ private final XHTMLContentHandler xhtml;
+ private final XWPFListManager listManager;
+ private final boolean includeDeletedText;
+ private final boolean includeMoveFromText;
+
+ private int pDepth = 0; //paragraph depth
+ private boolean isItalics = false;
+ private boolean isBold = false;
+
+ public XWPFTikaBodyPartHandler(XHTMLContentHandler xhtml, XWPFListManager listManager, OfficeParserConfig parserConfig) {
+ this.xhtml = xhtml;
+ this.listManager = listManager;
+ this.includeDeletedText = parserConfig.getIncludeDeletedContent();
+ this.includeMoveFromText = parserConfig.getIncludeMoveFromContent();
+ }
+
+ @Override
+ public void run(XWPFRunProperties runProperties, String contents) {
+ //TODO: smooth out bold/italics to handle only changes
+ //If two runs are bold, only add <b> at beginning and end of the run pair
+ try {
+ if (runProperties.getBold()) {
+ xhtml.startElement("b");
+ }
+ if (runProperties.getItalics()) {
+ xhtml.startElement("i");
+ }
+
+ xhtml.characters(contents);
+ if (runProperties.getItalics()) {
+ xhtml.endElement("i");
+ }
+ if (runProperties.getBold()) {
+ xhtml.endElement("b");
+ }
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void hyperlinkRun(String link, String text) {
+ //System.out.println("tika handler: "+link + " :: "+text);
+ try {
+ if (link != null) {
+ xhtml.startElement("a", "href", link);
+ }
+ xhtml.characters(text);
+ if (link != null) {
+ xhtml.endElement("a");
+ }
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void startParagraph() {
+ if (pDepth == 0) {
+ try {
+ xhtml.startElement("p");
+ } catch (SAXException e) {
+
+ }
+ }
+ pDepth++;
+ }
+
+ @Override
+ public void endParagraph() {
+ try {
+ if (pDepth == 1) {
+ xhtml.endElement("p");
+ } else {
+ xhtml.characters(NEWLINE, 0, 1);
+ }
+ } catch (SAXException e) {
+
+ }
+ pDepth--;
+ }
+
+ @Override
+ public void startTable() {
+ try {
+ xhtml.startElement("table");
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void endTable() {
+ try {
+ xhtml.endElement("table");
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void startTableRow() {
+ try {
+ xhtml.startElement("tr");
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void endTableRow() {
+ try {
+ xhtml.endElement("tr");
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void startTableCell() {
+ try {
+ xhtml.startElement("td");
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void endTableCell() {
+ try {
+ xhtml.endElement("td");
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void startSDT() {
+ //no-op
+ }
+
+ @Override
+ public void endSDT() {
+ //no-op
+ }
+
+ @Override
+ public void startEditedSection(String editor, Date date, XWPFDocumentXMLBodyHandler.EditType editType) {
+ //no-op
+ }
+
+ @Override
+ public void endEditedSection() {
+ //no-op
+ }
+
+ @Override
+ public boolean getIncludeDeletedText() {
+ return includeDeletedText;
+ }
+
+ @Override
+ public void footnoteReference(String id) {
+ if (id != null) {
+ try {
+ xhtml.characters("[");
+ xhtml.characters(id);
+ xhtml.characters("]");
+ } catch (SAXException e) {
+
+ }
+ }
+ }
+
+ @Override
+ public void endnoteReference(String id) {
+ if (id != null) {
+ try {
+ xhtml.characters("[");
+ xhtml.characters(id);
+ xhtml.characters("]");
+ } catch (SAXException e) {
+
+ }
+ }
+ }
+
+ @Override
+ public boolean getIncludeMoveFromText() {
+ return includeMoveFromText;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/AbstractPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/AbstractPartHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/AbstractPartHandler.java
new file mode 100644
index 0000000..9aa5471
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/AbstractPartHandler.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+abstract class AbstractPartHandler extends DefaultHandler implements PartHandler {
+
+ private String name;
+
+ public abstract String getContentType();
+
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * Override this to flush buffers, etc if necessary
+ */
+ public void endPart() throws SAXException, TikaException {
+ //no-op
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BinaryDataHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BinaryDataHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BinaryDataHandler.java
new file mode 100644
index 0000000..4a13799
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BinaryDataHandler.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+class BinaryDataHandler extends AbstractPartHandler {
+
+ private final XHTMLContentHandler handler;
+ private final Metadata metadata;
+ private final ParseContext parseContext;
+
+ private boolean inBinaryData = false;
+ private StringBuilder buffer = new StringBuilder();
+
+ final Base64 base64 = new Base64();
+
+
+ public BinaryDataHandler(XHTMLContentHandler handler, Metadata metadata, ParseContext context) {
+ this.handler = handler;
+ this.metadata = metadata;
+ this.parseContext = context;
+ }
+
+
+ @Override
+ public void startDocument() throws SAXException {
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+
+ }
+
+ @Override
+ public void endPart() throws SAXException, TikaException {
+ if (hasData()) {
+ EmbeddedDocumentExtractor embeddedDocumentExtractor =
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
+ Metadata embeddedMetadata = new Metadata();
+ try (TikaInputStream stream = TikaInputStream.get(getInputStream())) {
+ embeddedDocumentExtractor.parseEmbedded(stream, handler, embeddedMetadata, false);
+ } catch (IOException e) {
+ throw new TikaException("error in finishing part", e);
+ }
+ buffer.setLength(0);
+ }
+
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+
+ if (uri.equals(Word2006MLDocHandler.PKG_NS) && localName.equals("binaryData")) {
+ inBinaryData = true;
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ if (uri.equals(Word2006MLDocHandler.PKG_NS) && localName.equals("binaryData")) {
+ inBinaryData = false;
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (inBinaryData) {
+ buffer.append(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+
+ }
+
+ @Override
+ public String getContentType() {
+ return "";
+ }
+
+ boolean hasData() {
+ return buffer.length() > 0;
+ }
+
+ private InputStream getInputStream() {
+ byte[] bytes = base64.decode(buffer.toString());
+ return new ByteArrayInputStream(bytes);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BodyPartHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BodyPartHandler.java
new file mode 100644
index 0000000..4d04b2b
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BodyPartHandler.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+import java.util.HashMap;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFDocumentXMLBodyHandler;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFTikaBodyPartHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+
+/**
+ * Simple wrapper/extension of XWPFDocumentXMLBodyHandler to fit
+ * into the inline parsing scheme.
+ */
+class BodyPartHandler extends XWPFDocumentXMLBodyHandler implements PartHandler {
+
+ private final String contentType;
+ private String name;
+ public BodyPartHandler(String contentType, XHTMLContentHandler xhtml,
+ RelationshipsManager relationshipsManager,
+ OfficeParserConfig officeParserConfig) {
+ super(new XWPFTikaBodyPartHandler(xhtml, null, officeParserConfig),
+ new HashMap<String, String>());
+ this.contentType = contentType;
+ }
+
+ @Override
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ @Override
+ public String getName() {
+ return name;
+ }
+
+ @Override
+ public String getContentType() {
+ return contentType;
+ }
+
+ @Override
+ public void endPart() throws SAXException, TikaException {
+ //no-op
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/CorePropertiesHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/CorePropertiesHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/CorePropertiesHandler.java
new file mode 100644
index 0000000..c746e5c
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/CorePropertiesHandler.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.poi.openxml4j.opc.ContentTypes;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+class CorePropertiesHandler extends AbstractPartHandler {
+
+ final static String DC_NS = "http://purl.org/dc/elements/1.1";
+ final static String DC_TERMS_NS = "http://purl.org/dc/terms";
+ final static String CP_NS = "http://schemas.openxmlformats.org/package/2006/metadata/core-properties";
+
+ private final Metadata metadata;
+
+ final StringBuilder buffer = new StringBuilder();
+ final Map<String, Map<String, Property>> properties = new HashMap<>();
+
+ public CorePropertiesHandler(Metadata metadata) {
+ this.metadata = metadata;
+ addProperties();
+ }
+
+ void addProperties() {
+ Map<String, Property> dc = properties.get(DC_NS);
+ if (dc == null) {
+ dc = new HashMap<>();
+ }
+ dc.put("creator", TikaCoreProperties.CREATOR);
+ dc.put("title", TikaCoreProperties.TITLE);
+ dc.put("description", TikaCoreProperties.DESCRIPTION);
+ properties.put(DC_NS, dc);
+
+ Map<String, Property> dcTerms = properties.get(DC_TERMS_NS);
+ if (dcTerms == null) {
+ dcTerms = new HashMap<>();
+ }
+ dcTerms.put("created", TikaCoreProperties.CREATED);
+ dcTerms.put("modified", TikaCoreProperties.MODIFIED);
+
+ properties.put(DC_TERMS_NS, dcTerms);
+
+ Map<String, Property> cp = properties.get(CP_NS);
+ if (cp == null) {
+ cp = new HashMap<>();
+ }
+ cp.put("category", OfficeOpenXMLCore.CATEGORY);
+ cp.put("contentStatus", OfficeOpenXMLCore.CONTENT_STATUS);
+ cp.put("lastModifiedBy", TikaCoreProperties.MODIFIER);
+ cp.put("lastPrinted", OfficeOpenXMLCore.LAST_PRINTED);
+ cp.put("revision", OfficeOpenXMLCore.REVISION);
+ cp.put("subject", OfficeOpenXMLCore.SUBJECT);
+ cp.put("version", OfficeOpenXMLCore.VERSION);
+ properties.put(CP_NS, cp);
+ }
+
+ @Override
+ public void startDocument() throws SAXException {
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ buffer.setLength(0);
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) throws SAXException {
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) throws SAXException {
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ Property prop = getProperty(uri, localName);
+ if (prop != null) {
+
+ if (prop.isMultiValuePermitted()) {
+ metadata.add(prop, buffer.toString());
+ } else {
+ metadata.set(prop, buffer.toString());
+ }
+ }
+ buffer.setLength(0);
+
+ }
+
+ private Property getProperty(String uri, String localName) {
+ if (uri.endsWith("/")) {
+ uri = uri.substring(0, uri.length()-1);
+ }
+
+ Map<String, Property> m = properties.get(uri);
+ if (m != null) {
+ return m.get(localName);
+ }
+ return null;
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ buffer.append(ch, start, length);
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+ buffer.append(ch, start, length);
+ }
+
+ @Override
+ public String getContentType() {
+ return ContentTypes.CORE_PROPERTIES_PART;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/ExtendedPropertiesHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/ExtendedPropertiesHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/ExtendedPropertiesHandler.java
new file mode 100644
index 0000000..74238a6
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/ExtendedPropertiesHandler.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.Property;
+
+class ExtendedPropertiesHandler extends CorePropertiesHandler {
+
+ final static String EP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties";
+
+ public ExtendedPropertiesHandler(Metadata metadata) {
+ super(metadata);
+ }
+
+ @Override
+ void addProperties() {
+ Map<String, Property> ep = properties.get(EP_NS);
+ if (ep == null) {
+ ep = new HashMap<>();
+ }
+ ep.put("AppVersion", OfficeOpenXMLExtended.APP_VERSION);
+ ep.put("Application", OfficeOpenXMLExtended.APPLICATION);
+ ep.put("Comments", OfficeOpenXMLExtended.COMMENTS);
+ ep.put("Company", OfficeOpenXMLExtended.COMPANY);
+ ep.put("DocSecurity", OfficeOpenXMLExtended.DOC_SECURITY);
+ ep.put("HiddenSlides", OfficeOpenXMLExtended.HIDDEN_SLIDES);
+ ep.put("Manager", OfficeOpenXMLExtended.MANAGER);
+ ep.put("Notes", OfficeOpenXMLExtended.NOTES);
+ ep.put("PresentationFormat", OfficeOpenXMLExtended.PRESENTATION_FORMAT);
+ ep.put("Template", OfficeOpenXMLExtended.TEMPLATE);
+ ep.put("TotalTime", OfficeOpenXMLExtended.TOTAL_TIME);
+ ep.put("Pages", Office.PAGE_COUNT);
+ ep.put("Words", Office.WORD_COUNT);
+ ep.put("Characters", Office.CHARACTER_COUNT);
+ ep.put("CharactersWithSpaces", Office.CHARACTER_COUNT_WITH_SPACES);
+ ep.put("Paragraphs", Office.PARAGRAPH_COUNT);
+ ep.put("Lines", Office.LINE_COUNT);
+ properties.put(EP_NS, ep);
+ }
+
+ @Override
+ public String getContentType() {
+ return "application/vnd.openxmlformats-officedocument.extended-properties+xml";
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/PartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/PartHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/PartHandler.java
new file mode 100644
index 0000000..fee64de
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/PartHandler.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+interface PartHandler extends ContentHandler {
+ void setName(String name);
+
+ String getName();
+
+ String getContentType();
+
+ /**
+ * Override this to flush buffers, etc if necessary
+ */
+ void endPart() throws SAXException, TikaException;
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Relationship.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Relationship.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Relationship.java
new file mode 100644
index 0000000..eccb1bf
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Relationship.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+
+import org.apache.poi.openxml4j.opc.TargetMode;
+
+class Relationship {
+
+ private final String contentType;
+
+ private final String target;
+
+ private final TargetMode targetMode;
+
+ public Relationship(String contentType, String target) {
+ this(contentType, target, null);
+ }
+
+ public Relationship(String contentType, String target, TargetMode targetMode) {
+ this.contentType = contentType;
+ this.target = target;
+ this.targetMode = targetMode;
+ }
+
+ public String getContentType() {
+ return contentType;
+ }
+
+ public String getTarget() {
+ return target;
+ }
+
+ public TargetMode getTargetMode() {
+ return targetMode;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/RelationshipsHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/RelationshipsHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/RelationshipsHandler.java
new file mode 100644
index 0000000..670ffab
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/RelationshipsHandler.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+
+import org.apache.poi.openxml4j.opc.ContentTypes;
+import org.apache.poi.openxml4j.opc.TargetMode;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+class RelationshipsHandler extends AbstractPartHandler {
+
+ final static String REL_NS = "http://schemas.openxmlformats.org/package/2006/relationships";
+
+ private final RelationshipsManager relationshipsManager;
+
+ public RelationshipsHandler(RelationshipsManager relationshipsManager) {
+ this.relationshipsManager = relationshipsManager;
+ }
+
+
+ @Override
+ public void startDocument() throws SAXException {
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) throws SAXException {
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) throws SAXException {
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ if (uri.equals(REL_NS)) {
+ if (localName.equals("Relationship")) {
+ String id = atts.getValue("", "Id");
+ String type = atts.getValue("", "Type");
+ String target = atts.getValue("", "Target");
+ String targetModeString = atts.getValue("", "TargetMode");
+ TargetMode targetMode = "EXTERNAL".equals(targetModeString)? TargetMode.EXTERNAL :
+ TargetMode.INTERNAL;
+ relationshipsManager.addRelationship(getName(), id, type, target, targetMode);
+ }
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+
+ }
+
+ @Override
+ public String getContentType() {
+ return ContentTypes.RELATIONSHIPS_PART;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/RelationshipsManager.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/RelationshipsManager.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/RelationshipsManager.java
new file mode 100644
index 0000000..5773fbb
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/RelationshipsManager.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.poi.openxml4j.opc.TargetMode;
+
+class RelationshipsManager {
+
+ Map<String, Map<String, Relationship>> map = new HashMap<>();
+
+ public void addRelationship(String relsFileName, String id, String type, String target, TargetMode targetMode) {
+ String packageName = convertRelsFileNameToPackageName(relsFileName);
+ Map<String, Relationship> thisPackageRels = map.get(packageName);
+ if (thisPackageRels == null) {
+ thisPackageRels = new HashMap<>();
+ }
+ thisPackageRels.put(id, new Relationship(type, target, targetMode));
+ map.put(packageName, thisPackageRels);
+ }
+
+ public Relationship getRelationship(String packageName, String id) {
+ Map<String, Relationship> thisPackageRels = map.get(packageName);
+ if (thisPackageRels != null) {
+ return thisPackageRels.get(id);
+ }
+ return null;
+ }
+
+ private String convertRelsFileNameToPackageName(String relsFileName) {
+ if ("/_rels/.rels".equals(relsFileName)) {
+ return "/";
+ }
+
+ String tmp = relsFileName;
+ tmp = tmp.replaceFirst("\\/_rels\\/", "/");
+ tmp = tmp.replaceFirst(".rels\\Z", "");
+ return tmp;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLDocHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLDocHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLDocHandler.java
new file mode 100644
index 0000000..4276671
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLDocHandler.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+class Word2006MLDocHandler extends DefaultHandler {
+
+ final static String PKG_NS = "http://schemas.microsoft.com/office/2006/xmlPackage";
+
+
+ private final XHTMLContentHandler xhtml;
+ private final Metadata metadata;
+ private final ParseContext parseContext;
+
+ private final Map<String, PartHandler> partHandlers = new HashMap<>();
+ private final BinaryDataHandler binaryDataHandler;
+ private final RelationshipsManager relationshipsManager = new RelationshipsManager();
+ private PartHandler currentPartHandler = null;
+
+ public Word2006MLDocHandler(XHTMLContentHandler xhtml, Metadata metadata,
+ ParseContext context) {
+ this.xhtml = xhtml;
+ this.metadata = metadata;
+ this.parseContext = context;
+ OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class);
+
+ addPartHandler(new RelationshipsHandler(relationshipsManager));
+
+ addPartHandler(new BodyPartHandler(
+ XWPFRelation.DOCUMENT.getContentType(),
+ xhtml, relationshipsManager, officeParserConfig));
+
+ addPartHandler(new BodyPartHandler(
+ XWPFRelation.FOOTNOTE.getContentType(),
+ xhtml, relationshipsManager, officeParserConfig));
+
+ addPartHandler(new BodyPartHandler(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml",
+ xhtml, relationshipsManager, officeParserConfig));
+
+ addPartHandler(new BodyPartHandler(
+ XWPFRelation.HEADER.getContentType(),
+ xhtml, relationshipsManager, officeParserConfig));
+
+ addPartHandler(new BodyPartHandler(
+ XWPFRelation.FOOTER.getContentType(),
+ xhtml, relationshipsManager, officeParserConfig));
+
+ addPartHandler(new BodyPartHandler(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml",
+ xhtml, relationshipsManager, officeParserConfig));
+
+
+ addPartHandler(new BodyPartHandler(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml",
+ xhtml, relationshipsManager, officeParserConfig));
+
+ addPartHandler(new BodyPartHandler(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml",
+ xhtml, relationshipsManager, officeParserConfig));
+
+ addPartHandler(new CorePropertiesHandler(metadata));
+ addPartHandler(new ExtendedPropertiesHandler(metadata));
+ binaryDataHandler = new BinaryDataHandler(xhtml, metadata, context);
+ }
+
+ private void addPartHandler(PartHandler partHandler) {
+ partHandlers.put(partHandler.getContentType(), partHandler);
+ }
+
+
+ @Override
+ public void startDocument() throws SAXException {
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) throws SAXException {
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) throws SAXException {
+
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ if (uri.equals(PKG_NS) && localName.equals("part")) {
+ //start of a package
+ String name = atts.getValue(PKG_NS, "name");
+ String contentType = atts.getValue(PKG_NS, "contentType");
+ currentPartHandler = partHandlers.get(contentType);
+ //for now treat every unknown part type
+ //as if it contained binary data
+ if (currentPartHandler == null) {
+ currentPartHandler = binaryDataHandler;
+ }
+ if (currentPartHandler != null) {
+ currentPartHandler.setName(name);
+ }
+ } else if (currentPartHandler != null) {
+ currentPartHandler.startElement(uri, localName, qName, atts);
+ }
+
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ if (uri.equals(PKG_NS) && localName.equals("part")) {
+ //do post processing
+ if (currentPartHandler != null) {
+ try {
+ currentPartHandler.endPart();
+ } catch (TikaException e) {
+ throw new SAXException(e);
+ }
+ }
+ //then reset
+ currentPartHandler = null;
+ } else if (currentPartHandler != null) {
+ currentPartHandler.endElement(uri, localName, qName);
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ if (currentPartHandler != null) {
+ currentPartHandler.characters(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+ if (currentPartHandler != null) {
+ currentPartHandler.characters(ch, start, length);
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParser.java
new file mode 100644
index 0000000..ff8a43d
--- /dev/null
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLParser.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.AbstractOfficeParser;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+
+public class Word2006MLParser extends AbstractOfficeParser {
+
+ protected static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(
+ MediaType.application("vnd.ms-word2006ml"));
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+ //set OfficeParserConfig if the user hasn't specified one
+ configure(context);
+
+ final XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler, metadata);
+
+ xhtml.startDocument();
+
+ try {
+ context.getSAXParser().parse(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(new EmbeddedContentHandler(
+ new Word2006MLDocHandler(xhtml, metadata, context))));
+ } catch (SAXException e) {
+ throw new TikaException("XML parse error", e);
+ } finally {
+ xhtml.endDocument();
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index f492e89..7aa2b01 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parser-modules/tika-parser-office-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -21,7 +21,7 @@ org.apache.tika.parser.microsoft.OldExcelParser
org.apache.tika.parser.microsoft.TNEFParser
org.apache.tika.parser.microsoft.MSOwnerFileParser
org.apache.tika.parser.microsoft.ooxml.OOXMLParser
-org.apache.tika.parser.microsoft.ooxml.xwpf.Word2006MLParser
+org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006.Word2006MLParser
org.apache.tika.parser.microsoft.xml.WordMLParser
org.apache.tika.parser.microsoft.xml.SpreadsheetMLParser
#org.apache.tika.parser.odf.OpenDocumentContentParser
http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index d924f41..ea936d8 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -25,10 +25,12 @@ import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import java.io.ByteArrayOutputStream;
+import java.io.File;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.StringWriter;
import java.util.Arrays;
+import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
@@ -44,10 +46,12 @@ import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.WordParserTest;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Ignore;
@@ -1299,6 +1303,33 @@ public class OOXMLParserTest extends TikaTest {
assertContainsAtLeast(minExpected, getRecursiveMetadata("testEXCEL_macro.xlsm"));
}
+ //@Test //use this for lightweight benchmarking to compare xwpf options
+ public void testBatch() throws Exception {
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setUseSAXDocxExtractor(true);
+ long started = new Date().getTime();
+ int ex = 0;
+ for (int i = 0; i < 100; i++) {
+ for (File f : getResourceAsFile("/test-documents").listFiles()) {
+ if (!f.getName().endsWith(".docx")) {
+ continue;
+ }
+ try (InputStream is = TikaInputStream.get(f)) {
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(OfficeParserConfig.class, officeParserConfig);
+ //test only the extraction of the main docx content, not embedded docs
+ parseContext.set(Parser.class, new EmptyParser());
+ Metadata metadata = new Metadata();
+ XMLResult r = getXML(is, parser, metadata, parseContext);
+ } catch (Exception e) {
+ ex++;
+
+ }
+ }
+ }
+ System.out.println("elapsed: "+(new Date().getTime()-started) + " with " + ex + " exceptions");
+ }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/32162f59/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java
deleted file mode 100644
index 607e6ef..0000000
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/xwpf/Word2006MLParserTest.java
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-import static org.junit.Assert.assertEquals;
-
-import java.util.List;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.OfficeOpenXMLExtended;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.RecursiveParserWrapper;
-import org.apache.tika.parser.microsoft.MSOfficeParserConfig;
-import org.junit.Test;
-
-
-public class Word2006MLParserTest extends TikaTest {
-
- @Test
- public void basicTest() throws Exception {
-
-
-
- List<Metadata> metadataList = getRecursiveMetadata("testWORD_2006ml.xml");
-
- assertEquals(5, metadataList.size());
-
- Metadata m = metadataList.get(0);
-
- assertEquals("2016-11-23T12:07:00Z", m.get(TikaCoreProperties.CREATED));
- assertEquals("2016-11-23T12:07:00Z", m.get(TikaCoreProperties.MODIFIED));
- assertEquals("My Document Title", m.get(TikaCoreProperties.TITLE));
- assertEquals("This is the Author", m.get(TikaCoreProperties.CREATOR));
- assertEquals("2", m.get(OfficeOpenXMLCore.REVISION));
- assertEquals("Allison, Timothy B.", m.get(OfficeOpenXMLCore.LAST_MODIFIED_BY));
- assertEquals("0", m.get(OfficeOpenXMLExtended.DOC_SECURITY));
- assertEquals("225", m.get(Office.WORD_COUNT));
- assertEquals("3", m.get(Office.PARAGRAPH_COUNT));
- assertEquals("1506", m.get(Office.CHARACTER_COUNT_WITH_SPACES));
- assertEquals("10", m.get(Office.LINE_COUNT));
- assertEquals("16.0000", m.get(OfficeOpenXMLExtended.APP_VERSION));
-
-
- String content = m.get(RecursiveParserWrapper.TIKA_CONTENT);
-
-
- assertContainsCountTimes("engaging title page", content, 1);
- assertContainsCountTimes("<p>This is the Author</p>", content, 1);
- assertContainsCountTimes("<p>This is an engaging title page</p>", content, 1);
-
- assertContains("<p>My Document Title</p>", content);
- assertContains("<p>My Document Subtitle</p>", content);
-
- assertContains("<p>\tHeading1\t3</p>", content);
-
-
- //TODO: integrate numbering
- assertContains("Really basic 2.", content);
-
- assertContainsCountTimes("This is a text box", content, 1);
-
- assertContains("<p>This is a hyperlink: <a href=\"http://tika.apache.org\">tika</a></p>", content);
-
- assertContains("<p>This is a link to a local file: <a href=\"file:///C:\\data\\test.png\">test.png</a></p>", content);
-
- assertContains("<p>This is 10 spaces</p>", content);
-
- //caption
- assertContains("<p>Table 1: Table1 Caption</p>", content);
-
- //embedded table
- //TODO: figure out how to handle embedded tables in html
- assertContains("<p>Embedded table r1c1</p>", content);
-
- //shape
- assertContainsCountTimes("<p>This is text within a shape", content, 1);
-
- //sdt rich text
- assertContains("<p>Rich text content control", content);
-
- //sdt simple text
- assertContains("<p>Simple text content control", content);
-
- //sdt repeating
- assertContains("Repeating content", content);
-
- //sdt dropdown
- //TODO: get options for dropdown
- assertContains("Drop down1", content);
-
- //sdt date
- assertContains("<p>11/16/2016</p>", content);
-
- //test that <tab/> works
- assertContains("tab\ttab", content);
-
- assertContainsCountTimes("serious word art", content, 1);
- assertContainsCountTimes("Wordartr1c1", content, 1);
-
- //glossary document contents
- assertContains("Click or tap to enter a date", content);
-
- //basic formatting
- assertContains("<p>The <i>quick</i> brown <b>fox </b>j<i>um</i><b><i>ped</i></b> over",
- content);
-
- //TODO: add chart parsing
-// assertContains("This is the chart", content);
-
- assertContains("This is a comment", content);
-
- assertContains("This is an endnote", content);
-
- assertContains("this is the footnote", content);
-
- assertContains("First page header", content);
-
- assertContains("Even page header", content);
-
- assertContains("Odd page header", content);
-
- assertContains("First page footer", content);
-
- assertContains("Even page footer", content);
-
- assertContains("Odd page footer", content);
-
- //test default includes deleted
- assertContains("frog", content);
-
- assertContains("Mattmann", content);
-
- //TODO: extract this...Note that it is in "Backup" not "Choice"!!!
-// assertContains("This is the chart title", content);
-
-
-
- }
-
- private void assertContainsCountTimes(String needle, String haystack, int expectedCount) {
- int i = haystack.indexOf("engaging title page");
- int cnt = 0;
- while (i > -1) {
- cnt++;
- i = haystack.indexOf("engaging title page", i+1);
- }
- assertEquals("found needle >"+ needle+"<"+cnt+" times instead of expected: "+expectedCount,
- expectedCount, cnt);
-
- }
-
- @Test
- public void testSkipDeleted() throws Exception {
- ParseContext pc = new ParseContext();
- MSOfficeParserConfig msOfficeParserConfig = new MSOfficeParserConfig();
- msOfficeParserConfig.setIncludeDeletedContent(false);
- pc.set(MSOfficeParserConfig.class, msOfficeParserConfig);
-
- XMLResult r = getXML("testWORD_2006ml.xml", pc);
- assertNotContained("frog", r.xml);
- }
-
-}