You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/20 18:16:31 UTC
[1/2] tika git commit: TIKA-2220 - refactor new sax pptx and docx to
reduce code duplication.
Repository: tika
Updated Branches:
refs/heads/master ca37313a7 -> 376318fc1
http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java
index b655200..395602e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFStylesShim.java
@@ -25,7 +25,7 @@ import java.util.Map;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.microsoft.ooxml.AbstractDocumentXMLBodyHandler;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
import org.apache.tika.sax.OfflineContentHandler;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
@@ -39,8 +39,17 @@ import org.xml.sax.helpers.DefaultHandler;
*/
public class XWPFStylesShim {
+ /**
+ * Empty singleton to be used when there is no style info
+ */
+ public static XWPFStylesShim EMPTY_STYLES = new EmptyXWPFStyles();
+
private Map<String, String> styles = new HashMap<>();
+ private XWPFStylesShim() {
+
+ }
+
public XWPFStylesShim(PackagePart part, ParseContext parseContext) {
try (InputStream is = part.getInputStream()) {
onDocumentLoad(parseContext, is);
@@ -66,17 +75,25 @@ public class XWPFStylesShim {
return styles.get(styleId);
}
+ private static class EmptyXWPFStyles extends XWPFStylesShim {
+
+ @Override
+ public String getStyleName(String styleId) {
+ return null;
+ }
+ }
+
private class StylesStripper extends DefaultHandler {
String currentStyleId = null;
@Override
public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
- if (uri == null || AbstractDocumentXMLBodyHandler.W_NS.equals(uri)) {
+ if (uri == null || OOXMLWordAndPowerPointTextHandler.W_NS.equals(uri)) {
if ("style".equals(localName)) {
- currentStyleId = atts.getValue(AbstractDocumentXMLBodyHandler.W_NS, "styleId");
+ currentStyleId = atts.getValue(OOXMLWordAndPowerPointTextHandler.W_NS, "styleId");
} else if ("name".equals(localName)) {
- String name = atts.getValue(AbstractDocumentXMLBodyHandler.W_NS, "val");
+ String name = atts.getValue(OOXMLWordAndPowerPointTextHandler.W_NS, "val");
if (currentStyleId != null && name != null) {
styles.put(currentStyleId, name);
}
@@ -86,7 +103,7 @@ public class XWPFStylesShim {
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
- if (uri == null || AbstractDocumentXMLBodyHandler.W_NS.equals(uri)) {
+ if (uri == null || OOXMLWordAndPowerPointTextHandler.W_NS.equals(uri)) {
if ("style".equals(localName)) {
currentStyleId = null;
}
http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
deleted file mode 100644
index f11cf33..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFTikaBodyPartHandler.java
+++ /dev/null
@@ -1,376 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-import java.math.BigInteger;
-import java.util.Date;
-
-import org.apache.tika.parser.microsoft.OfficeParserConfig;
-import org.apache.tika.parser.microsoft.WordExtractor;
-import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
-import org.apache.tika.parser.microsoft.ooxml.RunProperties;
-import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-public class XWPFTikaBodyPartHandler implements XWPFDocumentXMLBodyHandler.XWPFBodyContentsHandler {
-
- private final static String P = "p";
-
- private final static char[] NEWLINE = new char[]{'\n'};
- private final static char[] TAB = new char[]{'\t'};
-
- private final XHTMLContentHandler xhtml;
- private final XWPFListManager listManager;
- private final boolean includeDeletedText;
- private final boolean includeMoveFromText;
- private final XWPFStylesShim styles;
-
- private int pDepth = 0; //paragraph depth
- private int tableDepth = 0;//table depth
- private int sdtDepth = 0;//
- private boolean isItalics = false;
- private boolean isBold = false;
- private boolean wroteHyperlinkStart = false;
-
- //will need to replace this with a stack
- //if we're marking more that the first level <p/> element
- private String paragraphTag = null;
-
- public XWPFTikaBodyPartHandler(XHTMLContentHandler xhtml, XWPFStylesShim styles, XWPFListManager listManager, OfficeParserConfig parserConfig) {
- this.xhtml = xhtml;
- this.styles = styles;
- this.listManager = listManager;
- this.includeDeletedText = parserConfig.getIncludeDeletedContent();
- this.includeMoveFromText = parserConfig.getIncludeMoveFromContent();
- }
-
- @Override
- public void run(RunProperties runProperties, String contents) {
- try {
- // True if we are currently in the named style tag:
- if (runProperties.getBold() != isBold) {
- if (isItalics) {
- xhtml.endElement("i");
- isItalics = false;
- }
- if (runProperties.getBold()) {
- xhtml.startElement("b");
- isBold = true;
- } else {
- xhtml.endElement("b");
- isBold = false;
- }
- }
-
- if (runProperties.getItalics() != isItalics) {
- if (runProperties.getItalics()) {
- xhtml.startElement("i");
- isItalics = true;
- } else {
- xhtml.endElement("i");
- isItalics = false;
- }
- }
-
- xhtml.characters(contents);
-
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void hyperlinkStart(String link) {
- try {
- if (link != null) {
- xhtml.startElement("a", "href", link);
- wroteHyperlinkStart = true;
- }
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void hyperlinkEnd() {
- try {
- if (wroteHyperlinkStart) {
- closeStyleTags();
- wroteHyperlinkStart = false;
- xhtml.endElement("a");
- }
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void startParagraph(ParagraphProperties paragraphProperties) {
- if (pDepth == 0 && tableDepth == 0 && sdtDepth == 0) {
- paragraphTag = P;
- String styleClass = null;
- //TIKA-2144 check that styles is not null
- if (paragraphProperties.getStyleID() != null && styles != null) {
- String styleName = styles.getStyleName(
- paragraphProperties.getStyleID()
- );
- if (styleName != null) {
- WordExtractor.TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(
- styleName, false);
- paragraphTag = tas.getTag();
- styleClass = tas.getStyleClass();
- }
- }
-
-
- try {
- if (styleClass == null) {
- xhtml.startElement(paragraphTag);
- } else {
- xhtml.startElement(paragraphTag, "class", styleClass);
- }
- } catch (SAXException e) {
-
- }
- }
-
- try {
- writeParagraphNumber(paragraphProperties.getNumId(),
- paragraphProperties.getIlvl(), listManager, xhtml);
- } catch (SAXException e) {
-
- }
- pDepth++;
- }
-
- @Override
- public void endParagraph() {
- try {
- closeStyleTags();
- if (pDepth == 1 && tableDepth == 0 && sdtDepth == 0) {
- xhtml.endElement(paragraphTag);
- paragraphTag = null;
- } else {
- xhtml.characters(NEWLINE, 0, 1);
- }
- } catch (SAXException e) {
-
- }
- pDepth--;
- }
-
- @Override
- public void startTable() {
- try {
- xhtml.startElement("table");
- tableDepth++;
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void endTable() {
- try {
- xhtml.endElement("table");
- tableDepth--;
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void startTableRow() {
- try {
- xhtml.startElement("tr");
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void endTableRow() {
- try {
- xhtml.endElement("tr");
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void startTableCell() {
- try {
- xhtml.startElement("td");
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void endTableCell() {
- try {
- xhtml.endElement("td");
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void startSDT() {
- try {
- closeStyleTags();
- sdtDepth++;
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void endSDT() {
- sdtDepth--;
- }
-
- @Override
- public void startEditedSection(String editor, Date date, XWPFDocumentXMLBodyHandler.EditType editType) {
- //no-op
- }
-
- @Override
- public void endEditedSection() {
- //no-op
- }
-
- @Override
- public boolean getIncludeDeletedText() {
- return includeDeletedText;
- }
-
- @Override
- public void footnoteReference(String id) {
- if (id != null) {
- try {
- xhtml.characters("[");
- xhtml.characters(id);
- xhtml.characters("]");
- } catch (SAXException e) {
-
- }
- }
- }
-
- @Override
- public void endnoteReference(String id) {
- if (id != null) {
- try {
- xhtml.characters("[");
- xhtml.characters(id);
- xhtml.characters("]");
- } catch (SAXException e) {
-
- }
- }
- }
-
- @Override
- public boolean getIncludeMoveFromText() {
- return includeMoveFromText;
- }
-
- @Override
- public void embeddedOLERef(String relId) {
- if (relId == null) {
- return;
- }
- try {
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
- attributes.addAttribute("", "id", "id", "CDATA", relId);
- xhtml.startElement("div", attributes);
- xhtml.endElement("div");
-
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void embeddedPicRef(String picFileName, String picDescription) {
-
- try {
- AttributesImpl attr = new AttributesImpl();
- if (picFileName != null) {
- attr.addAttribute("", "src", "src", "CDATA", "embedded:" + picFileName);
- }
- if (picDescription != null) {
- attr.addAttribute("", "alt", "alt", "CDATA", picDescription);
- }
-
- xhtml.startElement("img", attr);
- xhtml.endElement("img");
-
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void startBookmark(String id, String name) {
- //skip bookmarks within hyperlinks
- if (name != null && ! wroteHyperlinkStart) {
- try {
- xhtml.startElement("a", "name", name);
- xhtml.endElement("a");
- } catch (SAXException e) {
-
- }
- }
- }
-
- @Override
- public void endBookmark(String id) {
- //no-op
- }
-
- private void closeStyleTags() throws SAXException {
- if (isItalics) {
- xhtml.endElement("i");
- isItalics = false;
- }
- if (isBold) {
- xhtml.endElement("b");
- isBold = false;
- }
- }
-
- private void writeParagraphNumber(int numId, int ilvl,
- XWPFListManager listManager,
- XHTMLContentHandler xhtml) throws SAXException {
-
- if (ilvl < 0 || numId < 0 || listManager == null) {
- return;
- }
- String number = listManager.getFormattedNumber(BigInteger.valueOf(numId), ilvl);
- if (number != null) {
- xhtml.characters(number);
- }
-
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BodyPartHandler.java
deleted file mode 100644
index 1b25683..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BodyPartHandler.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
-
-import java.util.HashMap;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.microsoft.OfficeParserConfig;
-import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFDocumentXMLBodyHandler;
-import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFTikaBodyPartHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-
-
-/**
- * Simple wrapper/extension of XWPFDocumentXMLBodyHandler to fit
- * into the inline parsing scheme.
- */
-class BodyPartHandler extends XWPFDocumentXMLBodyHandler implements PartHandler {
-
- private final String contentType;
- private String name;
- public BodyPartHandler(String contentType, XHTMLContentHandler xhtml,
- RelationshipsManager relationshipsManager,
- OfficeParserConfig officeParserConfig) {
- super(new XWPFTikaBodyPartHandler(xhtml, null, null, officeParserConfig),
- new HashMap<String, String>());
- this.contentType = contentType;
- }
-
- @Override
- public void setName(String name) {
- this.name = name;
- }
-
- @Override
- public String getName() {
- return name;
- }
-
- @Override
- public String getContentType() {
- return contentType;
- }
-
- @Override
- public void endPart() throws SAXException, TikaException {
- //no-op
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLDocHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLDocHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLDocHandler.java
index 4276671..5b4853f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLDocHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/Word2006MLDocHandler.java
@@ -54,36 +54,36 @@ class Word2006MLDocHandler extends DefaultHandler {
addPartHandler(new RelationshipsHandler(relationshipsManager));
- addPartHandler(new BodyPartHandler(
+ addPartHandler(new WordAndPowerPointTextPartHandler(
XWPFRelation.DOCUMENT.getContentType(),
xhtml, relationshipsManager, officeParserConfig));
- addPartHandler(new BodyPartHandler(
+ addPartHandler(new WordAndPowerPointTextPartHandler(
XWPFRelation.FOOTNOTE.getContentType(),
xhtml, relationshipsManager, officeParserConfig));
- addPartHandler(new BodyPartHandler(
+ addPartHandler(new WordAndPowerPointTextPartHandler(
"application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml",
xhtml, relationshipsManager, officeParserConfig));
- addPartHandler(new BodyPartHandler(
+ addPartHandler(new WordAndPowerPointTextPartHandler(
XWPFRelation.HEADER.getContentType(),
xhtml, relationshipsManager, officeParserConfig));
- addPartHandler(new BodyPartHandler(
+ addPartHandler(new WordAndPowerPointTextPartHandler(
XWPFRelation.FOOTER.getContentType(),
xhtml, relationshipsManager, officeParserConfig));
- addPartHandler(new BodyPartHandler(
+ addPartHandler(new WordAndPowerPointTextPartHandler(
"application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml",
xhtml, relationshipsManager, officeParserConfig));
- addPartHandler(new BodyPartHandler(
+ addPartHandler(new WordAndPowerPointTextPartHandler(
"application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml",
xhtml, relationshipsManager, officeParserConfig));
- addPartHandler(new BodyPartHandler(
+ addPartHandler(new WordAndPowerPointTextPartHandler(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml",
xhtml, relationshipsManager, officeParserConfig));
http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/WordAndPowerPointTextPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/WordAndPowerPointTextPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/WordAndPowerPointTextPartHandler.java
new file mode 100644
index 0000000..3ffdb42
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/WordAndPowerPointTextPartHandler.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
+
+import java.util.HashMap;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLTikaBodyPartHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+
+/**
+ * Simple wrapper/extension of OOXMLWordAndPowerPointTextHandler to fit
+ * into the inline parsing scheme.
+ */
+class WordAndPowerPointTextPartHandler extends OOXMLWordAndPowerPointTextHandler implements PartHandler {
+
+ private final String contentType;
+ private String name;
+ public WordAndPowerPointTextPartHandler(String contentType, XHTMLContentHandler xhtml,
+ RelationshipsManager relationshipsManager,
+ OfficeParserConfig officeParserConfig) {
+ super(new OOXMLTikaBodyPartHandler(xhtml, null, null, officeParserConfig),
+ new HashMap<String, String>());
+ this.contentType = contentType;
+ }
+
+ @Override
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ @Override
+ public String getName() {
+ return name;
+ }
+
+ @Override
+ public String getContentType() {
+ return contentType;
+ }
+
+ @Override
+ public void endPart() throws SAXException, TikaException {
+ //no-op
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index d107756..635d0c9 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -231,7 +231,7 @@ public class SXWPFExtractorTest extends TikaTest {
assertTrue(xml.contains("<h1 class=\"title\">"));
// Regular headings
- assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
+ assertContains("<h1>Heading Level 1</h1>", xml);
assertTrue(xml.contains("<h2>Heading Level 2</h2>"));
// Headings with anchor tags in them
//TODO: still not getting bookmarks
[2/2] tika git commit: TIKA-2220 - refactor new sax pptx and docx to
reduce code duplication.
Posted by ta...@apache.org.
TIKA-2220 - refactor new sax pptx and docx to reduce code duplication.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/376318fc
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/376318fc
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/376318fc
Branch: refs/heads/master
Commit: 376318fc1b34014ec31d5fbfdfa962183ea8c717
Parents: ca37313
Author: tballison <ta...@mitre.org>
Authored: Tue Dec 20 13:16:20 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Tue Dec 20 13:16:20 2016 -0500
----------------------------------------------------------------------
.../ooxml/AbstractDocumentXMLBodyHandler.java | 99 ----
.../ooxml/OOXMLTikaBodyPartHandler.java | 397 +++++++++++++++
.../OOXMLWordAndPowerPointTextHandler.java | 497 +++++++++++++++++++
.../SXSLFPowerPointExtractorDecorator.java | 30 +-
.../ooxml/SXWPFWordExtractorDecorator.java | 6 +-
.../parser/microsoft/ooxml/XWPFListManager.java | 23 +
.../ooxml/xslf/XSLFDocumentXMLBodyHandler.java | 330 ------------
.../xslf/XSLFEventBasedPowerPointExtractor.java | 53 +-
.../ooxml/xslf/XSLFTikaBodyPartHandler.java | 262 ----------
.../ooxml/xwpf/XWPFDocumentXMLBodyHandler.java | 388 ---------------
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 7 +-
.../microsoft/ooxml/xwpf/XWPFStylesShim.java | 27 +-
.../ooxml/xwpf/XWPFTikaBodyPartHandler.java | 376 --------------
.../ooxml/xwpf/ml2006/BodyPartHandler.java | 64 ---
.../ooxml/xwpf/ml2006/Word2006MLDocHandler.java | 16 +-
.../WordAndPowerPointTextPartHandler.java | 64 +++
.../microsoft/ooxml/SXWPFExtractorTest.java | 2 +-
17 files changed, 1084 insertions(+), 1557 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractDocumentXMLBodyHandler.java
deleted file mode 100644
index 5037fd2..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractDocumentXMLBodyHandler.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml;
-
-
-import org.xml.sax.helpers.DefaultHandler;
-
-public class AbstractDocumentXMLBodyHandler extends DefaultHandler {
-
- protected final static String R = "r";
- protected final static String FLD = "fld";
- protected final static String RPR = "rPr";
- protected final static String P = "p";
- protected static String P_STYLE = "pStyle";
- protected final static String PPR = "pPr";
- protected static String T = "t";
- protected final static String TAB = "tab";
- protected final static String B = "b";
- protected final static String ILVL = "ilvl";
- protected final static String NUM_ID = "numId";
- protected final static String TC = "tc";
- protected final static String TR = "tr";
- protected final static String I = "i";
- protected final static String NUM_PR = "numPr";
- protected final static String BR = "br";
- protected final static String HYPERLINK = "hyperlink";
- protected final static String TBL = "tbl";
- protected final static String PIC = "pic";
- protected final static String PICT = "pict";
- protected final static String IMAGEDATA = "imagedata";
- protected final static String BLIP = "blip";
- protected final static String CHOICE = "Choice";
- protected final static String FALLBACK = "Fallback";
- protected final static String OLE_OBJECT = "OLEObject";
- protected final static String CR = "cr";
-
- public final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
- protected final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
- protected final static String O_NS = "urn:schemas-microsoft-com:office:office";
- protected final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
- protected final static String DRAWING_MAIN_NS = "http://schemas.openxmlformats.org/drawingml/2006/main";
- protected final static String V_NS = "urn:schemas-microsoft-com:vml";
-
- protected final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
-
- protected final static char[] TAB_CHAR = new char[1];
- protected final static char NEWLINE = '\n';
-
- static {
- TAB_CHAR[0] = '\t';
- }
-
- protected boolean inR = false;//in run or in field
- protected boolean inT = false;
- protected boolean inRPr = false;
- protected boolean inNumPr = false;
-
- protected boolean inPic = false;
- boolean inPict = false;
- protected String picDescription = null;
- protected String picRId = null;
- String picFilename = null;
-
- //mechanism used to determine when to
- //signal the start of the p, and still
- //handle p with pPr and those without
- protected boolean lastStartElementWasP = false;
- //have we signaled the start of a p?
- //pPr can happen multiple times within a p
- //<p><pPr/><r><t>text</t></r><pPr></p>
- protected boolean pStarted = false;
-
- //alternate content can be embedded in itself.
- //need to track depth.
- //if in alternate, choose fallback, maybe make this configurable?
- protected int inACChoiceDepth = 0;
- protected int inACFallbackDepth = 0;
-
- protected RunProperties currRunProperties = new RunProperties();
- protected ParagraphProperties currPProperties = new ParagraphProperties();
-
- protected final StringBuilder runBuffer = new StringBuilder();
-
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
new file mode 100644
index 0000000..ef3b3dc
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
@@ -0,0 +1,397 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml;
+
+
+import java.math.BigInteger;
+import java.util.Date;
+
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
+import org.apache.tika.parser.microsoft.WordExtractor;
+import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFStylesShim;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+public class OOXMLTikaBodyPartHandler implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
+
+ private final static String P = "p";
+
+ private final static char[] NEWLINE = new char[]{'\n'};
+
+ private final XHTMLContentHandler xhtml;
+ private final XWPFListManager listManager;
+ private final boolean includeDeletedText;
+ private final boolean includeMoveFromText;
+ private final XWPFStylesShim styles;
+
+ private int pDepth = 0; //paragraph depth
+ private int tableDepth = 0;//table depth
+ private int sdtDepth = 0;//
+ private boolean isItalics = false;
+ private boolean isBold = false;
+ private boolean wroteHyperlinkStart = false;
+
+ //TODO: fix this
+ //pWithinCell should be an array/stack of given cell depths
+ //so that when you get to the end of an embedded table, e.g.,
+ //you know what your paragraph count was in the parent cell.
+ //<tc><p/><p/><table><tr><tc></p></p></tc></tr></table>...
+ private int tableCellDepth = 0;
+ private int pWithinCell = 0;
+
+ //will need to replace this with a stack
+ //if we're marking more that the first level <p/> element
+ private String paragraphTag = null;
+
+ public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml) {
+ this.xhtml = xhtml;
+ this.styles = XWPFStylesShim.EMPTY_STYLES;
+ this.listManager = XWPFListManager.EMPTY_LIST;
+ this.includeDeletedText = false;
+ this.includeMoveFromText = false;
+ }
+
+ public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml, XWPFStylesShim styles, XWPFListManager listManager, OfficeParserConfig parserConfig) {
+ this.xhtml = xhtml;
+ this.styles = styles;
+ this.listManager = listManager;
+ this.includeDeletedText = parserConfig.getIncludeDeletedContent();
+ this.includeMoveFromText = parserConfig.getIncludeMoveFromContent();
+ }
+
+ @Override
+ public void run(RunProperties runProperties, String contents) {
+ try {
+ // True if we are currently in the named style tag:
+ if (runProperties.getBold() != isBold) {
+ if (isItalics) {
+ xhtml.endElement("i");
+ isItalics = false;
+ }
+ if (runProperties.getBold()) {
+ xhtml.startElement("b");
+ isBold = true;
+ } else {
+ xhtml.endElement("b");
+ isBold = false;
+ }
+ }
+
+ if (runProperties.getItalics() != isItalics) {
+ if (runProperties.getItalics()) {
+ xhtml.startElement("i");
+ isItalics = true;
+ } else {
+ xhtml.endElement("i");
+ isItalics = false;
+ }
+ }
+
+ xhtml.characters(contents);
+
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void hyperlinkStart(String link) {
+ try {
+ if (link != null) {
+ xhtml.startElement("a", "href", link);
+ wroteHyperlinkStart = true;
+ }
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void hyperlinkEnd() {
+ try {
+ if (wroteHyperlinkStart) {
+ closeStyleTags();
+ wroteHyperlinkStart = false;
+ xhtml.endElement("a");
+ }
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void startParagraph(ParagraphProperties paragraphProperties) {
+ if (pDepth == 0 && tableDepth == 0 && sdtDepth == 0) {
+ paragraphTag = P;
+ String styleClass = null;
+ //TIKA-2144 check that styles is not null
+ if (paragraphProperties.getStyleID() != null && styles != null) {
+ String styleName = styles.getStyleName(
+ paragraphProperties.getStyleID()
+ );
+ if (styleName != null) {
+ WordExtractor.TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(
+ styleName, false);
+ paragraphTag = tas.getTag();
+ styleClass = tas.getStyleClass();
+ }
+ }
+
+
+ try {
+ if (styleClass == null) {
+ xhtml.startElement(paragraphTag);
+ } else {
+ xhtml.startElement(paragraphTag, "class", styleClass);
+ }
+ } catch (SAXException e) {
+
+ }
+ }
+
+ try {
+ writeParagraphNumber(paragraphProperties.getNumId(),
+ paragraphProperties.getIlvl(), listManager, xhtml);
+ } catch (SAXException e) {
+
+ }
+ pDepth++;
+ }
+
+
+ @Override
+ public void endParagraph() {
+ try {
+ closeStyleTags();
+ if (pDepth == 1 && tableDepth == 0) {
+ xhtml.endElement(paragraphTag);
+ } else if (tableCellDepth > 0 && pWithinCell > 0){
+ xhtml.characters(NEWLINE, 0, 1);
+ } else if (tableCellDepth == 0) {
+ xhtml.characters(NEWLINE, 0, 1);
+ }
+ } catch (SAXException e) {
+
+ }
+ if (tableCellDepth > 0) {
+ pWithinCell++;
+ }
+ pDepth--;
+ }
+
+ @Override
+ public void startTable() {
+ try {
+ xhtml.startElement("table");
+ tableDepth++;
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void endTable() {
+ try {
+ xhtml.endElement("table");
+ tableDepth--;
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void startTableRow() {
+ try {
+ xhtml.startElement("tr");
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void endTableRow() {
+ try {
+ xhtml.endElement("tr");
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void startTableCell() {
+ try {
+ xhtml.startElement("td");
+ } catch (SAXException e) {
+
+ }
+ tableCellDepth++;
+ }
+
+ @Override
+ public void endTableCell() {
+ try {
+ xhtml.endElement("td");
+ } catch (SAXException e) {
+
+ }
+ pWithinCell = 0;
+ tableCellDepth--;
+ }
+
+ @Override
+ public void startSDT() {
+ try {
+ closeStyleTags();
+ sdtDepth++;
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void endSDT() {
+ sdtDepth--;
+ }
+
+ @Override
+ public void startEditedSection(String editor, Date date, OOXMLWordAndPowerPointTextHandler.EditType editType) {
+ //no-op
+ }
+
+ @Override
+ public void endEditedSection() {
+ //no-op
+ }
+
+ @Override
+ public boolean getIncludeDeletedText() {
+ return includeDeletedText;
+ }
+
+ @Override
+ public void footnoteReference(String id) {
+ if (id != null) {
+ try {
+ xhtml.characters("[");
+ xhtml.characters(id);
+ xhtml.characters("]");
+ } catch (SAXException e) {
+
+ }
+ }
+ }
+
+ @Override
+ public void endnoteReference(String id) {
+ if (id != null) {
+ try {
+ xhtml.characters("[");
+ xhtml.characters(id);
+ xhtml.characters("]");
+ } catch (SAXException e) {
+
+ }
+ }
+ }
+
+ @Override
+ public boolean getIncludeMoveFromText() {
+ return includeMoveFromText;
+ }
+
+ @Override
+ public void embeddedOLERef(String relId) {
+ if (relId == null) {
+ return;
+ }
+ try {
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", relId);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void embeddedPicRef(String picFileName, String picDescription) {
+
+ try {
+ AttributesImpl attr = new AttributesImpl();
+ if (picFileName != null) {
+ attr.addAttribute("", "src", "src", "CDATA", "embedded:" + picFileName);
+ }
+ if (picDescription != null) {
+ attr.addAttribute("", "alt", "alt", "CDATA", picDescription);
+ }
+
+ xhtml.startElement("img", attr);
+ xhtml.endElement("img");
+
+ } catch (SAXException e) {
+
+ }
+ }
+
+ @Override
+ public void startBookmark(String id, String name) {
+ //skip bookmarks within hyperlinks
+ if (name != null && ! wroteHyperlinkStart) {
+ try {
+ xhtml.startElement("a", "name", name);
+ xhtml.endElement("a");
+ } catch (SAXException e) {
+
+ }
+ }
+ }
+
+ @Override
+ public void endBookmark(String id) {
+ //no-op
+ }
+
+ private void closeStyleTags() throws SAXException {
+ if (isItalics) {
+ xhtml.endElement("i");
+ isItalics = false;
+ }
+ if (isBold) {
+ xhtml.endElement("b");
+ isBold = false;
+ }
+ }
+
+ private void writeParagraphNumber(int numId, int ilvl,
+ XWPFListManager listManager,
+ XHTMLContentHandler xhtml) throws SAXException {
+
+ if (ilvl < 0 || numId < 0 || listManager == null) {
+ return;
+ }
+ String number = listManager.getFormattedNumber(BigInteger.valueOf(numId), ilvl);
+ if (number != null) {
+ xhtml.characters(number);
+ }
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
new file mode 100644
index 0000000..8cd84d9
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -0,0 +1,497 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft.ooxml;
+
+
+import java.util.Date;
+import java.util.Map;
+
+import org.apache.tika.utils.DateUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * This class is intended to handle anything that might contain IBodyElements:
+ * main document, headers, footers, notes, slides, etc.
+ *
+ * <p/>
+ *
+ * This class does not check for namespaces, and it can be applied
+ * to PPTX and DOCX for text extraction.
+ *
+ * <p/>
+ * This does not work with .xlsx or .vsdx.
+ *
+ * TODO: move this into POI?
+ *
+ */
+
+public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
+
+
+ public enum EditType {
+ NONE,
+ INSERT,
+ DELETE,
+ MOVE_TO,
+ MOVE_FROM
+ }
+
+ private final static String R = "r";
+ private final static String FLD = "fld";
+ private final static String RPR = "rPr";
+ private final static String P = "p";
+ private final static String P_STYLE = "pStyle";
+ private final static String PPR = "pPr";
+ private final static String T = "t";
+ private final static String TAB = "tab";
+ private final static String B = "b";
+ private final static String ILVL = "ilvl";
+ private final static String NUM_ID = "numId";
+ private final static String TC = "tc";
+ private final static String TR = "tr";
+ private final static String I = "i";
+ private final static String NUM_PR = "numPr";
+ private final static String BR = "br";
+ private final static String HYPERLINK = "hyperlink";
+ private final static String HLINK_CLICK = "hlinkClick"; //pptx hlink
+ private final static String TBL = "tbl";
+ private final static String PIC = "pic";
+ private final static String PICT = "pict";
+ private final static String IMAGEDATA = "imagedata";
+ private final static String BLIP = "blip";
+ private final static String CHOICE = "Choice";
+ private final static String FALLBACK = "Fallback";
+ private final static String OLE_OBJECT = "OLEObject";
+ private final static String CR = "cr";
+
+ public final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
+ private final static String MC_NS = "http://schemas.openxmlformats.org/markup-compatibility/2006";
+ private final static String O_NS = "urn:schemas-microsoft-com:office:office";
+ private final static String PIC_NS = "http://schemas.openxmlformats.org/drawingml/2006/picture";
+ private final static String DRAWING_MAIN_NS = "http://schemas.openxmlformats.org/drawingml/2006/main";
+ private final static String V_NS = "urn:schemas-microsoft-com:vml";
+
+ private final static String OFFICE_DOC_RELATIONSHIP_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships";
+
+ private final static char[] TAB_CHAR = new char[]{'\t'};
+ private final static char NEWLINE = '\n';
+
+ private final static String BOOKMARK_START = "bookmarkStart";
+ private final static String BOOKMARK_END = "bookmarkEnd";
+ private final static String FOOTNOTE_REFERENCE = "footnoteReference";
+ private final static String INS = "ins";
+ private final static String DEL = "del";
+ private final static String DEL_TEXT = "delText";
+ private final static String MOVE_FROM = "moveFrom";
+ private final static String MOVE_TO = "moveTo";
+ private final static String ENDNOTE_REFERENCE = "endnoteReference";
+
+ private final XWPFBodyContentsHandler bodyContentsHandler;
+
+ private final Map<String, String> linkedRelationships;
+
+ private boolean inR = false;//in run or in field
+ private boolean inT = false;
+ private boolean inRPr = false;
+ private boolean inNumPr = false;
+
+ private boolean inPic = false;
+ private boolean inPict = false;
+ private String picDescription = null;
+ private String picRId = null;
+ private String picFilename = null;
+
+ //mechanism used to determine when to
+ //signal the start of the p, and still
+ //handle p with pPr and those without
+ private boolean lastStartElementWasP = false;
+ //have we signaled the start of a p?
+ //pPr can happen multiple times within a p
+ //<p><pPr/><r><t>text</t></r><pPr></p>
+ private boolean pStarted = false;
+
+ //alternate content can be embedded in itself.
+ //need to track depth.
+ //if in alternate, choose fallback, maybe make this configurable?
+ private int inACChoiceDepth = 0;
+ private int inACFallbackDepth = 0;
+
+ private final RunProperties currRunProperties = new RunProperties();
+ private final ParagraphProperties currPProperties = new ParagraphProperties();
+
+ private final StringBuilder runBuffer = new StringBuilder();
+
+
+ private boolean inDelText = false;
+ private boolean inHlinkClick = false;
+
+ private OOXMLWordAndPowerPointTextHandler.EditType editType = OOXMLWordAndPowerPointTextHandler.EditType.NONE;
+
+
+ public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler bodyContentsHandler,
+ Map<String, String> hyperlinks) {
+ this.bodyContentsHandler = bodyContentsHandler;
+ this.linkedRelationships = hyperlinks;
+ }
+
+
+ @Override
+ public void startDocument() throws SAXException {
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) throws SAXException {
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) throws SAXException {
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ //TODO: checkBox, textBox, sym, headerReference, footerReference, commentRangeEnd
+
+ if (lastStartElementWasP && ! PPR.equals(localName)) {
+ bodyContentsHandler.startParagraph(currPProperties);
+ }
+
+ lastStartElementWasP = false;
+
+ if (uri != null && uri.equals(MC_NS)) {
+ if (CHOICE.equals(localName)) {
+ inACChoiceDepth++;
+ } else if (FALLBACK.equals(localName)) {
+ inACFallbackDepth++;
+ }
+ }
+
+ if (inACChoiceDepth > 0) {
+ return;
+ }
+ //these are sorted descending by frequency within docx files
+ //in our regression corpus.
+ //yes, I know, likely premature optimization...
+ if (RPR.equals(localName)) {
+ inRPr = true;
+ } else if (R.equals(localName)) {
+ inR = true;
+ } else if (T.equals(localName)) {
+ inT = true;
+ } else if (TAB.equals(localName)) {
+ runBuffer.append(TAB_CHAR);
+ } else if (P.equals(localName)) {
+ lastStartElementWasP = true;
+ } else if (B.equals(localName)) { //TODO: add bCs
+ if(inR && inRPr) {
+ currRunProperties.setBold(true);
+ }
+ } else if (TC.equals(localName)) {
+ bodyContentsHandler.startTableCell();
+ } else if (P_STYLE.equals(localName)) {
+ String styleId = atts.getValue(W_NS, "val");
+ currPProperties.setStyleID(styleId);
+ } else if (I.equals(localName)) { //TODO: add iCs
+ //rprs don't have to be inR; ignore those that aren't
+ if (inR && inRPr) {
+ currRunProperties.setItalics(true);
+ }
+ } else if (TR.equals(localName)) {
+ bodyContentsHandler.startTableRow();
+ } else if (NUM_PR.equals(localName)) {
+ inNumPr = true;
+ } else if (ILVL.equals(localName)) {
+ if (inNumPr) {
+ currPProperties.setIlvl(getIntVal(atts));
+ }
+ } else if (NUM_ID.equals(localName)) {
+ if (inNumPr) {
+ currPProperties.setNumId(getIntVal(atts));
+ }
+ } else if(BR.equals(localName)) {
+ runBuffer.append(NEWLINE);
+ } else if (BOOKMARK_START.equals(localName)) {
+ String name = atts.getValue(W_NS, "name");
+ String id = atts.getValue(W_NS, "id");
+ bodyContentsHandler.startBookmark(id, name);
+ } else if (BOOKMARK_END.equals(localName)) {
+ String id = atts.getValue(W_NS, "id");
+ bodyContentsHandler.endBookmark(id);
+ } else if (HYPERLINK.equals(localName)) { //docx hyperlink
+ String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+ String hyperlink = null;
+ if (hyperlinkId != null) {
+ hyperlink = linkedRelationships.get(hyperlinkId);
+ bodyContentsHandler.hyperlinkStart(hyperlink);
+ } else {
+ String anchor = atts.getValue(W_NS, "anchor");
+ if (anchor != null) {
+ anchor = "#" + anchor;
+ }
+ bodyContentsHandler.hyperlinkStart(anchor);
+ }
+ } else if (HLINK_CLICK.equals(localName)) { //pptx hyperlink
+ String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+ String hyperlink = null;
+ if (hyperlinkId != null) {
+ hyperlink = linkedRelationships.get(hyperlinkId);
+ bodyContentsHandler.hyperlinkStart(hyperlink);
+ inHlinkClick = true;
+ }
+ } else if(TBL.equals(localName)) {
+ bodyContentsHandler.startTable();
+ } else if (BLIP.equals(localName)) { //check for DRAWING_NS
+ picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed");
+ } else if ("cNvPr".equals(localName)) { //check for PIC_NS?
+ picDescription = atts.getValue("", "descr");
+ } else if (PIC.equals(localName)) {
+ inPic = true; //check for PIC_NS?
+ } //TODO: add sdt, sdtPr, sdtContent goes here statistically
+ else if (FOOTNOTE_REFERENCE.equals(localName)) {
+ String id = atts.getValue(W_NS, "id");
+ bodyContentsHandler.footnoteReference(id);
+ } else if (IMAGEDATA.equals(localName)) {
+ picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
+ picDescription = atts.getValue(O_NS, "title");
+ } else if (INS.equals(localName)) {
+ startEditedSection(editType.INSERT, atts);
+ } else if (DEL_TEXT.equals(localName)) {
+ inDelText = true;
+ } else if (DEL.equals(localName)) {
+ startEditedSection(editType.DELETE, atts);
+ } else if (MOVE_TO.equals(localName)) {
+ startEditedSection(EditType.MOVE_TO, atts);
+ } else if (MOVE_FROM.equals(localName)) {
+ startEditedSection(editType.MOVE_FROM, atts);
+ } else if (OLE_OBJECT.equals(localName)){ //check for O_NS?
+ String type = null;
+ String refId = null;
+ //TODO: clean this up and ...want to get ProgID?
+ for (int i = 0; i < atts.getLength(); i++) {
+ String attLocalName = atts.getLocalName(i);
+ String attValue = atts.getValue(i);
+ if (attLocalName.equals("Type")) {
+ type = attValue;
+ } else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) && attLocalName.equals("id")) {
+ refId = attValue;
+ }
+ }
+ if ("Embed".equals(type)) {
+ bodyContentsHandler.embeddedOLERef(refId);
+ }
+ } else if(CR.equals(localName)) {
+ runBuffer.append(NEWLINE);
+ } else if (ENDNOTE_REFERENCE.equals(localName)) {
+ String id = atts.getValue(W_NS, "id");
+ bodyContentsHandler.endnoteReference(id);
+ }
+
+ }
+
+ private void startEditedSection(EditType editType, Attributes atts) {
+ String editAuthor = atts.getValue(W_NS, "author");
+ String editDateString = atts.getValue(W_NS, "date");
+ Date editDate = null;
+ if (editDateString != null) {
+ editDate = DateUtils.tryToParse(editDateString);
+ }
+ bodyContentsHandler.startEditedSection(editAuthor, editDate, editType);
+ this.editType = editType;
+ }
+
+ private int getIntVal(Attributes atts) {
+ String valString = atts.getValue(W_NS, "val");
+ if (valString != null) {
+ try {
+ return Integer.parseInt(valString);
+ } catch (NumberFormatException e) {
+ //swallow
+ }
+ }
+ return -1;
+ }
+
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+
+ if (CHOICE.equals(localName)) {
+ inACChoiceDepth--;
+ } else if (FALLBACK.equals(localName)) {
+ inACFallbackDepth--;
+ }
+ if (inACChoiceDepth > 0) {
+ return;
+ }
+
+ if (PIC.equals(localName)) { //PIC_NS
+ handlePict();
+ inPic = false;
+ return;
+ } else if (RPR.equals(localName)) {
+ inRPr = false;
+ } else if (R.equals(localName)) {
+ handleEndOfRun();
+ } else if (T.equals(localName)) {
+ inT = false;
+ } else if (PPR.equals(localName)) {
+ if (!pStarted) {
+ bodyContentsHandler.startParagraph(currPProperties);
+ pStarted = true;
+ }
+ currPProperties.reset();
+ } else if (P.equals(localName)) {
+ if (runBuffer.length() > 0) {
+ //<p><tab></p>...this will treat that as if it were
+ //a run...TODO: should we swallow whitespace that doesn't occur in a run?
+ bodyContentsHandler.run(currRunProperties, runBuffer.toString());
+ runBuffer.setLength(0);
+ }
+ pStarted = false;
+ bodyContentsHandler.endParagraph();
+ } else if (TC.equals(localName)) {
+ bodyContentsHandler.endTableCell();
+ } else if (TR.equals(localName)) {
+ bodyContentsHandler.endTableRow();
+ } else if (TBL.equals(localName)) {
+ bodyContentsHandler.endTable();
+ } else if (FLD.equals(localName)) {
+ handleEndOfRun();
+ } else if (DEL_TEXT.equals(localName)) {
+ inDelText = false;
+ } else if (INS.equals(localName) || DEL.equals(localName) ||
+ MOVE_TO.equals(localName) || MOVE_FROM.equals(localName)) {
+ editType = EditType.NONE;
+ } else if (HYPERLINK.equals(localName)) {
+ bodyContentsHandler.hyperlinkEnd();
+ } else if (PICT.equals(localName)) {
+ handlePict();
+ }
+ }
+
+ private void handleEndOfRun() {
+ bodyContentsHandler.run(currRunProperties, runBuffer.toString());
+ if (inHlinkClick) {
+ bodyContentsHandler.hyperlinkEnd();
+ inHlinkClick = false;
+ }
+ inR = false;
+ runBuffer.setLength(0);
+ currRunProperties.setBold(false);
+ currRunProperties.setItalics(false);
+ }
+
+ private void handlePict() {
+ String picFileName = null;
+ if (picRId != null) {
+ picFileName = linkedRelationships.get(picRId);
+ }
+ bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
+ picDescription = null;
+ picRId = null;
+ inPic = false;
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+
+ if (inACChoiceDepth > 0) {
+ return;
+ }
+ if (editType.equals(EditType.MOVE_FROM) && inT) {
+ if (bodyContentsHandler.getIncludeMoveFromText()) {
+ runBuffer.append(ch, start, length);
+ }
+ } else if (inT) {
+ runBuffer.append(ch, start, length);
+ } else if (bodyContentsHandler.getIncludeDeletedText() && editType.equals(EditType.DELETE)) {
+ runBuffer.append(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+ if (inACChoiceDepth > 0) {
+ return;
+ }
+
+ if (inT) {
+ runBuffer.append(ch, start, length);
+ } else if (bodyContentsHandler.getIncludeDeletedText() && inDelText) {
+ runBuffer.append(ch, start, length);
+ }
+ }
+
+
+ public interface XWPFBodyContentsHandler {
+
+ void run(RunProperties runProperties, String contents);
+
+ /**
+ * @param link the link; can be null
+ */
+ void hyperlinkStart(String link);
+
+ void hyperlinkEnd();
+
+ void startParagraph(ParagraphProperties paragraphProperties);
+
+ void endParagraph();
+
+ void startTable();
+
+ void endTable();
+
+ void startTableRow();
+
+ void endTableRow();
+
+ void startTableCell();
+
+ void endTableCell();
+
+ void startSDT();
+
+ void endSDT();
+
+ void startEditedSection(String editor, Date date, EditType editType);
+
+ void endEditedSection();
+
+ boolean getIncludeDeletedText();
+
+ void footnoteReference(String id);
+
+ void endnoteReference(String id);
+
+ boolean getIncludeMoveFromText();
+
+ void embeddedOLERef(String refId);
+
+ void embeddedPicRef(String picFileName, String picDescription);
+
+ void startBookmark(String id, String name);
+
+ void endBookmark(String id);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
index 21577c4..a7de780 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -36,9 +36,7 @@ import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFDocumentXMLBodyHandler;
import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor;
-import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFTikaBodyPartHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -110,14 +108,14 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
handleBasicRelatedParts(XSLFRelation.SLIDE_MASTER.getRelation(),
"slide-master",
mainDocument,
- new PlaceHolderSkipper(new XSLFDocumentXMLBodyHandler(
- new XSLFTikaBodyPartHandler(xhtml), new HashMap<String, String>())));
+ new PlaceHolderSkipper(new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml), new HashMap<String, String>())));
handleBasicRelatedParts(HANDOUT_MASTER,
"slide-handout-master",
mainDocument,
- new XSLFDocumentXMLBodyHandler(
- new XSLFTikaBodyPartHandler(xhtml), new HashMap<String, String>())
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml), new HashMap<String, String>())
);
}
@@ -162,8 +160,8 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
context.getSAXParser().parse(
new CloseShieldInputStream(stream),
new OfflineContentHandler(new EmbeddedContentHandler(
- new XSLFDocumentXMLBodyHandler(
- new XSLFTikaBodyPartHandler(xhtml), linkedRelationships))));
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships))));
} catch (TikaException e) {
//do something with this
@@ -174,19 +172,19 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
handleBasicRelatedParts(XSLFRelation.SLIDE_LAYOUT.getRelation(),
"slide-master-content", slidePart,
- new PlaceHolderSkipper(new XSLFDocumentXMLBodyHandler(
- new XSLFTikaBodyPartHandler(xhtml), linkedRelationships))
+ new PlaceHolderSkipper(new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships))
);
handleBasicRelatedParts(XSLFRelation.NOTES.getRelation(),
"slide-notes", slidePart,
- new XSLFDocumentXMLBodyHandler(
- new XSLFTikaBodyPartHandler(xhtml), linkedRelationships));
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));
handleBasicRelatedParts(XSLFRelation.NOTES_MASTER.getRelation(),
"slide-notes-master", slidePart,
- new XSLFDocumentXMLBodyHandler(
- new XSLFTikaBodyPartHandler(xhtml), linkedRelationships));
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml), linkedRelationships));
handleBasicRelatedParts(XSLFRelation.COMMENTS.getRelation(),
null, slidePart,
@@ -387,9 +385,9 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
private static class PlaceHolderSkipper extends DefaultHandler {
- private final XSLFDocumentXMLBodyHandler wrappedHandler;
+ private final ContentHandler wrappedHandler;
- PlaceHolderSkipper(XSLFDocumentXMLBodyHandler wrappedHandler) {
+ PlaceHolderSkipper(ContentHandler wrappedHandler) {
this.wrappedHandler = wrappedHandler;
}
http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index d60b274..8f9fbf5 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -34,11 +34,9 @@ import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
-import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFDocumentXMLBodyHandler;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFNumberingShim;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFStylesShim;
-import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFTikaBodyPartHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -166,8 +164,8 @@ public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
context.getSAXParser().parse(
new CloseShieldInputStream(stream),
new OfflineContentHandler(new EmbeddedContentHandler(
- new XWPFDocumentXMLBodyHandler(
- new XWPFTikaBodyPartHandler(xhtml, styles, listManager,
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml, styles, listManager,
context.get(OfficeParserConfig.class)), linkedRelationships))));
} catch (TikaException e) {
//swallow
http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
index 2a99126..c8bcdc7 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFListManager.java
@@ -31,6 +31,12 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTNumLvl;
public class XWPFListManager extends AbstractListManager {
+
+ /**
+ * Empty singleton to be used when there is no list manager.
+ * Always returns empty string.
+ */
+ public final static XWPFListManager EMPTY_LIST = new EmptyListManager();
private final static boolean OVERRIDE_AVAILABLE;
private final static String SKIP_FORMAT = Character.toString((char) 61623);//if this shows up as the lvlText, don't show a number
@@ -175,4 +181,21 @@ public class XWPFListManager extends AbstractListManager {
return new LevelTuple(start, restart, lvlText, numFmt, isLegal);
}
+
+ private static class EmptyListManager extends XWPFListManager {
+ EmptyListManager() {
+ super(null);
+ }
+
+ @Override
+ public String getFormattedNumber(XWPFParagraph paragraph) {
+ return "";
+ }
+
+ @Override
+ public String getFormattedNumber(BigInteger numId, int iLvl) {
+ return "";
+ }
+
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFDocumentXMLBodyHandler.java
deleted file mode 100644
index b5aa449..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFDocumentXMLBodyHandler.java
+++ /dev/null
@@ -1,330 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xslf;
-
-
-import java.util.Map;
-
-import org.apache.tika.parser.microsoft.ooxml.AbstractDocumentXMLBodyHandler;
-import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
-import org.apache.tika.parser.microsoft.ooxml.RunProperties;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-
-/**
- * This class is intended to handle anything that might contain IBodyElements:
- * main document, headers, footers, notes, etc.
- */
-
-public class XSLFDocumentXMLBodyHandler extends AbstractDocumentXMLBodyHandler {
-
-
- private final XSLFBodyContentsHandler bodyContentsHandler;
- //private final RelationshipsManager relationshipsManager;
-
-
- //alternate content can be embedded in itself.
- //need to track depth.
- //if in alternate, choose fallback, maybe make this configurable?
- private int inACChoiceDepth = 0;
- private int inACFallbackDepth = 0;
-
- private boolean inHyperlink = false;
-
- private final Map<String, String> linkedRelationships;
-
- public XSLFDocumentXMLBodyHandler(XSLFBodyContentsHandler bodyContentsHandler,
- Map<String, String> linkedRelationships) {
- this.bodyContentsHandler = bodyContentsHandler;
- this.linkedRelationships = linkedRelationships;
- }
-
-
- @Override
- public void startDocument() throws SAXException {
- }
-
- @Override
- public void endDocument() throws SAXException {
- }
-
- @Override
- public void startPrefixMapping(String prefix, String uri) throws SAXException {
- }
-
- @Override
- public void endPrefixMapping(String prefix) throws SAXException {
- }
-
- @Override
- public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
- //TODO: checkBox, textBox, sym, headerReference, footerReference, commentRangeEnd
-
- if (lastStartElementWasP && ! PPR.equals(localName)) {
- bodyContentsHandler.startParagraph(currPProperties);
- pStarted = true;
- }
-
- lastStartElementWasP = false;
-
- if (uri != null && uri.equals(MC_NS)) {
- if (CHOICE.equals(localName)) {
- inACChoiceDepth++;
- } else if (FALLBACK.equals(localName)) {
- inACFallbackDepth++;
- }
- }
-
- if (inACChoiceDepth > 0) {
- return;
- }
- //these are sorted descending by frequency
- //in our regression corpus
- if (RPR.equals(localName)) {
- inRPr = true;
- } else if (R.equals(localName)) {
- inR = true;
- } else if (T.equals(localName)) {
- inT = true;
- } else if (TAB.equals(localName)) {
- runBuffer.append(TAB_CHAR);
- } else if (P.equals(localName)) {
- lastStartElementWasP = true;
- } else if (B.equals(localName)) { //TODO: add bCs
- if(inR && inRPr) {
- currRunProperties.setBold(true);
- }
- } else if (TC.equals(localName)) {
- bodyContentsHandler.startTableCell();
- } else if (P_STYLE.equals(localName)) {
- String styleId = atts.getValue(W_NS, "val");
- currPProperties.setStyleID(styleId);
- } else if (I.equals(localName)) { //TODO: add iCs
- //rprs don't have to be inR; ignore those that aren't
- if (inR && inRPr) {
- currRunProperties.setItalics(true);
- }
- } else if (FLD.equals(localName)) {
- inR = true;
- } else if (TR.equals(localName)) {
- bodyContentsHandler.startTableRow();
- } else if (NUM_PR.equals(localName)) {
- inNumPr = true;
- } else if (ILVL.equals(localName)) {
- if (inNumPr) {
- currPProperties.setIlvl(getIntVal(atts));
- }
- } else if (NUM_ID.equals(localName)) {
- if (inNumPr) {
- currPProperties.setNumId(getIntVal(atts));
- }
- } else if(BR.equals(localName)) {
- runBuffer.append(NEWLINE);
- } else if ("hlinkClick".equals(localName)) {
- String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
- String hyperlink = null;
- if (hyperlinkId != null) {
- hyperlink = linkedRelationships.get(hyperlinkId);
- bodyContentsHandler.hyperlinkStart(hyperlink);
- inHyperlink = true;
- }/* else {
- String anchor = atts.getValue(W_NS, "anchor");
- if (anchor != null) {
- anchor = "#" + anchor;
- }
- bodyContentsHandler.hyperlinkStart(anchor);
- inHyperlink = true;
- }*/
- } else if(TBL.equals(localName)) {
- bodyContentsHandler.startTable();
- } else if (BLIP.equals(localName)) { //check for DRAWING_NS
- picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed");
- } else if ("cNvPr".equals(localName)) { //check for PIC_NS?
- picDescription = atts.getValue("", "descr");
- } else if (PIC.equals(localName)) {
- inPic = true; //check for PIC_NS?
- } else if (IMAGEDATA.equals(localName)) {
- picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
- picDescription = atts.getValue(O_NS, "title");
- } else if (OLE_OBJECT.equals(localName)){ //check for O_NS?
- String type = null;
- String refId = null;
- //TODO: clean this up and ...want to get ProgID?
- for (int i = 0; i < atts.getLength(); i++) {
- String attLocalName = atts.getLocalName(i);
- String attValue = atts.getValue(i);
- if (attLocalName.equals("Type")) {
- type = attValue;
- } else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) && attLocalName.equals("id")) {
- refId = attValue;
- }
- }
- if ("Embed".equals(type)) {
- bodyContentsHandler.embeddedOLERef(refId);
- }
- } else if(CR.equals(localName)) {
- runBuffer.append(NEWLINE);
- }
-
- }
-
-
- private int getIntVal(Attributes atts) {
- String valString = atts.getValue(W_NS, "val");
- if (valString != null) {
- try {
- return Integer.parseInt(valString);
- } catch (NumberFormatException e) {
- //swallow
- }
- }
- return -1;
- }
-
-
- @Override
- public void endElement(String uri, String localName, String qName) throws SAXException {
-
- if (CHOICE.equals(localName)) {
- inACChoiceDepth--;
- } else if (FALLBACK.equals(localName)) {
- inACFallbackDepth--;
- }
- if (inACChoiceDepth > 0) {
- return;
- }
-
- if (PIC.equals(localName)) { //PIC_NS
- handlePict();
- inPic = false;
- return;
- } else if (RPR.equals(localName)) {
- inRPr = false;
- } else if (R.equals(localName)) {
- handleEndOfRun();
- } else if (T.equals(localName)) {
- inT = false;
- } else if (PPR.equals(localName)) {
- if (!pStarted) {
- bodyContentsHandler.startParagraph(currPProperties);
- pStarted = true;
- }
- currPProperties.reset();
- } else if (P.equals(localName)) {
- if (runBuffer.length() > 0) {
- //<p><tab></p>...this will treat that as if it were
- //a run...TODO: should we swallow whitespace that doesn't occur in a run?
- bodyContentsHandler.run(currRunProperties, runBuffer.toString());
- runBuffer.setLength(0);
- }
- pStarted = false;
- bodyContentsHandler.endParagraph();
- } else if (TC.equals(localName)) {
- bodyContentsHandler.endTableCell();
- } else if (TR.equals(localName)) {
- bodyContentsHandler.endTableRow();
- } else if (TBL.equals(localName)) {
- bodyContentsHandler.endTable();
- } else if (FLD.equals(localName)) {
- handleEndOfRun();
- } else if (HYPERLINK.equals(localName)) {
- bodyContentsHandler.hyperlinkEnd();
- } else if (PICT.equals(localName)) {
- handlePict();
- }
- }
-
- private void handleEndOfRun() {
- bodyContentsHandler.run(currRunProperties, runBuffer.toString());
- if (inHyperlink) {
- bodyContentsHandler.hyperlinkEnd();
- inHyperlink = false;
- }
- inR = false;
- runBuffer.setLength(0);
- currRunProperties.setBold(false);
- currRunProperties.setItalics(false);
- }
-
- private void handlePict() {
- String picFileName = null;
- if (picRId != null) {
- picFileName = "picId";//TODO: linkedRelationships.get(picRId);
- }
- bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
- picDescription = null;
- picRId = null;
- inPic = false;
- }
-
- @Override
- public void characters(char[] ch, int start, int length) throws SAXException {
-
- if (inACChoiceDepth > 0) {
- return;
- }
- if (inT) {
- runBuffer.append(ch, start, length);
- }
- }
-
- @Override
- public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
- if (inACChoiceDepth > 0) {
- return;
- }
-
- if (inT) {
- runBuffer.append(ch, start, length);
- }
- }
-
-
- public interface XSLFBodyContentsHandler {
-
- void run(RunProperties runProperties, String contents);
-
- /**
- * @param link the link; can be null
- */
- void hyperlinkStart(String link);
-
- void hyperlinkEnd();
-
- void startParagraph(ParagraphProperties paragraphProperties);
-
- void endParagraph();
-
- void startTable();
-
- void endTable();
-
- void startTableRow();
-
- void endTableRow();
-
- void startTableCell();
-
- void endTableCell();
-
- void embeddedOLERef(String refId);
-
- void embeddedPicRef(String picFileName, String picDescription);
-
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
index 15bbd6a..3e98203 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFEventBasedPowerPointExtractor.java
@@ -18,12 +18,14 @@
package org.apache.tika.parser.microsoft.ooxml.xslf;
import java.io.IOException;
+import java.util.Date;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLProperties;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
import org.apache.tika.parser.microsoft.ooxml.RunProperties;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
@@ -83,7 +85,7 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
- private class XSLFToTextContentHandler implements XSLFDocumentXMLBodyHandler.XSLFBodyContentsHandler {
+ private class XSLFToTextContentHandler implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
private final StringBuilder buffer;
public XSLFToTextContentHandler(StringBuilder buffer) {
@@ -145,6 +147,45 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
buffer.append("\t");
}
+ @Override
+ public void startSDT() {
+
+ }
+
+ @Override
+ public void endSDT() {
+
+ }
+
+ @Override
+ public void startEditedSection(String editor, Date date, OOXMLWordAndPowerPointTextHandler.EditType editType) {
+
+ }
+
+ @Override
+ public void endEditedSection() {
+
+ }
+
+ @Override
+ public boolean getIncludeDeletedText() {
+ return false;
+ }
+
+ @Override
+ public void footnoteReference(String id) {
+
+ }
+
+ @Override
+ public void endnoteReference(String id) {
+
+ }
+
+ @Override
+ public boolean getIncludeMoveFromText() {
+ return false;
+ }
@Override
@@ -157,5 +198,15 @@ public class XSLFEventBasedPowerPointExtractor extends POIXMLTextExtractor {
//no-op
}
+ @Override
+ public void startBookmark(String id, String name) {
+
+ }
+
+ @Override
+ public void endBookmark(String id) {
+
+ }
+
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFTikaBodyPartHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFTikaBodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFTikaBodyPartHandler.java
deleted file mode 100644
index ff587f7..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xslf/XSLFTikaBodyPartHandler.java
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xslf;
-
-
-import java.math.BigInteger;
-
-import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
-import org.apache.tika.parser.microsoft.ooxml.RunProperties;
-import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-public class XSLFTikaBodyPartHandler implements XSLFDocumentXMLBodyHandler.XSLFBodyContentsHandler {
-
- private final static String P = "p";
-
- private final static char[] NEWLINE = new char[]{'\n'};
- private final static char[] TAB = new char[]{'\t'};
-
- private final XHTMLContentHandler xhtml;
-
- private int pDepth = 0; //paragraph depth
- private int tableDepth = 0;//table depth
- private int pWithinCell = 0;//paragraph count within a cell
- private boolean isItalics = false;
- private boolean isBold = false;
- private boolean wroteHyperlinkStart = false;
- private boolean inTableCell = false;
-
- public XSLFTikaBodyPartHandler(XHTMLContentHandler xhtml) {
- this.xhtml = xhtml;
- }
-
- @Override
- public void run(RunProperties runProperties, String contents) {
- try {
- // True if we are currently in the named style tag:
- if (runProperties.getBold() != isBold) {
- if (isItalics) {
- xhtml.endElement("i");
- isItalics = false;
- }
- if (runProperties.getBold()) {
- xhtml.startElement("b");
- isBold = true;
- } else {
- xhtml.endElement("b");
- isBold = false;
- }
- }
-
- if (runProperties.getItalics() != isItalics) {
- if (runProperties.getItalics()) {
- xhtml.startElement("i");
- isItalics = true;
- } else {
- xhtml.endElement("i");
- isItalics = false;
- }
- }
-
- xhtml.characters(contents);
-
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void hyperlinkStart(String link) {
- try {
- if (link != null) {
- xhtml.startElement("a", "href", link);
- wroteHyperlinkStart = true;
- }
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void hyperlinkEnd() {
- try {
- if (wroteHyperlinkStart) {
- closeStyleTags();
- wroteHyperlinkStart = false;
- xhtml.endElement("a");
- }
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void startParagraph(ParagraphProperties paragraphProperties) {
- if (pDepth == 0 && tableDepth == 0) {
- try {
- xhtml.startElement(P);
- } catch (SAXException e) {
-
- }
- }
- pDepth++;
- }
-
- @Override
- public void endParagraph() {
- try {
- closeStyleTags();
- if (pDepth == 1 && tableDepth == 0) {
- xhtml.endElement(P);
- } else if (pWithinCell > 0){
- xhtml.characters(NEWLINE, 0, 1);
- }
- } catch (SAXException e) {
-
- }
- if (inTableCell) {
- pWithinCell++;
- }
- pDepth--;
- }
-
- @Override
- public void startTable() {
- try {
- xhtml.startElement("table");
- tableDepth++;
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void endTable() {
- try {
- xhtml.endElement("table");
- tableDepth--;
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void startTableRow() {
- try {
- xhtml.startElement("tr");
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void endTableRow() {
- try {
- xhtml.endElement("tr");
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void startTableCell() {
- try {
- xhtml.startElement("td");
- } catch (SAXException e) {
-
- }
- inTableCell = true;
- }
-
- @Override
- public void endTableCell() {
- try {
- xhtml.endElement("td");
- } catch (SAXException e) {
-
- }
- inTableCell = false;
- pWithinCell = 0;
- }
-
-
- @Override
- public void embeddedOLERef(String relId) {
- if (relId == null) {
- return;
- }
- try {
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
- attributes.addAttribute("", "id", "id", "CDATA", relId);
- xhtml.startElement("div", attributes);
- xhtml.endElement("div");
-
- } catch (SAXException e) {
-
- }
- }
-
- @Override
- public void embeddedPicRef(String picFileName, String picDescription) {
-
- try {
- AttributesImpl attr = new AttributesImpl();
- if (picFileName != null) {
- attr.addAttribute("", "src", "src", "CDATA", "embedded:" + picFileName);
- }
- if (picDescription != null) {
- attr.addAttribute("", "alt", "alt", "CDATA", picDescription);
- }
-
- xhtml.startElement("img", attr);
- xhtml.endElement("img");
-
- } catch (SAXException e) {
-
- }
- }
-
- private void closeStyleTags() throws SAXException {
- if (isItalics) {
- xhtml.endElement("i");
- isItalics = false;
- }
- if (isBold) {
- xhtml.endElement("b");
- isBold = false;
- }
- }
-
- private void writeParagraphNumber(int numId, int ilvl,
- XWPFListManager listManager,
- XHTMLContentHandler xhtml) throws SAXException {
-
- if (ilvl < 0 || numId < 0 || listManager == null) {
- return;
- }
- String number = listManager.getFormattedNumber(BigInteger.valueOf(numId), ilvl);
- if (number != null) {
- xhtml.characters(number);
- }
-
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
deleted file mode 100644
index d08fb07..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFDocumentXMLBodyHandler.java
+++ /dev/null
@@ -1,388 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.microsoft.ooxml.xwpf;
-
-
-import java.util.Date;
-import java.util.Map;
-
-import org.apache.tika.parser.microsoft.ooxml.AbstractDocumentXMLBodyHandler;
-import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
-import org.apache.tika.parser.microsoft.ooxml.RunProperties;
-import org.apache.tika.utils.DateUtils;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-
-/**
- * This class is intended to handle anything that might contain IBodyElements:
- * main document, headers, footers, notes, etc.
- */
-
-public class XWPFDocumentXMLBodyHandler extends AbstractDocumentXMLBodyHandler {
-
-
- enum EditType {
- NONE,
- INSERT,
- DELETE,
- MOVE_TO,
- MOVE_FROM
- }
-
-
- private final static String BOOKMARK_START = "bookmarkStart";
- private final static String BOOKMARK_END = "bookmarkEnd";
- private final static String FOOTNOTE_REFERENCE = "footnoteReference";
- private final static String INS = "ins";
- private final static String DEL = "del";
- private final static String DEL_TEXT = "delText";
- private final static String MOVE_FROM = "moveFrom";
- private final static String MOVE_TO = "moveTo";
- private final static String ENDNOTE_REFERENCE = "endnoteReference";
-
- private final XWPFBodyContentsHandler bodyContentsHandler;
- //private final RelationshipsManager relationshipsManager;
- private final Map<String, String> linkedRelationships;
-
- private boolean inDelText = false;
-
- private XWPFDocumentXMLBodyHandler.EditType editType = XWPFDocumentXMLBodyHandler.EditType.NONE;
-
-
- public XWPFDocumentXMLBodyHandler(XWPFBodyContentsHandler bodyContentsHandler,
- Map<String, String> hyperlinks) {
- this.bodyContentsHandler = bodyContentsHandler;
- this.linkedRelationships = hyperlinks;
- }
-
-
- @Override
- public void startDocument() throws SAXException {
- }
-
- @Override
- public void endDocument() throws SAXException {
- }
-
- @Override
- public void startPrefixMapping(String prefix, String uri) throws SAXException {
- }
-
- @Override
- public void endPrefixMapping(String prefix) throws SAXException {
- }
-
- @Override
- public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
- //TODO: checkBox, textBox, sym, headerReference, footerReference, commentRangeEnd
-
- if (lastStartElementWasP && ! PPR.equals(localName)) {
- bodyContentsHandler.startParagraph(currPProperties);
- }
-
- lastStartElementWasP = false;
-
- if (uri != null && uri.equals(MC_NS)) {
- if (CHOICE.equals(localName)) {
- inACChoiceDepth++;
- } else if (FALLBACK.equals(localName)) {
- inACFallbackDepth++;
- }
- }
-
- if (inACChoiceDepth > 0) {
- return;
- }
- //these are sorted descending by frequency
- //in our regression corpus
- if (RPR.equals(localName)) {
- inRPr = true;
- } else if (R.equals(localName)) {
- inR = true;
- } else if (T.equals(localName)) {
- inT = true;
- } else if (TAB.equals(localName)) {
- runBuffer.append(TAB_CHAR);
- } else if (P.equals(localName)) {
- lastStartElementWasP = true;
- } else if (B.equals(localName)) { //TODO: add bCs
- if(inR && inRPr) {
- currRunProperties.setBold(true);
- }
- } else if (TC.equals(localName)) {
- bodyContentsHandler.startTableCell();
- } else if (P_STYLE.equals(localName)) {
- String styleId = atts.getValue(W_NS, "val");
- currPProperties.setStyleID(styleId);
- } else if (I.equals(localName)) { //TODO: add iCs
- //rprs don't have to be inR; ignore those that aren't
- if (inR && inRPr) {
- currRunProperties.setItalics(true);
- }
- } else if (TR.equals(localName)) {
- bodyContentsHandler.startTableRow();
- } else if (NUM_PR.equals(localName)) {
- inNumPr = true;
- } else if (ILVL.equals(localName)) {
- if (inNumPr) {
- currPProperties.setIlvl(getIntVal(atts));
- }
- } else if (NUM_ID.equals(localName)) {
- if (inNumPr) {
- currPProperties.setNumId(getIntVal(atts));
- }
- } else if(BR.equals(localName)) {
- runBuffer.append(NEWLINE);
- } else if (BOOKMARK_START.equals(localName)) {
- String name = atts.getValue(W_NS, "name");
- String id = atts.getValue(W_NS, "id");
- bodyContentsHandler.startBookmark(id, name);
- } else if (BOOKMARK_END.equals(localName)) {
- String id = atts.getValue(W_NS, "id");
- bodyContentsHandler.endBookmark(id);
- } else if (HYPERLINK.equals(localName)) {
- String hyperlinkId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
- String hyperlink = null;
- if (hyperlinkId != null) {
- hyperlink = linkedRelationships.get(hyperlinkId);
- bodyContentsHandler.hyperlinkStart(hyperlink);
- } else {
- String anchor = atts.getValue(W_NS, "anchor");
- if (anchor != null) {
- anchor = "#" + anchor;
- }
- bodyContentsHandler.hyperlinkStart(anchor);
- }
- } else if(TBL.equals(localName)) {
- bodyContentsHandler.startTable();
- } else if (BLIP.equals(localName)) { //check for DRAWING_NS
- picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "embed");
- } else if ("cNvPr".equals(localName)) { //check for PIC_NS?
- picDescription = atts.getValue("", "descr");
- } else if (PIC.equals(localName)) {
- inPic = true; //check for PIC_NS?
- } //TODO: add sdt, sdtPr, sdtContent goes here statistically
- else if (FOOTNOTE_REFERENCE.equals(localName)) {
- String id = atts.getValue(W_NS, "id");
- bodyContentsHandler.footnoteReference(id);
- } else if (IMAGEDATA.equals(localName)) {
- picRId = atts.getValue(OFFICE_DOC_RELATIONSHIP_NS, "id");
- picDescription = atts.getValue(O_NS, "title");
- } else if (INS.equals(localName)) {
- startEditedSection(editType.INSERT, atts);
- } else if (DEL_TEXT.equals(localName)) {
- inDelText = true;
- } else if (DEL.equals(localName)) {
- startEditedSection(editType.DELETE, atts);
- } else if (MOVE_TO.equals(localName)) {
- startEditedSection(EditType.MOVE_TO, atts);
- } else if (MOVE_FROM.equals(localName)) {
- startEditedSection(editType.MOVE_FROM, atts);
- } else if (OLE_OBJECT.equals(localName)){ //check for O_NS?
- String type = null;
- String refId = null;
- //TODO: clean this up and ...want to get ProgID?
- for (int i = 0; i < atts.getLength(); i++) {
- String attLocalName = atts.getLocalName(i);
- String attValue = atts.getValue(i);
- if (attLocalName.equals("Type")) {
- type = attValue;
- } else if (OFFICE_DOC_RELATIONSHIP_NS.equals(atts.getURI(i)) && attLocalName.equals("id")) {
- refId = attValue;
- }
- }
- if ("Embed".equals(type)) {
- bodyContentsHandler.embeddedOLERef(refId);
- }
- } else if(CR.equals(localName)) {
- runBuffer.append(NEWLINE);
- } else if (ENDNOTE_REFERENCE.equals(localName)) {
- String id = atts.getValue(W_NS, "id");
- bodyContentsHandler.endnoteReference(id);
- }
-
- }
-
- private void startEditedSection(EditType editType, Attributes atts) {
- String editAuthor = atts.getValue(W_NS, "author");
- String editDateString = atts.getValue(W_NS, "date");
- Date editDate = null;
- if (editDateString != null) {
- editDate = DateUtils.tryToParse(editDateString);
- }
- bodyContentsHandler.startEditedSection(editAuthor, editDate, editType);
- this.editType = editType;
- }
-
- private int getIntVal(Attributes atts) {
- String valString = atts.getValue(W_NS, "val");
- if (valString != null) {
- try {
- return Integer.parseInt(valString);
- } catch (NumberFormatException e) {
- //swallow
- }
- }
- return -1;
- }
-
-
- @Override
- public void endElement(String uri, String localName, String qName) throws SAXException {
-
- if (CHOICE.equals(localName)) {
- inACChoiceDepth--;
- } else if (FALLBACK.equals(localName)) {
- inACFallbackDepth--;
- }
- if (inACChoiceDepth > 0) {
- return;
- }
-
- if (PIC.equals(localName)) { //PIC_NS
- handlePict();
- inPic = false;
- return;
- } else if (RPR.equals(localName)) {
- inRPr = false;
- } else if (R.equals(localName)) {
- bodyContentsHandler.run(currRunProperties, runBuffer.toString());
- inR = false;
- runBuffer.setLength(0);
- currRunProperties.setBold(false);
- currRunProperties.setItalics(false);
- } else if (T.equals(localName)) {
- inT = false;
- } else if (PPR.equals(localName)) {
- bodyContentsHandler.startParagraph(currPProperties);
- currPProperties.reset();
- } else if (P.equals(localName)) {
- bodyContentsHandler.endParagraph();
- } else if (TC.equals(localName)) {
- bodyContentsHandler.endTableCell();
- } else if (TR.equals(localName)) {
- bodyContentsHandler.endTableRow();
- } else if (TBL.equals(localName)) {
- bodyContentsHandler.endTable();
- } else if (HYPERLINK.equals(localName)) {
- bodyContentsHandler.hyperlinkEnd();
- } else if (DEL_TEXT.equals(localName)) {
- inDelText = false;
- } else if (INS.equals(localName) || DEL.equals(localName) ||
- MOVE_TO.equals(localName) || MOVE_FROM.equals(localName)) {
- editType = EditType.NONE;
- } else if (PICT.equals(localName)) {
- handlePict();
-
- }
- }
-
- private void handlePict() {
- String picFileName = null;
- if (picRId != null) {
- picFileName = linkedRelationships.get(picRId);
- }
- bodyContentsHandler.embeddedPicRef(picFileName, picDescription);
- picDescription = null;
- picRId = null;
- inPic = false;
- }
-
- @Override
- public void characters(char[] ch, int start, int length) throws SAXException {
-
- if (inACChoiceDepth > 0) {
- return;
- }
- if (editType.equals(EditType.MOVE_FROM) && inT) {
- if (bodyContentsHandler.getIncludeMoveFromText()) {
- runBuffer.append(ch, start, length);
- }
- } else if (inT) {
- runBuffer.append(ch, start, length);
- } else if (bodyContentsHandler.getIncludeDeletedText() && editType.equals(EditType.DELETE)) {
- runBuffer.append(ch, start, length);
- }
- }
-
- @Override
- public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
- if (inACChoiceDepth > 0) {
- return;
- }
-
- if (inT) {
- runBuffer.append(ch, start, length);
- } else if (bodyContentsHandler.getIncludeDeletedText() && inDelText) {
- runBuffer.append(ch, start, length);
- }
- }
-
-
- public interface XWPFBodyContentsHandler {
-
- void run(RunProperties runProperties, String contents);
-
- /**
- * @param link the link; can be null
- */
- void hyperlinkStart(String link);
-
- void hyperlinkEnd();
-
- void startParagraph(ParagraphProperties paragraphProperties);
-
- void endParagraph();
-
- void startTable();
-
- void endTable();
-
- void startTableRow();
-
- void endTableRow();
-
- void startTableCell();
-
- void endTableCell();
-
- void startSDT();
-
- void endSDT();
-
- void startEditedSection(String editor, Date date, EditType editType);
-
- void endEditedSection();
-
- boolean getIncludeDeletedText();
-
- void footnoteReference(String id);
-
- void endnoteReference(String id);
-
- boolean getIncludeMoveFromText();
-
- void embeddedOLERef(String refId);
-
- void embeddedPicRef(String picFileName, String picDescription);
-
- void startBookmark(String id, String name);
-
- void endBookmark(String id);
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/376318fc/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index f61fa56..7466d09 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -38,6 +38,7 @@ import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.util.SAXHelper;
import org.apache.poi.xwpf.usermodel.XWPFNumbering;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
import org.apache.tika.parser.microsoft.ooxml.RunProperties;
import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
@@ -182,7 +183,7 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
try (InputStream stream = packagePart.getInputStream()) {
XMLReader reader = SAXHelper.newXMLReader();
- reader.setContentHandler(new XWPFDocumentXMLBodyHandler(
+ reader.setContentHandler(new OOXMLWordAndPowerPointTextHandler(
new XWPFToTextContentHandler(buffer), hyperlinks));
reader.parse(new InputSource(new CloseShieldInputStream(stream)));
@@ -232,7 +233,7 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
return null;
}
- private class XWPFToTextContentHandler implements XWPFDocumentXMLBodyHandler.XWPFBodyContentsHandler {
+ private class XWPFToTextContentHandler implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
private final StringBuilder buffer;
public XWPFToTextContentHandler(StringBuilder buffer) {
@@ -305,7 +306,7 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
}
@Override
- public void startEditedSection(String editor, Date date, XWPFDocumentXMLBodyHandler.EditType editType) {
+ public void startEditedSection(String editor, Date date, OOXMLWordAndPowerPointTextHandler.EditType editType) {
}