You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC
svn commit: r1723223 [11/32] - in /tika/branches/2.x:
tika-core/src/test/resources/META-INF/
tika-core/src/test/resources/META-INF/services/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-module/src/ ti...
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,711 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFOldDocument;
+import org.apache.poi.hwpf.OldWordFileFormatException;
+import org.apache.poi.hwpf.extractor.Word6Extractor;
+import org.apache.poi.hwpf.model.FieldsDocumentPart;
+import org.apache.poi.hwpf.model.PicturesTable;
+import org.apache.poi.hwpf.model.StyleDescription;
+import org.apache.poi.hwpf.usermodel.CharacterRun;
+import org.apache.poi.hwpf.usermodel.Field;
+import org.apache.poi.hwpf.usermodel.HeaderStories;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+import org.apache.poi.hwpf.usermodel.Picture;
+import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.hwpf.usermodel.Table;
+import org.apache.poi.hwpf.usermodel.TableCell;
+import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+public class WordExtractor extends AbstractPOIFSExtractor {
+
+ private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
+ private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
+ // could be improved by using the real delimiter in xchFollow [MS-DOC], v20140721, 2.4.6.3, Part 3, Step 3
+ private static final String LIST_DELIMITER = " ";
+ private static final Map<String, TagAndStyle> fixedParagraphStyles = new HashMap<String, TagAndStyle>();
+ private static final TagAndStyle defaultParagraphStyle = new TagAndStyle("p", null);
+
+ static {
+ fixedParagraphStyles.put("Default", defaultParagraphStyle);
+ fixedParagraphStyles.put("Normal", defaultParagraphStyle);
+ fixedParagraphStyles.put("heading", new TagAndStyle("h1", null));
+ fixedParagraphStyles.put("Heading", new TagAndStyle("h1", null));
+ fixedParagraphStyles.put("Title", new TagAndStyle("h1", "title"));
+ fixedParagraphStyles.put("Subtitle", new TagAndStyle("h2", "subtitle"));
+ fixedParagraphStyles.put("HTML Preformatted", new TagAndStyle("pre", null));
+ }
+
+ // True if we are currently in the named style tag:
+ private boolean curStrikeThrough;
+ private boolean curBold;
+ private boolean curItalic;
+
+ public WordExtractor(ParseContext context) {
+ super(context);
+ }
+
+ private static int countParagraphs(Range... ranges) {
+ int count = 0;
+ for (Range r : ranges) {
+ if (r != null) {
+ count += r.numParagraphs();
+ }
+ }
+ return count;
+ }
+
+ /**
+ * Given a style name, return what tag should be used, and
+ * what style should be applied to it.
+ */
+ public static TagAndStyle buildParagraphTagAndStyle(String styleName, boolean isTable) {
+ TagAndStyle tagAndStyle = fixedParagraphStyles.get(styleName);
+ if (tagAndStyle != null) {
+ return tagAndStyle;
+ }
+
+ if (styleName.equals("Table Contents") && isTable) {
+ return defaultParagraphStyle;
+ }
+
+ String tag = "p";
+ String styleClass = null;
+
+ if (styleName.startsWith("heading") || styleName.startsWith("Heading")) {
+ // "Heading 3" or "Heading2" or "heading 4"
+ int num = 1;
+ try {
+ num = Integer.parseInt(
+ styleName.substring(styleName.length() - 1)
+ );
+ } catch (NumberFormatException e) {
+ }
+ // Turn it into a H1 - H6 (H7+ isn't valid!)
+ tag = "h" + Math.min(num, 6);
+ } else {
+ styleClass = styleName.replace(' ', '_');
+ styleClass = styleClass.substring(0, 1).toLowerCase(Locale.ROOT) +
+ styleClass.substring(1);
+ }
+
+ return new TagAndStyle(tag, styleClass);
+ }
+
+ protected void parse(
+ NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ parse(filesystem.getRoot(), xhtml);
+ }
+
+ protected void parse(
+ DirectoryNode root, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ HWPFDocument document;
+ try {
+ document = new HWPFDocument(root);
+ } catch (OldWordFileFormatException e) {
+ parseWord6(root, xhtml);
+ return;
+ }
+ org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
+ new org.apache.poi.hwpf.extractor.WordExtractor(document);
+ HeaderStories headerFooter = new HeaderStories(document);
+
+ // Grab the list of pictures. As far as we can tell,
+ // the pictures should be in order, and may be directly
+ // placed or referenced from an anchor
+ PicturesTable pictureTable = document.getPicturesTable();
+ PicturesSource pictures = new PicturesSource(document);
+
+ // Do any headers, if present
+ Range[] headers = new Range[]{headerFooter.getFirstHeaderSubrange(),
+ headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange()};
+ handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);
+
+ // Do the main paragraph text
+ Range r = document.getRange();
+ ListManager listManager = new ListManager(document);
+ for (int i = 0; i < r.numParagraphs(); i++) {
+ Paragraph p = r.getParagraph(i);
+ i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml);
+ }
+
+ // Do everything else
+ for (String paragraph : wordExtractor.getMainTextboxText()) {
+ xhtml.element("p", paragraph);
+ }
+
+ for (String paragraph : wordExtractor.getFootnoteText()) {
+ xhtml.element("p", paragraph);
+ }
+
+ for (String paragraph : wordExtractor.getCommentsText()) {
+ xhtml.element("p", paragraph);
+ }
+
+ for (String paragraph : wordExtractor.getEndnoteText()) {
+ xhtml.element("p", paragraph);
+ }
+
+ // Do any footers, if present
+ Range[] footers = new Range[]{headerFooter.getFirstFooterSubrange(),
+ headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange()};
+ handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);
+
+ // Handle any pictures that we haven't output yet
+ for (Picture p = pictures.nextUnclaimed(); p != null; ) {
+ handlePictureCharacterRun(
+ null, p, pictures, xhtml
+ );
+ p = pictures.nextUnclaimed();
+ }
+
+ // Handle any embeded office documents
+ try {
+ DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
+ for (Entry entry : op) {
+ if (entry.getName().startsWith("_")
+ && entry instanceof DirectoryEntry) {
+ handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
+ }
+ }
+ } catch (FileNotFoundException e) {
+ }
+ }
+
+ private void handleHeaderFooter(Range[] ranges, String type, HWPFDocument document,
+ PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml)
+ throws SAXException, IOException, TikaException {
+ if (countParagraphs(ranges) > 0) {
+ xhtml.startElement("div", "class", type);
+ ListManager listManager = new ListManager(document);
+ for (Range r : ranges) {
+ if (r != null) {
+ for (int i = 0; i < r.numParagraphs(); i++) {
+ Paragraph p = r.getParagraph(i);
+
+ i += handleParagraph(p, 0, r, document,
+ FieldsDocumentPart.HEADER, pictures, pictureTable, listManager, xhtml);
+ }
+ }
+ }
+ xhtml.endElement("div");
+ }
+ }
+
+ private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document,
+ FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable, ListManager listManager,
+ XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
+ // Note - a poi bug means we can't currently properly recurse
+ // into nested tables, so currently we don't
+ if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) {
+ Table t = r.getTable(p);
+ xhtml.startElement("table");
+ xhtml.startElement("tbody");
+ for (int rn = 0; rn < t.numRows(); rn++) {
+ TableRow row = t.getRow(rn);
+ xhtml.startElement("tr");
+ for (int cn = 0; cn < row.numCells(); cn++) {
+ TableCell cell = row.getCell(cn);
+ xhtml.startElement("td");
+
+ for (int pn = 0; pn < cell.numParagraphs(); pn++) {
+ Paragraph cellP = cell.getParagraph(pn);
+ handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, listManager, xhtml);
+ }
+ xhtml.endElement("td");
+ }
+ xhtml.endElement("tr");
+ }
+ xhtml.endElement("tbody");
+ xhtml.endElement("table");
+ return (t.numParagraphs() - 1);
+ }
+
+ String text = p.text();
+ if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
+ // Skip empty paragraphs
+ return 0;
+ }
+
+ TagAndStyle tas;
+ String numbering = null;
+
+ if (document.getStyleSheet().numStyles() > p.getStyleIndex()) {
+ StyleDescription style =
+ document.getStyleSheet().getStyleDescription(p.getStyleIndex());
+ if (style != null && style.getName() != null && style.getName().length() > 0) {
+ if (p.isInList()) {
+ numbering = listManager.getFormattedNumber(p);
+ }
+ tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0));
+ } else {
+ tas = new TagAndStyle("p", null);
+ }
+ } else {
+ tas = new TagAndStyle("p", null);
+ }
+
+ if (tas.getStyleClass() != null) {
+ xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
+ } else {
+ xhtml.startElement(tas.getTag());
+ }
+
+ if (numbering != null) {
+ xhtml.characters(numbering);
+ }
+
+ for (int j = 0; j < p.numCharacterRuns(); j++) {
+ CharacterRun cr = p.getCharacterRun(j);
+
+ // FIELD_BEGIN_MARK:
+ if (cr.text().getBytes(UTF_8)[0] == 0x13) {
+ Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset());
+ // 58 is an embedded document
+ // 56 is a document link
+ if (field != null && (field.getType() == 58 || field.getType() == 56)) {
+ // Embedded Object: add a <div
+ // class="embedded" id="_X"/> so consumer can see where
+ // in the main text each embedded document
+ // occurred:
+ String id = "_" + field.getMarkSeparatorCharacterRun(r).getPicOffset();
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", id);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+ }
+ }
+
+ if (cr.text().equals("\u0013")) {
+ j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml);
+ } else if (cr.text().startsWith("\u0008")) {
+ // Floating Picture(s)
+ for (int pn = 0; pn < cr.text().length(); pn++) {
+ // Assume they're in the order from the unclaimed list...
+ Picture picture = pictures.nextUnclaimed();
+
+ // Output
+ handlePictureCharacterRun(cr, picture, pictures, xhtml);
+ }
+ } else if (pictureTable.hasPicture(cr)) {
+ // Inline Picture
+ Picture picture = pictures.getFor(cr);
+ handlePictureCharacterRun(cr, picture, pictures, xhtml);
+ } else {
+ handleCharacterRun(cr, tas.isHeading(), xhtml);
+ }
+ }
+
+ // Close any still open style tags
+ if (curStrikeThrough) {
+ xhtml.endElement("s");
+ curStrikeThrough = false;
+ }
+ if (curItalic) {
+ xhtml.endElement("i");
+ curItalic = false;
+ }
+ if (curBold) {
+ xhtml.endElement("b");
+ curBold = false;
+ }
+
+ xhtml.endElement(tas.getTag());
+
+ return 0;
+ }
+
+ private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLContentHandler xhtml)
+ throws SAXException {
+ // Skip trailing newlines
+ if (!isRendered(cr) || cr.text().equals("\r"))
+ return;
+
+ if (!skipStyling) {
+ if (cr.isBold() != curBold) {
+ // Enforce nesting -- must close s and i tags
+ if (curStrikeThrough) {
+ xhtml.endElement("s");
+ curStrikeThrough = false;
+ }
+ if (curItalic) {
+ xhtml.endElement("i");
+ curItalic = false;
+ }
+ if (cr.isBold()) {
+ xhtml.startElement("b");
+ } else {
+ xhtml.endElement("b");
+ }
+ curBold = cr.isBold();
+ }
+
+ if (cr.isItalic() != curItalic) {
+ // Enforce nesting -- must close s tag
+ if (curStrikeThrough) {
+ xhtml.endElement("s");
+ curStrikeThrough = false;
+ }
+ if (cr.isItalic()) {
+ xhtml.startElement("i");
+ } else {
+ xhtml.endElement("i");
+ }
+ curItalic = cr.isItalic();
+ }
+
+ if (cr.isStrikeThrough() != curStrikeThrough) {
+ if (cr.isStrikeThrough()) {
+ xhtml.startElement("s");
+ } else {
+ xhtml.endElement("s");
+ }
+ curStrikeThrough = cr.isStrikeThrough();
+ }
+ }
+
+ // Clean up the text
+ String text = cr.text();
+ text = text.replace('\r', '\n');
+ if (text.endsWith("\u0007")) {
+ // Strip the table cell end marker
+ text = text.substring(0, text.length() - 1);
+ }
+
+ // Copied from POI's org/apache/poi/hwpf/converter/AbstractWordConverter.processCharacters:
+
+ // Non-breaking hyphens are returned as char 30
+ text = text.replace((char) 30, UNICODECHAR_NONBREAKING_HYPHEN);
+
+ // Non-required hyphens to zero-width space
+ text = text.replace((char) 31, UNICODECHAR_ZERO_WIDTH_SPACE);
+
+ // Control characters as line break
+ text = text.replaceAll("[\u0000-\u001f]", "\n");
+ xhtml.characters(text);
+ }
+
+ /**
+ * Can be \13..text..\15 or \13..control..\14..text..\15 .
+ * Nesting is allowed
+ */
+ private int handleSpecialCharacterRuns(Paragraph p, int index, boolean skipStyling,
+ PicturesSource pictures, XHTMLContentHandler xhtml) throws SAXException, TikaException, IOException {
+ List<CharacterRun> controls = new ArrayList<CharacterRun>();
+ List<CharacterRun> texts = new ArrayList<CharacterRun>();
+ boolean has14 = false;
+
+ // Split it into before and after the 14
+ int i;
+ for (i = index + 1; i < p.numCharacterRuns(); i++) {
+ CharacterRun cr = p.getCharacterRun(i);
+ if (cr.text().equals("\u0013")) {
+ // Nested, oh joy...
+ int increment = handleSpecialCharacterRuns(p, i + 1, skipStyling, pictures, xhtml);
+ i += increment;
+ } else if (cr.text().equals("\u0014")) {
+ has14 = true;
+ } else if (cr.text().equals("\u0015")) {
+ if (!has14) {
+ texts = controls;
+ controls = new ArrayList<CharacterRun>();
+ }
+ break;
+ } else {
+ if (has14) {
+ texts.add(cr);
+ } else {
+ controls.add(cr);
+ }
+ }
+ }
+
+ // Do we need to do something special with this?
+ if (controls.size() > 0) {
+ String text = controls.get(0).text();
+ for (int j = 1; j < controls.size(); j++) {
+ text += controls.get(j).text();
+ }
+
+ if ((text.startsWith("HYPERLINK") || text.startsWith(" HYPERLINK"))
+ && text.indexOf('"') > -1) {
+ int start = text.indexOf('"') + 1;
+ int end = findHyperlinkEnd(text, start);
+ String url = "";
+ if (start >= 0 && start < end && end <= text.length()) {
+ url = text.substring(start, end);
+ }
+
+ xhtml.startElement("a", "href", url);
+ for (CharacterRun cr : texts) {
+ handleCharacterRun(cr, skipStyling, xhtml);
+ }
+ xhtml.endElement("a");
+ } else {
+ // Just output the text ones
+ for (CharacterRun cr : texts) {
+ if (pictures.hasPicture(cr)) {
+ Picture picture = pictures.getFor(cr);
+ handlePictureCharacterRun(cr, picture, pictures, xhtml);
+ } else {
+ handleCharacterRun(cr, skipStyling, xhtml);
+ }
+ }
+ }
+ } else {
+ // We only had text
+ // Output as-is
+ for (CharacterRun cr : texts) {
+ handleCharacterRun(cr, skipStyling, xhtml);
+ }
+ }
+
+ // Tell them how many to skip over
+ return i - index;
+ }
+
+ //temporary work around for TIKA-1512
+ private int findHyperlinkEnd(String text, int start) {
+ int end = text.lastIndexOf('"');
+ if (end > start) {
+ return end;
+ }
+ end = text.lastIndexOf('\u201D');//smart right double quote
+ if (end > start) {
+ return end;
+ }
+ end = text.lastIndexOf('\r');
+ if (end > start) {
+ return end;
+ }
+ //if nothing so far, take the full length of the string
+ //If the full string is > 256 characters, it appears
+ //that the url is truncated in the .doc file. This
+ //will return the value as it is in the file, which
+ //may be incorrect; but it is the same behavior as opening
+ //the link in MSWord.
+ //This code does not currently check that length is actually >= 256.
+ //we might want to add that?
+ return text.length();
+ }
+
+ private void handlePictureCharacterRun(CharacterRun cr, Picture picture, PicturesSource pictures, XHTMLContentHandler xhtml)
+ throws SAXException, IOException, TikaException {
+ if (!isRendered(cr) || picture == null) {
+ // Oh dear, we've run out...
+ // Probably caused by multiple \u0008 images referencing
+ // the same real image
+ return;
+ }
+
+ // Which one is it?
+ String extension = picture.suggestFileExtension();
+ int pictureNumber = pictures.pictureNumber(picture);
+
+ // Make up a name for the picture
+ // There isn't one in the file, but we need to be able to reference
+ // the picture from the img tag and the embedded resource
+ String filename = "image" + pictureNumber + (extension.length() > 0 ? "." + extension : "");
+
+ // Grab the mime type for the picture
+ String mimeType = picture.getMimeType();
+
+ // Output the img tag
+ AttributesImpl attr = new AttributesImpl();
+ attr.addAttribute("", "src", "src", "CDATA", "embedded:" + filename);
+ attr.addAttribute("", "alt", "alt", "CDATA", filename);
+ xhtml.startElement("img", attr);
+ xhtml.endElement("img");
+
+ // Have we already output this one?
+ // (Only expose each individual image once)
+ if (!pictures.hasOutput(picture)) {
+ TikaInputStream stream = TikaInputStream.get(picture.getContent());
+ handleEmbeddedResource(stream, filename, null, mimeType, xhtml, false);
+ pictures.recordOutput(picture);
+ }
+ }
+
+ /**
+ * Outputs a section of text if the given text is non-empty.
+ *
+ * @param xhtml XHTML content handler
+ * @param section the class of the <div/> section emitted
+ * @param text text to be emitted, if any
+ * @throws SAXException if an error occurs
+ */
+ private void addTextIfAny(
+ XHTMLContentHandler xhtml, String section, String text)
+ throws SAXException {
+ if (text != null && text.length() > 0) {
+ xhtml.startElement("div", "class", section);
+ xhtml.element("p", text);
+ xhtml.endElement("div");
+ }
+ }
+
+ protected void parseWord6(
+ NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ parseWord6(filesystem.getRoot(), xhtml);
+ }
+
+ protected void parseWord6(
+ DirectoryNode root, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ HWPFOldDocument doc = new HWPFOldDocument(root);
+ Word6Extractor extractor = new Word6Extractor(doc);
+
+ for (String p : extractor.getParagraphText()) {
+ xhtml.element("p", p);
+ }
+ }
+
+ /**
+ * Determines if character run should be included in the extraction.
+ *
+ * @param cr character run.
+ * @return true if character run should be included in extraction.
+ */
+ private boolean isRendered(final CharacterRun cr) {
+ return cr == null || !cr.isMarkedDeleted();
+ }
+
+ public static class TagAndStyle {
+ private String tag;
+ private String styleClass;
+
+ public TagAndStyle(String tag, String styleClass) {
+ this.tag = tag;
+ this.styleClass = styleClass;
+ }
+
+ public String getTag() {
+ return tag;
+ }
+
+ public String getStyleClass() {
+ return styleClass;
+ }
+
+ public boolean isHeading() {
+ return tag.length() == 2 && tag.startsWith("h");
+ }
+ }
+
+ /**
+ * Provides access to the pictures both by offset, iteration
+ * over the un-claimed, and peeking forward
+ */
+ private static class PicturesSource {
+ private PicturesTable picturesTable;
+ private Set<Picture> output = new HashSet<Picture>();
+ private Map<Integer, Picture> lookup;
+ private List<Picture> nonU1based;
+ private List<Picture> all;
+ private int pn = 0;
+
+ private PicturesSource(HWPFDocument doc) {
+ picturesTable = doc.getPicturesTable();
+ all = picturesTable.getAllPictures();
+
+ // Build the Offset-Picture lookup map
+ lookup = new HashMap<Integer, Picture>();
+ for (Picture p : all) {
+ lookup.put(p.getStartOffset(), p);
+ }
+
+ // Work out which Pictures aren't referenced by
+ // a \u0001 in the main text
+ // These are \u0008 escher floating ones, ones
+ // found outside the normal text, and who
+ // knows what else...
+ nonU1based = new ArrayList<Picture>();
+ nonU1based.addAll(all);
+ Range r = doc.getRange();
+ for (int i = 0; i < r.numCharacterRuns(); i++) {
+ CharacterRun cr = r.getCharacterRun(i);
+ if (picturesTable.hasPicture(cr)) {
+ Picture p = getFor(cr);
+ int at = nonU1based.indexOf(p);
+ nonU1based.set(at, null);
+ }
+ }
+ }
+
+ private boolean hasPicture(CharacterRun cr) {
+ return picturesTable.hasPicture(cr);
+ }
+
+ private void recordOutput(Picture picture) {
+ output.add(picture);
+ }
+
+ private boolean hasOutput(Picture picture) {
+ return output.contains(picture);
+ }
+
+ private int pictureNumber(Picture picture) {
+ return all.indexOf(picture) + 1;
+ }
+
+ private Picture getFor(CharacterRun cr) {
+ return lookup.get(cr.getPicOffset());
+ }
+
+ /**
+ * Return the next unclaimed one, used towards
+ * the end
+ */
+ private Picture nextUnclaimed() {
+ Picture p = null;
+ while (pn < nonU1based.size()) {
+ p = nonU1based.get(pn);
+ pn++;
+ if (p != null) return p;
+ }
+ return null;
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,312 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.util.List;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
+import org.apache.poi.openxml4j.opc.TargetMode;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Ole10Native;
+import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Base class for all Tika OOXML extractors.
+ * <p/>
+ * Tika extractors decorate POI extractors so that the parsed content of
+ * documents is returned as a sequence of XHTML SAX events. Subclasses must
+ * implement the buildXHTML method {@link #buildXHTML(XHTMLContentHandler)} that
+ * populates the {@link XHTMLContentHandler} object received as parameter.
+ */
+public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
+ static final String RELATION_AUDIO = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/audio";
+ static final String RELATION_IMAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
+ static final String RELATION_OLE_OBJECT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject";
+ static final String RELATION_PACKAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/package";
+
+ private static final String TYPE_OLE_OBJECT =
+ "application/vnd.openxmlformats-officedocument.oleObject";
+ private final EmbeddedDocumentExtractor embeddedExtractor;
+ protected POIXMLTextExtractor extractor;
+
+ public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) {
+ this.extractor = extractor;
+
+ EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+
+ if (ex == null) {
+ embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
+ } else {
+ embeddedExtractor = ex;
+ }
+
+ }
+
+ /**
+ * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getDocument()
+ */
+ public POIXMLDocument getDocument() {
+ return extractor.getDocument();
+ }
+
+ /**
+ * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getMetadataExtractor()
+ */
+ public MetadataExtractor getMetadataExtractor() {
+ return new MetadataExtractor(extractor);
+ }
+
+ /**
+ * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(org.xml.sax.ContentHandler,
+ * org.apache.tika.metadata.Metadata)
+ */
+ public void getXHTML(
+ ContentHandler handler, Metadata metadata, ParseContext context)
+ throws SAXException, XmlException, IOException, TikaException {
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ buildXHTML(xhtml);
+
+ // Now do any embedded parts
+ handleEmbeddedParts(handler);
+
+ // thumbnail
+ handleThumbnail(handler);
+
+ xhtml.endDocument();
+ }
+
+ protected String getJustFileName(String desc) {
+ int idx = desc.lastIndexOf('/');
+ if (idx != -1) {
+ desc = desc.substring(idx + 1);
+ }
+ idx = desc.lastIndexOf('.');
+ if (idx != -1) {
+ desc = desc.substring(0, idx);
+ }
+
+ return desc;
+ }
+
+ private void handleThumbnail(ContentHandler handler) {
+ try {
+ OPCPackage opcPackage = extractor.getPackage();
+ for (PackageRelationship rel : opcPackage.getRelationshipsByType(PackageRelationshipTypes.THUMBNAIL)) {
+ PackagePart tPart = opcPackage.getPart(rel);
+ InputStream tStream = tPart.getInputStream();
+ Metadata thumbnailMetadata = new Metadata();
+ String thumbName = tPart.getPartName().getName();
+ thumbnailMetadata.set(Metadata.RESOURCE_NAME_KEY, thumbName);
+
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute(XHTML, "class", "class", "CDATA", "embedded");
+ attributes.addAttribute(XHTML, "id", "id", "CDATA", thumbName);
+ handler.startElement(XHTML, "div", "div", attributes);
+ handler.endElement(XHTML, "div", "div");
+
+ thumbnailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, thumbName);
+ thumbnailMetadata.set(Metadata.CONTENT_TYPE, tPart.getContentType());
+ thumbnailMetadata.set(TikaCoreProperties.TITLE, tPart.getPartName().getName());
+
+ if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) {
+ embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream), new EmbeddedContentHandler(handler), thumbnailMetadata, false);
+ }
+
+ tStream.close();
+ }
+ } catch (Exception ex) {
+
+ }
+ }
+
+ private void handleEmbeddedParts(ContentHandler handler)
+ throws TikaException, IOException, SAXException {
+ try {
+ for (PackagePart source : getMainDocumentParts()) {
+ for (PackageRelationship rel : source.getRelationships()) {
+
+ URI sourceURI = rel.getSourceURI();
+ String sourceDesc;
+ if (sourceURI != null) {
+ sourceDesc = getJustFileName(sourceURI.getPath());
+ if (sourceDesc.startsWith("slide")) {
+ sourceDesc += "_";
+ } else {
+ sourceDesc = "";
+ }
+ } else {
+ sourceDesc = "";
+ }
+ if (rel.getTargetMode() == TargetMode.INTERNAL) {
+ PackagePart target;
+
+ try {
+ target = source.getRelatedPart(rel);
+ } catch (IllegalArgumentException ex) {
+ continue;
+ }
+
+ String type = rel.getRelationshipType();
+ if (RELATION_OLE_OBJECT.equals(type)
+ && TYPE_OLE_OBJECT.equals(target.getContentType())) {
+ handleEmbeddedOLE(target, handler, sourceDesc + rel.getId());
+ } else if (RELATION_AUDIO.equals(type)
+ || RELATION_IMAGE.equals(type)
+ || RELATION_PACKAGE.equals(type)
+ || RELATION_OLE_OBJECT.equals(type)) {
+ handleEmbeddedFile(target, handler, sourceDesc + rel.getId());
+ }
+ }
+ }
+ }
+ } catch (InvalidFormatException e) {
+ throw new TikaException("Broken OOXML file", e);
+ }
+ }
+
+ /**
+ * Handles an embedded OLE object in the document
+ */
+ private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel)
+ throws IOException, SAXException {
+ // A POIFSFileSystem needs to be at least 3 blocks big to be valid
+ if (part.getSize() >= 0 && part.getSize() < 512 * 3) {
+ // Too small, skip
+ return;
+ }
+
+ // Open the POIFS (OLE2) structure and process
+ POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream());
+ try {
+ Metadata metadata = new Metadata();
+ TikaInputStream stream = null;
+ metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
+
+ DirectoryNode root = fs.getRoot();
+ POIFSDocumentType type = POIFSDocumentType.detectType(root);
+
+ if (root.hasEntry("CONTENTS")
+ && root.hasEntry("\u0001Ole")
+ && root.hasEntry("\u0001CompObj")
+ && root.hasEntry("\u0003ObjInfo")) {
+ // TIKA-704: OLE 2.0 embedded non-Office document?
+ stream = TikaInputStream.get(
+ fs.createDocumentInputStream("CONTENTS"));
+ if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+ embeddedExtractor.parseEmbedded(
+ stream, new EmbeddedContentHandler(handler),
+ metadata, false);
+ }
+ } else if (POIFSDocumentType.OLE10_NATIVE == type) {
+ // TIKA-704: OLE 1.0 embedded document
+ Ole10Native ole =
+ Ole10Native.createFromEmbeddedOleObject(fs);
+ if (ole.getLabel() != null) {
+ metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
+ }
+ byte[] data = ole.getDataBuffer();
+ if (data != null) {
+ stream = TikaInputStream.get(data);
+ }
+
+ if (stream != null
+ && embeddedExtractor.shouldParseEmbedded(metadata)) {
+ embeddedExtractor.parseEmbedded(
+ stream, new EmbeddedContentHandler(handler),
+ metadata, false);
+ }
+ } else {
+ handleEmbeddedFile(part, handler, rel);
+ }
+ } catch (FileNotFoundException e) {
+ // There was no CONTENTS entry, so skip this part
+ } catch (Ole10NativeException e) {
+ // Could not process an OLE 1.0 entry, so skip this part
+ }
+ }
+
+ /**
+ * Handles an embedded file in the document
+ */
+ protected void handleEmbeddedFile(PackagePart part, ContentHandler handler, String rel)
+ throws SAXException, IOException {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
+
+ // Get the name
+ String name = part.getPartName().getName();
+ metadata.set(
+ Metadata.RESOURCE_NAME_KEY,
+ name.substring(name.lastIndexOf('/') + 1));
+
+ // Get the content type
+ metadata.set(
+ Metadata.CONTENT_TYPE, part.getContentType());
+
+ // Call the recursing handler
+ if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+ embeddedExtractor.parseEmbedded(
+ TikaInputStream.get(part.getInputStream()),
+ new EmbeddedContentHandler(handler),
+ metadata, false);
+ }
+ }
+
+ /**
+ * Populates the {@link XHTMLContentHandler} object received as parameter.
+ */
+ protected abstract void buildXHTML(XHTMLContentHandler xhtml)
+ throws SAXException, XmlException, IOException;
+
+ /**
+ * Return a list of the main parts of the document, used
+ * when searching for embedded resources.
+ * This should be all the parts of the document that end
+ * up with things embedded into them.
+ */
+ protected abstract List<PackagePart> getMainDocumentParts()
+ throws TikaException;
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.math.BigDecimal;
+import java.util.Date;
+
+import org.apache.poi.POIXMLProperties.CoreProperties;
+import org.apache.poi.POIXMLProperties.CustomProperties;
+import org.apache.poi.POIXMLProperties.ExtendedProperties;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.opc.internal.PackagePropertiesPart;
+import org.apache.poi.openxml4j.util.Nullable;
+import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.MSOffice;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.microsoft.SummaryExtractor;
+import org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperty;
+import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
+
+/**
+ * OOXML metadata extractor.
+ * <p/>
+ * Currently POI doesn't support metadata extraction for OOXML.
+ *
+ * @see OOXMLExtractor#getMetadataExtractor()
+ */
+public class MetadataExtractor {
+
+ private final POIXMLTextExtractor extractor;
+
+ public MetadataExtractor(POIXMLTextExtractor extractor) {
+ this.extractor = extractor;
+ }
+
+ public void extract(Metadata metadata) throws TikaException {
+ if (extractor.getDocument() != null ||
+ (extractor instanceof XSSFEventBasedExcelExtractor &&
+ extractor.getPackage() != null)) {
+ extractMetadata(extractor.getCoreProperties(), metadata);
+ extractMetadata(extractor.getExtendedProperties(), metadata);
+ extractMetadata(extractor.getCustomProperties(), metadata);
+ }
+ }
+
+ private void extractMetadata(CoreProperties properties, Metadata metadata) {
+ PackagePropertiesPart propsHolder = properties
+ .getUnderlyingProperties();
+
+ addProperty(metadata, OfficeOpenXMLCore.CATEGORY, propsHolder.getCategoryProperty());
+ addProperty(metadata, OfficeOpenXMLCore.CONTENT_STATUS, propsHolder
+ .getContentStatusProperty());
+ addProperty(metadata, TikaCoreProperties.CREATED, propsHolder
+ .getCreatedProperty());
+ addMultiProperty(metadata, TikaCoreProperties.CREATOR, propsHolder
+ .getCreatorProperty());
+ addProperty(metadata, TikaCoreProperties.DESCRIPTION, propsHolder
+ .getDescriptionProperty());
+ addProperty(metadata, TikaCoreProperties.IDENTIFIER, propsHolder
+ .getIdentifierProperty());
+ addProperty(metadata, TikaCoreProperties.KEYWORDS, propsHolder
+ .getKeywordsProperty());
+ addProperty(metadata, TikaCoreProperties.LANGUAGE, propsHolder
+ .getLanguageProperty());
+ addProperty(metadata, TikaCoreProperties.MODIFIER, propsHolder
+ .getLastModifiedByProperty());
+ addProperty(metadata, TikaCoreProperties.PRINT_DATE, propsHolder
+ .getLastPrintedProperty());
+ addProperty(metadata, Metadata.LAST_MODIFIED, propsHolder
+ .getModifiedProperty());
+ addProperty(metadata, TikaCoreProperties.MODIFIED, propsHolder
+ .getModifiedProperty());
+ addProperty(metadata, OfficeOpenXMLCore.REVISION, propsHolder
+ .getRevisionProperty());
+ // TODO: Move to OO subject in Tika 2.0
+ addProperty(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT,
+ propsHolder.getSubjectProperty());
+ addProperty(metadata, TikaCoreProperties.TITLE, propsHolder.getTitleProperty());
+ addProperty(metadata, OfficeOpenXMLCore.VERSION, propsHolder.getVersionProperty());
+
+ // Legacy Tika-1.0 style stats
+ // TODO Remove these in Tika 2.0
+ addProperty(metadata, Metadata.CATEGORY, propsHolder.getCategoryProperty());
+ addProperty(metadata, Metadata.CONTENT_STATUS, propsHolder
+ .getContentStatusProperty());
+ addProperty(metadata, Metadata.REVISION_NUMBER, propsHolder
+ .getRevisionProperty());
+ addProperty(metadata, Metadata.VERSION, propsHolder.getVersionProperty());
+ }
+
+ private void extractMetadata(ExtendedProperties properties,
+ Metadata metadata) {
+ CTProperties propsHolder = properties.getUnderlyingProperties();
+
+ addProperty(metadata, OfficeOpenXMLExtended.APPLICATION, propsHolder.getApplication());
+ addProperty(metadata, OfficeOpenXMLExtended.APP_VERSION, propsHolder.getAppVersion());
+ addProperty(metadata, TikaCoreProperties.PUBLISHER, propsHolder.getCompany());
+ addProperty(metadata, OfficeOpenXMLExtended.COMPANY, propsHolder.getCompany());
+ SummaryExtractor.addMulti(metadata, OfficeOpenXMLExtended.MANAGER, propsHolder.getManager());
+ addProperty(metadata, OfficeOpenXMLExtended.NOTES, propsHolder.getNotes());
+ addProperty(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
+ addProperty(metadata, OfficeOpenXMLExtended.TEMPLATE, propsHolder.getTemplate());
+ addProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, propsHolder.getTotalTime());
+
+ if (propsHolder.getPages() > 0) {
+ metadata.set(PagedText.N_PAGES, propsHolder.getPages());
+ } else if (propsHolder.getSlides() > 0) {
+ metadata.set(PagedText.N_PAGES, propsHolder.getSlides());
+ }
+
+ // Process the document statistics
+ addProperty(metadata, Office.PAGE_COUNT, propsHolder.getPages());
+ addProperty(metadata, Office.SLIDE_COUNT, propsHolder.getSlides());
+ addProperty(metadata, Office.PARAGRAPH_COUNT, propsHolder.getParagraphs());
+ addProperty(metadata, Office.LINE_COUNT, propsHolder.getLines());
+ addProperty(metadata, Office.WORD_COUNT, propsHolder.getWords());
+ addProperty(metadata, Office.CHARACTER_COUNT, propsHolder.getCharacters());
+ addProperty(metadata, Office.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
+
+ // Legacy Tika-1.0 style stats
+ // TODO Remove these in Tika 2.0
+ addProperty(metadata, Metadata.APPLICATION_NAME, propsHolder.getApplication());
+ addProperty(metadata, Metadata.APPLICATION_VERSION, propsHolder.getAppVersion());
+ addProperty(metadata, Metadata.MANAGER, propsHolder.getManager());
+ addProperty(metadata, Metadata.NOTES, propsHolder.getNotes());
+ addProperty(metadata, Metadata.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
+ addProperty(metadata, Metadata.TEMPLATE, propsHolder.getTemplate());
+ addProperty(metadata, Metadata.TOTAL_TIME, propsHolder.getTotalTime());
+ addProperty(metadata, MSOffice.PAGE_COUNT, propsHolder.getPages());
+ addProperty(metadata, MSOffice.SLIDE_COUNT, propsHolder.getSlides());
+ addProperty(metadata, MSOffice.PARAGRAPH_COUNT, propsHolder.getParagraphs());
+ addProperty(metadata, MSOffice.LINE_COUNT, propsHolder.getLines());
+ addProperty(metadata, MSOffice.WORD_COUNT, propsHolder.getWords());
+ addProperty(metadata, MSOffice.CHARACTER_COUNT, propsHolder.getCharacters());
+ addProperty(metadata, MSOffice.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
+ }
+
+ private void extractMetadata(CustomProperties properties,
+ Metadata metadata) {
+ org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties
+ props = properties.getUnderlyingProperties();
+ for (int i = 0; i < props.sizeOfPropertyArray(); i++) {
+ CTProperty property = props.getPropertyArray(i);
+ String val = null;
+ Date date = null;
+
+ if (property.isSetLpwstr()) {
+ val = property.getLpwstr();
+ } else if (property.isSetLpstr()) {
+ val = property.getLpstr();
+ } else if (property.isSetDate()) {
+ date = property.getDate().getTime();
+ } else if (property.isSetFiletime()) {
+ date = property.getFiletime().getTime();
+ } else if (property.isSetBool()) {
+ val = Boolean.toString(property.getBool());
+ }
+
+ // Integers
+ else if (property.isSetI1()) {
+ val = Integer.toString(property.getI1());
+ } else if (property.isSetI2()) {
+ val = Integer.toString(property.getI2());
+ } else if (property.isSetI4()) {
+ val = Integer.toString(property.getI4());
+ } else if (property.isSetI8()) {
+ val = Long.toString(property.getI8());
+ } else if (property.isSetInt()) {
+ val = Integer.toString(property.getInt());
+ }
+
+ // Unsigned Integers
+ else if (property.isSetUi1()) {
+ val = Integer.toString(property.getUi1());
+ } else if (property.isSetUi2()) {
+ val = Integer.toString(property.getUi2());
+ } else if (property.isSetUi4()) {
+ val = Long.toString(property.getUi4());
+ } else if (property.isSetUi8()) {
+ val = property.getUi8().toString();
+ } else if (property.isSetUint()) {
+ val = Long.toString(property.getUint());
+ }
+
+ // Reals
+ else if (property.isSetR4()) {
+ val = Float.toString(property.getR4());
+ } else if (property.isSetR8()) {
+ val = Double.toString(property.getR8());
+ } else if (property.isSetDecimal()) {
+ BigDecimal d = property.getDecimal();
+ if (d == null) {
+ val = null;
+ } else {
+ val = d.toPlainString();
+ }
+ } else if (property.isSetArray()) {
+ // TODO Fetch the array values and output
+ } else if (property.isSetVector()) {
+ // TODO Fetch the vector values and output
+ } else if (property.isSetBlob() || property.isSetOblob()) {
+ // TODO Decode, if possible
+ } else if (property.isSetStream() || property.isSetOstream() ||
+ property.isSetVstream()) {
+ // TODO Decode, if possible
+ } else if (property.isSetStorage() || property.isSetOstorage()) {
+ // TODO Decode, if possible
+ } else {
+ // This type isn't currently supported yet, skip the property
+ }
+
+ String propName = "custom:" + property.getName();
+ if (date != null) {
+ Property tikaProp = Property.externalDate(propName);
+ metadata.set(tikaProp, date);
+ } else if (val != null) {
+ metadata.set(propName, val);
+ }
+ }
+ }
+
+ private <T> void addProperty(Metadata metadata, Property property, Nullable<T> nullableValue) {
+ T value = nullableValue.getValue();
+ if (value != null) {
+ if (value instanceof Date) {
+ metadata.set(property, (Date) value);
+ } else if (value instanceof String) {
+ metadata.set(property, (String) value);
+ } else if (value instanceof Integer) {
+ metadata.set(property, (Integer) value);
+ } else if (value instanceof Double) {
+ metadata.set(property, (Double) value);
+ }
+ }
+ }
+
+ private void addProperty(Metadata metadata, String name, Nullable<?> value) {
+ if (value.getValue() != null) {
+ addProperty(metadata, name, value.getValue().toString());
+ }
+ }
+
+ private void addProperty(Metadata metadata, Property property, String value) {
+ if (value != null) {
+ metadata.set(property, value);
+ }
+ }
+
+ private void addProperty(Metadata metadata, String name, String value) {
+ if (value != null) {
+ metadata.set(name, value);
+ }
+ }
+
+ private void addProperty(Metadata metadata, Property property, int value) {
+ if (value > 0) {
+ metadata.set(property, value);
+ }
+ }
+
+ private void addProperty(Metadata metadata, String name, int value) {
+ if (value > 0) {
+ metadata.set(name, Integer.toString(value));
+ }
+ }
+
+ private void addMultiProperty(Metadata metadata, Property property, Nullable<String> value) {
+ if (value == null) {
+ return;
+ }
+ SummaryExtractor.addMulti(metadata, property, value.getValue());
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Interface implemented by all Tika OOXML extractors.
+ *
+ * @see org.apache.poi.POIXMLTextExtractor
+ */
+public interface OOXMLExtractor {
+
+ /**
+ * Returns the opened document.
+ *
+ * @see POIXMLTextExtractor#getDocument()
+ */
+ POIXMLDocument getDocument();
+
+ /**
+ * {@link POIXMLTextExtractor#getMetadataTextExtractor()} not yet supported
+ * for OOXML by POI.
+ */
+ MetadataExtractor getMetadataExtractor();
+
+ /**
+ * Parses the document into a sequence of XHTML SAX events sent to the
+ * given content handler.
+ */
+ void getXHTML(ContentHandler handler, Metadata metadata, ParseContext context)
+ throws SAXException, XmlException, IOException, TikaException;
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Locale;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.extractor.ExtractorFactory;
+import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
+import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
+import org.apache.poi.openxml4j.opc.OPCPackage;
+import org.apache.poi.openxml4j.opc.PackageAccess;
+import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.usermodel.XMLSlideShow;
+import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pkg.ZipContainerDetector;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Figures out the correct {@link OOXMLExtractor} for the supplied document and
+ * returns it.
+ */
+public class OOXMLExtractorFactory {
+
+ public static void parse(
+ InputStream stream, ContentHandler baseHandler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ Locale locale = context.get(Locale.class, Locale.getDefault());
+ ExtractorFactory.setThreadPrefersEventExtractors(true);
+
+ try {
+ OOXMLExtractor extractor;
+ OPCPackage pkg;
+
+ // Locate or Open the OPCPackage for the file
+ TikaInputStream tis = TikaInputStream.cast(stream);
+ if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
+ pkg = (OPCPackage) tis.getOpenContainer();
+ } else if (tis != null && tis.hasFile()) {
+ pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
+ tis.setOpenContainer(pkg);
+ } else {
+ InputStream shield = new CloseShieldInputStream(stream);
+ pkg = OPCPackage.open(shield);
+ }
+
+ // Get the type, and ensure it's one we handle
+ MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
+ if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
+ // Not a supported type, delegate to Empty Parser
+ EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
+ return;
+ }
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
+
+ // Have the appropriate OOXML text extractor picked
+ POIXMLTextExtractor poiExtractor = ExtractorFactory.createExtractor(pkg);
+
+ POIXMLDocument document = poiExtractor.getDocument();
+ if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
+ extractor = new XSSFExcelExtractorDecorator(
+ context, (XSSFEventBasedExcelExtractor) poiExtractor, locale);
+ } else if (document == null) {
+ throw new TikaException(
+ "Expecting UserModel based POI OOXML extractor with a document, but none found. " +
+ "The extractor returned was a " + poiExtractor
+ );
+ } else if (document instanceof XMLSlideShow) {
+ extractor = new XSLFPowerPointExtractorDecorator(
+ context, (XSLFPowerPointExtractor) poiExtractor);
+ } else if (document instanceof XWPFDocument) {
+ extractor = new XWPFWordExtractorDecorator(
+ context, (XWPFWordExtractor) poiExtractor);
+ } else {
+ extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
+ }
+
+ // Get the bulk of the metadata first, so that it's accessible during
+ // parsing if desired by the client (see TIKA-1109)
+ extractor.getMetadataExtractor().extract(metadata);
+
+ // Extract the text, along with any in-document metadata
+ extractor.getXHTML(baseHandler, metadata, context);
+ } catch (IllegalArgumentException e) {
+ if (e.getMessage() != null &&
+ e.getMessage().startsWith("No supported documents found")) {
+ throw new TikaException(
+ "TIKA-418: RuntimeException while getting content"
+ + " for thmx and xps file types", e);
+ } else {
+ throw new TikaException("Error creating OOXML extractor", e);
+ }
+ } catch (InvalidFormatException e) {
+ throw new TikaException("Error creating OOXML extractor", e);
+ } catch (OpenXML4JException e) {
+ throw new TikaException("Error creating OOXML extractor", e);
+ } catch (XmlException e) {
+ throw new TikaException("Error creating OOXML extractor", e);
+
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.openxml4j.util.ZipSecureFile;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Office Open XML (OOXML) parser.
+ */
+public class OOXMLParser extends AbstractParser {
+ static {
+ //turn off POI's zip bomb detection because we have our own
+ ZipSecureFile.setMinInflateRatio(-1.0d);
+ }
+
+ protected static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("x-tika-ooxml"),
+ MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"),
+ MediaType.application("vnd.ms-powerpoint.presentation.macroenabled.12"),
+ MediaType.application("vnd.openxmlformats-officedocument.presentationml.template"),
+ MediaType.application("vnd.openxmlformats-officedocument.presentationml.slideshow"),
+ MediaType.application("vnd.ms-powerpoint.slideshow.macroenabled.12"),
+ MediaType.application("vnd.ms-powerpoint.addin.macroenabled.12"),
+ MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
+ MediaType.application("vnd.ms-excel.sheet.macroenabled.12"),
+ MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.template"),
+ MediaType.application("vnd.ms-excel.template.macroenabled.12"),
+ MediaType.application("vnd.ms-excel.addin.macroenabled.12"),
+ MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"),
+ MediaType.application("vnd.ms-word.document.macroenabled.12"),
+ MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.template"),
+ MediaType.application("vnd.ms-word.template.macroenabled.12"))));
+ /**
+ * We claim to support all OOXML files, but we actually don't support a small
+ * number of them.
+ * This list is used to decline certain formats that are not yet supported
+ * by Tika and/or POI.
+ */
+ protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12"),
+ MediaType.application("vnd.ms-xpsdocument")
+ )));
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 6535995710857776481L;
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Have the OOXML file processed
+ OOXMLExtractorFactory.parse(stream, handler, metadata, context);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.openxml4j.opc.PackagePart;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+public class POIXMLTextExtractorDecorator extends AbstractOOXMLExtractor {
+
+ public POIXMLTextExtractorDecorator(ParseContext context, POIXMLTextExtractor extractor) {
+ super(context, extractor);
+ }
+
+ @Override
+ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException {
+ // extract document content as a single string (not structured)
+ xhtml.element("p", extractor.getText());
+ }
+
+ @Override
+ protected List<PackagePart> getMainDocumentParts() {
+ return new ArrayList<PackagePart>();
+ }
+}