You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/16 19:23:06 UTC
svn commit: r1725014 [9/28] - in /tika/branches/2.x:
tika-parser-bundles/tika-multimedia-bundle/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-parser-module/
tika-parser-modules/tika-advanced-parser-mo...
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Locale;
+import java.util.Set;
+
+import com.healthmarketscience.jackcess.CryptCodecProvider;
+import com.healthmarketscience.jackcess.Database;
+import com.healthmarketscience.jackcess.DatabaseBuilder;
+import com.healthmarketscience.jackcess.util.LinkResolver;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser that handles Microsoft Access files via
+ * <a href="http://jackcess.sourceforge.net/>Jackcess</a>
+ * <p>
+ * Many, many thanks to LexisNexis®/Health Market Science (HMS), Brian O'Neill,
+ * and James Ahlborn for relicensing Jackcess to Apache v2.0!
+ */
+public class JackcessParser extends AbstractParser {
+
+ public static final String SUMMARY_PROPERTY_PREFIX = "MDB_SUMMARY_PROP" + Metadata.NAMESPACE_PREFIX_DELIMITER;
+ public static String MDB_PROPERTY_PREFIX = "MDB_PROP" + Metadata.NAMESPACE_PREFIX_DELIMITER;
+ public static String USER_DEFINED_PROPERTY_PREFIX = "MDB_USER_PROP" + Metadata.NAMESPACE_PREFIX_DELIMITER;
+ public static Property MDB_PW = Property.externalText("Password");
+ private final static LinkResolver IGNORE_LINK_RESOLVER = new IgnoreLinkResolver();
+
+ //TODO: figure out how to get this info
+ // public static Property LINKED_DATABASES = Property.externalTextBag("LinkedDatabases");
+
+ private static final long serialVersionUID = -752276948656079347L;
+
+ private static final MediaType MEDIA_TYPE = MediaType.application("x-msaccess");
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE);
+
+ private Locale locale = Locale.ROOT;
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
+ TikaInputStream tis = TikaInputStream.get(stream);
+ Database db = null;
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ String password = null;
+ PasswordProvider passwordProvider = context.get(PasswordProvider.class);
+ if (passwordProvider != null) {
+ password = passwordProvider.getPassword(metadata);
+ }
+ try {
+ if (password == null) {
+ //do this to ensure encryption/wrong password exception vs. more generic
+ //"need right codec" error message.
+ db = new DatabaseBuilder(tis.getFile())
+ .setCodecProvider(new CryptCodecProvider())
+ .setReadOnly(true).open();
+ } else {
+ db = new DatabaseBuilder(tis.getFile())
+ .setCodecProvider(new CryptCodecProvider(password))
+ .setReadOnly(true).open();
+ }
+ db.setLinkResolver(IGNORE_LINK_RESOLVER);//just in case
+ JackcessExtractor ex = new JackcessExtractor(context, locale);
+ ex.parse(db, xhtml, metadata);
+ } catch (IllegalStateException e) {
+ if (e.getMessage() != null && e.getMessage().contains("Incorrect password")) {
+ throw new EncryptedDocumentException(e);
+ }
+ throw e;
+ } finally {
+ if (db != null) {
+ try {
+ db.close();
+ } catch (IOException e) {
+ //swallow = silent close
+ }
+ }
+ }
+ xhtml.endDocument();
+ }
+
+ private static final class IgnoreLinkResolver implements LinkResolver {
+ //If links are resolved, Jackcess might try to open and process
+ //any file on the current system that is specified as a linked db.
+ //This could be a nasty security issue.
+ @Override
+ public Database resolveLinkedDatabase(Database database, String s) throws IOException {
+ throw new AssertionError("DO NOT ALLOW RESOLVING OF LINKS!!!");
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/LinkedCell.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Linked cell. This class decorates another content cell with a hyperlink.
+ */
+public class LinkedCell extends CellDecorator {
+
+ private final String link;
+
+ public LinkedCell(Cell cell, String link) {
+ super(cell);
+ assert link != null;
+ this.link = link;
+ }
+
+ public void render(XHTMLContentHandler handler) throws SAXException {
+ handler.startElement("a", "href", link);
+ super.render(handler);
+ handler.endElement("a");
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ListManager.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ListManager.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ListManager.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ListManager.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.util.NoSuchElementException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.model.ListData;
+import org.apache.poi.hwpf.model.ListFormatOverrideLevel;
+import org.apache.poi.hwpf.model.ListLevel;
+import org.apache.poi.hwpf.model.ListTables;
+import org.apache.poi.hwpf.usermodel.Paragraph;
+
+/**
+ * Computes the number text which goes at the beginning of each list paragraph
+ * <p/>
+ * <p><em>Note:</em> This class only handles the raw number text and does not apply any further formatting as described in [MS-DOC], v20140721, 2.4.6.3, Part 3 to it.<p>
+ * <p><em>Note 2:</em> The {@code tplc}, a visual override for the appearance of list levels, as defined in [MS-DOC], v20140721, 2.9.328 is not taken care of in this class.</p>
+ * <p>Further, this class does not yet handle overrides</p>
+ */
+public class ListManager extends AbstractListManager {
+
+ private static final Log logger = LogFactory.getLog(ListManager.class);
+ private final ListTables listTables;
+
+ /**
+ * Ordinary constructor for a new list reader
+ *
+ * @param document Document to process
+ */
+ public ListManager(final HWPFDocument document) {
+ this.listTables = document.getListTables();
+ }
+
+ /**
+ * Get the formatted number for a given paragraph
+ * <p/>
+ * <p><em>Note:</em> This only works correctly if called subsequently for <em>all</em> paragraphs in a valid selection (main document, text field, ...) which are part of a list.</p>
+ *
+ * @param paragraph list paragraph to process
+ * @return String which represents the numbering of this list paragraph; never {@code null}, can be empty string, though,
+ * if something goes wrong in getList()
+ * @throws IllegalArgumentException If the given paragraph is {@code null} or is not part of a list
+ */
+ public String getFormattedNumber(final Paragraph paragraph) {
+ if (paragraph == null) throw new IllegalArgumentException("Given paragraph cannot be null.");
+ if (!paragraph.isInList()) throw new IllegalArgumentException("Can only process list paragraphs.");
+ //lsid is equivalent to docx's abnum
+ //ilfo is equivalent to docx's num
+ int currAbNumId = -1;
+ try{
+ currAbNumId = paragraph.getList().getLsid();
+ } catch (NoSuchElementException e) {
+ //somewhat frequent exception when initializing HWPFList
+ return "";
+ } catch (IllegalArgumentException e) {
+ return "";
+ } catch (NullPointerException e) {
+ return "";
+ }
+
+ int currNumId = paragraph.getIlfo();
+ ParagraphLevelCounter lc = listLevelMap.get(currAbNumId);
+ LevelTuple[] overrideTuples = overrideTupleMap.get(currNumId);
+
+ if (lc == null) {
+ ListData listData = listTables.getListData(paragraph.getList().getLsid());
+ LevelTuple[] levelTuples = new LevelTuple[listData.getLevels().length];
+ for (int i = 0; i < listData.getLevels().length; i++) {
+ levelTuples[i] = buildTuple(i, listData.getLevels()[i]);
+ }
+ lc = new ParagraphLevelCounter(levelTuples);
+ }
+ if (overrideTuples == null) {
+ overrideTuples = buildOverrideTuples(paragraph, lc.getNumberOfLevels());
+ }
+ String formattedString = lc.incrementLevel(paragraph.getIlvl(), overrideTuples);
+
+ listLevelMap.put(currAbNumId, lc);
+ overrideTupleMap.put(currNumId, overrideTuples);
+ return formattedString;
+ }
+
+ private LevelTuple buildTuple(int i, ListLevel listLevel) {
+ boolean isLegal = false;
+ int start = 1;
+ int restart = -1;
+ String lvlText = "%" + i + ".";
+ String numFmt = "decimal";
+
+ start = listLevel.getStartAt();
+ restart = listLevel.getRestart();
+ isLegal = listLevel.isLegalNumbering();
+ numFmt = convertToNewNumFormat(listLevel.getNumberFormat());
+ lvlText = convertToNewNumberText(listLevel.getNumberText(), listLevel.getLevelNumberingPlaceholderOffsets());
+ return new LevelTuple(start, restart, lvlText, numFmt, isLegal);
+ }
+
+ private LevelTuple[] buildOverrideTuples(Paragraph par, int length) {
+ ListFormatOverrideLevel overrideLevel;
+ // find the override for this level
+ if (listTables.getLfoData(par.getIlfo()).getRgLfoLvl().length == 0) {
+ return null;
+ }
+ overrideLevel = listTables.getLfoData(par.getIlfo()).getRgLfoLvl()[0];
+ if (overrideLevel == null) {
+ return null;
+ }
+ LevelTuple[] levelTuples = new LevelTuple[length];
+ ListLevel listLevel = overrideLevel.getLevel();
+ if (listLevel == null) {
+ return null;
+ }
+ for (int i = 0; i < length; i++) {
+ levelTuples[i] = buildTuple(i, listLevel);
+ }
+
+ return levelTuples;
+
+ }
+
+ private String convertToNewNumberText(String numberText, byte[] numberOffsets) {
+
+ StringBuilder sb = new StringBuilder();
+ int last = 0;
+ for (int i = 0; i < numberOffsets.length; i++) {
+ int offset = (int) numberOffsets[i];
+
+ if (offset == 0) {
+ break;
+ }
+ sb.append(numberText.substring(last, offset - 1));
+ //need to add one because newer format
+ //adds one. In .doc, this was the array index;
+ //but in .docx, this is the level number
+ int lvlNum = (int) numberText.charAt(offset - 1) + 1;
+ sb.append("%" + lvlNum);
+ last = offset;
+ }
+ if (last < numberText.length()) {
+ sb.append(numberText.substring(last));
+ }
+ return sb.toString();
+ }
+
+ private String convertToNewNumFormat(int numberFormat) {
+ switch (numberFormat) {
+ case -1:
+ return "none";
+ case 0:
+ return "decimal";
+ case 1:
+ return "upperRoman";
+ case 2:
+ return "lowerRoman";
+ case 3:
+ return "upperLetter";
+ case 4:
+ return "lowerLetter";
+ case 5:
+ return "ordinal";
+ case 22:
+ return "decimalZero";
+ case 23:
+ return "bullet";
+ case 47:
+ return "none";
+ default:
+ //do we really want to silently swallow these uncovered cases?
+ //throw new RuntimeException("NOT COVERED: " + numberFormat);
+ return "decimal";
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.text.NumberFormat;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Number cell.
+ */
+public class NumberCell implements Cell {
+
+ private final double number;
+
+ private final NumberFormat format;
+
+ public NumberCell(double number, NumberFormat format) {
+ this.number = number;
+ this.format = format;
+ }
+
+ public void render(XHTMLContentHandler handler) throws SAXException {
+ handler.characters(format.format(number));
+ }
+
+ public String toString() {
+ return "Numeric Cell: " + format.format(number);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,273 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.security.GeneralSecurityException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.poi.hdgf.extractor.VisioTextExtractor;
+import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
+import org.apache.poi.poifs.crypt.Decryptor;
+import org.apache.poi.poifs.crypt.EncryptionInfo;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Defines a Microsoft document content extractor.
+ */
+public class OfficeParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 7393462244028653479L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ POIFSDocumentType.WORKBOOK.type,
+ POIFSDocumentType.OLE10_NATIVE.type,
+ POIFSDocumentType.WORDDOCUMENT.type,
+ POIFSDocumentType.UNKNOWN.type,
+ POIFSDocumentType.ENCRYPTED.type,
+ POIFSDocumentType.POWERPOINT.type,
+ POIFSDocumentType.PUBLISHER.type,
+ POIFSDocumentType.PROJECT.type,
+ POIFSDocumentType.VISIO.type,
+ // Works isn't supported
+ POIFSDocumentType.XLR.type, // but Works 7.0 Spreadsheet is
+ POIFSDocumentType.OUTLOOK.type,
+ POIFSDocumentType.SOLIDWORKS_PART.type,
+ POIFSDocumentType.SOLIDWORKS_ASSEMBLY.type,
+ POIFSDocumentType.SOLIDWORKS_DRAWING.type
+ )));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ /**
+ * Extracts properties and text from an MS Document input stream
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ final DirectoryNode root;
+ TikaInputStream tstream = TikaInputStream.cast(stream);
+ if (tstream == null) {
+ root = new NPOIFSFileSystem(new CloseShieldInputStream(stream)).getRoot();
+ } else {
+ final Object container = tstream.getOpenContainer();
+ if (container instanceof NPOIFSFileSystem) {
+ root = ((NPOIFSFileSystem) container).getRoot();
+ } else if (container instanceof DirectoryNode) {
+ root = (DirectoryNode) container;
+ } else {
+ NPOIFSFileSystem fs;
+ if (tstream.hasFile()) {
+ fs = new NPOIFSFileSystem(tstream.getFile(), true);
+ } else {
+ fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
+ }
+ tstream.setOpenContainer(fs);
+ root = fs.getRoot();
+ }
+ }
+ parse(root, context, metadata, xhtml);
+ xhtml.endDocument();
+ }
+
+ protected void parse(
+ DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+
+ // Parse summary entries first, to make metadata available early
+ new SummaryExtractor(metadata).parseSummaries(root);
+
+ // Parse remaining document entries
+ POIFSDocumentType type = POIFSDocumentType.detectType(root);
+
+ if (type != POIFSDocumentType.UNKNOWN) {
+ setType(metadata, type.getType());
+ }
+
+ switch (type) {
+ case SOLIDWORKS_PART:
+ case SOLIDWORKS_ASSEMBLY:
+ case SOLIDWORKS_DRAWING:
+ break;
+ case PUBLISHER:
+ PublisherTextExtractor publisherTextExtractor =
+ new PublisherTextExtractor(root);
+ xhtml.element("p", publisherTextExtractor.getText());
+ break;
+ case WORDDOCUMENT:
+ new WordExtractor(context).parse(root, xhtml);
+ break;
+ case POWERPOINT:
+ new HSLFExtractor(context).parse(root, xhtml);
+ break;
+ case WORKBOOK:
+ case XLR:
+ Locale locale = context.get(Locale.class, Locale.getDefault());
+ new ExcelExtractor(context, metadata).parse(root, xhtml, locale);
+ break;
+ case PROJECT:
+ // We currently can't do anything beyond the metadata
+ break;
+ case VISIO:
+ VisioTextExtractor visioTextExtractor =
+ new VisioTextExtractor(root);
+ for (String text : visioTextExtractor.getAllText()) {
+ xhtml.element("p", text);
+ }
+ break;
+ case OUTLOOK:
+ OutlookExtractor extractor =
+ new OutlookExtractor(root, context);
+
+ extractor.parse(xhtml, metadata);
+ break;
+ case ENCRYPTED:
+ EncryptionInfo info = new EncryptionInfo(root);
+ Decryptor d = Decryptor.getInstance(info);
+
+ try {
+ // By default, use the default Office Password
+ String password = Decryptor.DEFAULT_PASSWORD;
+
+ // If they supplied a Password Provider, ask that for the password,
+ // and use the provider given one if available (stick with default if not)
+ PasswordProvider passwordProvider = context.get(PasswordProvider.class);
+ if (passwordProvider != null) {
+ String suppliedPassword = passwordProvider.getPassword(metadata);
+ if (suppliedPassword != null) {
+ password = suppliedPassword;
+ }
+ }
+
+ // Check if we've the right password or not
+ if (!d.verifyPassword(password)) {
+ throw new EncryptedDocumentException();
+ }
+
+ // Decrypt the OLE2 stream, and delegate the resulting OOXML
+ // file to the regular OOXML parser for normal handling
+ OOXMLParser parser = new OOXMLParser();
+
+ parser.parse(d.getDataStream(root), new EmbeddedContentHandler(
+ new BodyContentHandler(xhtml)),
+ metadata, context);
+ } catch (GeneralSecurityException ex) {
+ throw new EncryptedDocumentException(ex);
+ }
+ default:
+ // For unsupported / unhandled types, just the metadata
+ // is extracted, which happened above
+ break;
+ }
+ }
+
+ private void setType(Metadata metadata, MediaType type) {
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
+ }
+
+ public enum POIFSDocumentType {
+ WORKBOOK("xls", MediaType.application("vnd.ms-excel")),
+ OLE10_NATIVE("ole", POIFSContainerDetector.OLE10_NATIVE),
+ COMP_OBJ("ole", POIFSContainerDetector.COMP_OBJ),
+ WORDDOCUMENT("doc", MediaType.application("msword")),
+ UNKNOWN("unknown", MediaType.application("x-tika-msoffice")),
+ ENCRYPTED("ole", MediaType.application("x-tika-ooxml-protected")),
+ POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")),
+ PUBLISHER("pub", MediaType.application("x-mspublisher")),
+ PROJECT("mpp", MediaType.application("vnd.ms-project")),
+ VISIO("vsd", MediaType.application("vnd.visio")),
+ WORKS("wps", MediaType.application("vnd.ms-works")),
+ XLR("xlr", MediaType.application("x-tika-msworks-spreadsheet")),
+ OUTLOOK("msg", MediaType.application("vnd.ms-outlook")),
+ SOLIDWORKS_PART("sldprt", MediaType.application("sldworks")),
+ SOLIDWORKS_ASSEMBLY("sldasm", MediaType.application("sldworks")),
+ SOLIDWORKS_DRAWING("slddrw", MediaType.application("sldworks"));
+
+ private final String extension;
+ private final MediaType type;
+
+ POIFSDocumentType(String extension, MediaType type) {
+ this.extension = extension;
+ this.type = type;
+ }
+
+ public static POIFSDocumentType detectType(POIFSFileSystem fs) {
+ return detectType(fs.getRoot());
+ }
+
+ public static POIFSDocumentType detectType(NPOIFSFileSystem fs) {
+ return detectType(fs.getRoot());
+ }
+
+ public static POIFSDocumentType detectType(DirectoryEntry node) {
+ Set<String> names = new HashSet<String>();
+ for (Entry entry : node) {
+ names.add(entry.getName());
+ }
+ MediaType type = POIFSContainerDetector.detect(names, node);
+ for (POIFSDocumentType poifsType : values()) {
+ if (type.equals(poifsType.type)) {
+ return poifsType;
+ }
+ }
+ return UNKNOWN;
+ }
+
+ public String getExtension() {
+ return extension;
+ }
+
+ public MediaType getType() {
+ return type;
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringReader;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.poi.hssf.extractor.OldExcelExtractor;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * A POI-powered Tika Parser for very old versions of Excel, from
+ * pre-OLE2 days, such as Excel 4.
+ */
+public class OldExcelParser extends AbstractParser {
+ private static final long serialVersionUID = 4611820730372823452L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("vnd.ms-excel.sheet.4"),
+ MediaType.application("vnd.ms-excel.workspace.4"),
+ MediaType.application("vnd.ms-excel.sheet.3"),
+ MediaType.application("vnd.ms-excel.workspace.3"),
+ MediaType.application("vnd.ms-excel.sheet.2")
+ )));
+
+ protected static void parse(OldExcelExtractor extractor,
+ XHTMLContentHandler xhtml) throws TikaException, IOException, SAXException {
+ // Get the whole text, as a single string
+ String text = extractor.getText();
+
+ // Split and output
+ xhtml.startDocument();
+
+ String line;
+ BufferedReader reader = new BufferedReader(new StringReader(text));
+ while ((line = reader.readLine()) != null) {
+ xhtml.startElement("p");
+ xhtml.characters(line);
+ xhtml.endElement("p");
+ }
+
+ xhtml.endDocument();
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ /**
+ * Extracts properties and text from an MS Document input stream
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Open the POI provided extractor
+ OldExcelExtractor extractor = new OldExcelExtractor(stream);
+
+ // We can't do anything about metadata, as these old formats
+ // didn't have any stored with them
+
+ // Set the content type
+ // TODO Get the version and type, to set as the Content Type
+
+ // Have the text extracted and given to our Content Handler
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ parse(extractor, xhtml);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,386 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.UnsupportedCharsetException;
+import java.text.ParseException;
+import java.util.Date;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.poi.hmef.attribute.MAPIRtfAttribute;
+import org.apache.poi.hsmf.MAPIMessage;
+import org.apache.poi.hsmf.datatypes.AttachmentChunks;
+import org.apache.poi.hsmf.datatypes.ByteChunk;
+import org.apache.poi.hsmf.datatypes.Chunk;
+import org.apache.poi.hsmf.datatypes.Chunks;
+import org.apache.poi.hsmf.datatypes.MAPIProperty;
+import org.apache.poi.hsmf.datatypes.PropertyValue;
+import org.apache.poi.hsmf.datatypes.StringChunk;
+import org.apache.poi.hsmf.datatypes.Types;
+import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.util.CodePageUtil;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.html.HtmlEncodingDetector;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.parser.mbox.MboxParser;
+import org.apache.tika.parser.rtf.RTFParser;
+import org.apache.tika.parser.txt.CharsetDetector;
+import org.apache.tika.parser.txt.CharsetMatch;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Outlook Message Parser.
+ */
+public class OutlookExtractor extends AbstractPOIFSExtractor {
+ private static final Metadata EMPTY_METADATA = new Metadata();
+ HtmlEncodingDetector detector = new HtmlEncodingDetector();
+
+ private final MAPIMessage msg;
+
+ public OutlookExtractor(NPOIFSFileSystem filesystem, ParseContext context) throws TikaException {
+ this(filesystem.getRoot(), context);
+ }
+
+ public OutlookExtractor(DirectoryNode root, ParseContext context) throws TikaException {
+ super(context);
+
+ try {
+ this.msg = new MAPIMessage(root);
+ } catch (IOException e) {
+ throw new TikaException("Failed to parse Outlook message", e);
+ }
+ }
+
+ public void parse(XHTMLContentHandler xhtml, Metadata metadata)
+ throws TikaException, SAXException, IOException {
+ try {
+ msg.setReturnNullOnMissingChunk(true);
+
+ // If the message contains strings that aren't stored
+ // as Unicode, try to sort out an encoding for them
+ if (msg.has7BitEncodingStrings()) {
+ guess7BitEncoding(msg);
+ }
+
+ // Start with the metadata
+ String subject = msg.getSubject();
+ String from = msg.getDisplayFrom();
+
+ metadata.set(TikaCoreProperties.CREATOR, from);
+ metadata.set(Metadata.MESSAGE_FROM, from);
+ metadata.set(Metadata.MESSAGE_TO, msg.getDisplayTo());
+ metadata.set(Metadata.MESSAGE_CC, msg.getDisplayCC());
+ metadata.set(Metadata.MESSAGE_BCC, msg.getDisplayBCC());
+
+ metadata.set(TikaCoreProperties.TITLE, subject);
+ // TODO: Move to description in Tika 2.0
+ metadata.set(TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,
+ msg.getConversationTopic());
+
+ try {
+ for (String recipientAddress : msg.getRecipientEmailAddressList()) {
+ if (recipientAddress != null)
+ metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, recipientAddress);
+ }
+ } catch (ChunkNotFoundException he) {
+ } // Will be fixed in POI 3.7 Final
+
+ // Date - try two ways to find it
+ // First try via the proper chunk
+ if (msg.getMessageDate() != null) {
+ metadata.set(TikaCoreProperties.CREATED, msg.getMessageDate().getTime());
+ metadata.set(TikaCoreProperties.MODIFIED, msg.getMessageDate().getTime());
+ } else {
+ try {
+ // Failing that try via the raw headers
+ String[] headers = msg.getHeaders();
+ if (headers != null && headers.length > 0) {
+ for (String header : headers) {
+ if (header.toLowerCase(Locale.ROOT).startsWith("date:")) {
+ String date = header.substring(header.indexOf(':') + 1).trim();
+
+ // See if we can parse it as a normal mail date
+ try {
+ Date d = MboxParser.parseDate(date);
+ metadata.set(TikaCoreProperties.CREATED, d);
+ metadata.set(TikaCoreProperties.MODIFIED, d);
+ } catch (ParseException e) {
+ // Store it as-is, and hope for the best...
+ metadata.set(TikaCoreProperties.CREATED, date);
+ metadata.set(TikaCoreProperties.MODIFIED, date);
+ }
+ break;
+ }
+ }
+ }
+ } catch (ChunkNotFoundException he) {
+ // We can't find the date, sorry...
+ }
+ }
+
+
+ xhtml.element("h1", subject);
+
+ // Output the from and to details in text, as you
+ // often want them in text form for searching
+ xhtml.startElement("dl");
+ if (from != null) {
+ header(xhtml, "From", from);
+ }
+ header(xhtml, "To", msg.getDisplayTo());
+ header(xhtml, "Cc", msg.getDisplayCC());
+ header(xhtml, "Bcc", msg.getDisplayBCC());
+ try {
+ header(xhtml, "Recipients", msg.getRecipientEmailAddress());
+ } catch (ChunkNotFoundException e) {
+ }
+ xhtml.endElement("dl");
+
+ // Get the message body. Preference order is: html, rtf, text
+ Chunk htmlChunk = null;
+ Chunk rtfChunk = null;
+ Chunk textChunk = null;
+ for (Chunk chunk : msg.getMainChunks().getChunks()) {
+ if (chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
+ htmlChunk = chunk;
+ }
+ if (chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
+ rtfChunk = chunk;
+ }
+ if (chunk.getChunkId() == MAPIProperty.BODY.id) {
+ textChunk = chunk;
+ }
+ }
+
+ boolean doneBody = false;
+ xhtml.startElement("div", "class", "message-body");
+ if (htmlChunk != null) {
+ byte[] data = null;
+ if (htmlChunk instanceof ByteChunk) {
+ data = ((ByteChunk) htmlChunk).getValue();
+ } else if (htmlChunk instanceof StringChunk) {
+ data = ((StringChunk) htmlChunk).getRawValue();
+ }
+ if (data != null) {
+ HtmlParser htmlParser = new HtmlParser();
+ htmlParser.parse(
+ new ByteArrayInputStream(data),
+ new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
+ new Metadata(), new ParseContext()
+ );
+ doneBody = true;
+ }
+ }
+ if (rtfChunk != null && !doneBody) {
+ ByteChunk chunk = (ByteChunk) rtfChunk;
+ MAPIRtfAttribute rtf = new MAPIRtfAttribute(
+ MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()
+ );
+ RTFParser rtfParser = new RTFParser();
+ rtfParser.parse(
+ new ByteArrayInputStream(rtf.getData()),
+ new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
+ new Metadata(), new ParseContext());
+ doneBody = true;
+ }
+ if (textChunk != null && !doneBody) {
+ xhtml.element("p", ((StringChunk) textChunk).getValue());
+ }
+ xhtml.endElement("div");
+
+ // Process the attachments
+ for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
+ xhtml.startElement("div", "class", "attachment-entry");
+
+ String filename = null;
+ if (attachment.attachLongFileName != null) {
+ filename = attachment.attachLongFileName.getValue();
+ } else if (attachment.attachFileName != null) {
+ filename = attachment.attachFileName.getValue();
+ }
+ if (filename != null && filename.length() > 0) {
+ xhtml.element("h1", filename);
+ }
+
+ if (attachment.attachData != null) {
+ handleEmbeddedResource(
+ TikaInputStream.get(attachment.attachData.getValue()),
+ filename, null,
+ null, xhtml, true
+ );
+ }
+ if (attachment.attachmentDirectory != null) {
+ handleEmbeddedOfficeDoc(
+ attachment.attachmentDirectory.getDirectory(),
+ xhtml
+ );
+ }
+
+ xhtml.endElement("div");
+ }
+ } catch (ChunkNotFoundException e) {
+ throw new TikaException("POI MAPIMessage broken - didn't return null on missing chunk", e);
+ }
+ }
+
+ private void header(XHTMLContentHandler xhtml, String key, String value)
+ throws SAXException {
+ if (value != null && value.length() > 0) {
+ xhtml.element("dt", key);
+ xhtml.element("dd", value);
+ }
+ }
+
+ /**
+ * Tries to identify the correct encoding for 7-bit (non-unicode)
+ * strings in the file.
+ * <p>Many messages store their strings as unicode, which is
+ * nice and easy. Some use one-byte encodings for their
+ * strings, but don't always store the encoding anywhere
+ * helpful in the file.</p>
+ * <p>This method checks for codepage properties, and failing that
+ * looks at the headers for the message, and uses these to
+ * guess the correct encoding for your file.</p>
+ * <p>Bug #49441 has more on why this is needed</p>
+ * <p>This is taken verbatim from POI (TIKA-1238)
+ * as a temporary workaround to prevent unsupported encoding exceptions</p>
+ */
+ private void guess7BitEncoding(MAPIMessage msg) {
+ Chunks mainChunks = msg.getMainChunks();
+ //sanity check
+ if (mainChunks == null) {
+ return;
+ }
+
+ Map<MAPIProperty, List<PropertyValue>> props = mainChunks.getProperties();
+ if (props != null) {
+ // First choice is a codepage property
+ for (MAPIProperty prop : new MAPIProperty[]{
+ MAPIProperty.MESSAGE_CODEPAGE,
+ MAPIProperty.INTERNET_CPID
+ }) {
+ List<PropertyValue> val = props.get(prop);
+ if (val != null && val.size() > 0) {
+ int codepage = ((PropertyValue.LongPropertyValue) val.get(0)).getValue();
+ String encoding = null;
+ try {
+ encoding = CodePageUtil.codepageToEncoding(codepage, true);
+ } catch (UnsupportedEncodingException e) {
+ //swallow
+ }
+ if (tryToSet7BitEncoding(msg, encoding)) {
+ return;
+ }
+ }
+ }
+ }
+
+ // Second choice is a charset on a content type header
+ try {
+ String[] headers = msg.getHeaders();
+ if(headers != null && headers.length > 0) {
+ // Look for a content type with a charset
+ Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE);
+
+ for(String header : headers) {
+ if(header.startsWith("Content-Type")) {
+ Matcher m = p.matcher(header);
+ if(m.matches()) {
+ // Found it! Tell all the string chunks
+ String charset = m.group(1);
+ if (tryToSet7BitEncoding(msg, charset)) {
+ return;
+ }
+ }
+ }
+ }
+ }
+ } catch(ChunkNotFoundException e) {}
+
+ // Nothing suitable in the headers, try HTML
+ // TODO: do we need to replicate this in Tika? If we wind up
+ // parsing the html version of the email, this is duplicative??
+ // Or do we need to reset the header strings based on the html
+ // meta header if there is no other information?
+ try {
+ String html = msg.getHtmlBody();
+ if(html != null && html.length() > 0) {
+ Charset charset = null;
+ try {
+ charset = detector.detect(new ByteArrayInputStream(
+ html.getBytes(UTF_8)), EMPTY_METADATA);
+ } catch (IOException e) {
+ //swallow
+ }
+ if (charset != null && tryToSet7BitEncoding(msg, charset.name())) {
+ return;
+ }
+ }
+ } catch(ChunkNotFoundException e) {}
+
+ //absolute last resort, try charset detector
+ StringChunk text = mainChunks.textBodyChunk;
+ if (text != null) {
+ CharsetDetector detector = new CharsetDetector();
+ detector.setText(text.getRawValue());
+ CharsetMatch match = detector.detect();
+ if (match != null && match.getConfidence() > 35 &&
+ tryToSet7BitEncoding(msg, match.getName())) {
+ return;
+ }
+ }
+ }
+
+ private boolean tryToSet7BitEncoding(MAPIMessage msg, String charsetName) {
+ if (charsetName == null) {
+ return false;
+ }
+
+ if (charsetName.equalsIgnoreCase("utf-8")) {
+ return false;
+ }
+ try {
+ if (Charset.isSupported(charsetName)) {
+ msg.set7BitEncoding(charsetName);
+ return true;
+ }
+ } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
+ //swallow
+ }
+ return false;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,436 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.apache.tika.mime.MediaType.application;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.DocumentNode;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * A detector that works on a POIFS OLE2 document
+ * to figure out exactly what the file is.
+ * This should work for all OLE2 documents, whether
+ * they are ones supported by POI or not.
+ */
+public class POIFSContainerDetector implements Detector {
+
+ /**
+ * The OLE base file format
+ */
+ public static final MediaType OLE = application("x-tika-msoffice");
+ /**
+ * The protected OOXML base file format
+ */
+ public static final MediaType OOXML_PROTECTED = application("x-tika-ooxml-protected");
+ /**
+ * General embedded document type within an OLE2 container
+ */
+ public static final MediaType GENERAL_EMBEDDED = application("x-tika-msoffice-embedded");
+ /**
+ * An OLE10 Native embedded document within another OLE2 document
+ */
+ public static final MediaType OLE10_NATIVE =
+ new MediaType(GENERAL_EMBEDDED, "format", "ole10_native");
+ /**
+ * Some other kind of embedded document, in a CompObj container within another OLE2 document
+ */
+ public static final MediaType COMP_OBJ =
+ new MediaType(GENERAL_EMBEDDED, "format", "comp_obj");
+ /**
+ * Microsoft Excel
+ */
+ public static final MediaType XLS = application("vnd.ms-excel");
+ /**
+ * Microsoft Word
+ */
+ public static final MediaType DOC = application("msword");
+ /**
+ * Microsoft PowerPoint
+ */
+ public static final MediaType PPT = application("vnd.ms-powerpoint");
+ /**
+ * Microsoft Publisher
+ */
+ public static final MediaType PUB = application("x-mspublisher");
+ /**
+ * Microsoft Visio
+ */
+ public static final MediaType VSD = application("vnd.visio");
+ /**
+ * Microsoft Works
+ */
+ public static final MediaType WPS = application("vnd.ms-works");
+ /**
+ * Microsoft Works Spreadsheet 7.0
+ */
+ public static final MediaType XLR = application("x-tika-msworks-spreadsheet");
+ /**
+ * Microsoft Outlook
+ */
+ public static final MediaType MSG = application("vnd.ms-outlook");
+ /**
+ * Microsoft Project
+ */
+ public static final MediaType MPP = application("vnd.ms-project");
+ /**
+ * StarOffice Calc
+ */
+ public static final MediaType SDC = application("vnd.stardivision.calc");
+ /**
+ * StarOffice Draw
+ */
+ public static final MediaType SDA = application("vnd.stardivision.draw");
+ /**
+ * StarOffice Impress
+ */
+ public static final MediaType SDD = application("vnd.stardivision.impress");
+ /**
+ * StarOffice Writer
+ */
+ public static final MediaType SDW = application("vnd.stardivision.writer");
+ /**
+ * SolidWorks CAD file
+ */
+ public static final MediaType SLDWORKS = application("sldworks");
+ /**
+ * Hangul Word Processor (Korean)
+ */
+ public static final MediaType HWP = application("x-hwp-v5");
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -3028021741663605293L;
+ /**
+ * An ASCII String "StarImpress"
+ */
+ private static final byte[] STAR_IMPRESS = new byte[]{
+ 0x53, 0x74, 0x61, 0x72, 0x49, 0x6d, 0x70, 0x72, 0x65, 0x73, 0x73
+ };
+ /**
+ * An ASCII String "StarDraw"
+ */
+ private static final byte[] STAR_DRAW = new byte[]{
+ 0x53, 0x74, 0x61, 0x72, 0x44, 0x72, 0x61, 0x77
+ };
+ /**
+ * An ASCII String "Quill96" for Works Files
+ */
+ private static final byte[] WORKS_QUILL96 = new byte[]{
+ 0x51, 0x75, 0x69, 0x6c, 0x6c, 0x39, 0x36
+ };
+ /**
+ * Regexp for matching the MPP Project Data stream
+ */
+ private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+");
+
+ /**
+ * Internal detection of the specific kind of OLE2 document, based on the
+ * names of the top level streams within the file.
+ *
+ * @deprecated Use {@link #detect(Set, DirectoryEntry)} and pass the root
+ * entry of the filesystem whose type is to be detected, as a
+ * second argument.
+ */
+ protected static MediaType detect(Set<String> names) {
+ return detect(names, null);
+ }
+
+ /**
+ * Internal detection of the specific kind of OLE2 document, based on the
+ * names of the top-level streams within the file. In some cases the
+ * detection may need access to the root {@link DirectoryEntry} of that file
+ * for best results. The entry can be given as a second, optional argument.
+ *
+ * @param names
+ * @param root
+ * @return
+ */
+ protected static MediaType detect(Set<String> names, DirectoryEntry root) {
+ if (names != null) {
+ if (names.contains("SwDocContentMgr") && names.contains("SwDocMgrTempStorage")) {
+ return SLDWORKS;
+ } else if (names.contains("StarCalcDocument")) {
+ // Star Office Calc
+ return SDC;
+ } else if (names.contains("StarWriterDocument")) {
+ return SDW;
+ } else if (names.contains("StarDrawDocument3")) {
+ if (root == null) {
+ /*
+ * This is either StarOfficeDraw or StarOfficeImpress, we have
+ * to consult the CompObj to distinguish them, if this method is
+ * called in "legacy mode", without the root, just return
+ * x-tika-msoffice. The one-argument method is only for backward
+ * compatibility, if someone calls old API he/she can get the
+ * old result.
+ */
+ return OLE;
+ } else {
+ return processCompObjFormatType(root);
+ }
+ } else if (names.contains("\u0005HwpSummaryInformation")) {
+ // Hangul Word Processor v5+ (previous aren't OLE2-based)
+ return HWP;
+ } else if (names.contains("WksSSWorkBook")) {
+ // This check has to be before names.contains("Workbook")
+ // Works 7.0 spreadsheet files contain both
+ // we want to avoid classifying this as Excel
+ return XLR;
+ } else if (names.contains("Workbook") || names.contains("WORKBOOK")) {
+ return XLS;
+ } else if (names.contains("Book")) {
+ // Excel 95 or older, we won't be able to parse this....
+ return XLS;
+ } else if (names.contains("EncryptedPackage") &&
+ names.contains("EncryptionInfo") &&
+ names.contains("\u0006DataSpaces")) {
+ // This is a protected OOXML document, which is an OLE2 file
+ // with an Encrypted Stream which holds the OOXML data
+ // Without decrypting the stream, we can't tell what kind of
+ // OOXML file we have. Return a general OOXML Protected type,
+ // and hope the name based detection can guess the rest!
+ return OOXML_PROTECTED;
+ } else if (names.contains("EncryptedPackage")) {
+ return OLE;
+ } else if (names.contains("WordDocument")) {
+ return DOC;
+ } else if (names.contains("Quill")) {
+ return PUB;
+ } else if (names.contains("PowerPoint Document")) {
+ return PPT;
+ } else if (names.contains("VisioDocument")) {
+ return VSD;
+ } else if (names.contains("\u0001Ole10Native")) {
+ return OLE10_NATIVE;
+ } else if (names.contains("MatOST")) {
+ // this occurs on older Works Word Processor files (versions 3.0 and 4.0)
+ return WPS;
+ } else if (names.contains("CONTENTS") && names.contains("SPELLING")) {
+ // Newer Works files
+ return WPS;
+ } else if (names.contains("Contents") && names.contains("\u0003ObjInfo")) {
+ return COMP_OBJ;
+ } else if (names.contains("CONTENTS") && names.contains("\u0001CompObj")) {
+ // CompObj is a general kind of OLE2 embedding, but this may be an old Works file
+ // If we have the Directory, check
+ if (root != null) {
+ MediaType type = processCompObjFormatType(root);
+ if (type == WPS) {
+ return WPS;
+ } else {
+ // Assume it's a general CompObj embedded resource
+ return COMP_OBJ;
+ }
+ } else {
+ // Assume it's a general CompObj embedded resource
+ return COMP_OBJ;
+ }
+ } else if (names.contains("CONTENTS")) {
+ // CONTENTS without SPELLING nor CompObj normally means some sort
+ // of embedded non-office file inside an OLE2 document
+ // This is most commonly triggered on nested directories
+ return OLE;
+ } else if (names.contains("\u0001CompObj") &&
+ (names.contains("Props") || names.contains("Props9") || names.contains("Props12"))) {
+ // Could be Project, look for common name patterns
+ for (String name : names) {
+ if (mppDataMatch.matcher(name).matches()) {
+ return MPP;
+ }
+ }
+ } else if (names.contains("PerfectOffice_MAIN")) {
+ if (names.contains("SlideShow")) {
+ return MediaType.application("x-corelpresentations"); // .shw
+ } else if (names.contains("PerfectOffice_OBJECTS")) {
+ return MediaType.application("x-quattro-pro"); // .wb?
+ }
+ } else if (names.contains("NativeContent_MAIN")) {
+ return MediaType.application("x-quattro-pro"); // .qpw
+ } else {
+ for (String name : names) {
+ if (name.startsWith("__substg1.0_")) {
+ return MSG;
+ }
+ }
+ }
+ }
+
+ // Couldn't detect a more specific type
+ return OLE;
+ }
+
+ /**
+ * Is this one of the kinds of formats which uses CompObj to
+ * store all of their data, eg Star Draw, Star Impress or
+ * (older) Works?
+ * If not, it's likely an embedded resource
+ */
+ private static MediaType processCompObjFormatType(DirectoryEntry root) {
+ try {
+ Entry e = root.getEntry("\u0001CompObj");
+ if (e != null && e.isDocumentEntry()) {
+ DocumentNode dn = (DocumentNode) e;
+ DocumentInputStream stream = new DocumentInputStream(dn);
+ byte[] bytes = IOUtils.toByteArray(stream);
+ /*
+ * This array contains a string with a normal ASCII name of the
+ * application used to create this file. We want to search for that
+ * name.
+ */
+ if (arrayContains(bytes, STAR_DRAW)) {
+ return SDA;
+ } else if (arrayContains(bytes, STAR_IMPRESS)) {
+ return SDD;
+ } else if (arrayContains(bytes, WORKS_QUILL96)) {
+ return WPS;
+ }
+ }
+ } catch (Exception e) {
+ /*
+ * "root.getEntry" can throw FileNotFoundException. The code inside
+ * "if" can throw IOExceptions. Theoretically. Practically no
+ * exceptions will likely ever appear.
+ *
+ * Swallow all of them. If any occur, we just assume that we can't
+ * distinguish between Draw and Impress and return something safe:
+ * x-tika-msoffice
+ */
+ }
+ return OLE;
+ }
+
+ // poor man's search for byte arrays, replace with some library call if
+ // you know one without adding new dependencies
+ private static boolean arrayContains(byte[] larger, byte[] smaller) {
+ int largerCounter = 0;
+ int smallerCounter = 0;
+ while (largerCounter < larger.length) {
+ if (larger[largerCounter] == smaller[smallerCounter]) {
+ largerCounter++;
+ smallerCounter++;
+ if (smallerCounter == smaller.length) {
+ return true;
+ }
+ } else {
+ largerCounter = largerCounter - smallerCounter + 1;
+ smallerCounter = 0;
+ }
+ }
+ return false;
+ }
+
+ private static Set<String> getTopLevelNames(TikaInputStream stream)
+ throws IOException {
+ // Force the document stream to a (possibly temporary) file
+ // so we don't modify the current position of the stream
+ File file = stream.getFile();
+
+ try {
+ NPOIFSFileSystem fs = new NPOIFSFileSystem(file, true);
+
+ // Optimize a possible later parsing process by keeping
+ // a reference to the already opened POI file system
+ stream.setOpenContainer(fs);
+
+ return getTopLevelNames(fs.getRoot());
+ } catch (IOException e) {
+ // Parse error in POI, so we don't know the file type
+ return Collections.emptySet();
+ } catch (RuntimeException e) {
+ // Another problem in POI
+ return Collections.emptySet();
+ }
+ }
+
+ private static Set<String> getTopLevelNames(DirectoryNode root) {
+ Set<String> names = new HashSet<String>();
+ for (Entry entry : root) {
+ names.add(entry.getName());
+ }
+ return names;
+ }
+
+ public MediaType detect(InputStream input, Metadata metadata)
+ throws IOException {
+ // Check if we have access to the document
+ if (input == null) {
+ return MediaType.OCTET_STREAM;
+ }
+
+ // If this is a TikaInputStream wrapping an already
+ // parsed NPOIFileSystem/DirectoryNode, just get the
+ // names from the root:
+ TikaInputStream tis = TikaInputStream.cast(input);
+ Set<String> names = null;
+ if (tis != null) {
+ Object container = tis.getOpenContainer();
+ if (container instanceof NPOIFSFileSystem) {
+ names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot());
+ } else if (container instanceof DirectoryNode) {
+ names = getTopLevelNames((DirectoryNode) container);
+ }
+ }
+
+ if (names == null) {
+ // Check if the document starts with the OLE header
+ input.mark(8);
+ try {
+ if (input.read() != 0xd0 || input.read() != 0xcf
+ || input.read() != 0x11 || input.read() != 0xe0
+ || input.read() != 0xa1 || input.read() != 0xb1
+ || input.read() != 0x1a || input.read() != 0xe1) {
+ return MediaType.OCTET_STREAM;
+ }
+ } finally {
+ input.reset();
+ }
+ }
+
+ // We can only detect the exact type when given a TikaInputStream
+ if (names == null && tis != null) {
+ // Look for known top level entry names to detect the document type
+ names = getTopLevelNames(tis);
+ }
+
+ // Detect based on the names (as available)
+ if (tis != null &&
+ tis.getOpenContainer() != null &&
+ tis.getOpenContainer() instanceof NPOIFSFileSystem) {
+ return detect(names, ((NPOIFSFileSystem) tis.getOpenContainer()).getRoot());
+ } else {
+ return detect(names, null);
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,260 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.poi.hpsf.CustomProperties;
+import org.apache.poi.hpsf.DocumentSummaryInformation;
+import org.apache.poi.hpsf.MarkUnsupportedException;
+import org.apache.poi.hpsf.NoPropertySetStreamException;
+import org.apache.poi.hpsf.PropertySet;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.MSOffice;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+
+/**
+ * Extractor for Common OLE2 (HPSF) metadata
+ */
+public class SummaryExtractor {
+ private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class);
+
+ private static final String SUMMARY_INFORMATION =
+ SummaryInformation.DEFAULT_STREAM_NAME;
+
+ private static final String DOCUMENT_SUMMARY_INFORMATION =
+ DocumentSummaryInformation.DEFAULT_STREAM_NAME;
+
+ private final Metadata metadata;
+
+ public SummaryExtractor(Metadata metadata) {
+ this.metadata = metadata;
+ }
+
+ public void parseSummaries(NPOIFSFileSystem filesystem)
+ throws IOException, TikaException {
+ parseSummaries(filesystem.getRoot());
+ }
+
+ public void parseSummaries(DirectoryNode root)
+ throws IOException, TikaException {
+ parseSummaryEntryIfExists(root, SUMMARY_INFORMATION);
+ parseSummaryEntryIfExists(root, DOCUMENT_SUMMARY_INFORMATION);
+ }
+
+ private void parseSummaryEntryIfExists(
+ DirectoryNode root, String entryName)
+ throws IOException, TikaException {
+ try {
+ DocumentEntry entry =
+ (DocumentEntry) root.getEntry(entryName);
+ PropertySet properties =
+ new PropertySet(new DocumentInputStream(entry));
+ if (properties.isSummaryInformation()) {
+ parse(new SummaryInformation(properties));
+ }
+ if (properties.isDocumentSummaryInformation()) {
+ parse(new DocumentSummaryInformation(properties));
+ }
+ } catch (FileNotFoundException e) {
+ // entry does not exist, just skip it
+ } catch (NoPropertySetStreamException e) {
+ // no property stream, just skip it
+ } catch (UnexpectedPropertySetTypeException e) {
+ throw new TikaException("Unexpected HPSF document", e);
+ } catch (MarkUnsupportedException e) {
+ throw new TikaException("Invalid DocumentInputStream", e);
+ } catch (Exception e) {
+ logger.warn("Ignoring unexpected exception while parsing summary entry " + entryName, e);
+ }
+ }
+
+ private void parse(SummaryInformation summary) {
+ set(TikaCoreProperties.TITLE, summary.getTitle());
+ addMulti(metadata, TikaCoreProperties.CREATOR, summary.getAuthor());
+ set(TikaCoreProperties.KEYWORDS, summary.getKeywords());
+ // TODO Move to OO subject in Tika 2.0
+ set(TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, summary.getSubject());
+ set(TikaCoreProperties.MODIFIER, summary.getLastAuthor());
+ set(TikaCoreProperties.COMMENTS, summary.getComments());
+ set(OfficeOpenXMLExtended.TEMPLATE, summary.getTemplate());
+ set(OfficeOpenXMLExtended.APPLICATION, summary.getApplicationName());
+ set(OfficeOpenXMLCore.REVISION, summary.getRevNumber());
+ set(TikaCoreProperties.CREATED, summary.getCreateDateTime());
+ set(TikaCoreProperties.MODIFIED, summary.getLastSaveDateTime());
+ set(TikaCoreProperties.PRINT_DATE, summary.getLastPrinted());
+ set(Metadata.EDIT_TIME, summary.getEditTime());
+ set(OfficeOpenXMLExtended.DOC_SECURITY, summary.getSecurity());
+
+ // New style counts
+ set(Office.WORD_COUNT, summary.getWordCount());
+ set(Office.CHARACTER_COUNT, summary.getCharCount());
+ set(Office.PAGE_COUNT, summary.getPageCount());
+ if (summary.getPageCount() > 0) {
+ metadata.set(PagedText.N_PAGES, summary.getPageCount());
+ }
+
+ // Old style, Tika 1.0 properties
+ // TODO Remove these in Tika 2.0
+ set(Metadata.TEMPLATE, summary.getTemplate());
+ set(Metadata.APPLICATION_NAME, summary.getApplicationName());
+ set(Metadata.REVISION_NUMBER, summary.getRevNumber());
+ set(Metadata.SECURITY, summary.getSecurity());
+ set(MSOffice.WORD_COUNT, summary.getWordCount());
+ set(MSOffice.CHARACTER_COUNT, summary.getCharCount());
+ set(MSOffice.PAGE_COUNT, summary.getPageCount());
+ }
+
+ private void parse(DocumentSummaryInformation summary) {
+ set(OfficeOpenXMLExtended.COMPANY, summary.getCompany());
+ addMulti(metadata, OfficeOpenXMLExtended.MANAGER, summary.getManager());
+ set(TikaCoreProperties.LANGUAGE, getLanguage(summary));
+ set(OfficeOpenXMLCore.CATEGORY, summary.getCategory());
+
+ // New style counts
+ set(Office.SLIDE_COUNT, summary.getSlideCount());
+ if (summary.getSlideCount() > 0) {
+ metadata.set(PagedText.N_PAGES, summary.getSlideCount());
+ }
+ // Old style, Tika 1.0 counts
+ // TODO Remove these in Tika 2.0
+ set(Metadata.COMPANY, summary.getCompany());
+ set(Metadata.MANAGER, summary.getManager());
+ set(MSOffice.SLIDE_COUNT, summary.getSlideCount());
+ set(Metadata.CATEGORY, summary.getCategory());
+
+ parse(summary.getCustomProperties());
+ }
+
+ private String getLanguage(DocumentSummaryInformation summary) {
+ CustomProperties customProperties = summary.getCustomProperties();
+ if (customProperties != null) {
+ Object value = customProperties.get("Language");
+ if (value instanceof String) {
+ return (String) value;
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Attempt to parse custom document properties and add to the collection of metadata
+ *
+ * @param customProperties
+ */
+ private void parse(CustomProperties customProperties) {
+ if (customProperties != null) {
+ for (String name : customProperties.nameSet()) {
+ // Apply the custom prefix
+ String key = Metadata.USER_DEFINED_METADATA_NAME_PREFIX + name;
+
+ // Get, convert and save property value
+ Object value = customProperties.get(name);
+ if (value instanceof String) {
+ set(key, (String) value);
+ } else if (value instanceof Date) {
+ Property prop = Property.externalDate(key);
+ metadata.set(prop, (Date) value);
+ } else if (value instanceof Boolean) {
+ Property prop = Property.externalBoolean(key);
+ metadata.set(prop, value.toString());
+ } else if (value instanceof Long) {
+ Property prop = Property.externalInteger(key);
+ metadata.set(prop, ((Long) value).intValue());
+ } else if (value instanceof Double) {
+ Property prop = Property.externalReal(key);
+ metadata.set(prop, (Double) value);
+ } else if (value instanceof Integer) {
+ Property prop = Property.externalInteger(key);
+ metadata.set(prop, ((Integer) value).intValue());
+ }
+ }
+ }
+ }
+
+ private void set(String name, String value) {
+ if (value != null) {
+ metadata.set(name, value);
+ }
+ }
+
+ private void set(Property property, String value) {
+ if (value != null) {
+ metadata.set(property, value);
+ }
+ }
+
+ private void set(Property property, Date value) {
+ if (value != null) {
+ metadata.set(property, value);
+ }
+ }
+
+ private void set(Property property, int value) {
+ if (value > 0) {
+ metadata.set(property, value);
+ }
+ }
+
+ private void set(String name, long value) {
+ if (value > 0) {
+ metadata.set(name, Long.toString(value));
+ }
+ }
+
+ //MS stores values that should be multiple values (e.g. dc:creator)
+ //as a semicolon-delimited list. We need to split
+ //on semicolon to add each value.
+ public static void addMulti(Metadata metadata, Property property, String string) {
+ if (string == null) {
+ return;
+ }
+ String[] parts = string.split(";");
+ String[] current = metadata.getValues(property);
+ Set<String> seen = new HashSet<>();
+ if (current != null) {
+ for (String val : current) {
+ seen.add(val);
+ }
+ }
+ for (String part : parts) {
+ if (! seen.contains(part)) {
+ metadata.add(property, part);
+ seen.add(part);
+ }
+ }
+ }
+
+}