You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/16 19:23:06 UTC
svn commit: r1725014 [8/28] - in /tika/branches/2.x:
tika-parser-bundles/tika-multimedia-bundle/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-parser-module/
tika-parser-modules/tika-advanced-parser-mo...
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,269 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.poi.hwpf.converter.NumberFormatter;
+
+public abstract class AbstractListManager {
+ private final static String BULLET = "\u00b7";
+
+ protected Map<Integer, ParagraphLevelCounter> listLevelMap = new HashMap<Integer, ParagraphLevelCounter>();
+ protected Map<Integer, LevelTuple[]> overrideTupleMap = new HashMap<Integer, LevelTuple[]>();
+
+ //helper class that is docx/doc format agnostic
+ protected class ParagraphLevelCounter {
+
+ //counts can == 0 if the format is decimal, make sure
+ //that flag values are < 0
+ private final Integer NOT_SEEN_YET = -1;
+ private final Integer FIRST_SKIPPED = -2;
+ private final LevelTuple[] levelTuples;
+ Pattern LEVEL_INTERPOLATOR = Pattern.compile("%(\\d+)");
+ private List<Integer> counts = new ArrayList<Integer>();
+ private int lastLevel = -1;
+
+ public ParagraphLevelCounter(LevelTuple[] levelTuples) {
+ this.levelTuples = levelTuples;
+ }
+
+ public int getNumberOfLevels() {
+ return levelTuples.length;
+ }
+
+ /**
+ * Apply this to every numbered paragraph in order.
+ *
+ * @param levelNumber level number that is being incremented
+ * @return the new formatted number string for this level
+ */
+ public String incrementLevel(int levelNumber, LevelTuple[] overrideLevelTuples) {
+
+ for (int i = lastLevel + 1; i < levelNumber; i++) {
+ if (i >= counts.size()) {
+ int val = getStart(i, overrideLevelTuples);
+ counts.add(i, val);
+ } else {
+ int count = counts.get(i);
+ if (count == NOT_SEEN_YET) {
+ count = getStart(i, overrideLevelTuples);
+ counts.set(i, count);
+ }
+ }
+ }
+
+ if (levelNumber < counts.size()) {
+ resetAfter(levelNumber, overrideLevelTuples);
+ int count = counts.get(levelNumber);
+ if (count == NOT_SEEN_YET) {
+ count = getStart(levelNumber, overrideLevelTuples);
+ } else {
+ count++;
+ }
+ counts.set(levelNumber, count);
+ lastLevel = levelNumber;
+ return format(levelNumber, overrideLevelTuples);
+ }
+
+ counts.add(levelNumber, getStart(levelNumber, overrideLevelTuples));
+ lastLevel = levelNumber;
+ return format(levelNumber, overrideLevelTuples);
+ }
+
+ /**
+ * @param level which level to format
+ * @return the string that represents the number and the surrounding text for this paragraph
+ */
+ private String format(int level, LevelTuple[] overrideLevelTuples) {
+ if (level < 0 || level >= levelTuples.length) {
+ //log?
+ return "";
+ }
+ boolean isLegal = (overrideLevelTuples != null) ? overrideLevelTuples[level].isLegal : levelTuples[level].isLegal;
+ //short circuit bullet
+ String numFmt = getNumFormat(level, isLegal, overrideLevelTuples);
+ if ("bullet".equals(numFmt)) {
+ return BULLET + " ";
+ }
+
+ String lvlText = (overrideLevelTuples == null || overrideLevelTuples[level].lvlText == null) ?
+ levelTuples[level].lvlText : overrideLevelTuples[level].lvlText;
+ StringBuilder sb = new StringBuilder();
+ Matcher m = LEVEL_INTERPOLATOR.matcher(lvlText);
+ int last = 0;
+ while (m.find()) {
+ sb.append(lvlText.substring(last, m.start()));
+ String lvlString = m.group(1);
+ int lvlNum = -1;
+ try {
+ lvlNum = Integer.parseInt(lvlString);
+ } catch (NumberFormatException e) {
+ //swallow
+ }
+ String numString = "";
+ //need to subtract 1 because, e.g. %1 is the format
+ //for the number at array offset 0
+ numString = formatNum(lvlNum - 1, isLegal, overrideLevelTuples);
+
+ sb.append(numString);
+ last = m.end();
+ }
+ sb.append(lvlText.substring(last));
+ if (sb.length() > 0) {
+ //TODO: add in character after number
+ sb.append(" ");
+ }
+ return sb.toString();
+ }
+
+ //actual level number; can return empty string if numberformatter fails
+ private String formatNum(int lvlNum, boolean isLegal, LevelTuple[] overrideLevelTuples) {
+
+ int numFmtStyle = 0;
+ String numFmt = getNumFormat(lvlNum, isLegal, overrideLevelTuples);
+
+ int count = getCount(lvlNum);
+ if (count < 0) {
+ count = 1;
+ }
+ if ("lowerLetter".equals(numFmt)) {
+ numFmtStyle = 4;
+ } else if ("lowerRoman".equals(numFmt)) {
+ numFmtStyle = 2;
+ } else if ("decimal".equals(numFmt)) {
+ numFmtStyle = 0;
+ } else if ("upperLetter".equals(numFmt)) {
+ numFmtStyle = 3;
+ } else if ("upperRoman".equals(numFmt)) {
+ numFmtStyle = 1;
+ } else if ("bullet".equals(numFmt)) {
+ return "";
+ //not yet handled by NumberFormatter...TODO: add to NumberFormatter?
+ } else if ("ordinal".equals(numFmt)) {
+ return ordinalize(count);
+ } else if ("decimalZero".equals(numFmt)) {
+ return "0" + NumberFormatter.getNumber(count, 0);
+ } else if ("none".equals(numFmt)) {
+ return "";
+ }
+ try {
+ return NumberFormatter.getNumber(count, numFmtStyle);
+ } catch (IllegalArgumentException e) {
+ return "";
+ }
+ }
+
+ private String ordinalize(int count) {
+ //this is only good for locale == English
+ String countString = Integer.toString(count);
+ if (countString.endsWith("1")) {
+ return countString + "st";
+ } else if (countString.endsWith("2")) {
+ return countString + "nd";
+ } else if (countString.endsWith("3")) {
+ return countString + "rd";
+ }
+ return countString + "th";
+ }
+
+ private String getNumFormat(int lvlNum, boolean isLegal, LevelTuple[] overrideLevelTuples) {
+ if (lvlNum < 0 || lvlNum >= levelTuples.length) {
+ //log?
+ return "decimal";
+ }
+ if (isLegal) {
+ //return decimal no matter the level if isLegal is true
+ return "decimal";
+ }
+ return (overrideLevelTuples == null || overrideLevelTuples[lvlNum].numFmt == null) ?
+ levelTuples[lvlNum].numFmt : overrideLevelTuples[lvlNum].numFmt;
+ }
+
+ private int getCount(int lvlNum) {
+ if (lvlNum < 0 || lvlNum >= counts.size()) {
+ //log?
+ return 1;
+ }
+ return counts.get(lvlNum);
+ }
+
+ private void resetAfter(int startlevelNumber, LevelTuple[] overrideLevelTuples) {
+ for (int levelNumber = startlevelNumber + 1; levelNumber < counts.size(); levelNumber++) {
+ int cnt = counts.get(levelNumber);
+ if (cnt == NOT_SEEN_YET) {
+ //do nothing
+ } else if (cnt == FIRST_SKIPPED) {
+ //do nothing
+ } else if (levelTuples.length > levelNumber) {
+ //never reset if restarts == 0
+ int restart = (overrideLevelTuples == null || overrideLevelTuples[levelNumber].restart < 0) ?
+ levelTuples[levelNumber].restart : overrideLevelTuples[levelNumber].restart;
+ if (restart == 0) {
+ return;
+ } else if (restart == -1 ||
+ startlevelNumber <= restart - 1) {
+ counts.set(levelNumber, NOT_SEEN_YET);
+ } else {
+ //do nothing/don't reset
+ }
+ } else {
+ //reset!
+ counts.set(levelNumber, NOT_SEEN_YET);
+ }
+ }
+ }
+
+ private int getStart(int levelNumber, LevelTuple[] overrideLevelTuples) {
+ if (levelNumber >= levelTuples.length) {
+ return 1;
+ } else {
+ return (overrideLevelTuples == null || overrideLevelTuples[levelNumber].start < 0) ?
+ levelTuples[levelNumber].start : overrideLevelTuples[levelNumber].start;
+ }
+ }
+ }
+
+ protected class LevelTuple {
+ private final int start;
+ private final int restart;
+ private final String lvlText;
+ private final String numFmt;
+ private final boolean isLegal;
+
+ public LevelTuple(String lvlText) {
+ this.lvlText = lvlText;
+ start = 1;
+ restart = -1;
+ numFmt = "decimal";
+ isLegal = false;
+ }
+
+ public LevelTuple(int start, int restart, String lvlText, String numFmt, boolean isLegal) {
+ this.start = start;
+ this.restart = restart;
+ this.lvlText = lvlText;
+ this.numFmt = numFmt;
+ this.isLegal = isLegal;
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.Ole10Native;
+import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
+import org.apache.tika.parser.pkg.ZipContainerDetector;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+abstract class AbstractPOIFSExtractor {
+ private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class);
+ private final EmbeddedDocumentExtractor extractor;
+ private PasswordProvider passwordProvider;
+ private TikaConfig tikaConfig;
+ private MimeTypes mimeTypes;
+ private Detector detector;
+ private Metadata metadata;
+
+ protected AbstractPOIFSExtractor(ParseContext context) {
+ this(context, null);
+ }
+
+ protected AbstractPOIFSExtractor(ParseContext context, Metadata metadata) {
+ EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+
+ if (ex == null) {
+ this.extractor = new ParsingEmbeddedDocumentExtractor(context);
+ } else {
+ this.extractor = ex;
+ }
+
+ this.passwordProvider = context.get(PasswordProvider.class);
+ this.tikaConfig = context.get(TikaConfig.class);
+ this.mimeTypes = context.get(MimeTypes.class);
+ this.detector = context.get(Detector.class);
+ this.metadata = metadata;
+ }
+
+ // Note - these cache, but avoid creating the default TikaConfig if not needed
+ protected TikaConfig getTikaConfig() {
+ if (tikaConfig == null) {
+ tikaConfig = TikaConfig.getDefaultConfig();
+ }
+ return tikaConfig;
+ }
+
+ protected Detector getDetector() {
+ if (detector != null) return detector;
+
+ detector = getTikaConfig().getDetector();
+ return detector;
+ }
+
+ protected MimeTypes getMimeTypes() {
+ if (mimeTypes != null) return mimeTypes;
+
+ mimeTypes = getTikaConfig().getMimeRepository();
+ return mimeTypes;
+ }
+
+ /**
+ * Returns the password to be used for this file, or null
+ * if no / default password should be used
+ */
+ protected String getPassword() {
+ if (passwordProvider != null) {
+ return passwordProvider.getPassword(metadata);
+ }
+ return null;
+ }
+
+ protected void handleEmbeddedResource(TikaInputStream resource, String filename,
+ String relationshipID, String mediaType, XHTMLContentHandler xhtml,
+ boolean outputHtml)
+ throws IOException, SAXException, TikaException {
+ try {
+ Metadata metadata = new Metadata();
+ if (filename != null) {
+ metadata.set(Metadata.TIKA_MIME_FILE, filename);
+ metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+ }
+ if (relationshipID != null) {
+ metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, relationshipID);
+ }
+ if (mediaType != null) {
+ metadata.set(Metadata.CONTENT_TYPE, mediaType);
+ }
+
+ if (extractor.shouldParseEmbedded(metadata)) {
+ extractor.parseEmbedded(resource, xhtml, metadata, outputHtml);
+ }
+ } finally {
+ resource.close();
+ }
+ }
+
+ /**
+ * Handle an office document that's embedded at the POIFS level
+ */
+ protected void handleEmbeddedOfficeDoc(
+ DirectoryEntry dir, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+
+ // Is it an embedded OLE2 document, or an embedded OOXML document?
+
+ if (dir.hasEntry("Package")) {
+ // It's OOXML (has a ZipFile):
+ Entry ooxml = dir.getEntry("Package");
+
+ try (TikaInputStream stream = TikaInputStream.get(
+ new DocumentInputStream((DocumentEntry) ooxml))) {
+ ZipContainerDetector detector = new ZipContainerDetector();
+ MediaType type = detector.detect(stream, new Metadata());
+ handleEmbeddedResource(stream, null, dir.getName(), type.toString(), xhtml, true);
+ return;
+ }
+ }
+
+ // It's regular OLE2:
+
+ // What kind of document is it?
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
+ POIFSDocumentType type = POIFSDocumentType.detectType(dir);
+ TikaInputStream embedded = null;
+
+ try {
+ if (type == POIFSDocumentType.OLE10_NATIVE) {
+ try {
+ // Try to un-wrap the OLE10Native record:
+ Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
+ if (ole.getLabel() != null) {
+ metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
+ }
+ byte[] data = ole.getDataBuffer();
+ embedded = TikaInputStream.get(data);
+ } catch (Ole10NativeException ex) {
+ // Not a valid OLE10Native record, skip it
+ } catch (Exception e) {
+ logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), e);
+ }
+ } else if (type == POIFSDocumentType.COMP_OBJ) {
+ try {
+ // Grab the contents and process
+ DocumentEntry contentsEntry;
+ try {
+ contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
+ } catch (FileNotFoundException ioe) {
+ contentsEntry = (DocumentEntry) dir.getEntry("Contents");
+ }
+ DocumentInputStream inp = new DocumentInputStream(contentsEntry);
+ byte[] contents = new byte[contentsEntry.getSize()];
+ inp.readFully(contents);
+ embedded = TikaInputStream.get(contents);
+
+ // Try to work out what it is
+ MediaType mediaType = getDetector().detect(embedded, new Metadata());
+ String extension = type.getExtension();
+ try {
+ MimeType mimeType = getMimeTypes().forName(mediaType.toString());
+ extension = mimeType.getExtension();
+ } catch (MimeTypeException mte) {
+ // No details on this type are known
+ }
+
+ // Record what we can do about it
+ metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
+ metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
+ } catch (Exception e) {
+ throw new TikaException("Invalid embedded resource", e);
+ }
+ } else {
+ metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
+ metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
+ }
+
+ // Should we parse it?
+ if (extractor.shouldParseEmbedded(metadata)) {
+ if (embedded == null) {
+ // Make a TikaInputStream that just
+ // passes the root directory of the
+ // embedded document, and is otherwise
+ // empty (byte[0]):
+ embedded = TikaInputStream.get(new byte[0]);
+ embedded.setOpenContainer(dir);
+ }
+ extractor.parseEmbedded(embedded, xhtml, metadata, true);
+ }
+ } finally {
+ if (embedded != null) {
+ embedded.close();
+ }
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Cell of content. Classes that implement this interface are used by
+ * Tika parsers (currently just the MS Excel parser) to keep track of
+ * individual pieces of content before they are rendered to the XHTML
+ * SAX event stream.
+ */
+public interface Cell {
+
+ /**
+ * Renders the content to the given XHTML SAX event stream.
+ *
+ * @param handler
+ * @throws SAXException
+ */
+ void render(XHTMLContentHandler handler) throws SAXException;
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Cell decorator.
+ */
+public class CellDecorator implements Cell {
+
+ private final Cell cell;
+
+ public CellDecorator(Cell cell) {
+ this.cell = cell;
+ }
+
+ public void render(XHTMLContentHandler handler) throws SAXException {
+ cell.render(handler);
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,633 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.awt.*;
+import java.io.IOException;
+import java.text.NumberFormat;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import org.apache.poi.ddf.EscherBSERecord;
+import org.apache.poi.ddf.EscherBlipRecord;
+import org.apache.poi.ddf.EscherRecord;
+import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
+import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
+import org.apache.poi.hssf.eventusermodel.HSSFListener;
+import org.apache.poi.hssf.eventusermodel.HSSFRequest;
+import org.apache.poi.hssf.extractor.OldExcelExtractor;
+import org.apache.poi.hssf.record.BOFRecord;
+import org.apache.poi.hssf.record.BoundSheetRecord;
+import org.apache.poi.hssf.record.CellValueRecordInterface;
+import org.apache.poi.hssf.record.CountryRecord;
+import org.apache.poi.hssf.record.DateWindow1904Record;
+import org.apache.poi.hssf.record.DrawingGroupRecord;
+import org.apache.poi.hssf.record.EOFRecord;
+import org.apache.poi.hssf.record.ExtendedFormatRecord;
+import org.apache.poi.hssf.record.FooterRecord;
+import org.apache.poi.hssf.record.FormatRecord;
+import org.apache.poi.hssf.record.FormulaRecord;
+import org.apache.poi.hssf.record.HeaderRecord;
+import org.apache.poi.hssf.record.HyperlinkRecord;
+import org.apache.poi.hssf.record.LabelRecord;
+import org.apache.poi.hssf.record.LabelSSTRecord;
+import org.apache.poi.hssf.record.NumberRecord;
+import org.apache.poi.hssf.record.RKRecord;
+import org.apache.poi.hssf.record.Record;
+import org.apache.poi.hssf.record.SSTRecord;
+import org.apache.poi.hssf.record.StringRecord;
+import org.apache.poi.hssf.record.TextObjectRecord;
+import org.apache.poi.hssf.record.chart.SeriesTextRecord;
+import org.apache.poi.hssf.record.common.UnicodeString;
+import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
+import org.apache.poi.hssf.usermodel.HSSFPictureData;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Excel parser implementation which uses POI's Event API
+ * to handle the contents of a Workbook.
+ * <p/>
+ * The Event API uses a much smaller memory footprint than
+ * <code>HSSFWorkbook</code> when processing excel files
+ * but at the cost of more complexity.
+ * <p/>
+ * With the Event API a <i>listener</i> is registered for
+ * specific record types and those records are created,
+ * fired off to the listener and then discarded as the stream
+ * is being processed.
+ *
+ * @see org.apache.poi.hssf.eventusermodel.HSSFListener
+ * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api">
+ * POI Event API How To</a>
+ */
+public class ExcelExtractor extends AbstractPOIFSExtractor {
+
+ private static final String WORKBOOK_ENTRY = "Workbook";
+ private static final String BOOK_ENTRY = "Book";
+ /**
+ * <code>true</code> if the HSSFListener should be registered
+ * to listen for all records or <code>false</code> (the default)
+ * if the listener should be configured to only receive specified
+ * records.
+ */
+ private boolean listenForAllRecords = false;
+
+ public ExcelExtractor(ParseContext context, Metadata metadata) {
+ super(context, metadata);
+ }
+
+ /**
+ * Returns <code>true</code> if this parser is configured to listen
+ * for all records instead of just the specified few.
+ */
+ public boolean isListenForAllRecords() {
+ return listenForAllRecords;
+ }
+
+ /**
+ * Specifies whether this parser should to listen for all
+ * records or just for the specified few.
+ * <p/>
+ * <strong>Note:</strong> Under normal operation this setting should
+ * be <code>false</code> (the default), but you can experiment with
+ * this setting for testing and debugging purposes.
+ *
+ * @param listenForAllRecords <code>true</code> if the HSSFListener
+ * should be registered to listen for all records or <code>false</code>
+ * if the listener should be configured to only receive specified records.
+ */
+ public void setListenForAllRecords(boolean listenForAllRecords) {
+ this.listenForAllRecords = listenForAllRecords;
+ }
+
+ /**
+ * Extracts text from an Excel Workbook writing the extracted content
+ * to the specified {@link Appendable}.
+ *
+ * @param filesystem POI file system
+ * @throws IOException if an error occurs processing the workbook
+ * or writing the extracted content
+ */
+ protected void parse(
+ NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml,
+ Locale locale) throws IOException, SAXException, TikaException {
+ parse(filesystem.getRoot(), xhtml, locale);
+ }
+
+ protected void parse(
+ DirectoryNode root, XHTMLContentHandler xhtml,
+ Locale locale) throws IOException, SAXException, TikaException {
+ if (!root.hasEntry(WORKBOOK_ENTRY)) {
+ if (root.hasEntry(BOOK_ENTRY)) {
+ // Excel 5 / Excel 95 file
+ // Records are in a different structure so needs a
+ // different parser to process them
+ OldExcelExtractor extractor = new OldExcelExtractor(root);
+ OldExcelParser.parse(extractor, xhtml);
+ return;
+ } else {
+ // Corrupt file / very old file, just skip text extraction
+ return;
+ }
+ }
+
+ // If a password was supplied, use it, otherwise the default
+ Biff8EncryptionKey.setCurrentUserPassword(getPassword());
+
+ // Have the file processed in event mode
+ TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this);
+ listener.processFile(root, isListenForAllRecords());
+ listener.throwStoredException();
+
+ for (Entry entry : root) {
+ if (entry.getName().startsWith("MBD")
+ && entry instanceof DirectoryEntry) {
+ try {
+ handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
+ } catch (TikaException e) {
+ // ignore parse errors from embedded documents
+ }
+ }
+ }
+ }
+
+ // ======================================================================
+
+ /**
+ * HSSF Listener implementation which processes the HSSF records.
+ */
+ private static class TikaHSSFListener implements HSSFListener {
+
+ /**
+ * XHTML content handler to which the document content is rendered.
+ */
+ private final XHTMLContentHandler handler;
+
+ /**
+ * The POIFS Extractor, used for embeded resources.
+ */
+ private final AbstractPOIFSExtractor extractor;
+ /**
+ * Format for rendering numbers in the worksheet. Currently we just
+ * use the platform default formatting.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-103">TIKA-103</a>
+ */
+ private final NumberFormat format;
+ /**
+ * Potential exception thrown by the content handler. When set to
+ * non-<code>null</code>, causes all subsequent HSSF records to be
+ * ignored and the stored exception to be thrown when
+ * {@link #throwStoredException()} is invoked.
+ */
+ private Exception exception = null;
+ private SSTRecord sstRecord;
+ private FormulaRecord stringFormulaRecord;
+ private short previousSid;
+ /**
+ * Internal <code>FormatTrackingHSSFListener</code> to handle cell
+ * formatting within the extraction.
+ */
+ private FormatTrackingHSSFListener formatListener;
+ /**
+ * List of worksheet names.
+ */
+ private List<String> sheetNames = new ArrayList<String>();
+ /**
+ * Index of the current worksheet within the workbook.
+ * Used to find the worksheet name in the {@link #sheetNames} list.
+ */
+ private short currentSheetIndex;
+ /**
+ * Content of the current worksheet, or <code>null</code> if no
+ * worksheet is currently active.
+ */
+ private SortedMap<Point, Cell> currentSheet = null;
+ /**
+ * Extra text or cells that crops up, typically as part of a
+ * worksheet but not always.
+ */
+ private List<Cell> extraTextCells = new ArrayList<Cell>();
+ /**
+ * These aren't complete when we first see them, as the
+ * depend on continue records that aren't always
+ * contiguous. Collect them for later processing.
+ */
+ private List<DrawingGroupRecord> drawingGroups = new ArrayList<DrawingGroupRecord>();
+
+ /**
+ * Construct a new listener instance outputting parsed data to
+ * the specified XHTML content handler.
+ *
+ * @param handler Destination to write the parsed output to
+ */
+ private TikaHSSFListener(XHTMLContentHandler handler, Locale locale, AbstractPOIFSExtractor extractor) {
+ this.handler = handler;
+ this.extractor = extractor;
+ this.format = NumberFormat.getInstance(locale);
+ this.formatListener = new FormatTrackingHSSFListener(this, locale);
+ }
+
+ /**
+ * Entry point to listener to start the processing of a file.
+ *
+ * @param filesystem POI file system.
+ * @param listenForAllRecords sets whether the listener is configured to listen
+ * for all records types or not.
+ * @throws IOException on any IO errors.
+ * @throws SAXException on any SAX parsing errors.
+ */
+ public void processFile(NPOIFSFileSystem filesystem, boolean listenForAllRecords)
+ throws IOException, SAXException, TikaException {
+ processFile(filesystem.getRoot(), listenForAllRecords);
+ }
+
+ public void processFile(DirectoryNode root, boolean listenForAllRecords)
+ throws IOException, SAXException, TikaException {
+
+ // Set up listener and register the records we want to process
+ HSSFRequest hssfRequest = new HSSFRequest();
+ if (listenForAllRecords) {
+ hssfRequest.addListenerForAllRecords(formatListener);
+ } else {
+ hssfRequest.addListener(formatListener, BOFRecord.sid);
+ hssfRequest.addListener(formatListener, EOFRecord.sid);
+ hssfRequest.addListener(formatListener, DateWindow1904Record.sid);
+ hssfRequest.addListener(formatListener, CountryRecord.sid);
+ hssfRequest.addListener(formatListener, BoundSheetRecord.sid);
+ hssfRequest.addListener(formatListener, SSTRecord.sid);
+ hssfRequest.addListener(formatListener, FormulaRecord.sid);
+ hssfRequest.addListener(formatListener, LabelRecord.sid);
+ hssfRequest.addListener(formatListener, LabelSSTRecord.sid);
+ hssfRequest.addListener(formatListener, NumberRecord.sid);
+ hssfRequest.addListener(formatListener, RKRecord.sid);
+ hssfRequest.addListener(formatListener, StringRecord.sid);
+ hssfRequest.addListener(formatListener, HyperlinkRecord.sid);
+ hssfRequest.addListener(formatListener, TextObjectRecord.sid);
+ hssfRequest.addListener(formatListener, SeriesTextRecord.sid);
+ hssfRequest.addListener(formatListener, FormatRecord.sid);
+ hssfRequest.addListener(formatListener, ExtendedFormatRecord.sid);
+ hssfRequest.addListener(formatListener, DrawingGroupRecord.sid);
+ hssfRequest.addListener(formatListener, HeaderRecord.sid);
+ hssfRequest.addListener(formatListener, FooterRecord.sid);
+ }
+
+ // Create event factory and process Workbook (fire events)
+ DocumentInputStream documentInputStream = root.createDocumentInputStream(WORKBOOK_ENTRY);
+ HSSFEventFactory eventFactory = new HSSFEventFactory();
+ try {
+ eventFactory.processEvents(hssfRequest, documentInputStream);
+ } catch (org.apache.poi.EncryptedDocumentException e) {
+ throw new EncryptedDocumentException(e);
+ }
+
+ // Output any extra text that came after all the sheets
+ processExtraText();
+
+ // Look for embeded images, now that the drawing records
+ // have been fully matched with their continue data
+ for (DrawingGroupRecord dgr : drawingGroups) {
+ dgr.decode();
+ findPictures(dgr.getEscherRecords());
+ }
+ }
+
+ /**
+ * Process a HSSF record.
+ *
+ * @param record HSSF Record
+ */
+ public void processRecord(Record record) {
+ if (exception == null) {
+ try {
+ internalProcessRecord(record);
+ } catch (TikaException te) {
+ exception = te;
+ } catch (IOException ie) {
+ exception = ie;
+ } catch (SAXException se) {
+ exception = se;
+ }
+ }
+ }
+
+ public void throwStoredException() throws TikaException, SAXException, IOException {
+ if (exception != null) {
+ if (exception instanceof IOException)
+ throw (IOException) exception;
+ if (exception instanceof SAXException)
+ throw (SAXException) exception;
+ if (exception instanceof TikaException)
+ throw (TikaException) exception;
+ throw new TikaException(exception.getMessage());
+ }
+ }
+
+ private void internalProcessRecord(Record record) throws SAXException, TikaException, IOException {
+ switch (record.getSid()) {
+ case BOFRecord.sid: // start of workbook, worksheet etc. records
+ BOFRecord bof = (BOFRecord) record;
+ if (bof.getType() == BOFRecord.TYPE_WORKBOOK) {
+ currentSheetIndex = -1;
+ } else if (bof.getType() == BOFRecord.TYPE_CHART) {
+ if (previousSid == EOFRecord.sid) {
+ // This is a sheet which contains only a chart
+ newSheet();
+ } else {
+ // This is a chart within a normal sheet
+ // Handling of this is a bit hacky...
+ if (currentSheet != null) {
+ processSheet();
+ currentSheetIndex--;
+ newSheet();
+ }
+ }
+ } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) {
+ newSheet();
+ }
+ break;
+
+ case EOFRecord.sid: // end of workbook, worksheet etc. records
+ if (currentSheet != null) {
+ processSheet();
+ }
+ currentSheet = null;
+ break;
+
+ case BoundSheetRecord.sid: // Worksheet index record
+ BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record;
+ sheetNames.add(boundSheetRecord.getSheetname());
+ break;
+
+ case SSTRecord.sid: // holds all the strings for LabelSSTRecords
+ sstRecord = (SSTRecord) record;
+ break;
+
+ case FormulaRecord.sid: // Cell value from a formula
+ FormulaRecord formula = (FormulaRecord) record;
+ if (formula.hasCachedResultString()) {
+ // The String itself should be the next record
+ stringFormulaRecord = formula;
+ } else {
+ addTextCell(record, formatListener.formatNumberDateCell(formula));
+ }
+ break;
+
+ case StringRecord.sid:
+ if (previousSid == FormulaRecord.sid) {
+ // Cached string value of a string formula
+ StringRecord sr = (StringRecord) record;
+ addTextCell(stringFormulaRecord, sr.getString());
+ } else {
+ // Some other string not associated with a cell, skip
+ }
+ break;
+
+ case LabelRecord.sid: // strings stored directly in the cell
+ LabelRecord label = (LabelRecord) record;
+ addTextCell(record, label.getValue());
+ break;
+
+ case LabelSSTRecord.sid: // Ref. a string in the shared string table
+ LabelSSTRecord sst = (LabelSSTRecord) record;
+ UnicodeString unicode = sstRecord.getString(sst.getSSTIndex());
+ addTextCell(record, unicode.getString());
+ break;
+
+ case NumberRecord.sid: // Contains a numeric cell value
+ NumberRecord number = (NumberRecord) record;
+ addTextCell(record, formatListener.formatNumberDateCell(number));
+ break;
+
+ case RKRecord.sid: // Excel internal number record
+ RKRecord rk = (RKRecord) record;
+ addCell(record, new NumberCell(rk.getRKNumber(), format));
+ break;
+
+ case HyperlinkRecord.sid: // holds a URL associated with a cell
+ if (currentSheet != null) {
+ HyperlinkRecord link = (HyperlinkRecord) record;
+ Point point =
+ new Point(link.getFirstColumn(), link.getFirstRow());
+ Cell cell = currentSheet.get(point);
+ if (cell != null) {
+ String address = link.getAddress();
+ if (address != null) {
+ addCell(record, new LinkedCell(cell, address));
+ } else {
+ addCell(record, cell);
+ }
+ }
+ }
+ break;
+
+ case TextObjectRecord.sid:
+ TextObjectRecord tor = (TextObjectRecord) record;
+ addTextCell(record, tor.getStr().getString());
+ break;
+
+ case SeriesTextRecord.sid: // Chart label or title
+ SeriesTextRecord str = (SeriesTextRecord) record;
+ addTextCell(record, str.getText());
+ break;
+
+ case DrawingGroupRecord.sid:
+ // Collect this now, we'll process later when all
+ // the continue records are in
+ drawingGroups.add((DrawingGroupRecord) record);
+ break;
+
+ case HeaderRecord.sid:
+ HeaderRecord headerRecord = (HeaderRecord) record;
+ addTextCell(record, headerRecord.getText());
+ break;
+
+ case FooterRecord.sid:
+ FooterRecord footerRecord = (FooterRecord) record;
+ addTextCell(record, footerRecord.getText());
+ break;
+
+ }
+
+ previousSid = record.getSid();
+
+ if (stringFormulaRecord != record) {
+ stringFormulaRecord = null;
+ }
+ }
+
+ private void processExtraText() throws SAXException {
+ if (extraTextCells.size() > 0) {
+ for (Cell cell : extraTextCells) {
+ handler.startElement("div", "class", "outside");
+ cell.render(handler);
+ handler.endElement("div");
+ }
+
+ // Reset
+ extraTextCells.clear();
+ }
+ }
+
+ /**
+ * Adds the given cell (unless <code>null</code>) to the current
+ * worksheet (if any) at the position (if any) of the given record.
+ *
+ * @param record record that holds the cell value
+ * @param cell cell value (or <code>null</code>)
+ */
+ private void addCell(Record record, Cell cell) throws SAXException {
+ if (cell == null) {
+ // Ignore empty cells
+ } else if (currentSheet != null
+ && record instanceof CellValueRecordInterface) {
+ // Normal cell inside a worksheet
+ CellValueRecordInterface value =
+ (CellValueRecordInterface) record;
+ Point point = new Point(value.getColumn(), value.getRow());
+ currentSheet.put(point, cell);
+ } else {
+ // Cell outside the worksheets
+ extraTextCells.add(cell);
+ }
+ }
+
+ /**
+ * Adds a text cell with the given text comment. The given text
+ * is trimmed, and ignored if <code>null</code> or empty.
+ *
+ * @param record record that holds the text value
+ * @param text text content, may be <code>null</code>
+ * @throws SAXException
+ */
+ private void addTextCell(Record record, String text) throws SAXException {
+ if (text != null) {
+ text = text.trim();
+ if (text.length() > 0) {
+ addCell(record, new TextCell(text));
+ }
+ }
+ }
+
+ private void newSheet() {
+ currentSheetIndex++;
+ currentSheet = new TreeMap<Point, Cell>(new PointComparator());
+ }
+
+ /**
+ * Process an excel sheet.
+ *
+ * @throws SAXException if an error occurs
+ */
+ private void processSheet() throws SAXException {
+ // Sheet Start
+ handler.startElement("div", "class", "page");
+ if (currentSheetIndex < sheetNames.size()) {
+ handler.element("h1", sheetNames.get(currentSheetIndex));
+ }
+ handler.startElement("table");
+ handler.startElement("tbody");
+
+ // Process Rows
+ int currentRow = 0;
+ int currentColumn = 0;
+ handler.startElement("tr");
+ handler.startElement("td");
+ for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) {
+ while (currentRow < entry.getKey().y) {
+ handler.endElement("td");
+ handler.endElement("tr");
+ handler.startElement("tr");
+ handler.startElement("td");
+ currentRow++;
+ currentColumn = 0;
+ }
+
+ while (currentColumn < entry.getKey().x) {
+ handler.endElement("td");
+ handler.startElement("td");
+ currentColumn++;
+ }
+
+ entry.getValue().render(handler);
+ }
+ handler.endElement("td");
+ handler.endElement("tr");
+
+ // Sheet End
+ handler.endElement("tbody");
+ handler.endElement("table");
+
+ // Finish up
+ processExtraText();
+ handler.endElement("div");
+ }
+
+ private void findPictures(List<EscherRecord> records) throws IOException, SAXException, TikaException {
+ for (EscherRecord escherRecord : records) {
+ if (escherRecord instanceof EscherBSERecord) {
+ EscherBlipRecord blip = ((EscherBSERecord) escherRecord).getBlipRecord();
+ if (blip != null) {
+ HSSFPictureData picture = new HSSFPictureData(blip);
+ String mimeType = picture.getMimeType();
+ TikaInputStream stream = TikaInputStream.get(picture.getData());
+
+ // Handle the embeded resource
+ extractor.handleEmbeddedResource(
+ stream, null, null, mimeType,
+ handler, true
+ );
+ }
+ }
+
+ // Recursive call.
+ findPictures(escherRecord.getChildRecords());
+ }
+ }
+ }
+
+ /**
+ * Utility comparator for points.
+ */
+ private static class PointComparator implements Comparator<Point> {
+
+ public int compare(Point a, Point b) {
+ int diff = a.y - b.y;
+ if (diff == 0) {
+ diff = a.x - b.x;
+ }
+ return diff;
+ }
+
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,366 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.List;
+
+import org.apache.poi.hslf.model.Comment;
+import org.apache.poi.hslf.model.HeadersFooters;
+import org.apache.poi.hslf.model.OLEShape;
+import org.apache.poi.hslf.usermodel.HSLFMasterSheet;
+import org.apache.poi.hslf.usermodel.HSLFNotes;
+import org.apache.poi.hslf.usermodel.HSLFObjectData;
+import org.apache.poi.hslf.usermodel.HSLFPictureData;
+import org.apache.poi.hslf.usermodel.HSLFShape;
+import org.apache.poi.hslf.usermodel.HSLFSlide;
+import org.apache.poi.hslf.usermodel.HSLFSlideShow;
+import org.apache.poi.hslf.usermodel.HSLFTable;
+import org.apache.poi.hslf.usermodel.HSLFTableCell;
+import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
+import org.apache.poi.hslf.usermodel.HSLFTextRun;
+import org.apache.poi.hslf.usermodel.HSLFTextShape;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+public class HSLFExtractor extends AbstractPOIFSExtractor {
+ public HSLFExtractor(ParseContext context) {
+ super(context);
+ }
+
+ protected void parse(
+ NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ parse(filesystem.getRoot(), xhtml);
+ }
+
+ protected void parse(
+ DirectoryNode root, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ HSLFSlideShow ss = new HSLFSlideShow(root);
+ List<HSLFSlide> _slides = ss.getSlides();
+
+ xhtml.startElement("div", "class", "slideShow");
+
+ /* Iterate over slides and extract text */
+ for (HSLFSlide slide : _slides) {
+ xhtml.startElement("div", "class", "slide");
+
+ // Slide header, if present
+ HeadersFooters hf = slide.getHeadersFooters();
+ if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) {
+ xhtml.startElement("p", "class", "slide-header");
+
+ xhtml.characters(hf.getHeaderText());
+
+ xhtml.endElement("p");
+ }
+
+ // Slide master, if present
+ extractMaster(xhtml, slide.getMasterSheet());
+
+ // Slide text
+ {
+ xhtml.startElement("div", "class", "slide-content");
+
+ textRunsToText(xhtml, slide.getTextParagraphs());
+
+ xhtml.endElement("div");
+ }
+
+ // Table text
+ for (HSLFShape shape : slide.getShapes()) {
+ if (shape instanceof HSLFTable) {
+ extractTableText(xhtml, (HSLFTable) shape);
+ }
+ }
+
+ // Slide footer, if present
+ if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
+ xhtml.startElement("p", "class", "slide-footer");
+
+ xhtml.characters(hf.getFooterText());
+
+ xhtml.endElement("p");
+ }
+
+ // Comments, if present
+ StringBuilder authorStringBuilder = new StringBuilder();
+ for (Comment comment : slide.getComments()) {
+ authorStringBuilder.setLength(0);
+ xhtml.startElement("p", "class", "slide-comment");
+
+ if (comment.getAuthor() != null) {
+ authorStringBuilder.append(comment.getAuthor());
+ }
+ if (comment.getAuthorInitials() != null) {
+ if (authorStringBuilder.length() > 0) {
+ authorStringBuilder.append(" ");
+ }
+ authorStringBuilder.append("("+comment.getAuthorInitials()+")");
+ }
+ if (authorStringBuilder.length() > 0) {
+ if (comment.getText() != null) {
+ authorStringBuilder.append(" - ");
+ }
+ xhtml.startElement("b");
+ xhtml.characters(authorStringBuilder.toString());
+ xhtml.endElement("b");
+ }
+ if (comment.getText() != null) {
+ xhtml.characters(comment.getText());
+ }
+ xhtml.endElement("p");
+ }
+
+ // Now any embedded resources
+ handleSlideEmbeddedResources(slide, xhtml);
+
+ // TODO Find the Notes for this slide and extract inline
+
+ // Slide complete
+ xhtml.endElement("div");
+ }
+
+ // All slides done
+ xhtml.endElement("div");
+
+ /* notes */
+ xhtml.startElement("div", "class", "slide-notes");
+ HashSet<Integer> seenNotes = new HashSet<>();
+ HeadersFooters hf = ss.getNotesHeadersFooters();
+
+ for (HSLFSlide slide : _slides) {
+ HSLFNotes notes = slide.getNotes();
+ if (notes == null) {
+ continue;
+ }
+ Integer id = notes._getSheetNumber();
+ if (seenNotes.contains(id)) {
+ continue;
+ }
+ seenNotes.add(id);
+
+ // Repeat the Notes header, if set
+ if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) {
+ xhtml.startElement("p", "class", "slide-note-header");
+ xhtml.characters(hf.getHeaderText());
+ xhtml.endElement("p");
+ }
+
+ // Notes text
+ textRunsToText(xhtml, notes.getTextParagraphs());
+
+ // Repeat the notes footer, if set
+ if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
+ xhtml.startElement("p", "class", "slide-note-footer");
+ xhtml.characters(hf.getFooterText());
+ xhtml.endElement("p");
+ }
+ }
+
+ handleSlideEmbeddedPictures(ss, xhtml);
+
+ xhtml.endElement("div");
+ }
+
+ private void extractMaster(XHTMLContentHandler xhtml, HSLFMasterSheet master) throws SAXException {
+ if (master == null) {
+ return;
+ }
+ List<HSLFShape> shapes = master.getShapes();
+ if (shapes == null || shapes.isEmpty()) {
+ return;
+ }
+
+ xhtml.startElement("div", "class", "slide-master-content");
+ for (HSLFShape shape : shapes) {
+ if (shape != null && !HSLFMasterSheet.isPlaceholder(shape)) {
+ if (shape instanceof HSLFTextShape) {
+ HSLFTextShape tsh = (HSLFTextShape) shape;
+ String text = tsh.getText();
+ if (text != null) {
+ xhtml.element("p", text);
+ }
+ }
+ }
+ }
+ xhtml.endElement("div");
+ }
+
+ private void extractTableText(XHTMLContentHandler xhtml, HSLFTable shape) throws SAXException {
+ xhtml.startElement("table");
+ for (int row = 0; row < shape.getNumberOfRows(); row++) {
+ xhtml.startElement("tr");
+ for (int col = 0; col < shape.getNumberOfColumns(); col++) {
+ HSLFTableCell cell = shape.getCell(row, col);
+ //insert empty string for empty cell if cell is null
+ String txt = "";
+ if (cell != null) {
+ txt = cell.getText();
+ }
+ xhtml.element("td", txt);
+ }
+ xhtml.endElement("tr");
+ }
+ xhtml.endElement("table");
+ }
+
+ private void textRunsToText(XHTMLContentHandler xhtml, List<List<HSLFTextParagraph>> paragraphsList) throws SAXException {
+ if (paragraphsList == null) {
+ return;
+ }
+
+ for (List<HSLFTextParagraph> run : paragraphsList) {
+ // Leaving in wisdom from TIKA-712 for easy revert.
+ // Avoid boiler-plate text on the master slide (0
+ // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
+ //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) {
+
+ boolean isBullet = false;
+ for (HSLFTextParagraph htp : run) {
+ boolean nextBullet = htp.isBullet();
+ // TODO: identify bullet/list type
+ if (isBullet != nextBullet) {
+ isBullet = nextBullet;
+ if (isBullet) {
+ xhtml.startElement("ul");
+ } else {
+ xhtml.endElement("ul");
+ }
+ }
+
+ List<HSLFTextRun> textRuns = htp.getTextRuns();
+ String firstLine = removePBreak(textRuns.get(0).getRawText());
+ boolean showBullet = (isBullet && (textRuns.size() > 1 || !"".equals(firstLine)));
+ String paraTag = showBullet ? "li" : "p";
+
+ xhtml.startElement(paraTag);
+ for (HSLFTextRun htr : textRuns) {
+ String line = htr.getRawText();
+ if (line != null) {
+ boolean isfirst = true;
+ for (String fragment : line.split("\\u000b")) {
+ if (!isfirst) {
+ xhtml.startElement("br");
+ xhtml.endElement("br");
+ }
+ isfirst = false;
+ xhtml.characters(removePBreak(fragment));
+ }
+ if (line.endsWith("\u000b")) {
+ xhtml.startElement("br");
+ xhtml.endElement("br");
+ }
+ }
+ }
+ xhtml.endElement(paraTag);
+ }
+ if (isBullet) {
+ xhtml.endElement("ul");
+ }
+ }
+ }
+
+ // remove trailing paragraph break
+ private static String removePBreak(String fragment) {
+ // the last text run of a text paragraph contains the paragraph break (\r)
+ // line breaks (\\u000b) can happen more often
+ return fragment.replaceFirst("\\r$", "");
+ }
+
+ private void handleSlideEmbeddedPictures(HSLFSlideShow slideshow, XHTMLContentHandler xhtml)
+ throws TikaException, SAXException, IOException {
+ for (HSLFPictureData pic : slideshow.getPictureData()) {
+ String mediaType;
+
+ switch (pic.getType()) {
+ case EMF:
+ mediaType = "application/x-emf";
+ break;
+ case WMF:
+ mediaType = "application/x-msmetafile";
+ break;
+ case DIB:
+ mediaType = "image/bmp";
+ break;
+ default:
+ mediaType = pic.getContentType();
+ break;
+ }
+
+ handleEmbeddedResource(
+ TikaInputStream.get(pic.getData()), null, null,
+ mediaType, xhtml, false);
+ }
+ }
+
+ private void handleSlideEmbeddedResources(HSLFSlide slide, XHTMLContentHandler xhtml)
+ throws TikaException, SAXException, IOException {
+ List<HSLFShape> shapes;
+ try {
+ shapes = slide.getShapes();
+ } catch (NullPointerException e) {
+ // Sometimes HSLF hits problems
+ // Please open POI bugs for any you come across!
+ return;
+ }
+
+ for (HSLFShape shape : shapes) {
+ if (shape instanceof OLEShape) {
+ OLEShape oleShape = (OLEShape) shape;
+ HSLFObjectData data = null;
+ try {
+ data = oleShape.getObjectData();
+ } catch (NullPointerException e) {
+ /* getObjectData throws NPE some times. */
+ }
+
+ if (data != null) {
+ String objID = Integer.toString(oleShape.getObjectID());
+
+ // Embedded Object: add a <div
+ // class="embedded" id="X"/> so consumer can see where
+ // in the main text each embedded document
+ // occurred:
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", objID);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+
+ try (TikaInputStream stream = TikaInputStream.get(data.getData())) {
+ String mediaType = null;
+ if ("Excel.Chart.8".equals(oleShape.getProgID())) {
+ mediaType = "application/vnd.ms-excel";
+ }
+ handleEmbeddedResource(
+ stream, objID, objID,
+ mediaType, xhtml, false);
+ }
+ }
+ }
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,345 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft;
+
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.text.DateFormat;
+import java.text.NumberFormat;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+
+import com.healthmarketscience.jackcess.Column;
+import com.healthmarketscience.jackcess.DataType;
+import com.healthmarketscience.jackcess.Database;
+import com.healthmarketscience.jackcess.PropertyMap;
+import com.healthmarketscience.jackcess.Row;
+import com.healthmarketscience.jackcess.Table;
+import com.healthmarketscience.jackcess.query.Query;
+import com.healthmarketscience.jackcess.util.OleBlob;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Internal class. Needs to be instantiated for each parse because of
+ * the lack of thread safety with the dateTimeFormatter
+ */
+class JackcessExtractor extends AbstractPOIFSExtractor {
+
+ final static String TITLE_PROP_KEY = "Title";
+ final static String AUTHOR_PROP_KEY = "Author";
+ final static String COMPANY_PROP_KEY = "Company";
+
+ final static String TEXT_FORMAT_KEY = "TextFormat";
+ final static String CURRENCY_FORMAT_KEY = "Format";
+ final static byte TEXT_FORMAT = 0;
+ final static byte RICH_TEXT_FORMAT = 1;
+ final static ParseContext EMPTY_PARSE_CONTEXT = new ParseContext();
+
+ final NumberFormat currencyFormatter;
+ final DateFormat shortDateTimeFormatter;
+
+ final HtmlParser htmlParser = new HtmlParser();
+
+ protected JackcessExtractor(ParseContext context, Locale locale) {
+ super(context);
+ currencyFormatter = NumberFormat.getCurrencyInstance(locale);
+ shortDateTimeFormatter = DateFormat.getDateInstance(DateFormat.SHORT, locale);
+ }
+
+ public void parse(Database db, XHTMLContentHandler xhtml, Metadata metadata) throws IOException, SAXException, TikaException {
+
+
+ String pw = db.getDatabasePassword();
+ if (pw != null) {
+ metadata.set(JackcessParser.MDB_PW, pw);
+ }
+
+ PropertyMap dbp = db.getDatabaseProperties();
+ for (PropertyMap.Property p : dbp) {
+ metadata.add(JackcessParser.MDB_PROPERTY_PREFIX + p.getName(),
+ toString(p.getValue(), p.getType()));
+ }
+
+ PropertyMap up = db.getUserDefinedProperties();
+ for (PropertyMap.Property p : up) {
+ metadata.add(JackcessParser.USER_DEFINED_PROPERTY_PREFIX+ p.getName(),
+ toString(p.getValue(), p.getType()));
+ }
+
+ Set<String> found = new HashSet<>();
+ PropertyMap summaryProperties = db.getSummaryProperties();
+ if (summaryProperties != null) {
+ //try to get core properties
+ PropertyMap.Property title = summaryProperties.get(TITLE_PROP_KEY);
+ if (title != null) {
+ metadata.set(TikaCoreProperties.TITLE, toString(title.getValue(), title.getType()));
+ found.add(title.getName());
+ }
+ PropertyMap.Property author = summaryProperties.get(AUTHOR_PROP_KEY);
+ if (author != null && author.getValue() != null) {
+ String authorString = toString(author.getValue(), author.getType());
+ SummaryExtractor.addMulti(metadata, TikaCoreProperties.CREATOR, authorString);
+ found.add(author.getName());
+ }
+ PropertyMap.Property company = summaryProperties.get(COMPANY_PROP_KEY);
+ if (company != null) {
+ metadata.set(OfficeOpenXMLExtended.COMPANY, toString(company.getValue(), company.getType()));
+ found.add(company.getName());
+ }
+
+ for (PropertyMap.Property p : db.getSummaryProperties()) {
+ if (! found.contains(p.getName())) {
+ metadata.add(JackcessParser.SUMMARY_PROPERTY_PREFIX + p.getName(),
+ toString(p.getValue(), p.getType()));
+ }
+ }
+
+ }
+
+ Iterator<Table> it = db.newIterable().
+ setIncludeLinkedTables(false).
+ setIncludeSystemTables(false).iterator();
+
+ while (it.hasNext()) {
+ Table table = it.next();
+ String tableName = table.getName();
+ List<? extends Column> columns = table.getColumns();
+ xhtml.startElement("table", "name", tableName);
+ addHeaders(columns, xhtml);
+ xhtml.startElement("tbody");
+
+ Row r = table.getNextRow();
+
+ while (r != null) {
+ xhtml.startElement("tr");
+ for (Column c : columns) {
+ handleCell(r, c, xhtml);
+ }
+ xhtml.endElement("tr");
+ r = table.getNextRow();
+ }
+ xhtml.endElement("tbody");
+ xhtml.endElement("table");
+ }
+
+ for (Query q : db.getQueries()) {
+ xhtml.startElement("div", "type", "sqlQuery");
+ xhtml.characters(q.toSQLString());
+ xhtml.endElement("div");
+ }
+ }
+
+ private void addHeaders(List<? extends Column> columns, XHTMLContentHandler xhtml) throws SAXException {
+ xhtml.startElement("thead");
+ xhtml.startElement("tr");
+ for (Column c : columns) {
+ xhtml.startElement("th");
+ xhtml.characters(c.getName());
+ xhtml.endElement("th");
+ }
+ xhtml.endElement("tr");
+ xhtml.endElement("thead");
+
+ }
+
+ private void handleCell(Row r, Column c, XHTMLContentHandler handler)
+ throws SAXException, IOException, TikaException {
+
+ handler.startElement("td");
+ if (c.getType().equals(DataType.OLE)) {
+ handleOLE(r, c.getName(), handler);
+ } else if (c.getType().equals(DataType.BINARY)) {
+ Object obj = r.get(c.getName());
+ if (obj != null) {
+ byte[] bytes = (byte[])obj;
+ handleEmbeddedResource(
+ TikaInputStream.get(bytes),
+ null,//filename
+ null,//relationshipId
+ null,//mediatype
+ handler, false);
+ }
+ } else {
+ Object obj = r.get(c.getName());
+ String v = toString(obj, c.getType());
+ if (isRichText(c)) {
+ BodyContentHandler h = new BodyContentHandler();
+ Metadata m = new Metadata();
+ m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
+ try {
+ htmlParser.parse(new ByteArrayInputStream(v.getBytes(UTF_8)),
+ h,
+ m, EMPTY_PARSE_CONTEXT);
+ handler.characters(h.toString());
+ } catch (SAXException e) {
+ //if something went wrong in htmlparser, just append the characters
+ handler.characters(v);
+ }
+ } else {
+ handler.characters(v);
+ }
+ }
+ handler.endElement("td");
+ }
+
+ private boolean isRichText(Column c) throws IOException {
+
+ if (c == null) {
+ return false;
+ }
+
+ PropertyMap m = c.getProperties();
+ if (m == null) {
+ return false;
+ }
+ if (c.getType() == null || ! c.getType().equals(DataType.MEMO)) {
+ return false;
+ }
+ Object b = m.getValue(TEXT_FORMAT_KEY);
+ if (b instanceof Byte) {
+ if (((Byte)b).byteValue() == RICH_TEXT_FORMAT) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private String toString(Object value, DataType type) {
+ if (value == null) {
+ return "";
+ }
+ if (type == null) {
+ //this shouldn't happen
+ return value.toString();
+ }
+ switch (type) {
+ case LONG:
+ return Integer.toString((Integer)value);
+ case TEXT:
+ return (String)value;
+ case MONEY:
+ //TODO: consider getting parsing "Format" field from
+ //field properties.
+ return formatCurrency(((BigDecimal)value).doubleValue(), type);
+ case SHORT_DATE_TIME:
+ return formatShortDateTime((Date)value);
+ case BOOLEAN:
+ return Boolean.toString((Boolean) value);
+ case MEMO:
+ return (String)value;
+ case INT:
+ return Short.toString((Short)value);
+ case DOUBLE:
+ return Double.toString((Double)value);
+ case FLOAT:
+ return Float.toString((Float)value);
+ case NUMERIC:
+ return value.toString();
+ case BYTE:
+ return Byte.toString((Byte)value);
+ case GUID:
+ return value.toString();
+ case COMPLEX_TYPE: //skip all these
+ case UNKNOWN_0D:
+ case UNKNOWN_11:
+ case UNSUPPORTED_FIXEDLEN:
+ case UNSUPPORTED_VARLEN:
+ default:
+ return "";
+
+ }
+ }
+
+ private void handleOLE(Row row, String cName, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
+ OleBlob blob = row.getBlob(cName);
+ //lifted shamelessly from Jackcess's OleBlobTest
+ if (blob == null)
+ return;
+
+ OleBlob.Content content = blob.getContent();
+ if (content == null)
+ return;
+
+ switch (content.getType()) {
+ case LINK:
+ xhtml.characters(((OleBlob.LinkContent) content).getLinkPath());
+ break;
+ case SIMPLE_PACKAGE:
+ OleBlob.SimplePackageContent spc = (OleBlob.SimplePackageContent) content;
+
+ handleEmbeddedResource(
+ TikaInputStream.get(spc.getStream()),
+ spc.getFileName(),//filename
+ null,//relationshipId
+ spc.getTypeName(),//mediatype
+ xhtml, false);
+ break;
+ case OTHER:
+ OleBlob.OtherContent oc = (OleBlob.OtherContent) content;
+ handleEmbeddedResource(
+ TikaInputStream.get(oc.getStream()),
+ null,//filename
+ null,//relationshipId
+ oc.getTypeName(),//mediatype
+ xhtml, false);
+ break;
+ case COMPOUND_STORAGE:
+ OleBlob.CompoundContent cc = (OleBlob.CompoundContent) content;
+ handleCompoundContent(cc, xhtml);
+ break;
+ }
+ }
+
+ private void handleCompoundContent(OleBlob.CompoundContent cc, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
+ NPOIFSFileSystem nfs = new NPOIFSFileSystem(cc.getStream());
+ handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml);
+ }
+
+ String formatCurrency(Double d, DataType type) {
+ if (d == null) {
+ return "";
+ }
+ return currencyFormatter.format(d);
+ }
+
+ String formatShortDateTime(Date d) {
+ if (d == null) {
+ return "";
+ }
+ return shortDateTimeFormatter.format(d);
+ }
+}
+