You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/16 19:23:06 UTC
svn commit: r1725014 [8/28] - in /tika/branches/2.x: tika-parser-bundles/tika-multimedia-bundle/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-parser-module/ tika-parser-modules/tika-advanced-parser-mo...

Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,269 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.poi.hwpf.converter.NumberFormatter;
+
+public abstract class AbstractListManager {
+    private final static String BULLET = "\u00b7";
+
+    protected Map<Integer, ParagraphLevelCounter> listLevelMap = new HashMap<Integer, ParagraphLevelCounter>();
+    protected Map<Integer, LevelTuple[]> overrideTupleMap = new HashMap<Integer, LevelTuple[]>();
+
+    //helper class that is docx/doc format agnostic
+    protected class ParagraphLevelCounter {
+
+        //counts can == 0 if the format is decimal, make sure
+        //that flag values are < 0
+        private final Integer NOT_SEEN_YET = -1;
+        private final Integer FIRST_SKIPPED = -2;
+        private final LevelTuple[] levelTuples;
+        Pattern LEVEL_INTERPOLATOR = Pattern.compile("%(\\d+)");
+        private List<Integer> counts = new ArrayList<Integer>();
+        private int lastLevel = -1;
+
+        public ParagraphLevelCounter(LevelTuple[] levelTuples) {
+            this.levelTuples = levelTuples;
+        }
+
+        public int getNumberOfLevels() {
+            return levelTuples.length;
+        }
+
+        /**
+         * Apply this to every numbered paragraph in order.
+         *
+         * @param levelNumber level number that is being incremented
+         * @return the new formatted number string for this level
+         */
+        public String incrementLevel(int levelNumber, LevelTuple[] overrideLevelTuples) {
+
+            for (int i = lastLevel + 1; i < levelNumber; i++) {
+                if (i >= counts.size()) {
+                    int val = getStart(i, overrideLevelTuples);
+                    counts.add(i, val);
+                } else {
+                    int count = counts.get(i);
+                    if (count == NOT_SEEN_YET) {
+                        count = getStart(i, overrideLevelTuples);
+                        counts.set(i, count);
+                    }
+                }
+            }
+
+            if (levelNumber < counts.size()) {
+                resetAfter(levelNumber, overrideLevelTuples);
+                int count = counts.get(levelNumber);
+                if (count == NOT_SEEN_YET) {
+                    count = getStart(levelNumber, overrideLevelTuples);
+                } else {
+                    count++;
+                }
+                counts.set(levelNumber, count);
+                lastLevel = levelNumber;
+                return format(levelNumber, overrideLevelTuples);
+            }
+
+            counts.add(levelNumber, getStart(levelNumber, overrideLevelTuples));
+            lastLevel = levelNumber;
+            return format(levelNumber, overrideLevelTuples);
+        }
+
+        /**
+         * @param level which level to format
+         * @return the string that represents the number and the surrounding text for this paragraph
+         */
+        private String format(int level, LevelTuple[] overrideLevelTuples) {
+            if (level < 0 || level >= levelTuples.length) {
+                //log?
+                return "";
+            }
+            boolean isLegal = (overrideLevelTuples != null) ? overrideLevelTuples[level].isLegal : levelTuples[level].isLegal;
+            //short circuit bullet
+            String numFmt = getNumFormat(level, isLegal, overrideLevelTuples);
+            if ("bullet".equals(numFmt)) {
+                return BULLET + " ";
+            }
+
+            String lvlText = (overrideLevelTuples == null || overrideLevelTuples[level].lvlText == null) ?
+                    levelTuples[level].lvlText : overrideLevelTuples[level].lvlText;
+            StringBuilder sb = new StringBuilder();
+            Matcher m = LEVEL_INTERPOLATOR.matcher(lvlText);
+            int last = 0;
+            while (m.find()) {
+                sb.append(lvlText.substring(last, m.start()));
+                String lvlString = m.group(1);
+                int lvlNum = -1;
+                try {
+                    lvlNum = Integer.parseInt(lvlString);
+                } catch (NumberFormatException e) {
+                    //swallow
+                }
+                String numString = "";
+                //need to subtract 1 because, e.g. %1 is the format
+                //for the number at array offset 0
+                numString = formatNum(lvlNum - 1, isLegal, overrideLevelTuples);
+
+                sb.append(numString);
+                last = m.end();
+            }
+            sb.append(lvlText.substring(last));
+            if (sb.length() > 0) {
+                //TODO: add in character after number
+                sb.append(" ");
+            }
+            return sb.toString();
+        }
+
+        //actual level number; can return empty string if numberformatter fails
+        private String formatNum(int lvlNum, boolean isLegal, LevelTuple[] overrideLevelTuples) {
+
+            int numFmtStyle = 0;
+            String numFmt = getNumFormat(lvlNum, isLegal, overrideLevelTuples);
+
+            int count = getCount(lvlNum);
+            if (count < 0) {
+                count = 1;
+            }
+            if ("lowerLetter".equals(numFmt)) {
+                numFmtStyle = 4;
+            } else if ("lowerRoman".equals(numFmt)) {
+                numFmtStyle = 2;
+            } else if ("decimal".equals(numFmt)) {
+                numFmtStyle = 0;
+            } else if ("upperLetter".equals(numFmt)) {
+                numFmtStyle = 3;
+            } else if ("upperRoman".equals(numFmt)) {
+                numFmtStyle = 1;
+            } else if ("bullet".equals(numFmt)) {
+                return "";
+                //not yet handled by NumberFormatter...TODO: add to NumberFormatter?
+            } else if ("ordinal".equals(numFmt)) {
+                return ordinalize(count);
+            } else if ("decimalZero".equals(numFmt)) {
+                return "0" + NumberFormatter.getNumber(count, 0);
+            } else if ("none".equals(numFmt)) {
+                return "";
+            }
+            try {
+                return NumberFormatter.getNumber(count, numFmtStyle);
+            } catch (IllegalArgumentException e) {
+                return "";
+            }
+        }
+
+        private String ordinalize(int count) {
+            //this is only good for locale == English
+            String countString = Integer.toString(count);
+            if (countString.endsWith("1")) {
+                return countString + "st";
+            } else if (countString.endsWith("2")) {
+                return countString + "nd";
+            } else if (countString.endsWith("3")) {
+                return countString + "rd";
+            }
+            return countString + "th";
+        }
+
+        private String getNumFormat(int lvlNum, boolean isLegal, LevelTuple[] overrideLevelTuples) {
+            if (lvlNum < 0 || lvlNum >= levelTuples.length) {
+                //log?
+                return "decimal";
+            }
+            if (isLegal) {
+                //return decimal no matter the level if isLegal is true
+                return "decimal";
+            }
+            return (overrideLevelTuples == null || overrideLevelTuples[lvlNum].numFmt == null) ?
+                    levelTuples[lvlNum].numFmt : overrideLevelTuples[lvlNum].numFmt;
+        }
+
+        private int getCount(int lvlNum) {
+            if (lvlNum < 0 || lvlNum >= counts.size()) {
+                //log?
+                return 1;
+            }
+            return counts.get(lvlNum);
+        }
+
+        private void resetAfter(int startlevelNumber, LevelTuple[] overrideLevelTuples) {
+            for (int levelNumber = startlevelNumber + 1; levelNumber < counts.size(); levelNumber++) {
+                int cnt = counts.get(levelNumber);
+                if (cnt == NOT_SEEN_YET) {
+                    //do nothing
+                } else if (cnt == FIRST_SKIPPED) {
+                    //do nothing
+                } else if (levelTuples.length > levelNumber) {
+                    //never reset if restarts == 0
+                    int restart = (overrideLevelTuples == null || overrideLevelTuples[levelNumber].restart < 0) ?
+                            levelTuples[levelNumber].restart : overrideLevelTuples[levelNumber].restart;
+                    if (restart == 0) {
+                        return;
+                    } else if (restart == -1 ||
+                            startlevelNumber <= restart - 1) {
+                        counts.set(levelNumber, NOT_SEEN_YET);
+                    } else {
+                        //do nothing/don't reset
+                    }
+                } else {
+                    //reset!
+                    counts.set(levelNumber, NOT_SEEN_YET);
+                }
+            }
+        }
+
+        private int getStart(int levelNumber, LevelTuple[] overrideLevelTuples) {
+            if (levelNumber >= levelTuples.length) {
+                return 1;
+            } else {
+                return (overrideLevelTuples == null || overrideLevelTuples[levelNumber].start < 0) ?
+                        levelTuples[levelNumber].start : overrideLevelTuples[levelNumber].start;
+            }
+        }
+    }
+
+    protected class LevelTuple {
+        private final int start;
+        private final int restart;
+        private final String lvlText;
+        private final String numFmt;
+        private final boolean isLegal;
+
+        public LevelTuple(String lvlText) {
+            this.lvlText = lvlText;
+            start = 1;
+            restart = -1;
+            numFmt = "decimal";
+            isLegal = false;
+        }
+
+        public LevelTuple(int start, int restart, String lvlText, String numFmt, boolean isLegal) {
+            this.start = start;
+            this.restart = restart;
+            this.lvlText = lvlText;
+            this.numFmt = numFmt;
+            this.isLegal = isLegal;
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.Ole10Native;
+import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
+import org.apache.tika.parser.pkg.ZipContainerDetector;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+abstract class AbstractPOIFSExtractor {
+    private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class);
+    private final EmbeddedDocumentExtractor extractor;
+    private PasswordProvider passwordProvider;
+    private TikaConfig tikaConfig;
+    private MimeTypes mimeTypes;
+    private Detector detector;
+    private Metadata metadata;
+
+    protected AbstractPOIFSExtractor(ParseContext context) {
+        this(context, null);
+    }
+
+    protected AbstractPOIFSExtractor(ParseContext context, Metadata metadata) {
+        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+
+        if (ex == null) {
+            this.extractor = new ParsingEmbeddedDocumentExtractor(context);
+        } else {
+            this.extractor = ex;
+        }
+
+        this.passwordProvider = context.get(PasswordProvider.class);
+        this.tikaConfig = context.get(TikaConfig.class);
+        this.mimeTypes = context.get(MimeTypes.class);
+        this.detector = context.get(Detector.class);
+        this.metadata = metadata;
+    }
+
+    // Note - these cache, but avoid creating the default TikaConfig if not needed
+    protected TikaConfig getTikaConfig() {
+        if (tikaConfig == null) {
+            tikaConfig = TikaConfig.getDefaultConfig();
+        }
+        return tikaConfig;
+    }
+
+    protected Detector getDetector() {
+        if (detector != null) return detector;
+
+        detector = getTikaConfig().getDetector();
+        return detector;
+    }
+
+    protected MimeTypes getMimeTypes() {
+        if (mimeTypes != null) return mimeTypes;
+
+        mimeTypes = getTikaConfig().getMimeRepository();
+        return mimeTypes;
+    }
+
+    /**
+     * Returns the password to be used for this file, or null
+     * if no / default password should be used
+     */
+    protected String getPassword() {
+        if (passwordProvider != null) {
+            return passwordProvider.getPassword(metadata);
+        }
+        return null;
+    }
+
+    protected void handleEmbeddedResource(TikaInputStream resource, String filename,
+                                          String relationshipID, String mediaType, XHTMLContentHandler xhtml,
+                                          boolean outputHtml)
+            throws IOException, SAXException, TikaException {
+        try {
+            Metadata metadata = new Metadata();
+            if (filename != null) {
+                metadata.set(Metadata.TIKA_MIME_FILE, filename);
+                metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+            }
+            if (relationshipID != null) {
+                metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, relationshipID);
+            }
+            if (mediaType != null) {
+                metadata.set(Metadata.CONTENT_TYPE, mediaType);
+            }
+
+            if (extractor.shouldParseEmbedded(metadata)) {
+                extractor.parseEmbedded(resource, xhtml, metadata, outputHtml);
+            }
+        } finally {
+            resource.close();
+        }
+    }
+
+    /**
+     * Handle an office document that's embedded at the POIFS level
+     */
+    protected void handleEmbeddedOfficeDoc(
+            DirectoryEntry dir, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+
+        // Is it an embedded OLE2 document, or an embedded OOXML document?
+
+        if (dir.hasEntry("Package")) {
+            // It's OOXML (has a ZipFile):
+            Entry ooxml = dir.getEntry("Package");
+
+            try (TikaInputStream stream = TikaInputStream.get(
+                    new DocumentInputStream((DocumentEntry) ooxml))) {
+                ZipContainerDetector detector = new ZipContainerDetector();
+                MediaType type = detector.detect(stream, new Metadata());
+                handleEmbeddedResource(stream, null, dir.getName(), type.toString(), xhtml, true);
+                return;
+            }
+        }
+
+        // It's regular OLE2:
+
+        // What kind of document is it?
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
+        POIFSDocumentType type = POIFSDocumentType.detectType(dir);
+        TikaInputStream embedded = null;
+
+        try {
+            if (type == POIFSDocumentType.OLE10_NATIVE) {
+                try {
+                    // Try to un-wrap the OLE10Native record:
+                    Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
+                    if (ole.getLabel() != null) {
+                        metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
+                    }
+                    byte[] data = ole.getDataBuffer();
+                    embedded = TikaInputStream.get(data);
+                } catch (Ole10NativeException ex) {
+                    // Not a valid OLE10Native record, skip it
+                } catch (Exception e) {
+                    logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), e);
+                }
+            } else if (type == POIFSDocumentType.COMP_OBJ) {
+                try {
+                    // Grab the contents and process
+                    DocumentEntry contentsEntry;
+                    try {
+                        contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
+                    } catch (FileNotFoundException ioe) {
+                        contentsEntry = (DocumentEntry) dir.getEntry("Contents");
+                    }
+                    DocumentInputStream inp = new DocumentInputStream(contentsEntry);
+                    byte[] contents = new byte[contentsEntry.getSize()];
+                    inp.readFully(contents);
+                    embedded = TikaInputStream.get(contents);
+
+                    // Try to work out what it is
+                    MediaType mediaType = getDetector().detect(embedded, new Metadata());
+                    String extension = type.getExtension();
+                    try {
+                        MimeType mimeType = getMimeTypes().forName(mediaType.toString());
+                        extension = mimeType.getExtension();
+                    } catch (MimeTypeException mte) {
+                        // No details on this type are known
+                    }
+
+                    // Record what we can do about it
+                    metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
+                    metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
+                } catch (Exception e) {
+                    throw new TikaException("Invalid embedded resource", e);
+                }
+            } else {
+                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
+                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
+            }
+
+            // Should we parse it?
+            if (extractor.shouldParseEmbedded(metadata)) {
+                if (embedded == null) {
+                    // Make a TikaInputStream that just
+                    // passes the root directory of the
+                    // embedded document, and is otherwise
+                    // empty (byte[0]):
+                    embedded = TikaInputStream.get(new byte[0]);
+                    embedded.setOpenContainer(dir);
+                }
+                extractor.parseEmbedded(embedded, xhtml, metadata, true);
+            }
+        } finally {
+            if (embedded != null) {
+                embedded.close();
+            }
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Cell of content. Classes that implement this interface are used by
+ * Tika parsers (currently just the MS Excel parser) to keep track of
+ * individual pieces of content before they are rendered to the XHTML
+ * SAX event stream.
+ */
+public interface Cell {
+
+    /**
+     * Renders the content to the given XHTML SAX event stream.
+     *
+     * @param handler
+     * @throws SAXException
+     */
+    void render(XHTMLContentHandler handler) throws SAXException;
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Cell decorator.
+ */
+public class CellDecorator implements Cell {
+
+    private final Cell cell;
+
+    public CellDecorator(Cell cell) {
+        this.cell = cell;
+    }
+
+    public void render(XHTMLContentHandler handler) throws SAXException {
+        cell.render(handler);
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,633 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.awt.*;
+import java.io.IOException;
+import java.text.NumberFormat;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import org.apache.poi.ddf.EscherBSERecord;
+import org.apache.poi.ddf.EscherBlipRecord;
+import org.apache.poi.ddf.EscherRecord;
+import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
+import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
+import org.apache.poi.hssf.eventusermodel.HSSFListener;
+import org.apache.poi.hssf.eventusermodel.HSSFRequest;
+import org.apache.poi.hssf.extractor.OldExcelExtractor;
+import org.apache.poi.hssf.record.BOFRecord;
+import org.apache.poi.hssf.record.BoundSheetRecord;
+import org.apache.poi.hssf.record.CellValueRecordInterface;
+import org.apache.poi.hssf.record.CountryRecord;
+import org.apache.poi.hssf.record.DateWindow1904Record;
+import org.apache.poi.hssf.record.DrawingGroupRecord;
+import org.apache.poi.hssf.record.EOFRecord;
+import org.apache.poi.hssf.record.ExtendedFormatRecord;
+import org.apache.poi.hssf.record.FooterRecord;
+import org.apache.poi.hssf.record.FormatRecord;
+import org.apache.poi.hssf.record.FormulaRecord;
+import org.apache.poi.hssf.record.HeaderRecord;
+import org.apache.poi.hssf.record.HyperlinkRecord;
+import org.apache.poi.hssf.record.LabelRecord;
+import org.apache.poi.hssf.record.LabelSSTRecord;
+import org.apache.poi.hssf.record.NumberRecord;
+import org.apache.poi.hssf.record.RKRecord;
+import org.apache.poi.hssf.record.Record;
+import org.apache.poi.hssf.record.SSTRecord;
+import org.apache.poi.hssf.record.StringRecord;
+import org.apache.poi.hssf.record.TextObjectRecord;
+import org.apache.poi.hssf.record.chart.SeriesTextRecord;
+import org.apache.poi.hssf.record.common.UnicodeString;
+import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey;
+import org.apache.poi.hssf.usermodel.HSSFPictureData;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Excel parser implementation which uses POI's Event API
+ * to handle the contents of a Workbook.
+ * <p/>
+ * The Event API uses a much smaller memory footprint than
+ * <code>HSSFWorkbook</code> when processing excel files
+ * but at the cost of more complexity.
+ * <p/>
+ * With the Event API a <i>listener</i> is registered for
+ * specific record types and those records are created,
+ * fired off to the listener and then discarded as the stream
+ * is being processed.
+ *
+ * @see org.apache.poi.hssf.eventusermodel.HSSFListener
+ * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api">
+ * POI Event API How To</a>
+ */
+public class ExcelExtractor extends AbstractPOIFSExtractor {
+
+    private static final String WORKBOOK_ENTRY = "Workbook";
+    private static final String BOOK_ENTRY = "Book";
+    /**
+     * <code>true</code> if the HSSFListener should be registered
+     * to listen for all records or <code>false</code> (the default)
+     * if the listener should be configured to only receive specified
+     * records.
+     */
+    private boolean listenForAllRecords = false;
+
+    public ExcelExtractor(ParseContext context, Metadata metadata) {
+        super(context, metadata);
+    }
+
+    /**
+     * Returns <code>true</code> if this parser is configured to listen
+     * for all records instead of just the specified few.
+     */
+    public boolean isListenForAllRecords() {
+        return listenForAllRecords;
+    }
+
+    /**
+     * Specifies whether this parser should to listen for all
+     * records or just for the specified few.
+     * <p/>
+     * <strong>Note:</strong> Under normal operation this setting should
+     * be <code>false</code> (the default), but you can experiment with
+     * this setting for testing and debugging purposes.
+     *
+     * @param listenForAllRecords <code>true</code> if the HSSFListener
+     *                            should be registered to listen for all records or <code>false</code>
+     *                            if the listener should be configured to only receive specified records.
+     */
+    public void setListenForAllRecords(boolean listenForAllRecords) {
+        this.listenForAllRecords = listenForAllRecords;
+    }
+
+    /**
+     * Extracts text from an Excel Workbook writing the extracted content
+     * to the specified {@link Appendable}.
+     *
+     * @param filesystem POI file system
+     * @throws IOException if an error occurs processing the workbook
+     *                     or writing the extracted content
+     */
+    protected void parse(
+            NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml,
+            Locale locale) throws IOException, SAXException, TikaException {
+        parse(filesystem.getRoot(), xhtml, locale);
+    }
+
+    protected void parse(
+            DirectoryNode root, XHTMLContentHandler xhtml,
+            Locale locale) throws IOException, SAXException, TikaException {
+        if (!root.hasEntry(WORKBOOK_ENTRY)) {
+            if (root.hasEntry(BOOK_ENTRY)) {
+                // Excel 5 / Excel 95 file
+                // Records are in a different structure so needs a
+                //  different parser to process them
+                OldExcelExtractor extractor = new OldExcelExtractor(root);
+                OldExcelParser.parse(extractor, xhtml);
+                return;
+            } else {
+                // Corrupt file / very old file, just skip text extraction
+                return;
+            }
+        }
+
+        // If a password was supplied, use it, otherwise the default
+        Biff8EncryptionKey.setCurrentUserPassword(getPassword());
+
+        // Have the file processed in event mode
+        TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this);
+        listener.processFile(root, isListenForAllRecords());
+        listener.throwStoredException();
+
+        for (Entry entry : root) {
+            if (entry.getName().startsWith("MBD")
+                    && entry instanceof DirectoryEntry) {
+                try {
+                    handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
+                } catch (TikaException e) {
+                    // ignore parse errors from embedded documents
+                }
+            }
+        }
+    }
+
+    // ======================================================================
+
+    /**
+     * HSSF Listener implementation which processes the HSSF records.
+     */
+    private static class TikaHSSFListener implements HSSFListener {
+
+        /**
+         * XHTML content handler to which the document content is rendered.
+         */
+        private final XHTMLContentHandler handler;
+
+        /**
+         * The POIFS Extractor, used for embeded resources.
+         */
+        private final AbstractPOIFSExtractor extractor;
+        /**
+         * Format for rendering numbers in the worksheet. Currently we just
+         * use the platform default formatting.
+         *
+         * @see <a href="https://issues.apache.org/jira/browse/TIKA-103">TIKA-103</a>
+         */
+        private final NumberFormat format;
+        /**
+         * Potential exception thrown by the content handler. When set to
+         * non-<code>null</code>, causes all subsequent HSSF records to be
+         * ignored and the stored exception to be thrown when
+         * {@link #throwStoredException()} is invoked.
+         */
+        private Exception exception = null;
+        private SSTRecord sstRecord;
+        private FormulaRecord stringFormulaRecord;
+        private short previousSid;
+        /**
+         * Internal <code>FormatTrackingHSSFListener</code> to handle cell
+         * formatting within the extraction.
+         */
+        private FormatTrackingHSSFListener formatListener;
+        /**
+         * List of worksheet names.
+         */
+        private List<String> sheetNames = new ArrayList<String>();
+        /**
+         * Index of the current worksheet within the workbook.
+         * Used to find the worksheet name in the {@link #sheetNames} list.
+         */
+        private short currentSheetIndex;
+        /**
+         * Content of the current worksheet, or <code>null</code> if no
+         * worksheet is currently active.
+         */
+        private SortedMap<Point, Cell> currentSheet = null;
+        /**
+         * Extra text or cells that crops up, typically as part of a
+         * worksheet but not always.
+         */
+        private List<Cell> extraTextCells = new ArrayList<Cell>();
+        /**
+         * These aren't complete when we first see them, as the
+         * depend on continue records that aren't always
+         * contiguous. Collect them for later processing.
+         */
+        private List<DrawingGroupRecord> drawingGroups = new ArrayList<DrawingGroupRecord>();
+
+        /**
+         * Construct a new listener instance outputting parsed data to
+         * the specified XHTML content handler.
+         *
+         * @param handler Destination to write the parsed output to
+         */
+        private TikaHSSFListener(XHTMLContentHandler handler, Locale locale, AbstractPOIFSExtractor extractor) {
+            this.handler = handler;
+            this.extractor = extractor;
+            this.format = NumberFormat.getInstance(locale);
+            this.formatListener = new FormatTrackingHSSFListener(this, locale);
+        }
+
+        /**
+         * Entry point to listener to start the processing of a file.
+         *
+         * @param filesystem          POI file system.
+         * @param listenForAllRecords sets whether the listener is configured to listen
+         *                            for all records types or not.
+         * @throws IOException  on any IO errors.
+         * @throws SAXException on any SAX parsing errors.
+         */
+        public void processFile(NPOIFSFileSystem filesystem, boolean listenForAllRecords)
+                throws IOException, SAXException, TikaException {
+            processFile(filesystem.getRoot(), listenForAllRecords);
+        }
+
+        public void processFile(DirectoryNode root, boolean listenForAllRecords)
+                throws IOException, SAXException, TikaException {
+
+            // Set up listener and register the records we want to process
+            HSSFRequest hssfRequest = new HSSFRequest();
+            if (listenForAllRecords) {
+                hssfRequest.addListenerForAllRecords(formatListener);
+            } else {
+                hssfRequest.addListener(formatListener, BOFRecord.sid);
+                hssfRequest.addListener(formatListener, EOFRecord.sid);
+                hssfRequest.addListener(formatListener, DateWindow1904Record.sid);
+                hssfRequest.addListener(formatListener, CountryRecord.sid);
+                hssfRequest.addListener(formatListener, BoundSheetRecord.sid);
+                hssfRequest.addListener(formatListener, SSTRecord.sid);
+                hssfRequest.addListener(formatListener, FormulaRecord.sid);
+                hssfRequest.addListener(formatListener, LabelRecord.sid);
+                hssfRequest.addListener(formatListener, LabelSSTRecord.sid);
+                hssfRequest.addListener(formatListener, NumberRecord.sid);
+                hssfRequest.addListener(formatListener, RKRecord.sid);
+                hssfRequest.addListener(formatListener, StringRecord.sid);
+                hssfRequest.addListener(formatListener, HyperlinkRecord.sid);
+                hssfRequest.addListener(formatListener, TextObjectRecord.sid);
+                hssfRequest.addListener(formatListener, SeriesTextRecord.sid);
+                hssfRequest.addListener(formatListener, FormatRecord.sid);
+                hssfRequest.addListener(formatListener, ExtendedFormatRecord.sid);
+                hssfRequest.addListener(formatListener, DrawingGroupRecord.sid);
+                hssfRequest.addListener(formatListener, HeaderRecord.sid);
+                hssfRequest.addListener(formatListener, FooterRecord.sid);
+            }
+
+            // Create event factory and process Workbook (fire events)
+            DocumentInputStream documentInputStream = root.createDocumentInputStream(WORKBOOK_ENTRY);
+            HSSFEventFactory eventFactory = new HSSFEventFactory();
+            try {
+                eventFactory.processEvents(hssfRequest, documentInputStream);
+            } catch (org.apache.poi.EncryptedDocumentException e) {
+                throw new EncryptedDocumentException(e);
+            }
+
+            // Output any extra text that came after all the sheets
+            processExtraText();
+
+            // Look for embeded images, now that the drawing records
+            //  have been fully matched with their continue data
+            for (DrawingGroupRecord dgr : drawingGroups) {
+                dgr.decode();
+                findPictures(dgr.getEscherRecords());
+            }
+        }
+
+        /**
+         * Process a HSSF record.
+         *
+         * @param record HSSF Record
+         */
+        public void processRecord(Record record) {
+            if (exception == null) {
+                try {
+                    internalProcessRecord(record);
+                } catch (TikaException te) {
+                    exception = te;
+                } catch (IOException ie) {
+                    exception = ie;
+                } catch (SAXException se) {
+                    exception = se;
+                }
+            }
+        }
+
+        public void throwStoredException() throws TikaException, SAXException, IOException {
+            if (exception != null) {
+                if (exception instanceof IOException)
+                    throw (IOException) exception;
+                if (exception instanceof SAXException)
+                    throw (SAXException) exception;
+                if (exception instanceof TikaException)
+                    throw (TikaException) exception;
+                throw new TikaException(exception.getMessage());
+            }
+        }
+
+        private void internalProcessRecord(Record record) throws SAXException, TikaException, IOException {
+            switch (record.getSid()) {
+                case BOFRecord.sid: // start of workbook, worksheet etc. records
+                    BOFRecord bof = (BOFRecord) record;
+                    if (bof.getType() == BOFRecord.TYPE_WORKBOOK) {
+                        currentSheetIndex = -1;
+                    } else if (bof.getType() == BOFRecord.TYPE_CHART) {
+                        if (previousSid == EOFRecord.sid) {
+                            // This is a sheet which contains only a chart
+                            newSheet();
+                        } else {
+                            // This is a chart within a normal sheet
+                            // Handling of this is a bit hacky...
+                            if (currentSheet != null) {
+                                processSheet();
+                                currentSheetIndex--;
+                                newSheet();
+                            }
+                        }
+                    } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) {
+                        newSheet();
+                    }
+                    break;
+
+                case EOFRecord.sid: // end of workbook, worksheet etc. records
+                    if (currentSheet != null) {
+                        processSheet();
+                    }
+                    currentSheet = null;
+                    break;
+
+                case BoundSheetRecord.sid: // Worksheet index record
+                    BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record;
+                    sheetNames.add(boundSheetRecord.getSheetname());
+                    break;
+
+                case SSTRecord.sid: // holds all the strings for LabelSSTRecords
+                    sstRecord = (SSTRecord) record;
+                    break;
+
+                case FormulaRecord.sid: // Cell value from a formula
+                    FormulaRecord formula = (FormulaRecord) record;
+                    if (formula.hasCachedResultString()) {
+                        // The String itself should be the next record
+                        stringFormulaRecord = formula;
+                    } else {
+                        addTextCell(record, formatListener.formatNumberDateCell(formula));
+                    }
+                    break;
+
+                case StringRecord.sid:
+                    if (previousSid == FormulaRecord.sid) {
+                        // Cached string value of a string formula
+                        StringRecord sr = (StringRecord) record;
+                        addTextCell(stringFormulaRecord, sr.getString());
+                    } else {
+                        // Some other string not associated with a cell, skip
+                    }
+                    break;
+
+                case LabelRecord.sid: // strings stored directly in the cell
+                    LabelRecord label = (LabelRecord) record;
+                    addTextCell(record, label.getValue());
+                    break;
+
+                case LabelSSTRecord.sid: // Ref. a string in the shared string table
+                    LabelSSTRecord sst = (LabelSSTRecord) record;
+                    UnicodeString unicode = sstRecord.getString(sst.getSSTIndex());
+                    addTextCell(record, unicode.getString());
+                    break;
+
+                case NumberRecord.sid: // Contains a numeric cell value
+                    NumberRecord number = (NumberRecord) record;
+                    addTextCell(record, formatListener.formatNumberDateCell(number));
+                    break;
+
+                case RKRecord.sid: // Excel internal number record
+                    RKRecord rk = (RKRecord) record;
+                    addCell(record, new NumberCell(rk.getRKNumber(), format));
+                    break;
+
+                case HyperlinkRecord.sid: // holds a URL associated with a cell
+                    if (currentSheet != null) {
+                        HyperlinkRecord link = (HyperlinkRecord) record;
+                        Point point =
+                                new Point(link.getFirstColumn(), link.getFirstRow());
+                        Cell cell = currentSheet.get(point);
+                        if (cell != null) {
+                            String address = link.getAddress();
+                            if (address != null) {
+                                addCell(record, new LinkedCell(cell, address));
+                            } else {
+                                addCell(record, cell);
+                            }
+                        }
+                    }
+                    break;
+
+                case TextObjectRecord.sid:
+                    TextObjectRecord tor = (TextObjectRecord) record;
+                    addTextCell(record, tor.getStr().getString());
+                    break;
+
+                case SeriesTextRecord.sid: // Chart label or title
+                    SeriesTextRecord str = (SeriesTextRecord) record;
+                    addTextCell(record, str.getText());
+                    break;
+
+                case DrawingGroupRecord.sid:
+                    // Collect this now, we'll process later when all
+                    //  the continue records are in
+                    drawingGroups.add((DrawingGroupRecord) record);
+                    break;
+                    
+                case HeaderRecord.sid:
+                	HeaderRecord headerRecord = (HeaderRecord) record;
+                	addTextCell(record, headerRecord.getText());
+                	break;
+                	
+                case FooterRecord.sid:
+                	FooterRecord footerRecord = (FooterRecord) record;
+                	addTextCell(record, footerRecord.getText());
+                	break;
+
+            }
+
+            previousSid = record.getSid();
+
+            if (stringFormulaRecord != record) {
+                stringFormulaRecord = null;
+            }
+        }
+
+        private void processExtraText() throws SAXException {
+            if (extraTextCells.size() > 0) {
+                for (Cell cell : extraTextCells) {
+                    handler.startElement("div", "class", "outside");
+                    cell.render(handler);
+                    handler.endElement("div");
+                }
+
+                // Reset
+                extraTextCells.clear();
+            }
+        }
+
+        /**
+         * Adds the given cell (unless <code>null</code>) to the current
+         * worksheet (if any) at the position (if any) of the given record.
+         *
+         * @param record record that holds the cell value
+         * @param cell   cell value (or <code>null</code>)
+         */
+        private void addCell(Record record, Cell cell) throws SAXException {
+            if (cell == null) {
+                // Ignore empty cells
+            } else if (currentSheet != null
+                    && record instanceof CellValueRecordInterface) {
+                // Normal cell inside a worksheet
+                CellValueRecordInterface value =
+                        (CellValueRecordInterface) record;
+                Point point = new Point(value.getColumn(), value.getRow());
+                currentSheet.put(point, cell);
+            } else {
+                // Cell outside the worksheets
+                extraTextCells.add(cell);
+            }
+        }
+
+        /**
+         * Adds a text cell with the given text comment. The given text
+         * is trimmed, and ignored if <code>null</code> or empty.
+         *
+         * @param record record that holds the text value
+         * @param text   text content, may be <code>null</code>
+         * @throws SAXException
+         */
+        private void addTextCell(Record record, String text) throws SAXException {
+            if (text != null) {
+                text = text.trim();
+                if (text.length() > 0) {
+                    addCell(record, new TextCell(text));
+                }
+            }
+        }
+
+        private void newSheet() {
+            currentSheetIndex++;
+            currentSheet = new TreeMap<Point, Cell>(new PointComparator());
+        }
+
+        /**
+         * Process an excel sheet.
+         *
+         * @throws SAXException if an error occurs
+         */
+        private void processSheet() throws SAXException {
+            // Sheet Start
+            handler.startElement("div", "class", "page");
+            if (currentSheetIndex < sheetNames.size()) {
+                handler.element("h1", sheetNames.get(currentSheetIndex));
+            }
+            handler.startElement("table");
+            handler.startElement("tbody");
+
+            // Process Rows
+            int currentRow = 0;
+            int currentColumn = 0;
+            handler.startElement("tr");
+            handler.startElement("td");
+            for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) {
+                while (currentRow < entry.getKey().y) {
+                    handler.endElement("td");
+                    handler.endElement("tr");
+                    handler.startElement("tr");
+                    handler.startElement("td");
+                    currentRow++;
+                    currentColumn = 0;
+                }
+
+                while (currentColumn < entry.getKey().x) {
+                    handler.endElement("td");
+                    handler.startElement("td");
+                    currentColumn++;
+                }
+
+                entry.getValue().render(handler);
+            }
+            handler.endElement("td");
+            handler.endElement("tr");
+
+            // Sheet End
+            handler.endElement("tbody");
+            handler.endElement("table");
+
+            // Finish up
+            processExtraText();
+            handler.endElement("div");
+        }
+
+        private void findPictures(List<EscherRecord> records) throws IOException, SAXException, TikaException {
+            for (EscherRecord escherRecord : records) {
+                if (escherRecord instanceof EscherBSERecord) {
+                    EscherBlipRecord blip = ((EscherBSERecord) escherRecord).getBlipRecord();
+                    if (blip != null) {
+                        HSSFPictureData picture = new HSSFPictureData(blip);
+                        String mimeType = picture.getMimeType();
+                        TikaInputStream stream = TikaInputStream.get(picture.getData());
+
+                        // Handle the embeded resource
+                        extractor.handleEmbeddedResource(
+                                stream, null, null, mimeType,
+                                handler, true
+                        );
+                    }
+                }
+
+                // Recursive call.
+                findPictures(escherRecord.getChildRecords());
+            }
+        }
+    }
+
+    /**
+     * Utility comparator for points.
+     */
+    private static class PointComparator implements Comparator<Point> {
+
+        public int compare(Point a, Point b) {
+            int diff = a.y - b.y;
+            if (diff == 0) {
+                diff = a.x - b.x;
+            }
+            return diff;
+        }
+
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,366 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.List;
+
+import org.apache.poi.hslf.model.Comment;
+import org.apache.poi.hslf.model.HeadersFooters;
+import org.apache.poi.hslf.model.OLEShape;
+import org.apache.poi.hslf.usermodel.HSLFMasterSheet;
+import org.apache.poi.hslf.usermodel.HSLFNotes;
+import org.apache.poi.hslf.usermodel.HSLFObjectData;
+import org.apache.poi.hslf.usermodel.HSLFPictureData;
+import org.apache.poi.hslf.usermodel.HSLFShape;
+import org.apache.poi.hslf.usermodel.HSLFSlide;
+import org.apache.poi.hslf.usermodel.HSLFSlideShow;
+import org.apache.poi.hslf.usermodel.HSLFTable;
+import org.apache.poi.hslf.usermodel.HSLFTableCell;
+import org.apache.poi.hslf.usermodel.HSLFTextParagraph;
+import org.apache.poi.hslf.usermodel.HSLFTextRun;
+import org.apache.poi.hslf.usermodel.HSLFTextShape;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+public class HSLFExtractor extends AbstractPOIFSExtractor {
+    public HSLFExtractor(ParseContext context) {
+        super(context);
+    }
+
+    protected void parse(
+            NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+        parse(filesystem.getRoot(), xhtml);
+    }
+
+    protected void parse(
+            DirectoryNode root, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+        HSLFSlideShow ss = new HSLFSlideShow(root);
+        List<HSLFSlide> _slides = ss.getSlides();
+
+        xhtml.startElement("div", "class", "slideShow");
+
+      /* Iterate over slides and extract text */
+        for (HSLFSlide slide : _slides) {
+            xhtml.startElement("div", "class", "slide");
+
+            // Slide header, if present
+            HeadersFooters hf = slide.getHeadersFooters();
+            if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) {
+                xhtml.startElement("p", "class", "slide-header");
+
+                xhtml.characters(hf.getHeaderText());
+
+                xhtml.endElement("p");
+            }
+
+            // Slide master, if present
+            extractMaster(xhtml, slide.getMasterSheet());
+
+            // Slide text
+            {
+                xhtml.startElement("div", "class", "slide-content");
+
+                textRunsToText(xhtml, slide.getTextParagraphs());
+
+                xhtml.endElement("div");
+            }
+
+            // Table text
+            for (HSLFShape shape : slide.getShapes()) {
+                if (shape instanceof HSLFTable) {
+                    extractTableText(xhtml, (HSLFTable) shape);
+                }
+            }
+
+            // Slide footer, if present
+            if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
+                xhtml.startElement("p", "class", "slide-footer");
+
+                xhtml.characters(hf.getFooterText());
+
+                xhtml.endElement("p");
+            }
+
+            // Comments, if present
+            StringBuilder authorStringBuilder = new StringBuilder();
+            for (Comment comment : slide.getComments()) {
+                authorStringBuilder.setLength(0);
+                xhtml.startElement("p", "class", "slide-comment");
+
+                if (comment.getAuthor() != null) {
+                    authorStringBuilder.append(comment.getAuthor());
+                }
+                if (comment.getAuthorInitials() != null) {
+                    if (authorStringBuilder.length() > 0) {
+                        authorStringBuilder.append(" ");
+                    }
+                    authorStringBuilder.append("("+comment.getAuthorInitials()+")");
+                }
+                if (authorStringBuilder.length() > 0) {
+                    if (comment.getText() != null) {
+                        authorStringBuilder.append(" - ");
+                    }
+                    xhtml.startElement("b");
+                    xhtml.characters(authorStringBuilder.toString());
+                    xhtml.endElement("b");
+                }
+                if (comment.getText() != null) {
+                    xhtml.characters(comment.getText());
+                }
+                xhtml.endElement("p");
+            }
+
+            // Now any embedded resources
+            handleSlideEmbeddedResources(slide, xhtml);
+
+            // TODO Find the Notes for this slide and extract inline
+
+            // Slide complete
+            xhtml.endElement("div");
+        }
+
+        // All slides done
+        xhtml.endElement("div");
+
+      /* notes */
+        xhtml.startElement("div", "class", "slide-notes");
+        HashSet<Integer> seenNotes = new HashSet<>();
+        HeadersFooters hf = ss.getNotesHeadersFooters();
+
+        for (HSLFSlide slide : _slides) {
+            HSLFNotes notes = slide.getNotes();
+            if (notes == null) {
+                continue;
+            }
+            Integer id = notes._getSheetNumber();
+            if (seenNotes.contains(id)) {
+                continue;
+            }
+            seenNotes.add(id);
+
+            // Repeat the Notes header, if set
+            if (hf != null && hf.isHeaderVisible() && hf.getHeaderText() != null) {
+                xhtml.startElement("p", "class", "slide-note-header");
+                xhtml.characters(hf.getHeaderText());
+                xhtml.endElement("p");
+            }
+
+            // Notes text
+            textRunsToText(xhtml, notes.getTextParagraphs());
+
+            // Repeat the notes footer, if set
+            if (hf != null && hf.isFooterVisible() && hf.getFooterText() != null) {
+                xhtml.startElement("p", "class", "slide-note-footer");
+                xhtml.characters(hf.getFooterText());
+                xhtml.endElement("p");
+            }
+        }
+
+        handleSlideEmbeddedPictures(ss, xhtml);
+
+        xhtml.endElement("div");
+    }
+
+    private void extractMaster(XHTMLContentHandler xhtml, HSLFMasterSheet master) throws SAXException {
+        if (master == null) {
+            return;
+        }
+        List<HSLFShape> shapes = master.getShapes();
+        if (shapes == null || shapes.isEmpty()) {
+            return;
+        }
+
+        xhtml.startElement("div", "class", "slide-master-content");
+        for (HSLFShape shape : shapes) {
+            if (shape != null && !HSLFMasterSheet.isPlaceholder(shape)) {
+                if (shape instanceof HSLFTextShape) {
+                	HSLFTextShape tsh = (HSLFTextShape) shape;
+                    String text = tsh.getText();
+                    if (text != null) {
+                        xhtml.element("p", text);
+                    }
+                }
+            }
+        }
+        xhtml.endElement("div");
+    }
+
+    private void extractTableText(XHTMLContentHandler xhtml, HSLFTable shape) throws SAXException {
+        xhtml.startElement("table");
+        for (int row = 0; row < shape.getNumberOfRows(); row++) {
+            xhtml.startElement("tr");
+            for (int col = 0; col < shape.getNumberOfColumns(); col++) {
+                HSLFTableCell cell = shape.getCell(row, col);
+                //insert empty string for empty cell if cell is null
+                String txt = "";
+                if (cell != null) {
+                    txt = cell.getText();
+                }
+                xhtml.element("td", txt);
+            }
+            xhtml.endElement("tr");
+        }
+        xhtml.endElement("table");
+    }
+
+    private void textRunsToText(XHTMLContentHandler xhtml, List<List<HSLFTextParagraph>> paragraphsList) throws SAXException {
+        if (paragraphsList == null) {
+            return;
+        }
+
+        for (List<HSLFTextParagraph> run : paragraphsList) {
+            // Leaving in wisdom from TIKA-712 for easy revert.
+            // Avoid boiler-plate text on the master slide (0
+            // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
+            //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) {
+
+            boolean isBullet = false;
+            for (HSLFTextParagraph htp : run) {
+                boolean nextBullet = htp.isBullet();
+                // TODO: identify bullet/list type
+                if (isBullet != nextBullet) {
+                    isBullet = nextBullet;
+                    if (isBullet) {
+                        xhtml.startElement("ul");
+                    } else {
+                        xhtml.endElement("ul");
+                    }
+                }
+
+                List<HSLFTextRun> textRuns = htp.getTextRuns();
+                String firstLine = removePBreak(textRuns.get(0).getRawText());
+                boolean showBullet = (isBullet && (textRuns.size() > 1 || !"".equals(firstLine)));
+                String paraTag = showBullet ? "li" : "p";
+
+                xhtml.startElement(paraTag);
+                for (HSLFTextRun htr : textRuns) {
+                    String line = htr.getRawText();
+                    if (line != null) {
+                        boolean isfirst = true;
+                        for (String fragment : line.split("\\u000b")) {
+                            if (!isfirst) {
+                                xhtml.startElement("br");
+                                xhtml.endElement("br");
+                            }
+                            isfirst = false;
+                            xhtml.characters(removePBreak(fragment));
+                        }
+                        if (line.endsWith("\u000b")) {
+                            xhtml.startElement("br");
+                            xhtml.endElement("br");
+                        }
+                    }
+                }
+                xhtml.endElement(paraTag);
+            }
+            if (isBullet) {
+                xhtml.endElement("ul");
+            }
+        }
+    }
+
+    // remove trailing paragraph break
+    private static String removePBreak(String fragment) {
+        // the last text run of a text paragraph contains the paragraph break (\r)
+        // line breaks (\\u000b) can happen more often
+        return fragment.replaceFirst("\\r$", "");
+    }
+
+    private void handleSlideEmbeddedPictures(HSLFSlideShow slideshow, XHTMLContentHandler xhtml)
+            throws TikaException, SAXException, IOException {
+        for (HSLFPictureData pic : slideshow.getPictureData()) {
+            String mediaType;
+
+            switch (pic.getType()) {
+                case EMF:
+                    mediaType = "application/x-emf";
+                    break;
+                case WMF:
+                    mediaType = "application/x-msmetafile";
+                    break;
+                case DIB:
+                    mediaType = "image/bmp";
+                    break;
+                default:
+                    mediaType = pic.getContentType();
+                    break;
+            }
+
+            handleEmbeddedResource(
+                    TikaInputStream.get(pic.getData()), null, null,
+                    mediaType, xhtml, false);
+        }
+    }
+
+    private void handleSlideEmbeddedResources(HSLFSlide slide, XHTMLContentHandler xhtml)
+            throws TikaException, SAXException, IOException {
+        List<HSLFShape> shapes;
+        try {
+            shapes = slide.getShapes();
+        } catch (NullPointerException e) {
+            // Sometimes HSLF hits problems
+            // Please open POI bugs for any you come across!
+            return;
+        }
+
+        for (HSLFShape shape : shapes) {
+            if (shape instanceof OLEShape) {
+                OLEShape oleShape = (OLEShape) shape;
+                HSLFObjectData data = null;
+                try {
+                    data = oleShape.getObjectData();
+                } catch (NullPointerException e) {
+                /* getObjectData throws NPE some times. */
+                }
+
+                if (data != null) {
+                    String objID = Integer.toString(oleShape.getObjectID());
+
+                    // Embedded Object: add a <div
+                    // class="embedded" id="X"/> so consumer can see where
+                    // in the main text each embedded document
+                    // occurred:
+                    AttributesImpl attributes = new AttributesImpl();
+                    attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+                    attributes.addAttribute("", "id", "id", "CDATA", objID);
+                    xhtml.startElement("div", attributes);
+                    xhtml.endElement("div");
+
+                    try (TikaInputStream stream = TikaInputStream.get(data.getData())) {
+                        String mediaType = null;
+                        if ("Excel.Chart.8".equals(oleShape.getProgID())) {
+                            mediaType = "application/vnd.ms-excel";
+                        }
+                        handleEmbeddedResource(
+                                stream, objID, objID,
+                                mediaType, xhtml, false);
+                    }
+                }
+            }
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,345 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.microsoft;
+
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.text.DateFormat;
+import java.text.NumberFormat;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+
+import com.healthmarketscience.jackcess.Column;
+import com.healthmarketscience.jackcess.DataType;
+import com.healthmarketscience.jackcess.Database;
+import com.healthmarketscience.jackcess.PropertyMap;
+import com.healthmarketscience.jackcess.Row;
+import com.healthmarketscience.jackcess.Table;
+import com.healthmarketscience.jackcess.query.Query;
+import com.healthmarketscience.jackcess.util.OleBlob;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Internal class.  Needs to be instantiated for each parse because of
+ * the lack of thread safety with the dateTimeFormatter
+ */
+class JackcessExtractor extends AbstractPOIFSExtractor {
+
+    final static String TITLE_PROP_KEY = "Title";
+    final static String AUTHOR_PROP_KEY = "Author";
+    final static String COMPANY_PROP_KEY = "Company";
+
+    final static String TEXT_FORMAT_KEY = "TextFormat";
+    final static String CURRENCY_FORMAT_KEY = "Format";
+    final static byte TEXT_FORMAT = 0;
+    final static byte RICH_TEXT_FORMAT = 1;
+    final static ParseContext EMPTY_PARSE_CONTEXT = new ParseContext();
+
+    final NumberFormat currencyFormatter;
+    final DateFormat shortDateTimeFormatter;
+
+    final HtmlParser htmlParser = new HtmlParser();
+
+    protected JackcessExtractor(ParseContext context, Locale locale) {
+        super(context);
+        currencyFormatter = NumberFormat.getCurrencyInstance(locale);
+        shortDateTimeFormatter = DateFormat.getDateInstance(DateFormat.SHORT, locale);
+    }
+
+    public void parse(Database db, XHTMLContentHandler xhtml, Metadata metadata) throws IOException, SAXException, TikaException {
+
+
+        String pw = db.getDatabasePassword();
+        if (pw != null) {
+            metadata.set(JackcessParser.MDB_PW, pw);
+        }
+
+        PropertyMap dbp = db.getDatabaseProperties();
+        for (PropertyMap.Property p : dbp) {
+            metadata.add(JackcessParser.MDB_PROPERTY_PREFIX + p.getName(),
+                    toString(p.getValue(), p.getType()));
+        }
+
+        PropertyMap up = db.getUserDefinedProperties();
+        for (PropertyMap.Property p : up) {
+            metadata.add(JackcessParser.USER_DEFINED_PROPERTY_PREFIX+ p.getName(),
+                    toString(p.getValue(), p.getType()));
+        }
+
+        Set<String> found = new HashSet<>();
+        PropertyMap summaryProperties = db.getSummaryProperties();
+        if (summaryProperties != null) {
+            //try to get core properties
+            PropertyMap.Property title = summaryProperties.get(TITLE_PROP_KEY);
+            if (title != null) {
+                metadata.set(TikaCoreProperties.TITLE, toString(title.getValue(), title.getType()));
+                found.add(title.getName());
+            }
+            PropertyMap.Property author = summaryProperties.get(AUTHOR_PROP_KEY);
+            if (author != null && author.getValue() != null) {
+                String authorString = toString(author.getValue(), author.getType());
+                SummaryExtractor.addMulti(metadata, TikaCoreProperties.CREATOR, authorString);
+                found.add(author.getName());
+            }
+            PropertyMap.Property company = summaryProperties.get(COMPANY_PROP_KEY);
+            if (company != null) {
+                metadata.set(OfficeOpenXMLExtended.COMPANY, toString(company.getValue(), company.getType()));
+                found.add(company.getName());
+            }
+
+            for (PropertyMap.Property p : db.getSummaryProperties()) {
+                if (! found.contains(p.getName())) {
+                    metadata.add(JackcessParser.SUMMARY_PROPERTY_PREFIX + p.getName(),
+                            toString(p.getValue(), p.getType()));
+                }
+            }
+
+        }
+
+        Iterator<Table> it = db.newIterable().
+                setIncludeLinkedTables(false).
+                setIncludeSystemTables(false).iterator();
+
+        while (it.hasNext()) {
+            Table table = it.next();
+            String tableName = table.getName();
+            List<? extends Column> columns = table.getColumns();
+            xhtml.startElement("table", "name", tableName);
+            addHeaders(columns, xhtml);
+            xhtml.startElement("tbody");
+
+            Row r = table.getNextRow();
+
+            while (r != null) {
+                xhtml.startElement("tr");
+                for (Column c : columns) {
+                    handleCell(r, c, xhtml);
+                }
+                xhtml.endElement("tr");
+                r = table.getNextRow();
+            }
+            xhtml.endElement("tbody");
+            xhtml.endElement("table");
+        }
+
+        for (Query q : db.getQueries()) {
+            xhtml.startElement("div", "type", "sqlQuery");
+            xhtml.characters(q.toSQLString());
+            xhtml.endElement("div");
+        }
+    }
+
+    private void addHeaders(List<? extends Column> columns, XHTMLContentHandler xhtml) throws SAXException {
+        xhtml.startElement("thead");
+        xhtml.startElement("tr");
+        for (Column c : columns) {
+            xhtml.startElement("th");
+            xhtml.characters(c.getName());
+            xhtml.endElement("th");
+        }
+        xhtml.endElement("tr");
+        xhtml.endElement("thead");
+
+    }
+
+    private void handleCell(Row r, Column c, XHTMLContentHandler handler)
+            throws SAXException, IOException, TikaException {
+
+        handler.startElement("td");
+        if (c.getType().equals(DataType.OLE)) {
+            handleOLE(r, c.getName(), handler);
+        } else if (c.getType().equals(DataType.BINARY)) {
+            Object obj = r.get(c.getName());
+            if (obj != null) {
+                byte[] bytes = (byte[])obj;
+                handleEmbeddedResource(
+                        TikaInputStream.get(bytes),
+                        null,//filename
+                        null,//relationshipId
+                        null,//mediatype
+                        handler, false);
+            }
+        } else {
+            Object obj = r.get(c.getName());
+            String v = toString(obj, c.getType());
+            if (isRichText(c)) {
+                BodyContentHandler h = new BodyContentHandler();
+                Metadata m = new Metadata();
+                m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
+                try {
+                    htmlParser.parse(new ByteArrayInputStream(v.getBytes(UTF_8)),
+                            h,
+                           m, EMPTY_PARSE_CONTEXT);
+                    handler.characters(h.toString());
+                } catch (SAXException e) {
+                    //if something went wrong in htmlparser, just append the characters
+                    handler.characters(v);
+                }
+            } else {
+                handler.characters(v);
+            }
+        }
+        handler.endElement("td");
+    }
+
+    private boolean isRichText(Column c) throws IOException {
+
+        if (c == null) {
+            return false;
+        }
+
+        PropertyMap m = c.getProperties();
+        if (m == null) {
+            return false;
+        }
+        if (c.getType() == null || ! c.getType().equals(DataType.MEMO)) {
+            return false;
+        }
+        Object b = m.getValue(TEXT_FORMAT_KEY);
+        if (b instanceof Byte) {
+            if (((Byte)b).byteValue() == RICH_TEXT_FORMAT) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    private String toString(Object value, DataType type) {
+        if (value == null) {
+            return "";
+        }
+        if (type == null) {
+            //this shouldn't happen
+            return value.toString();
+        }
+        switch (type) {
+            case LONG:
+                return Integer.toString((Integer)value);
+            case TEXT:
+                return (String)value;
+            case MONEY:
+                //TODO: consider getting parsing "Format" field from
+                //field properties.
+                return formatCurrency(((BigDecimal)value).doubleValue(), type);
+            case SHORT_DATE_TIME:
+                return formatShortDateTime((Date)value);
+            case BOOLEAN:
+                return Boolean.toString((Boolean) value);
+            case MEMO:
+                return (String)value;
+            case INT:
+                return Short.toString((Short)value);
+            case DOUBLE:
+                return Double.toString((Double)value);
+            case FLOAT:
+                return Float.toString((Float)value);
+            case NUMERIC:
+                return value.toString();
+            case BYTE:
+                return Byte.toString((Byte)value);
+            case GUID:
+                return value.toString();
+            case COMPLEX_TYPE: //skip all these
+            case UNKNOWN_0D:
+            case UNKNOWN_11:
+            case UNSUPPORTED_FIXEDLEN:
+            case UNSUPPORTED_VARLEN:
+            default:
+                return "";
+
+        }
+    }
+
+    private void handleOLE(Row row, String cName, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
+        OleBlob blob = row.getBlob(cName);
+        //lifted shamelessly from Jackcess's OleBlobTest
+        if (blob == null)
+            return;
+
+        OleBlob.Content content = blob.getContent();
+        if (content == null)
+            return;
+
+        switch (content.getType()) {
+            case LINK:
+                xhtml.characters(((OleBlob.LinkContent) content).getLinkPath());
+                break;
+            case SIMPLE_PACKAGE:
+                OleBlob.SimplePackageContent spc = (OleBlob.SimplePackageContent) content;
+
+                handleEmbeddedResource(
+                        TikaInputStream.get(spc.getStream()),
+                        spc.getFileName(),//filename
+                        null,//relationshipId
+                        spc.getTypeName(),//mediatype
+                        xhtml, false);
+                break;
+            case OTHER:
+                OleBlob.OtherContent oc = (OleBlob.OtherContent) content;
+                handleEmbeddedResource(
+                        TikaInputStream.get(oc.getStream()),
+                        null,//filename
+                        null,//relationshipId
+                        oc.getTypeName(),//mediatype
+                        xhtml, false);
+                break;
+            case COMPOUND_STORAGE:
+                OleBlob.CompoundContent cc = (OleBlob.CompoundContent) content;
+                handleCompoundContent(cc, xhtml);
+                break;
+        }
+    }
+
+    private void handleCompoundContent(OleBlob.CompoundContent cc, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
+        NPOIFSFileSystem nfs = new NPOIFSFileSystem(cc.getStream());
+        handleEmbeddedOfficeDoc(nfs.getRoot(), xhtml);
+    }
+
+    String formatCurrency(Double d, DataType type) {
+        if (d == null) {
+            return "";
+        }
+        return currencyFormatter.format(d);
+    }
+
+    String formatShortDateTime(Date d) {
+        if (d == null) {
+            return "";
+        }
+        return shortDateTimeFormatter.format(d);
+    }
+}
+