You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC
svn commit: r1723223 [8/32] - in /tika/branches/2.x: tika-core/src/test/resources/META-INF/ tika-core/src/test/resources/META-INF/services/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-module/src/ tik...

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,913 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.math.BigInteger;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
+import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * Decompresses a chm block. Depending on chm block type chooses most relevant
+ * decompressing method. A chm block type can be as follows:</br> <li>UNDEFINED
+ * - no action taken, i.e. skipping the block <li>VERBATIM <li>ALIGNED_OFFSET
+ * <li>UNCOMPRESSED the most simplest In addition there are unknown types (4-7).
+ * Currently relying on previous chm block these types changing according to the
+ * previous chm block type. We need to invent more appropriate way to handle
+ * such types.
+ * 
+ */
+public class ChmLzxBlock {
+    private int block_number;
+    private long block_length;
+    private ChmLzxState state;
+    private byte[] content = null;
+    private ChmSection chmSection = null;
+    private int contentLength = 0;
+
+    // trying to find solution for bad blocks ...
+    private int previousBlockType = -1;
+
+    public ChmLzxBlock(int blockNumber, byte[] dataSegment, long blockLength,
+            ChmLzxBlock prevBlock) throws TikaException {
+        try {
+            if (validateConstructorParams(blockNumber, dataSegment, blockLength)) {
+                setBlockNumber(blockNumber);
+
+                if (prevBlock != null
+                        && prevBlock.getState().getBlockLength() > prevBlock
+                                .getState().getBlockRemaining())
+                    setChmSection(new ChmSection(dataSegment, prevBlock.getContent()));
+                else
+                    setChmSection(new ChmSection(dataSegment));
+
+                setBlockLength(blockLength);
+
+                // ============================================
+                // we need to take care of previous context
+                // ============================================
+                checkLzxBlock(prevBlock);
+                if (prevBlock == null
+                        || blockLength < (int) getBlockLength()) {
+                    setContent((int) getBlockLength());
+                }
+                else {
+                    setContent((int) blockLength);
+                }
+
+                if (prevBlock != null && prevBlock.getState() != null)
+                    previousBlockType = prevBlock.getState().getBlockType();
+
+                extractContent();
+            } else
+                throw new TikaException("Check your chm lzx block parameters");
+        } catch (TikaException e) {
+            throw e;
+        }
+    }
+
+    protected int getContentLength() {
+        return contentLength;
+    }
+
+    protected void setContentLength(int contentLength) {
+        this.contentLength = contentLength;
+    }
+
+    private ChmSection getChmSection() {
+        return chmSection;
+    }
+
+    private void setChmSection(ChmSection chmSection) {
+        this.chmSection = chmSection;
+    }
+
+    private void assertStateNotNull() throws TikaException {
+        if (getState() == null)
+            throw new ChmParsingException("state is null");
+    }
+
+    private void extractContent() throws TikaException {
+        assertStateNotNull();
+        if (getChmSection().getData() != null) {
+            boolean continueLoop = true;
+            while (continueLoop && getContentLength() < getBlockLength()) {
+                if (getState() != null && getState().getBlockRemaining() == 0) {
+                    if (getState().getHadStarted() == LzxState.NOT_STARTED_DECODING) {
+                        getState().setHadStarted(LzxState.STARTED_DECODING);
+                        if (getChmSection().getSyncBits(1) == 1) {
+                            int intelSizeTemp = (getChmSection()
+                                    .getSyncBits(16) << 16)
+                                    + getChmSection().getSyncBits(16);
+                            if (intelSizeTemp >= 0)
+                                getState().setIntelFileSize(intelSizeTemp);
+                            else
+                                getState().setIntelFileSize(0);
+                        }
+                    }
+                    getState().setBlockType(getChmSection().getSyncBits(3));
+                    getState().setBlockLength(
+                            (getChmSection().getSyncBits(16) << 8)
+                                    + getChmSection().getSyncBits(8));
+                    getState().setBlockRemaining(getState().getBlockLength());
+
+                    // ----------------------------------------
+                    // Trying to handle 3 - 7 block types
+                    // ----------------------------------------
+                    if (getState().getBlockType() > 3) {
+                        if (previousBlockType >= 0 && previousBlockType < 3)
+                            getState().setBlockType(previousBlockType);
+                    }
+
+                    switch (getState().getBlockType()) {
+                        case ChmCommons.ALIGNED_OFFSET:
+                            createAlignedTreeTable();
+                            //fall through
+                        case ChmCommons.VERBATIM:
+                            /* Creates mainTreeTable */
+                            createMainTreeTable();
+                            createLengthTreeTable();
+                            if (getState().getMainTreeLengtsTable()[0xe8] != 0)
+                                getState().setIntelState(IntelState.STARTED);
+                            break;
+                        case ChmCommons.UNCOMPRESSED:
+                            getState().setIntelState(IntelState.STARTED);
+                            if (getChmSection().getTotal() > 16)
+                                getChmSection().setSwath(
+                                        getChmSection().getSwath() - 1);
+                            getState().setR0(
+                                    (new BigInteger(getChmSection()
+                                            .reverseByteOrder(
+                                                    getChmSection().unmarshalBytes(
+                                                            4))).longValue()));
+                            getState().setR1(
+                                    (new BigInteger(getChmSection()
+                                            .reverseByteOrder(
+                                                    getChmSection().unmarshalBytes(
+                                                            4))).longValue()));
+                            getState().setR2(
+                                    (new BigInteger(getChmSection()
+                                            .reverseByteOrder(
+                                                    getChmSection().unmarshalBytes(
+                                                            4))).longValue()));
+                            break;
+                        default:
+                            break;
+                    }
+                } //end of if BlockRemaining == 0
+
+                int tempLen;
+
+                if (getContentLength() + getState().getBlockRemaining() > getBlockLength()) {
+                    getState().setBlockRemaining(
+                            getContentLength() + getState().getBlockRemaining()
+                                    - (int) getBlockLength());
+                    tempLen = (int) getBlockLength();
+                } else {
+                    tempLen = getContentLength()
+                            + getState().getBlockRemaining();
+                    getState().setBlockRemaining(0);
+                }
+
+                int lastLength = getContentLength();
+                switch (getState().getBlockType()) {
+                case ChmCommons.ALIGNED_OFFSET:
+                    // if(prevblock.lzxState.length>prevblock.lzxState.remaining)
+                    decompressAlignedBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());// prevcontext
+                    break;
+                case ChmCommons.VERBATIM:
+                    decompressVerbatimBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());
+                    break;
+                case ChmCommons.UNCOMPRESSED:
+                    decompressUncompressedBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());
+                    break;
+                }
+                getState().increaseFramesRead();
+                if ((getState().getFramesRead() < 32768)
+                        && getState().getIntelFileSize() != 0)
+                    intelE8Decoding();
+
+                continueLoop = getContentLength() > lastLength;
+            }
+        }
+    }
+
+    protected void intelE8Decoding() {
+        if (getBlockLength() <= ChmConstants.LZX_PRETREE_TABLEBITS
+                || (getState().getIntelState() == IntelState.NOT_STARTED)) {
+            getState().setBlockRemaining(
+                    getState().getBlockRemaining() - (int) getBlockLength());
+        } else {
+            long curpos = getState().getBlockRemaining();
+            getState().setBlockRemaining(
+                    getState().getBlockRemaining() - (int) getBlockLength());
+            int i = 0;
+            while (i < getBlockLength() - 10) {
+                if (content[i] != 0xe8) {
+                    i++;
+                    continue;
+                }
+                byte[] b = new byte[4];
+                b[0] = getContent()[i + 3];
+                b[1] = getContent()[i + 2];
+                b[2] = getContent()[i + 1];
+                b[3] = getContent()[i + 0];
+                long absoff = (new BigInteger(b)).longValue();
+                if ((absoff >= -curpos)
+                        && (absoff < getState().getIntelFileSize())) {
+                    long reloff = (absoff >= 0) ? absoff - curpos : absoff
+                            + getState().getIntelFileSize();
+                    getContent()[i + 0] = (byte) reloff;
+                    getContent()[i + 1] = (byte) (reloff >>> 8);
+                    getContent()[i + 2] = (byte) (reloff >>> 16);
+                    getContent()[i + 3] = (byte) (reloff >>> 24);
+                }
+                i += 4;
+                curpos += 5;
+            }
+        }
+    }
+
+    private short[] createPreLenTable() {
+        short[] tmp = new short[ChmConstants.LZX_PRETREE_MAXSYMBOLS];
+        for (int i = 0; i < ChmConstants.LZX_PRETREE_MAXSYMBOLS; i++) {
+            tmp[i] = (short) getChmSection().getSyncBits(
+                    ChmConstants.LZX_PRETREE_NUM_ELEMENTS_BITS);
+        }
+        return tmp;
+    }
+
+    private void createLengthTreeTable() throws TikaException {
+        //Read Pre Tree Table
+        short[] prelentable = createPreLenTable();
+
+        if (prelentable == null) {
+            throw new ChmParsingException("pretreetable is null");
+        }
+
+        short[] pretreetable = createTreeTable2(prelentable,
+                (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+                        + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+                ChmConstants.LZX_PRETREE_TABLEBITS,
+                ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+
+        if (pretreetable == null) {
+            throw new ChmParsingException("pretreetable is null");
+        }
+
+        //Build Length Tree
+        createLengthTreeLenTable(0, ChmConstants.LZX_NUM_SECONDARY_LENGTHS,
+                pretreetable, prelentable);
+
+        getState().setLengthTreeTable(
+                createTreeTable2(getState().getLengthTreeLengtsTable(),
+                        (1 << ChmConstants.LZX_LENGTH_TABLEBITS)
+                                + (ChmConstants.LZX_LENGTH_MAXSYMBOLS << 1),
+                        ChmConstants.LZX_LENGTH_TABLEBITS,
+                        ChmConstants.LZX_NUM_SECONDARY_LENGTHS));
+    }
+
+    private void decompressUncompressedBlock(int len, byte[] prevcontent) {
+        if (getContentLength() + getState().getBlockRemaining() <= getBlockLength()) {
+            for (int i = getContentLength(); i < (getContentLength() + getState()
+                    .getBlockRemaining()); i++)
+                content[i] = getChmSection().getByte();
+
+            setContentLength(getContentLength()
+                    + getState().getBlockRemaining());
+            getState().setBlockRemaining(0);
+        } else {
+            for (int i = getContentLength(); i < getBlockLength(); i++)
+                content[i] = getChmSection().getByte();
+            getState().setBlockRemaining(
+                    (int) getBlockLength() - getContentLength());// = blockLen -
+                                                                 // contentlen;
+            setContentLength((int) getBlockLength());
+        }
+    }
+
+    private void decompressAlignedBlock(int len, byte[] prevcontent) throws TikaException {
+
+        if ((getChmSection() == null) || (getState() == null)
+                || (getState().getMainTreeTable() == null))
+            throw new ChmParsingException("chm section is null");
+
+        short s;
+        int x, i, border;
+        int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
+        int matchoffset = 0;
+        for (i = getContentLength(); i < len; i++) {
+            /* new code */
+            //read huffman tree from main tree
+            border = getChmSection().peekBits(
+                    ChmConstants.LZX_MAINTREE_TABLEBITS);
+            if (border >= getState().mainTreeTable.length)
+                throw new ChmParsingException("error decompressing aligned block.");
+                //break;
+            /* end new code */
+            s = getState().mainTreeTable[getChmSection().peekBits(
+                    ChmConstants.LZX_MAINTREE_TABLEBITS)];
+            if (s >= getState().getMainTreeElements()) {
+                x = ChmConstants.LZX_MAINTREE_TABLEBITS;
+                do {
+                    x++;
+                    s <<= 1;
+                    s += getChmSection().checkBit(x);
+                } while ((s = getState().mainTreeTable[s]) >= getState()
+                        .getMainTreeElements());
+            }
+            //System.out.printf("%d,", s);
+            //?getChmSection().getSyncBits(getState().mainTreeTable[s]);
+            getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]);
+            if (s < ChmConstants.LZX_NUM_CHARS) {
+                content[i] = (byte) s;
+            } else {
+                s -= ChmConstants.LZX_NUM_CHARS;
+                matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
+                if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
+                    matchfooter = getState().lengthTreeTable[getChmSection()
+                            .peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)];//.LZX_MAINTREE_TABLEBITS)];
+                    if (matchfooter >= ChmConstants.LZX_LENGTH_MAXSYMBOLS/*?LZX_LENGTH_TABLEBITS*/) {
+                        x = ChmConstants.LZX_LENGTH_TABLEBITS;
+                        do {
+                            x++;
+                            matchfooter <<= 1;
+                            matchfooter += getChmSection().checkBit(x);
+                        } while ((matchfooter = getState().lengthTreeTable[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
+                    }
+                    getChmSection().getSyncBits(
+                            getState().lengthTreeLengtsTable[matchfooter]);
+                    matchlen += matchfooter;
+                }
+                matchlen += ChmConstants.LZX_MIN_MATCH;
+                matchoffset = s >>> 3;
+                if (matchoffset > 2) {
+                    extra = ChmConstants.EXTRA_BITS[matchoffset];
+                    matchoffset = (ChmConstants.POSITION_BASE[matchoffset] - 2);
+                    if (extra > 3) {
+                        extra -= 3;
+                        long verbatim_bits = getChmSection().getSyncBits(extra);
+                        matchoffset += (verbatim_bits << 3);
+                        //READ HUFF SYM in Aligned Tree
+                        int aligned_bits = getChmSection().peekBits(
+                                ChmConstants.LZX_NUM_PRIMARY_LENGTHS);
+                        int t = getState().getAlignedTreeTable()[aligned_bits];
+                        if (t >= getState().getMainTreeElements()) {
+                            x = ChmConstants.LZX_ALIGNED_TABLEBITS; //?LZX_MAINTREE_TABLEBITS; //?LZX_ALIGNED_TABLEBITS
+                            do {
+                                x++;
+                                t <<= 1;
+                                t += getChmSection().checkBit(x);
+                            } while ((t = getState().getAlignedTreeTable()[t]) >= getState()
+                                    .getMainTreeElements());
+                        }
+                        getChmSection().getSyncBits(
+                                getState().getAlignedLenTable()[t]);
+                        matchoffset += t;
+                    } else if (extra == 3) {
+                        int g = getChmSection().peekBits(
+                                ChmConstants.LZX_NUM_PRIMARY_LENGTHS);
+                        int t = getState().getAlignedTreeTable()[g];
+                        if (t >= getState().getMainTreeElements()) {
+                            x = ChmConstants.LZX_ALIGNED_TABLEBITS; //?LZX_MAINTREE_TABLEBITS;
+                            do {
+                                x++;
+                                t <<= 1;
+                                t += getChmSection().checkBit(x);
+                            } while ((t = getState().getAlignedTreeTable()[t]) >= getState()
+                                    .getMainTreeElements());
+                        }
+                        getChmSection().getSyncBits(
+                                getState().getAlignedLenTable()[t]);
+                        matchoffset += t;
+                    } else if (extra > 0) {
+                        long l = getChmSection().getSyncBits(extra);
+                        matchoffset += l;
+                    } else
+                        matchoffset = 1;
+                    getState().setR2(getState().getR1());
+                    getState().setR1(getState().getR0());
+                    getState().setR0(matchoffset);
+                } else if (matchoffset == 0) {
+                    matchoffset = (int) getState().getR0();
+                } else if (matchoffset == 1) {
+                    matchoffset = (int) getState().getR1();
+                    getState().setR1(getState().getR0());
+                    getState().setR0(matchoffset);
+                } else /** match_offset == 2 */
+                {
+                    matchoffset = (int) getState().getR2();
+                    getState().setR2(getState().getR0());
+                    getState().setR0(matchoffset);
+                }
+                rundest = i;
+                runsrc = rundest - matchoffset;
+                i += (matchlen - 1);
+                if (i > len)
+                    break;
+
+                if (runsrc < 0) {
+                    if (matchlen + runsrc <= 0) {
+                        runsrc = prevcontent.length + runsrc;
+                        while (matchlen-- > 0)
+                            content[rundest++] = prevcontent[runsrc++];
+                    } else {
+                        runsrc = prevcontent.length + runsrc;
+                        while (runsrc < prevcontent.length)
+                            content[rundest++] = prevcontent[runsrc++];
+                        matchlen = matchlen + runsrc - prevcontent.length;
+                        runsrc = 0;
+                        while (matchlen-- > 0)
+                            content[rundest++] = content[runsrc++];
+                    }
+
+                } else {
+                    /* copies any wrappes around source data */
+                    while ((runsrc < 0) && (matchlen-- > 0)) {
+                        content[rundest++] = content[(int) (runsrc + getBlockLength())];
+                        runsrc++;
+                    }
+                    /* copies match data - no worries about destination wraps */
+                    while (matchlen-- > 0)
+                        content[rundest++] = content[runsrc++];
+                }
+            }
+        }
+        setContentLength(len);
+    }
+
+    private void assertShortArrayNotNull(short[] array) throws TikaException {
+        if (array == null)
+            throw new ChmParsingException("short[] is null");
+    }
+
+    private void decompressVerbatimBlock(int len, byte[] prevcontent) throws TikaException {
+        short s;
+        int x, i;
+        int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
+        int matchoffset = 0;
+        for (i = getContentLength(); i < len; i++) {
+            int f = getChmSection().peekBits(
+                    ChmConstants.LZX_MAINTREE_TABLEBITS);
+            assertShortArrayNotNull(getState().getMainTreeTable());
+            s = getState().getMainTreeTable()[f];
+            if (s >= ChmConstants.LZX_MAIN_MAXSYMBOLS) {
+                x = ChmConstants.LZX_MAINTREE_TABLEBITS;
+                do {
+                    x++;
+                    s <<= 1;
+                    s += getChmSection().checkBit(x);
+                } while ((s = getState().getMainTreeTable()[s]) >= ChmConstants.LZX_MAIN_MAXSYMBOLS);
+            }
+            getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]);
+            if (s < ChmConstants.LZX_NUM_CHARS) {
+                content[i] = (byte) s;
+            } else {
+                s -= ChmConstants.LZX_NUM_CHARS;
+                matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
+                if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
+                    matchfooter = getState().getLengthTreeTable()[getChmSection()
+                            .peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)];
+                    if (matchfooter >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS) {
+                        x = ChmConstants.LZX_LENGTH_TABLEBITS;
+                        do {
+                            x++;
+                            matchfooter <<= 1;
+                            matchfooter += getChmSection().checkBit(x);
+                        } while ((matchfooter = getState().getLengthTreeTable()[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
+                    }
+                    getChmSection().getSyncBits(
+                            getState().getLengthTreeLengtsTable()[matchfooter]);
+                    matchlen += matchfooter;
+                }
+                matchlen += ChmConstants.LZX_MIN_MATCH;
+                // shorter than 2
+                matchoffset = s >>> 3;
+                if (matchoffset > 2) {
+                    if (matchoffset != 3) { // should get other bits to retrieve
+                                            // offset
+                        extra = ChmConstants.EXTRA_BITS[matchoffset];
+                        long l = getChmSection().getSyncBits(extra);
+                        matchoffset = (int) (ChmConstants.POSITION_BASE[matchoffset] - 2 + l);
+                    } else {
+                        matchoffset = 1;
+                    }
+                    getState().setR2(getState().getR1());
+                    getState().setR1(getState().getR0());
+                    getState().setR0(matchoffset);
+                } else if (matchoffset == 0) {
+                    matchoffset = (int) getState().getR0();
+                } else if (matchoffset == 1) {
+                    matchoffset = (int) getState().getR1();
+                    getState().setR1(getState().getR0());
+                    getState().setR0(matchoffset);
+                } else /* match_offset == 2 */
+                {
+                    matchoffset = (int) getState().getR2();
+                    getState().setR2(getState().getR0());
+                    getState().setR0(matchoffset);
+                }
+                rundest = i;
+                runsrc = rundest - matchoffset;
+                i += (matchlen - 1);
+                if (i > len)
+                    break;
+                if (runsrc < 0) {
+                    if (matchlen + runsrc <= 0) {
+                        runsrc = prevcontent.length + runsrc;
+                        while ((matchlen-- > 0) && (prevcontent != null)
+                                && ((runsrc + 1) > 0))
+                            if ((rundest < content.length)
+                                    && (runsrc < content.length))
+                                content[rundest++] = prevcontent[runsrc++];
+                    } else {
+                        runsrc = prevcontent.length + runsrc;
+                        while (runsrc < prevcontent.length)
+                            if ((rundest < content.length)
+                                    && (runsrc < content.length))
+                                content[rundest++] = prevcontent[runsrc++];
+                        matchlen = matchlen + runsrc - prevcontent.length;
+                        runsrc = 0;
+                        while (matchlen-- > 0)
+                            content[rundest++] = content[runsrc++];
+                    }
+
+                } else {
+                    /* copies any wrapped source data */
+                    while ((runsrc < 0) && (matchlen-- > 0)) {
+                        content[rundest++] = content[(int) (runsrc + getBlockLength())];
+                        runsrc++;
+                    }
+                    /* copies match data - no worries about destination wraps */
+                    while (matchlen-- > 0) {
+                        if ((rundest < content.length)
+                                && (runsrc < content.length))
+                            content[rundest++] = content[runsrc++];
+                    }
+                }
+            }
+        }
+        setContentLength(len);
+    }
+
+    private void createLengthTreeLenTable(int offset, int tablelen,
+            short[] pretreetable, short[] prelentable) throws TikaException {
+        if (prelentable == null || getChmSection() == null
+                || pretreetable == null || prelentable == null)
+            throw new ChmParsingException("is null");
+
+        int i = offset; // represents offset
+        int z, y, x;// local counters
+        while (i < tablelen) {
+            //Read HUFF sym to z
+            z = pretreetable[getChmSection().peekBits(
+                    ChmConstants.LZX_PRETREE_TABLEBITS)];
+            if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 1 bug, should be
+                                                             // 20
+                x = ChmConstants.LZX_PRETREE_TABLEBITS;
+                do {
+                    x++;
+                    z <<= 1;
+                    z += getChmSection().checkBit(x);
+                } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS);
+            }
+            getChmSection().getSyncBits(prelentable[z]);
+            
+            if (z < 17) {
+                z = getState().getLengthTreeLengtsTable()[i] - z;
+                if (z < 0)
+                    z = z + 17;
+                getState().getLengthTreeLengtsTable()[i] = (short) z;
+                i++;
+            } else if (z == 17) {
+                y = getChmSection().getSyncBits(4);
+                y += 4;
+                for (int j = 0; j < y; j++)
+                    if (i < getState().getLengthTreeLengtsTable().length)
+                        getState().getLengthTreeLengtsTable()[i++] = 0;
+            } else if (z == 18) {
+                y = getChmSection().getSyncBits(5);
+                y += 20;
+                for (int j = 0; j < y; j++)
+                    //no tolerate //if (i < getState().getLengthTreeLengtsTable().length)
+                        getState().getLengthTreeLengtsTable()[i++] = 0;
+            } else if (z == 19) {
+                y = getChmSection().getSyncBits(1);
+                y += 4;
+                z = pretreetable[getChmSection().peekBits(
+                        ChmConstants.LZX_PRETREE_TABLEBITS)];
+                if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 20
+                    x = ChmConstants.LZX_PRETREE_TABLEBITS;// 6
+                    do {
+                        x++;
+                        z <<= 1;
+                        z += getChmSection().checkBit(x);
+                    } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS);//LZX_MAINTREE_TABLEBITS);
+                }
+                getChmSection().getSyncBits(prelentable[z]);
+                z = getState().getLengthTreeLengtsTable()[i] - z;
+                if (z < 0)
+                    z = z + 17;
+                for (int j = 0; j < y; j++)
+                    getState().getLengthTreeLengtsTable()[i++] = (short) z;
+            }
+        }
+    }
+
+    private void createMainTreeTable() throws TikaException {
+        //Read Pre Tree Table
+        short[] prelentable = createPreLenTable();
+        short[] pretreetable = createTreeTable2(prelentable,
+                (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+                        + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+                ChmConstants.LZX_PRETREE_TABLEBITS,
+                ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+
+        createMainTreeLenTable(0, ChmConstants.LZX_NUM_CHARS, pretreetable,
+                prelentable);
+        
+        //Read Pre Tree Table
+        prelentable = createPreLenTable();
+        pretreetable = createTreeTable2(prelentable,
+                (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+                        + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+                ChmConstants.LZX_PRETREE_TABLEBITS,
+                ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+
+        createMainTreeLenTable(ChmConstants.LZX_NUM_CHARS,
+                getState().mainTreeLengtsTable.length, pretreetable,
+                prelentable);
+
+        getState().setMainTreeTable(
+                createTreeTable2(getState().mainTreeLengtsTable,
+                        (1 << ChmConstants.LZX_MAINTREE_TABLEBITS)
+                                + (ChmConstants.LZX_MAINTREE_MAXSYMBOLS << 1),
+                        ChmConstants.LZX_MAINTREE_TABLEBITS, getState()
+                                .getMainTreeElements()));
+    }
+
+    private void createMainTreeLenTable(int offset, int tablelen,
+            short[] pretreetable, short[] prelentable) throws TikaException {
+        if (pretreetable == null)
+            throw new ChmParsingException("pretreetable is null");
+        int i = offset;
+        int z, y, x;
+        while (i < tablelen) {
+            int f = getChmSection().peekBits(
+                    ChmConstants.LZX_PRETREE_TABLEBITS);
+            z = pretreetable[f];
+            if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
+                x = ChmConstants.LZX_PRETREE_TABLEBITS;
+                do {
+                    x++;
+                    z <<= 1;
+                    z += getChmSection().checkBit(x);
+                } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+            }
+            getChmSection().getSyncBits(prelentable[z]);
+            if (z < 17) {
+                z = getState().getMainTreeLengtsTable()[i] - z;
+                if (z < 0)
+                    z = z + 17;
+                getState().mainTreeLengtsTable[i] = (short) z;
+                i++;
+            } else if (z == 17) {
+                y = getChmSection().getSyncBits(4);
+                y += 4;
+                for (int j = 0; j < y; j++) {
+                    assertInRange(getState().getMainTreeLengtsTable(), i);
+                    getState().mainTreeLengtsTable[i++] = 0;
+                }
+            } else if (z == 18) {
+                y = getChmSection().getSyncBits(5);
+                y += 20;
+                for (int j = 0; j < y; j++) {
+                    assertInRange(getState().getMainTreeLengtsTable(), i);
+                    getState().mainTreeLengtsTable[i++] = 0;
+                }
+            } else if (z == 19) {
+                y = getChmSection().getSyncBits(1);
+                y += 4;
+                z = pretreetable[getChmSection().peekBits(
+                        ChmConstants.LZX_PRETREE_TABLEBITS)];
+                if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
+                    x = ChmConstants.LZX_PRETREE_TABLEBITS;
+                    do {
+                        x++;
+                        z <<= 1;
+                        z += getChmSection().checkBit(x);
+                    } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+                }
+                getChmSection().getSyncBits(prelentable[z]);
+                z = getState().mainTreeLengtsTable[i] - z;
+                if (z < 0)
+                    z = z + 17;
+                for (int j = 0; j < y; j++)
+                    if (i < getState().getMainTreeLengtsTable().length)
+                        getState().mainTreeLengtsTable[i++] = (short) z;
+            }
+        }
+    }
+
+    private void assertInRange(short[] array, int index) throws ChmParsingException {
+        if (index >= array.length)
+            throw new ChmParsingException(index + " is bigger than "
+                    + array.length);
+    }
+
+    private short[] createAlignedLenTable() {
+        int tablelen = ChmConstants.LZX_ALIGNED_NUM_ELEMENTS;//LZX_BLOCKTYPE_UNCOMPRESSED;//
+        int bits = ChmConstants.LZX_BLOCKTYPE_UNCOMPRESSED;
+        short[] tmp = new short[tablelen];
+        for (int i = 0; i < tablelen; i++) {
+            tmp[i] = (short) getChmSection().getSyncBits(bits);
+        }
+        return tmp;
+    }
+
+    private void createAlignedTreeTable() throws ChmParsingException {
+        getState().setAlignedLenTable(createAlignedLenTable());
+        getState().setAlignedTreeTable(//setAlignedLenTable(
+                createTreeTable2(getState().getAlignedLenTable(),
+                        (1 << ChmConstants.LZX_NUM_PRIMARY_LENGTHS)
+                                + (ChmConstants.LZX_ALIGNED_MAXSYMBOLS << 1),
+                        ChmConstants.LZX_NUM_PRIMARY_LENGTHS,
+                        ChmConstants.LZX_ALIGNED_MAXSYMBOLS));
+    }
+
+    private short[] createTreeTable2(short[] lentable, int tablelen, int bits,
+            int maxsymbol) throws ChmParsingException {
+        short[] tmp = new short[tablelen];
+        short sym;
+        int leaf;
+        int bit_num = 1;
+        long fill;
+        int pos = 0;
+        /* the current position in the decode table */
+        long table_mask = (1 << bits);
+        long bit_mask = (table_mask >> 1);
+        long next_symbol = bit_mask;
+
+        /* fills entries for short codes for a direct mapping */
+        while (bit_num <= bits) {
+            for (sym = 0; sym < maxsymbol; sym++) {
+                if (lentable.length > sym && lentable[sym] == bit_num) {
+                    leaf = pos;
+
+                    if ((pos += bit_mask) > table_mask) {
+                        /* table overflow */
+                        throw new ChmParsingException("Table overflow");
+                    }
+
+                    fill = bit_mask;
+                    while (fill-- > 0)
+                        tmp[leaf++] = sym;
+                }
+            }
+            bit_mask >>= 1;
+            bit_num++;
+        }
+
+        /* if there are any codes longer than nbits */
+        if (pos != table_mask) {
+            /* clears the remainder of the table */
+            for (leaf = pos; leaf < table_mask; leaf++)
+                tmp[leaf] = 0;
+
+            /* gives ourselves room for codes to grow by up to 16 more bits */
+            pos <<= 16;
+            table_mask <<= 16;
+            bit_mask = 1 << 15;
+
+            while (bit_num <= 16) {
+                for (sym = 0; sym < maxsymbol; sym++) {
+                    if ((lentable.length > sym) && (lentable[sym] == bit_num)) {
+                        leaf = pos >> 16;
+                        for (fill = 0; fill < bit_num - bits; fill++) {
+                            /*
+                             * if this path hasn't been taken yet, 'allocate'
+                             * two entries
+                             */
+                            if (tmp[leaf] == 0) {
+                                if (((next_symbol << 1) + 1) < tmp.length) {
+                                    tmp[(int) (next_symbol << 1)] = 0;
+                                    tmp[(int) (next_symbol << 1) + 1] = 0;
+                                    tmp[leaf] = (short) next_symbol++;
+                                }
+
+                            }
+                            /*
+                             * follows the path and select either left or right
+                             * for next bit
+                             */
+                            leaf = tmp[leaf] << 1;
+                            if (((pos >> (15 - fill)) & 1) != 0)
+                                leaf++;
+                        }
+                        tmp[leaf] = sym;
+
+                        if ((pos += bit_mask) > table_mask) {
+                            /* table overflow */
+                            throw new ChmParsingException("Table overflow");
+                        }
+                    }
+                }
+                bit_mask >>= 1;
+                bit_num++;
+            }
+        }
+
+        /* is it full table? */
+        if (pos == table_mask)
+            return tmp;
+
+        return tmp;
+    }
+
+    public byte[] getContent() {
+        return content;
+    }
+
+    public byte[] getContent(int startOffset, int endOffset) {
+        return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
+                startOffset, endOffset) : new byte[1];
+    }
+
+    public byte[] getContent(int start) {
+        return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
+                start, getContent().length) : new byte[1];
+    }
+
+    private void setContent(int contentLength) {
+        this.content = new byte[contentLength];
+    }
+
+    private void checkLzxBlock(ChmLzxBlock chmPrevLzxBlock) throws TikaException {
+        if (chmPrevLzxBlock == null && getBlockLength() < Integer.MAX_VALUE)
+            setState(new ChmLzxState((int) getBlockLength()));
+        else
+            //use clone to avoid changing a cached or to be cached block
+            setState(chmPrevLzxBlock.getState().clone()); 
+    }
+
+    private boolean validateConstructorParams(int blockNumber,
+            byte[] dataSegment, long blockLength) throws TikaException {
+        int goodParameter = 0;
+        if (blockNumber >= 0)
+            ++goodParameter;
+        else
+            throw new ChmParsingException("block number should be possitive");
+        if (dataSegment != null && dataSegment.length > 0)
+            ++goodParameter;
+        else
+            throw new ChmParsingException("data segment should not be null");
+        if (blockLength > 0)
+            ++goodParameter;
+        else
+            throw new ChmParsingException(
+                    "block length should be more than zero");
+        return (goodParameter == 3);
+    }
+
+    public int getBlockNumber() {
+        return block_number;
+    }
+
+    private void setBlockNumber(int block_number) {
+        this.block_number = block_number;
+    }
+
+    private long getBlockLength() {
+        return block_length;
+    }
+
+    private void setBlockLength(long block_length) {
+        this.block_length = block_length;
+    }
+
+    public ChmLzxState getState() {
+        return state;
+    }
+
+    private void setState(ChmLzxState state) {
+        this.state = state;
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,327 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.util.concurrent.CancellationException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
+import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+public class ChmLzxState implements Cloneable {
+    /* Class' members */
+    private int window; /* the actual decoding window */
+    private long window_size; /* window size (32Kb through 2Mb) */
+    private int window_position; /* current offset within the window */
+    private int main_tree_elements; /* number of main tree elements */
+    private LzxState hadStarted; /* have we started decoding at all yet? */
+    private int block_type; /* type of this block */
+    private int block_length; /* uncompressed length of this block */
+    private int block_remaining; /* uncompressed bytes still left to decode */
+    private int frames_read; /* the number of CFDATA blocks processed */
+    private int intel_file_size; /* magic header value used for transform */
+    private long intel_current_possition; /* current offset in transform space */
+    private IntelState intel_state; /* have we seen any translatable data yet? */
+    private long R0; /* for the LRU offset system */
+    private long R1; /* for the LRU offset system */
+    private long R2; /* for the LRU offset system */
+
+    // Trees - PRETREE, MAINTREE, LENGTH, ALIGNED
+    protected short[] mainTreeLengtsTable;
+    protected short[] mainTreeTable;
+
+    protected short[] lengthTreeTable;
+    protected short[] lengthTreeLengtsTable;
+
+    protected short[] alignedLenTable;
+    protected short[] alignedTreeTable;
+
+    @Override
+    public ChmLzxState clone() {
+        try {
+          ChmLzxState clone = (ChmLzxState)super.clone();
+          clone.mainTreeLengtsTable = arrayClone(mainTreeLengtsTable);
+          clone.mainTreeTable = arrayClone(mainTreeTable);
+          clone.lengthTreeTable = arrayClone(lengthTreeTable);
+          clone.lengthTreeLengtsTable = arrayClone(lengthTreeLengtsTable);
+          clone.alignedLenTable = arrayClone(alignedLenTable);
+          clone.alignedTreeTable = arrayClone(alignedTreeTable);
+          return clone;
+        } catch (CloneNotSupportedException ex) {
+           return null;
+        }
+    }
+    
+    protected short[] getMainTreeTable() {
+        return mainTreeTable;
+    }
+
+    protected short[] getAlignedTreeTable() {
+        return alignedTreeTable;
+    }
+
+    protected void setAlignedTreeTable(short[] alignedTreeTable) {
+        this.alignedTreeTable = alignedTreeTable;
+    }
+
+    protected short[] getLengthTreeTable() throws TikaException {
+        if (lengthTreeTable != null)
+            return this.lengthTreeTable;
+        else
+            throw new ChmParsingException("lengthTreeTable is null");
+    }
+
+    protected void setLengthTreeTable(short[] lengthTreeTable) {
+        this.lengthTreeTable = lengthTreeTable;
+    }
+
+    protected void setMainTreeTable(short[] mainTreeTable) {
+        this.mainTreeTable = mainTreeTable;
+    }
+
+    protected short[] getAlignedLenTable() {
+        return this.alignedLenTable;
+    }
+
+    protected void setAlignedLenTable(short[] alignedLenTable) {
+        this.alignedLenTable = alignedLenTable;
+    }
+
+    /**
+     * It suits for informative outlook
+     */
+    public String toString() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("actual decoding window:=" + getWindow()
+                + System.getProperty("line.separator"));
+        sb.append("window size (32Kb through 2Mb):=" + getWindowSize()
+                + System.getProperty("line.separator"));
+        sb.append("current offset within the window:=" + getWindowPosition()
+                + System.getProperty("line.separator"));
+        sb.append("number of main tree elements:=" + getMainTreeElements()
+                + System.getProperty("line.separator"));
+        sb.append("have we started decoding at all yet?:=" + getHadStarted()
+                + System.getProperty("line.separator"));
+        sb.append("type of this block:=" + getBlockType()
+                + System.getProperty("line.separator"));
+        sb.append("uncompressed length of this block:=" + getBlockLength()
+                + System.getProperty("line.separator"));
+        sb.append("uncompressed bytes still left to decode:="
+                + getBlockRemaining() + System.getProperty("line.separator"));
+        sb.append("the number of CFDATA blocks processed:=" + getFramesRead()
+                + System.getProperty("line.separator"));
+        sb.append("magic header value used for transform:="
+                + getIntelFileSize() + System.getProperty("line.separator"));
+        sb.append("current offset in transform space:="
+                + getIntelCurrentPossition()
+                + System.getProperty("line.separator"));
+        sb.append("have we seen any translatable data yet?:=" + getIntelState()
+                + System.getProperty("line.separator"));
+        sb.append("R0 for the LRU offset system:=" + getR0()
+                + System.getProperty("line.separator"));
+        sb.append("R1 for the LRU offset system:=" + getR1()
+                + System.getProperty("line.separator"));
+        sb.append("R2 for the LRU offset system:=" + getR2()
+                + System.getProperty("line.separator"));
+        sb.append("main tree length:=" + getMainTreeLengtsTable().length
+                + System.getProperty("line.separator"));
+        sb.append("secondary tree length:=" + getLengthTreeLengtsTable().length
+                + System.getProperty("line.separator"));
+        return sb.toString();
+    }
+
+    public ChmLzxState(int window) throws TikaException {
+        if (window >= 0) {
+            int position_slots;
+            int win = ChmCommons.getWindowSize(window);
+            setWindowSize(1 << win);
+            /* LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) */
+            if (win < 15 || win > 21)
+                throw new ChmParsingException("window less than 15 or window greater than 21");
+
+            /* Calculates required position slots */
+            if (win == 20)
+                position_slots = 42;
+            else if (win == 21)
+                position_slots = 50;
+            else
+                position_slots = win << 1;
+            //TODO: position_slots is not used ?
+            setR0(1);
+            setR1(1);
+            setR2(1);
+            setMainTreeElements(512);
+            setHadStarted(LzxState.NOT_STARTED_DECODING);
+            setFramesRead(0);
+            setBlockRemaining(0);
+            setBlockType(ChmConstants.LZX_BLOCKTYPE_INVALID);
+            setIntelCurrentPossition(0);
+            setIntelState(IntelState.NOT_STARTED);
+            setWindowPosition(0);
+            setMainTreeLengtsTable(new short[getMainTreeElements()]);
+            setLengthTreeLengtsTable(new short[ChmConstants.LZX_NUM_SECONDARY_LENGTHS]);
+        } else
+            throw new CancellationException(
+                    "window size should be more than zero");
+    }
+
+    protected void setWindow(int window) {
+        this.window = window;
+    }
+
+    protected int getWindow() {
+        return window;
+    }
+
+    protected void setWindowSize(long window_size) {
+        this.window_size = window_size;
+    }
+
+    protected long getWindowSize() {
+        return window_size;
+    }
+
+    protected void setWindowPosition(int window_position) {
+        this.window_position = window_position;
+    }
+
+    protected int getWindowPosition() {
+        return window_position;
+    }
+
+    protected void setMainTreeElements(int main_tree_elements) {
+        this.main_tree_elements = main_tree_elements;
+    }
+
+    protected int getMainTreeElements() {
+        return main_tree_elements;
+    }
+
+    protected void setHadStarted(LzxState hadStarted) {
+        this.hadStarted = hadStarted;
+    }
+
+    protected LzxState getHadStarted() {
+        return hadStarted;
+    }
+
+    protected void setBlockType(int block_type) {
+        this.block_type = block_type;
+    }
+
+    public int getBlockType() {
+        return block_type;
+    }
+
+    protected void setBlockLength(int block_length) {
+        this.block_length = block_length;
+    }
+
+    protected int getBlockLength() {
+        return block_length;
+    }
+
+    protected void setBlockRemaining(int block_remaining) {
+        this.block_remaining = block_remaining;
+    }
+
+    protected int getBlockRemaining() {
+        return block_remaining;
+    }
+
+    protected void setFramesRead(int frames_read) {
+        this.frames_read = frames_read;
+    }
+
+    protected void increaseFramesRead() {
+        this.frames_read = getFramesRead() + 1;
+    }
+
+    protected int getFramesRead() {
+        return frames_read;
+    }
+
+    protected void setIntelFileSize(int intel_file_size) {
+        this.intel_file_size = intel_file_size;
+    }
+
+    protected int getIntelFileSize() {
+        return intel_file_size;
+    }
+
+    protected void setIntelCurrentPossition(long intel_current_possition) {
+        this.intel_current_possition = intel_current_possition;
+    }
+
+    protected long getIntelCurrentPossition() {
+        return intel_current_possition;
+    }
+
+    protected void setIntelState(IntelState intel_state) {
+        this.intel_state = intel_state;
+    }
+
+    protected IntelState getIntelState() {
+        return intel_state;
+    }
+
+    protected void setR0(long r0) {
+        R0 = r0;
+    }
+
+    protected long getR0() {
+        return R0;
+    }
+
+    protected void setR1(long r1) {
+        R1 = r1;
+    }
+
+    protected long getR1() {
+        return R1;
+    }
+
+    protected void setR2(long r2) {
+        R2 = r2;
+    }
+
+    protected long getR2() {
+        return R2;
+    }
+
+    public void setMainTreeLengtsTable(short[] mainTreeLengtsTable) {
+        this.mainTreeLengtsTable = mainTreeLengtsTable;
+    }
+
+    public short[] getMainTreeLengtsTable() {
+        return mainTreeLengtsTable;
+    }
+
+    public void setLengthTreeLengtsTable(short[] lengthTreeLengtsTable) {
+        this.lengthTreeLengtsTable = lengthTreeLengtsTable;
+    }
+
+    public short[] getLengthTreeLengtsTable() {
+        return lengthTreeLengtsTable;
+    }
+    
+    private static short[] arrayClone(short[] a) {
+        return a==null ? null : (short[]) a.clone();
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+
+public class ChmSection {
+    final private byte[] data;
+    final private byte[] prevcontent;
+    private int swath;// kiks
+    private int total;// remains
+    private int buffer;// val
+
+    public ChmSection(byte[] data) throws TikaException {
+        this(data, null);
+    }
+
+    public ChmSection(byte[] data, byte[] prevconent) throws TikaException {
+        ChmCommons.assertByteArrayNotNull(data);
+        this.data = data;
+        this.prevcontent = prevconent;
+        //setData(data);
+    }
+    
+    /* Utilities */
+    public byte[] reverseByteOrder(byte[] toBeReversed) throws TikaException {
+        ChmCommons.assertByteArrayNotNull(toBeReversed);
+        ChmCommons.reverse(toBeReversed);
+        return toBeReversed;
+    }
+
+    public int checkBit(int i) {
+        return ((getBuffer() & (1 << (getTotal() - i))) == 0) ? 0 : 1;
+    }
+
+    public int getSyncBits(int bit) {
+        return getDesyncBits(bit, bit);
+    }
+
+    public int peekBits(int bit) {
+        return getDesyncBits(bit, 0);
+    }
+    
+    private int getDesyncBits(int bit, int removeBit) {
+        while (getTotal() < 16) {
+            setBuffer((getBuffer() << 16) + unmarshalUByte()
+                    + (unmarshalUByte() << 8));
+            setTotal(getTotal() + 16);
+        }
+        int tmp = (getBuffer() >>> (getTotal() - bit));
+        setTotal(getTotal() - removeBit);
+        setBuffer(getBuffer() - ((getBuffer() >>> getTotal()) << getTotal()));
+        return tmp;
+    }
+
+    public int unmarshalUByte() {
+        return getByte() & 255;
+    }
+
+    public byte getByte() {
+        if (getSwath() < getData().length) {
+            setSwath(getSwath() + 1);
+            return getData()[getSwath() - 1];
+        } else
+            return 0;
+    }
+
+    public int getLeft() {
+        return (getData().length - getSwath());
+    }
+
+    public byte[] getData() {
+        return data;
+    }
+
+    public byte[] getPrevContent() {
+        return prevcontent;
+    }
+    
+    public BigInteger getBigInteger(int i) {
+        if (getData() == null)
+            return BigInteger.ZERO;
+        if (getData().length - getSwath() < i)
+            i = getData().length - getSwath();
+        byte[] tmp = new byte[i];
+        for (int j = i - 1; j >= 0; j--) {
+            tmp[i - j - 1] = getData()[getSwath() + j];
+        }
+        setSwath(getSwath() + i);
+        return new BigInteger(tmp);
+    }
+
+    public byte[] stringToAsciiBytes(String s) {
+        char[] c = s.toCharArray();
+        byte[] byteval = new byte[c.length];
+        for (int i = 0; i < c.length; i++)
+            byteval[i] = (byte) c[i];
+        return byteval;
+    }
+
+    public BigInteger unmarshalUlong() {
+        return getBigInteger(8);
+    }
+
+    public long unmarshalUInt() {
+        return getBigInteger(4).longValue();
+    }
+
+    public int unmarshalInt() {
+        return getBigInteger(4).intValue();
+    }
+
+    public byte[] unmarshalBytes(int i) {
+        if (i == 0)
+            return new byte[1];
+        byte[] t = new byte[i];
+        for (int j = 0; j < i; j++)
+            t[j] = getData()[j + getSwath()];
+        setSwath(getSwath() + i);
+        return t;
+    }
+
+    public BigInteger getEncint() {
+        byte ob;
+        BigInteger bi = BigInteger.ZERO;
+        byte[] nb = new byte[1];
+        while ((ob = this.getByte()) < 0) {
+            nb[0] = (byte) ((ob & 0x7f));
+            bi = bi.shiftLeft(7).add(new BigInteger(nb));
+        }
+        nb[0] = (byte) ((ob & 0x7f));
+        bi = bi.shiftLeft(7).add(new BigInteger(nb));
+        return bi;
+    }
+
+    public char unmarshalUtfChar() {
+        byte ob;
+        int i = 1;
+        byte[] ba;
+        ob = this.getByte();
+        if (ob < 0) {
+            i = 2;
+            while ((ob << (24 + i)) < 0)
+                i++;
+        }
+        ba = new byte[i];
+        ba[0] = ob;
+        int j = 1;
+        while (j < i) {
+            ba[j] = this.getByte();
+            j++;
+        }
+        i = ba.length;
+        if (i == 1)
+            return (char) ba[0];
+        else {
+            int n;
+            n = ba[0] & 15; // 00001111b, gets last 4 bits
+            j = 1;
+            while (j < i)
+                n = (n << 6) + (ba[j++] & 63);// 00111111b,gets last 6 bits
+            return (char) n;
+        }
+    }
+
+//    private void setData(byte[] data) {
+//        this.data = data;
+//    }
+
+    public int getSwath() {
+        return swath;
+    }
+
+    public void setSwath(int swath) {
+        this.swath = swath;
+    }
+
+    public int getTotal() {
+        return total;
+    }
+
+    public void setTotal(int total) {
+        this.total = total;
+    }
+
+    private int getBuffer() {
+        return buffer;
+    }
+
+    private void setBuffer(int buffer) {
+        this.buffer = buffer;
+    }
+
+    /**
+     * @param args
+     * @throws TikaException 
+     */
+    public static void main(String[] args) throws TikaException {
+        byte[] array = { 4, 78, -67, 90, 1, -33 };
+        ChmSection chmSection = new ChmSection(array);
+        System.out.println("before " + Arrays.toString(array));
+        System.out.println("after " + Arrays.toString(chmSection.reverseByteOrder(array)));
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,269 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.poi.hwpf.converter.NumberFormatter;
+
+public abstract class AbstractListManager {
+    private final static String BULLET = "\u00b7";
+
+    protected Map<Integer, ParagraphLevelCounter> listLevelMap = new HashMap<Integer, ParagraphLevelCounter>();
+    protected Map<Integer, LevelTuple[]> overrideTupleMap = new HashMap<Integer, LevelTuple[]>();
+
+    //helper class that is docx/doc format agnostic
+    protected class ParagraphLevelCounter {
+
+        //counts can == 0 if the format is decimal, make sure
+        //that flag values are < 0
+        private final Integer NOT_SEEN_YET = -1;
+        private final Integer FIRST_SKIPPED = -2;
+        private final LevelTuple[] levelTuples;
+        Pattern LEVEL_INTERPOLATOR = Pattern.compile("%(\\d+)");
+        private List<Integer> counts = new ArrayList<Integer>();
+        private int lastLevel = -1;
+
+        public ParagraphLevelCounter(LevelTuple[] levelTuples) {
+            this.levelTuples = levelTuples;
+        }
+
+        public int getNumberOfLevels() {
+            return levelTuples.length;
+        }
+
+        /**
+         * Apply this to every numbered paragraph in order.
+         *
+         * @param levelNumber level number that is being incremented
+         * @return the new formatted number string for this level
+         */
+        public String incrementLevel(int levelNumber, LevelTuple[] overrideLevelTuples) {
+
+            for (int i = lastLevel + 1; i < levelNumber; i++) {
+                if (i >= counts.size()) {
+                    int val = getStart(i, overrideLevelTuples);
+                    counts.add(i, val);
+                } else {
+                    int count = counts.get(i);
+                    if (count == NOT_SEEN_YET) {
+                        count = getStart(i, overrideLevelTuples);
+                        counts.set(i, count);
+                    }
+                }
+            }
+
+            if (levelNumber < counts.size()) {
+                resetAfter(levelNumber, overrideLevelTuples);
+                int count = counts.get(levelNumber);
+                if (count == NOT_SEEN_YET) {
+                    count = getStart(levelNumber, overrideLevelTuples);
+                } else {
+                    count++;
+                }
+                counts.set(levelNumber, count);
+                lastLevel = levelNumber;
+                return format(levelNumber, overrideLevelTuples);
+            }
+
+            counts.add(levelNumber, getStart(levelNumber, overrideLevelTuples));
+            lastLevel = levelNumber;
+            return format(levelNumber, overrideLevelTuples);
+        }
+
+        /**
+         * @param level which level to format
+         * @return the string that represents the number and the surrounding text for this paragraph
+         */
+        private String format(int level, LevelTuple[] overrideLevelTuples) {
+            if (level < 0 || level >= levelTuples.length) {
+                //log?
+                return "";
+            }
+            boolean isLegal = (overrideLevelTuples != null) ? overrideLevelTuples[level].isLegal : levelTuples[level].isLegal;
+            //short circuit bullet
+            String numFmt = getNumFormat(level, isLegal, overrideLevelTuples);
+            if ("bullet".equals(numFmt)) {
+                return BULLET + " ";
+            }
+
+            String lvlText = (overrideLevelTuples == null || overrideLevelTuples[level].lvlText == null) ?
+                    levelTuples[level].lvlText : overrideLevelTuples[level].lvlText;
+            StringBuilder sb = new StringBuilder();
+            Matcher m = LEVEL_INTERPOLATOR.matcher(lvlText);
+            int last = 0;
+            while (m.find()) {
+                sb.append(lvlText.substring(last, m.start()));
+                String lvlString = m.group(1);
+                int lvlNum = -1;
+                try {
+                    lvlNum = Integer.parseInt(lvlString);
+                } catch (NumberFormatException e) {
+                    //swallow
+                }
+                String numString = "";
+                //need to subtract 1 because, e.g. %1 is the format
+                //for the number at array offset 0
+                numString = formatNum(lvlNum - 1, isLegal, overrideLevelTuples);
+
+                sb.append(numString);
+                last = m.end();
+            }
+            sb.append(lvlText.substring(last));
+            if (sb.length() > 0) {
+                //TODO: add in character after number
+                sb.append(" ");
+            }
+            return sb.toString();
+        }
+
+        //actual level number; can return empty string if numberformatter fails
+        private String formatNum(int lvlNum, boolean isLegal, LevelTuple[] overrideLevelTuples) {
+
+            int numFmtStyle = 0;
+            String numFmt = getNumFormat(lvlNum, isLegal, overrideLevelTuples);
+
+            int count = getCount(lvlNum);
+            if (count < 0) {
+                count = 1;
+            }
+            if ("lowerLetter".equals(numFmt)) {
+                numFmtStyle = 4;
+            } else if ("lowerRoman".equals(numFmt)) {
+                numFmtStyle = 2;
+            } else if ("decimal".equals(numFmt)) {
+                numFmtStyle = 0;
+            } else if ("upperLetter".equals(numFmt)) {
+                numFmtStyle = 3;
+            } else if ("upperRoman".equals(numFmt)) {
+                numFmtStyle = 1;
+            } else if ("bullet".equals(numFmt)) {
+                return "";
+                //not yet handled by NumberFormatter...TODO: add to NumberFormatter?
+            } else if ("ordinal".equals(numFmt)) {
+                return ordinalize(count);
+            } else if ("decimalZero".equals(numFmt)) {
+                return "0" + NumberFormatter.getNumber(count, 0);
+            } else if ("none".equals(numFmt)) {
+                return "";
+            }
+            try {
+                return NumberFormatter.getNumber(count, numFmtStyle);
+            } catch (IllegalArgumentException e) {
+                return "";
+            }
+        }
+
+        private String ordinalize(int count) {
+            //this is only good for locale == English
+            String countString = Integer.toString(count);
+            if (countString.endsWith("1")) {
+                return countString + "st";
+            } else if (countString.endsWith("2")) {
+                return countString + "nd";
+            } else if (countString.endsWith("3")) {
+                return countString + "rd";
+            }
+            return countString + "th";
+        }
+
+        private String getNumFormat(int lvlNum, boolean isLegal, LevelTuple[] overrideLevelTuples) {
+            if (lvlNum < 0 || lvlNum >= levelTuples.length) {
+                //log?
+                return "decimal";
+            }
+            if (isLegal) {
+                //return decimal no matter the level if isLegal is true
+                return "decimal";
+            }
+            return (overrideLevelTuples == null || overrideLevelTuples[lvlNum].numFmt == null) ?
+                    levelTuples[lvlNum].numFmt : overrideLevelTuples[lvlNum].numFmt;
+        }
+
+        private int getCount(int lvlNum) {
+            if (lvlNum < 0 || lvlNum >= counts.size()) {
+                //log?
+                return 1;
+            }
+            return counts.get(lvlNum);
+        }
+
+        private void resetAfter(int startlevelNumber, LevelTuple[] overrideLevelTuples) {
+            for (int levelNumber = startlevelNumber + 1; levelNumber < counts.size(); levelNumber++) {
+                int cnt = counts.get(levelNumber);
+                if (cnt == NOT_SEEN_YET) {
+                    //do nothing
+                } else if (cnt == FIRST_SKIPPED) {
+                    //do nothing
+                } else if (levelTuples.length > levelNumber) {
+                    //never reset if restarts == 0
+                    int restart = (overrideLevelTuples == null || overrideLevelTuples[levelNumber].restart < 0) ?
+                            levelTuples[levelNumber].restart : overrideLevelTuples[levelNumber].restart;
+                    if (restart == 0) {
+                        return;
+                    } else if (restart == -1 ||
+                            startlevelNumber <= restart - 1) {
+                        counts.set(levelNumber, NOT_SEEN_YET);
+                    } else {
+                        //do nothing/don't reset
+                    }
+                } else {
+                    //reset!
+                    counts.set(levelNumber, NOT_SEEN_YET);
+                }
+            }
+        }
+
+        private int getStart(int levelNumber, LevelTuple[] overrideLevelTuples) {
+            if (levelNumber >= levelTuples.length) {
+                return 1;
+            } else {
+                return (overrideLevelTuples == null || overrideLevelTuples[levelNumber].start < 0) ?
+                        levelTuples[levelNumber].start : overrideLevelTuples[levelNumber].start;
+            }
+        }
+    }
+
+    protected class LevelTuple {
+        private final int start;
+        private final int restart;
+        private final String lvlText;
+        private final String numFmt;
+        private final boolean isLegal;
+
+        public LevelTuple(String lvlText) {
+            this.lvlText = lvlText;
+            start = 1;
+            restart = -1;
+            numFmt = "decimal";
+            isLegal = false;
+        }
+
+        public LevelTuple(int start, int restart, String lvlText, String numFmt, boolean isLegal) {
+            this.start = start;
+            this.restart = restart;
+            this.lvlText = lvlText;
+            this.numFmt = numFmt;
+            this.isLegal = isLegal;
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.Ole10Native;
+import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
+import org.apache.tika.parser.pkg.ZipContainerDetector;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+abstract class AbstractPOIFSExtractor {
+    private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class);
+    private final EmbeddedDocumentExtractor extractor;
+    private PasswordProvider passwordProvider;
+    private TikaConfig tikaConfig;
+    private MimeTypes mimeTypes;
+    private Detector detector;
+    private Metadata metadata;
+
+    protected AbstractPOIFSExtractor(ParseContext context) {
+        this(context, null);
+    }
+
+    protected AbstractPOIFSExtractor(ParseContext context, Metadata metadata) {
+        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+
+        if (ex == null) {
+            this.extractor = new ParsingEmbeddedDocumentExtractor(context);
+        } else {
+            this.extractor = ex;
+        }
+
+        this.passwordProvider = context.get(PasswordProvider.class);
+        this.tikaConfig = context.get(TikaConfig.class);
+        this.mimeTypes = context.get(MimeTypes.class);
+        this.detector = context.get(Detector.class);
+        this.metadata = metadata;
+    }
+
+    // Note - these cache, but avoid creating the default TikaConfig if not needed
+    protected TikaConfig getTikaConfig() {
+        if (tikaConfig == null) {
+            tikaConfig = TikaConfig.getDefaultConfig();
+        }
+        return tikaConfig;
+    }
+
+    protected Detector getDetector() {
+        if (detector != null) return detector;
+
+        detector = getTikaConfig().getDetector();
+        return detector;
+    }
+
+    protected MimeTypes getMimeTypes() {
+        if (mimeTypes != null) return mimeTypes;
+
+        mimeTypes = getTikaConfig().getMimeRepository();
+        return mimeTypes;
+    }
+
+    /**
+     * Returns the password to be used for this file, or null
+     * if no / default password should be used
+     */
+    protected String getPassword() {
+        if (passwordProvider != null) {
+            return passwordProvider.getPassword(metadata);
+        }
+        return null;
+    }
+
+    protected void handleEmbeddedResource(TikaInputStream resource, String filename,
+                                          String relationshipID, String mediaType, XHTMLContentHandler xhtml,
+                                          boolean outputHtml)
+            throws IOException, SAXException, TikaException {
+        try {
+            Metadata metadata = new Metadata();
+            if (filename != null) {
+                metadata.set(Metadata.TIKA_MIME_FILE, filename);
+                metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+            }
+            if (relationshipID != null) {
+                metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, relationshipID);
+            }
+            if (mediaType != null) {
+                metadata.set(Metadata.CONTENT_TYPE, mediaType);
+            }
+
+            if (extractor.shouldParseEmbedded(metadata)) {
+                extractor.parseEmbedded(resource, xhtml, metadata, outputHtml);
+            }
+        } finally {
+            resource.close();
+        }
+    }
+
+    /**
+     * Handle an office document that's embedded at the POIFS level
+     */
+    protected void handleEmbeddedOfficeDoc(
+            DirectoryEntry dir, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+
+        // Is it an embedded OLE2 document, or an embedded OOXML document?
+
+        if (dir.hasEntry("Package")) {
+            // It's OOXML (has a ZipFile):
+            Entry ooxml = dir.getEntry("Package");
+
+            try (TikaInputStream stream = TikaInputStream.get(
+                    new DocumentInputStream((DocumentEntry) ooxml))) {
+                ZipContainerDetector detector = new ZipContainerDetector();
+                MediaType type = detector.detect(stream, new Metadata());
+                handleEmbeddedResource(stream, null, dir.getName(), type.toString(), xhtml, true);
+                return;
+            }
+        }
+
+        // It's regular OLE2:
+
+        // What kind of document is it?
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
+        POIFSDocumentType type = POIFSDocumentType.detectType(dir);
+        TikaInputStream embedded = null;
+
+        try {
+            if (type == POIFSDocumentType.OLE10_NATIVE) {
+                try {
+                    // Try to un-wrap the OLE10Native record:
+                    Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
+                    if (ole.getLabel() != null) {
+                        metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
+                    }
+                    byte[] data = ole.getDataBuffer();
+                    embedded = TikaInputStream.get(data);
+                } catch (Ole10NativeException ex) {
+                    // Not a valid OLE10Native record, skip it
+                } catch (Exception e) {
+                    logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), e);
+                }
+            } else if (type == POIFSDocumentType.COMP_OBJ) {
+                try {
+                    // Grab the contents and process
+                    DocumentEntry contentsEntry;
+                    try {
+                        contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
+                    } catch (FileNotFoundException ioe) {
+                        contentsEntry = (DocumentEntry) dir.getEntry("Contents");
+                    }
+                    DocumentInputStream inp = new DocumentInputStream(contentsEntry);
+                    byte[] contents = new byte[contentsEntry.getSize()];
+                    inp.readFully(contents);
+                    embedded = TikaInputStream.get(contents);
+
+                    // Try to work out what it is
+                    MediaType mediaType = getDetector().detect(embedded, new Metadata());
+                    String extension = type.getExtension();
+                    try {
+                        MimeType mimeType = getMimeTypes().forName(mediaType.toString());
+                        extension = mimeType.getExtension();
+                    } catch (MimeTypeException mte) {
+                        // No details on this type are known
+                    }
+
+                    // Record what we can do about it
+                    metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
+                    metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
+                } catch (Exception e) {
+                    throw new TikaException("Invalid embedded resource", e);
+                }
+            } else {
+                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
+                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
+            }
+
+            // Should we parse it?
+            if (extractor.shouldParseEmbedded(metadata)) {
+                if (embedded == null) {
+                    // Make a TikaInputStream that just
+                    // passes the root directory of the
+                    // embedded document, and is otherwise
+                    // empty (byte[0]):
+                    embedded = TikaInputStream.get(new byte[0]);
+                    embedded.setOpenContainer(dir);
+                }
+                extractor.parseEmbedded(embedded, xhtml, metadata, true);
+            }
+        } finally {
+            if (embedded != null) {
+                embedded.close();
+            }
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Cell of content. Classes that implement this interface are used by
+ * Tika parsers (currently just the MS Excel parser) to keep track of
+ * individual pieces of content before they are rendered to the XHTML
+ * SAX event stream.
+ */
+public interface Cell {
+
+    /**
+     * Renders the content to the given XHTML SAX event stream.
+     *
+     * @param handler
+     * @throws SAXException
+     */
+    void render(XHTMLContentHandler handler) throws SAXException;
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Cell decorator.
+ */
+public class CellDecorator implements Cell {
+
+    private final Cell cell;
+
+    public CellDecorator(Cell cell) {
+        this.cell = cell;
+    }
+
+    public void render(XHTMLContentHandler handler) throws SAXException {
+        cell.render(handler);
+    }
+
+}