You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC
svn commit: r1723223 [8/32] - in /tika/branches/2.x:
tika-core/src/test/resources/META-INF/
tika-core/src/test/resources/META-INF/services/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-module/src/ tik...
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,913 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.math.BigInteger;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
+import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * Decompresses a chm block. Depending on chm block type chooses most relevant
+ * decompressing method. A chm block type can be as follows:</br> <li>UNDEFINED
+ * - no action taken, i.e. skipping the block <li>VERBATIM <li>ALIGNED_OFFSET
+ * <li>UNCOMPRESSED the most simplest In addition there are unknown types (4-7).
+ * Currently relying on previous chm block these types changing according to the
+ * previous chm block type. We need to invent more appropriate way to handle
+ * such types.
+ *
+ */
+public class ChmLzxBlock {
+ private int block_number;
+ private long block_length;
+ private ChmLzxState state;
+ private byte[] content = null;
+ private ChmSection chmSection = null;
+ private int contentLength = 0;
+
+ // trying to find solution for bad blocks ...
+ private int previousBlockType = -1;
+
+ public ChmLzxBlock(int blockNumber, byte[] dataSegment, long blockLength,
+ ChmLzxBlock prevBlock) throws TikaException {
+ try {
+ if (validateConstructorParams(blockNumber, dataSegment, blockLength)) {
+ setBlockNumber(blockNumber);
+
+ if (prevBlock != null
+ && prevBlock.getState().getBlockLength() > prevBlock
+ .getState().getBlockRemaining())
+ setChmSection(new ChmSection(dataSegment, prevBlock.getContent()));
+ else
+ setChmSection(new ChmSection(dataSegment));
+
+ setBlockLength(blockLength);
+
+ // ============================================
+ // we need to take care of previous context
+ // ============================================
+ checkLzxBlock(prevBlock);
+ if (prevBlock == null
+ || blockLength < (int) getBlockLength()) {
+ setContent((int) getBlockLength());
+ }
+ else {
+ setContent((int) blockLength);
+ }
+
+ if (prevBlock != null && prevBlock.getState() != null)
+ previousBlockType = prevBlock.getState().getBlockType();
+
+ extractContent();
+ } else
+ throw new TikaException("Check your chm lzx block parameters");
+ } catch (TikaException e) {
+ throw e;
+ }
+ }
+
+ protected int getContentLength() {
+ return contentLength;
+ }
+
+ protected void setContentLength(int contentLength) {
+ this.contentLength = contentLength;
+ }
+
+ private ChmSection getChmSection() {
+ return chmSection;
+ }
+
+ private void setChmSection(ChmSection chmSection) {
+ this.chmSection = chmSection;
+ }
+
+ private void assertStateNotNull() throws TikaException {
+ if (getState() == null)
+ throw new ChmParsingException("state is null");
+ }
+
+ private void extractContent() throws TikaException {
+ assertStateNotNull();
+ if (getChmSection().getData() != null) {
+ boolean continueLoop = true;
+ while (continueLoop && getContentLength() < getBlockLength()) {
+ if (getState() != null && getState().getBlockRemaining() == 0) {
+ if (getState().getHadStarted() == LzxState.NOT_STARTED_DECODING) {
+ getState().setHadStarted(LzxState.STARTED_DECODING);
+ if (getChmSection().getSyncBits(1) == 1) {
+ int intelSizeTemp = (getChmSection()
+ .getSyncBits(16) << 16)
+ + getChmSection().getSyncBits(16);
+ if (intelSizeTemp >= 0)
+ getState().setIntelFileSize(intelSizeTemp);
+ else
+ getState().setIntelFileSize(0);
+ }
+ }
+ getState().setBlockType(getChmSection().getSyncBits(3));
+ getState().setBlockLength(
+ (getChmSection().getSyncBits(16) << 8)
+ + getChmSection().getSyncBits(8));
+ getState().setBlockRemaining(getState().getBlockLength());
+
+ // ----------------------------------------
+ // Trying to handle 3 - 7 block types
+ // ----------------------------------------
+ if (getState().getBlockType() > 3) {
+ if (previousBlockType >= 0 && previousBlockType < 3)
+ getState().setBlockType(previousBlockType);
+ }
+
+ switch (getState().getBlockType()) {
+ case ChmCommons.ALIGNED_OFFSET:
+ createAlignedTreeTable();
+ //fall through
+ case ChmCommons.VERBATIM:
+ /* Creates mainTreeTable */
+ createMainTreeTable();
+ createLengthTreeTable();
+ if (getState().getMainTreeLengtsTable()[0xe8] != 0)
+ getState().setIntelState(IntelState.STARTED);
+ break;
+ case ChmCommons.UNCOMPRESSED:
+ getState().setIntelState(IntelState.STARTED);
+ if (getChmSection().getTotal() > 16)
+ getChmSection().setSwath(
+ getChmSection().getSwath() - 1);
+ getState().setR0(
+ (new BigInteger(getChmSection()
+ .reverseByteOrder(
+ getChmSection().unmarshalBytes(
+ 4))).longValue()));
+ getState().setR1(
+ (new BigInteger(getChmSection()
+ .reverseByteOrder(
+ getChmSection().unmarshalBytes(
+ 4))).longValue()));
+ getState().setR2(
+ (new BigInteger(getChmSection()
+ .reverseByteOrder(
+ getChmSection().unmarshalBytes(
+ 4))).longValue()));
+ break;
+ default:
+ break;
+ }
+ } //end of if BlockRemaining == 0
+
+ int tempLen;
+
+ if (getContentLength() + getState().getBlockRemaining() > getBlockLength()) {
+ getState().setBlockRemaining(
+ getContentLength() + getState().getBlockRemaining()
+ - (int) getBlockLength());
+ tempLen = (int) getBlockLength();
+ } else {
+ tempLen = getContentLength()
+ + getState().getBlockRemaining();
+ getState().setBlockRemaining(0);
+ }
+
+ int lastLength = getContentLength();
+ switch (getState().getBlockType()) {
+ case ChmCommons.ALIGNED_OFFSET:
+ // if(prevblock.lzxState.length>prevblock.lzxState.remaining)
+ decompressAlignedBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());// prevcontext
+ break;
+ case ChmCommons.VERBATIM:
+ decompressVerbatimBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());
+ break;
+ case ChmCommons.UNCOMPRESSED:
+ decompressUncompressedBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());
+ break;
+ }
+ getState().increaseFramesRead();
+ if ((getState().getFramesRead() < 32768)
+ && getState().getIntelFileSize() != 0)
+ intelE8Decoding();
+
+ continueLoop = getContentLength() > lastLength;
+ }
+ }
+ }
+
+ protected void intelE8Decoding() {
+ if (getBlockLength() <= ChmConstants.LZX_PRETREE_TABLEBITS
+ || (getState().getIntelState() == IntelState.NOT_STARTED)) {
+ getState().setBlockRemaining(
+ getState().getBlockRemaining() - (int) getBlockLength());
+ } else {
+ long curpos = getState().getBlockRemaining();
+ getState().setBlockRemaining(
+ getState().getBlockRemaining() - (int) getBlockLength());
+ int i = 0;
+ while (i < getBlockLength() - 10) {
+ if (content[i] != 0xe8) {
+ i++;
+ continue;
+ }
+ byte[] b = new byte[4];
+ b[0] = getContent()[i + 3];
+ b[1] = getContent()[i + 2];
+ b[2] = getContent()[i + 1];
+ b[3] = getContent()[i + 0];
+ long absoff = (new BigInteger(b)).longValue();
+ if ((absoff >= -curpos)
+ && (absoff < getState().getIntelFileSize())) {
+ long reloff = (absoff >= 0) ? absoff - curpos : absoff
+ + getState().getIntelFileSize();
+ getContent()[i + 0] = (byte) reloff;
+ getContent()[i + 1] = (byte) (reloff >>> 8);
+ getContent()[i + 2] = (byte) (reloff >>> 16);
+ getContent()[i + 3] = (byte) (reloff >>> 24);
+ }
+ i += 4;
+ curpos += 5;
+ }
+ }
+ }
+
+ private short[] createPreLenTable() {
+ short[] tmp = new short[ChmConstants.LZX_PRETREE_MAXSYMBOLS];
+ for (int i = 0; i < ChmConstants.LZX_PRETREE_MAXSYMBOLS; i++) {
+ tmp[i] = (short) getChmSection().getSyncBits(
+ ChmConstants.LZX_PRETREE_NUM_ELEMENTS_BITS);
+ }
+ return tmp;
+ }
+
+ private void createLengthTreeTable() throws TikaException {
+ //Read Pre Tree Table
+ short[] prelentable = createPreLenTable();
+
+ if (prelentable == null) {
+ throw new ChmParsingException("pretreetable is null");
+ }
+
+ short[] pretreetable = createTreeTable2(prelentable,
+ (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+ + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+ ChmConstants.LZX_PRETREE_TABLEBITS,
+ ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+
+ if (pretreetable == null) {
+ throw new ChmParsingException("pretreetable is null");
+ }
+
+ //Build Length Tree
+ createLengthTreeLenTable(0, ChmConstants.LZX_NUM_SECONDARY_LENGTHS,
+ pretreetable, prelentable);
+
+ getState().setLengthTreeTable(
+ createTreeTable2(getState().getLengthTreeLengtsTable(),
+ (1 << ChmConstants.LZX_LENGTH_TABLEBITS)
+ + (ChmConstants.LZX_LENGTH_MAXSYMBOLS << 1),
+ ChmConstants.LZX_LENGTH_TABLEBITS,
+ ChmConstants.LZX_NUM_SECONDARY_LENGTHS));
+ }
+
+ private void decompressUncompressedBlock(int len, byte[] prevcontent) {
+ if (getContentLength() + getState().getBlockRemaining() <= getBlockLength()) {
+ for (int i = getContentLength(); i < (getContentLength() + getState()
+ .getBlockRemaining()); i++)
+ content[i] = getChmSection().getByte();
+
+ setContentLength(getContentLength()
+ + getState().getBlockRemaining());
+ getState().setBlockRemaining(0);
+ } else {
+ for (int i = getContentLength(); i < getBlockLength(); i++)
+ content[i] = getChmSection().getByte();
+ getState().setBlockRemaining(
+ (int) getBlockLength() - getContentLength());// = blockLen -
+ // contentlen;
+ setContentLength((int) getBlockLength());
+ }
+ }
+
+ private void decompressAlignedBlock(int len, byte[] prevcontent) throws TikaException {
+
+ if ((getChmSection() == null) || (getState() == null)
+ || (getState().getMainTreeTable() == null))
+ throw new ChmParsingException("chm section is null");
+
+ short s;
+ int x, i, border;
+ int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
+ int matchoffset = 0;
+ for (i = getContentLength(); i < len; i++) {
+ /* new code */
+ //read huffman tree from main tree
+ border = getChmSection().peekBits(
+ ChmConstants.LZX_MAINTREE_TABLEBITS);
+ if (border >= getState().mainTreeTable.length)
+ throw new ChmParsingException("error decompressing aligned block.");
+ //break;
+ /* end new code */
+ s = getState().mainTreeTable[getChmSection().peekBits(
+ ChmConstants.LZX_MAINTREE_TABLEBITS)];
+ if (s >= getState().getMainTreeElements()) {
+ x = ChmConstants.LZX_MAINTREE_TABLEBITS;
+ do {
+ x++;
+ s <<= 1;
+ s += getChmSection().checkBit(x);
+ } while ((s = getState().mainTreeTable[s]) >= getState()
+ .getMainTreeElements());
+ }
+ //System.out.printf("%d,", s);
+ //?getChmSection().getSyncBits(getState().mainTreeTable[s]);
+ getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]);
+ if (s < ChmConstants.LZX_NUM_CHARS) {
+ content[i] = (byte) s;
+ } else {
+ s -= ChmConstants.LZX_NUM_CHARS;
+ matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
+ if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
+ matchfooter = getState().lengthTreeTable[getChmSection()
+ .peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)];//.LZX_MAINTREE_TABLEBITS)];
+ if (matchfooter >= ChmConstants.LZX_LENGTH_MAXSYMBOLS/*?LZX_LENGTH_TABLEBITS*/) {
+ x = ChmConstants.LZX_LENGTH_TABLEBITS;
+ do {
+ x++;
+ matchfooter <<= 1;
+ matchfooter += getChmSection().checkBit(x);
+ } while ((matchfooter = getState().lengthTreeTable[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
+ }
+ getChmSection().getSyncBits(
+ getState().lengthTreeLengtsTable[matchfooter]);
+ matchlen += matchfooter;
+ }
+ matchlen += ChmConstants.LZX_MIN_MATCH;
+ matchoffset = s >>> 3;
+ if (matchoffset > 2) {
+ extra = ChmConstants.EXTRA_BITS[matchoffset];
+ matchoffset = (ChmConstants.POSITION_BASE[matchoffset] - 2);
+ if (extra > 3) {
+ extra -= 3;
+ long verbatim_bits = getChmSection().getSyncBits(extra);
+ matchoffset += (verbatim_bits << 3);
+ //READ HUFF SYM in Aligned Tree
+ int aligned_bits = getChmSection().peekBits(
+ ChmConstants.LZX_NUM_PRIMARY_LENGTHS);
+ int t = getState().getAlignedTreeTable()[aligned_bits];
+ if (t >= getState().getMainTreeElements()) {
+ x = ChmConstants.LZX_ALIGNED_TABLEBITS; //?LZX_MAINTREE_TABLEBITS; //?LZX_ALIGNED_TABLEBITS
+ do {
+ x++;
+ t <<= 1;
+ t += getChmSection().checkBit(x);
+ } while ((t = getState().getAlignedTreeTable()[t]) >= getState()
+ .getMainTreeElements());
+ }
+ getChmSection().getSyncBits(
+ getState().getAlignedLenTable()[t]);
+ matchoffset += t;
+ } else if (extra == 3) {
+ int g = getChmSection().peekBits(
+ ChmConstants.LZX_NUM_PRIMARY_LENGTHS);
+ int t = getState().getAlignedTreeTable()[g];
+ if (t >= getState().getMainTreeElements()) {
+ x = ChmConstants.LZX_ALIGNED_TABLEBITS; //?LZX_MAINTREE_TABLEBITS;
+ do {
+ x++;
+ t <<= 1;
+ t += getChmSection().checkBit(x);
+ } while ((t = getState().getAlignedTreeTable()[t]) >= getState()
+ .getMainTreeElements());
+ }
+ getChmSection().getSyncBits(
+ getState().getAlignedLenTable()[t]);
+ matchoffset += t;
+ } else if (extra > 0) {
+ long l = getChmSection().getSyncBits(extra);
+ matchoffset += l;
+ } else
+ matchoffset = 1;
+ getState().setR2(getState().getR1());
+ getState().setR1(getState().getR0());
+ getState().setR0(matchoffset);
+ } else if (matchoffset == 0) {
+ matchoffset = (int) getState().getR0();
+ } else if (matchoffset == 1) {
+ matchoffset = (int) getState().getR1();
+ getState().setR1(getState().getR0());
+ getState().setR0(matchoffset);
+ } else /** match_offset == 2 */
+ {
+ matchoffset = (int) getState().getR2();
+ getState().setR2(getState().getR0());
+ getState().setR0(matchoffset);
+ }
+ rundest = i;
+ runsrc = rundest - matchoffset;
+ i += (matchlen - 1);
+ if (i > len)
+ break;
+
+ if (runsrc < 0) {
+ if (matchlen + runsrc <= 0) {
+ runsrc = prevcontent.length + runsrc;
+ while (matchlen-- > 0)
+ content[rundest++] = prevcontent[runsrc++];
+ } else {
+ runsrc = prevcontent.length + runsrc;
+ while (runsrc < prevcontent.length)
+ content[rundest++] = prevcontent[runsrc++];
+ matchlen = matchlen + runsrc - prevcontent.length;
+ runsrc = 0;
+ while (matchlen-- > 0)
+ content[rundest++] = content[runsrc++];
+ }
+
+ } else {
+ /* copies any wrappes around source data */
+ while ((runsrc < 0) && (matchlen-- > 0)) {
+ content[rundest++] = content[(int) (runsrc + getBlockLength())];
+ runsrc++;
+ }
+ /* copies match data - no worries about destination wraps */
+ while (matchlen-- > 0)
+ content[rundest++] = content[runsrc++];
+ }
+ }
+ }
+ setContentLength(len);
+ }
+
+ private void assertShortArrayNotNull(short[] array) throws TikaException {
+ if (array == null)
+ throw new ChmParsingException("short[] is null");
+ }
+
+ private void decompressVerbatimBlock(int len, byte[] prevcontent) throws TikaException {
+ short s;
+ int x, i;
+ int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
+ int matchoffset = 0;
+ for (i = getContentLength(); i < len; i++) {
+ int f = getChmSection().peekBits(
+ ChmConstants.LZX_MAINTREE_TABLEBITS);
+ assertShortArrayNotNull(getState().getMainTreeTable());
+ s = getState().getMainTreeTable()[f];
+ if (s >= ChmConstants.LZX_MAIN_MAXSYMBOLS) {
+ x = ChmConstants.LZX_MAINTREE_TABLEBITS;
+ do {
+ x++;
+ s <<= 1;
+ s += getChmSection().checkBit(x);
+ } while ((s = getState().getMainTreeTable()[s]) >= ChmConstants.LZX_MAIN_MAXSYMBOLS);
+ }
+ getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]);
+ if (s < ChmConstants.LZX_NUM_CHARS) {
+ content[i] = (byte) s;
+ } else {
+ s -= ChmConstants.LZX_NUM_CHARS;
+ matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
+ if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
+ matchfooter = getState().getLengthTreeTable()[getChmSection()
+ .peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)];
+ if (matchfooter >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS) {
+ x = ChmConstants.LZX_LENGTH_TABLEBITS;
+ do {
+ x++;
+ matchfooter <<= 1;
+ matchfooter += getChmSection().checkBit(x);
+ } while ((matchfooter = getState().getLengthTreeTable()[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
+ }
+ getChmSection().getSyncBits(
+ getState().getLengthTreeLengtsTable()[matchfooter]);
+ matchlen += matchfooter;
+ }
+ matchlen += ChmConstants.LZX_MIN_MATCH;
+ // shorter than 2
+ matchoffset = s >>> 3;
+ if (matchoffset > 2) {
+ if (matchoffset != 3) { // should get other bits to retrieve
+ // offset
+ extra = ChmConstants.EXTRA_BITS[matchoffset];
+ long l = getChmSection().getSyncBits(extra);
+ matchoffset = (int) (ChmConstants.POSITION_BASE[matchoffset] - 2 + l);
+ } else {
+ matchoffset = 1;
+ }
+ getState().setR2(getState().getR1());
+ getState().setR1(getState().getR0());
+ getState().setR0(matchoffset);
+ } else if (matchoffset == 0) {
+ matchoffset = (int) getState().getR0();
+ } else if (matchoffset == 1) {
+ matchoffset = (int) getState().getR1();
+ getState().setR1(getState().getR0());
+ getState().setR0(matchoffset);
+ } else /* match_offset == 2 */
+ {
+ matchoffset = (int) getState().getR2();
+ getState().setR2(getState().getR0());
+ getState().setR0(matchoffset);
+ }
+ rundest = i;
+ runsrc = rundest - matchoffset;
+ i += (matchlen - 1);
+ if (i > len)
+ break;
+ if (runsrc < 0) {
+ if (matchlen + runsrc <= 0) {
+ runsrc = prevcontent.length + runsrc;
+ while ((matchlen-- > 0) && (prevcontent != null)
+ && ((runsrc + 1) > 0))
+ if ((rundest < content.length)
+ && (runsrc < content.length))
+ content[rundest++] = prevcontent[runsrc++];
+ } else {
+ runsrc = prevcontent.length + runsrc;
+ while (runsrc < prevcontent.length)
+ if ((rundest < content.length)
+ && (runsrc < content.length))
+ content[rundest++] = prevcontent[runsrc++];
+ matchlen = matchlen + runsrc - prevcontent.length;
+ runsrc = 0;
+ while (matchlen-- > 0)
+ content[rundest++] = content[runsrc++];
+ }
+
+ } else {
+ /* copies any wrapped source data */
+ while ((runsrc < 0) && (matchlen-- > 0)) {
+ content[rundest++] = content[(int) (runsrc + getBlockLength())];
+ runsrc++;
+ }
+ /* copies match data - no worries about destination wraps */
+ while (matchlen-- > 0) {
+ if ((rundest < content.length)
+ && (runsrc < content.length))
+ content[rundest++] = content[runsrc++];
+ }
+ }
+ }
+ }
+ setContentLength(len);
+ }
+
+ private void createLengthTreeLenTable(int offset, int tablelen,
+ short[] pretreetable, short[] prelentable) throws TikaException {
+ if (prelentable == null || getChmSection() == null
+ || pretreetable == null || prelentable == null)
+ throw new ChmParsingException("is null");
+
+ int i = offset; // represents offset
+ int z, y, x;// local counters
+ while (i < tablelen) {
+ //Read HUFF sym to z
+ z = pretreetable[getChmSection().peekBits(
+ ChmConstants.LZX_PRETREE_TABLEBITS)];
+ if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 1 bug, should be
+ // 20
+ x = ChmConstants.LZX_PRETREE_TABLEBITS;
+ do {
+ x++;
+ z <<= 1;
+ z += getChmSection().checkBit(x);
+ } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS);
+ }
+ getChmSection().getSyncBits(prelentable[z]);
+
+ if (z < 17) {
+ z = getState().getLengthTreeLengtsTable()[i] - z;
+ if (z < 0)
+ z = z + 17;
+ getState().getLengthTreeLengtsTable()[i] = (short) z;
+ i++;
+ } else if (z == 17) {
+ y = getChmSection().getSyncBits(4);
+ y += 4;
+ for (int j = 0; j < y; j++)
+ if (i < getState().getLengthTreeLengtsTable().length)
+ getState().getLengthTreeLengtsTable()[i++] = 0;
+ } else if (z == 18) {
+ y = getChmSection().getSyncBits(5);
+ y += 20;
+ for (int j = 0; j < y; j++)
+ //no tolerate //if (i < getState().getLengthTreeLengtsTable().length)
+ getState().getLengthTreeLengtsTable()[i++] = 0;
+ } else if (z == 19) {
+ y = getChmSection().getSyncBits(1);
+ y += 4;
+ z = pretreetable[getChmSection().peekBits(
+ ChmConstants.LZX_PRETREE_TABLEBITS)];
+ if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 20
+ x = ChmConstants.LZX_PRETREE_TABLEBITS;// 6
+ do {
+ x++;
+ z <<= 1;
+ z += getChmSection().checkBit(x);
+ } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS);//LZX_MAINTREE_TABLEBITS);
+ }
+ getChmSection().getSyncBits(prelentable[z]);
+ z = getState().getLengthTreeLengtsTable()[i] - z;
+ if (z < 0)
+ z = z + 17;
+ for (int j = 0; j < y; j++)
+ getState().getLengthTreeLengtsTable()[i++] = (short) z;
+ }
+ }
+ }
+
+ private void createMainTreeTable() throws TikaException {
+ //Read Pre Tree Table
+ short[] prelentable = createPreLenTable();
+ short[] pretreetable = createTreeTable2(prelentable,
+ (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+ + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+ ChmConstants.LZX_PRETREE_TABLEBITS,
+ ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+
+ createMainTreeLenTable(0, ChmConstants.LZX_NUM_CHARS, pretreetable,
+ prelentable);
+
+ //Read Pre Tree Table
+ prelentable = createPreLenTable();
+ pretreetable = createTreeTable2(prelentable,
+ (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+ + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+ ChmConstants.LZX_PRETREE_TABLEBITS,
+ ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+
+ createMainTreeLenTable(ChmConstants.LZX_NUM_CHARS,
+ getState().mainTreeLengtsTable.length, pretreetable,
+ prelentable);
+
+ getState().setMainTreeTable(
+ createTreeTable2(getState().mainTreeLengtsTable,
+ (1 << ChmConstants.LZX_MAINTREE_TABLEBITS)
+ + (ChmConstants.LZX_MAINTREE_MAXSYMBOLS << 1),
+ ChmConstants.LZX_MAINTREE_TABLEBITS, getState()
+ .getMainTreeElements()));
+ }
+
+ private void createMainTreeLenTable(int offset, int tablelen,
+ short[] pretreetable, short[] prelentable) throws TikaException {
+ if (pretreetable == null)
+ throw new ChmParsingException("pretreetable is null");
+ int i = offset;
+ int z, y, x;
+ while (i < tablelen) {
+ int f = getChmSection().peekBits(
+ ChmConstants.LZX_PRETREE_TABLEBITS);
+ z = pretreetable[f];
+ if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
+ x = ChmConstants.LZX_PRETREE_TABLEBITS;
+ do {
+ x++;
+ z <<= 1;
+ z += getChmSection().checkBit(x);
+ } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+ }
+ getChmSection().getSyncBits(prelentable[z]);
+ if (z < 17) {
+ z = getState().getMainTreeLengtsTable()[i] - z;
+ if (z < 0)
+ z = z + 17;
+ getState().mainTreeLengtsTable[i] = (short) z;
+ i++;
+ } else if (z == 17) {
+ y = getChmSection().getSyncBits(4);
+ y += 4;
+ for (int j = 0; j < y; j++) {
+ assertInRange(getState().getMainTreeLengtsTable(), i);
+ getState().mainTreeLengtsTable[i++] = 0;
+ }
+ } else if (z == 18) {
+ y = getChmSection().getSyncBits(5);
+ y += 20;
+ for (int j = 0; j < y; j++) {
+ assertInRange(getState().getMainTreeLengtsTable(), i);
+ getState().mainTreeLengtsTable[i++] = 0;
+ }
+ } else if (z == 19) {
+ y = getChmSection().getSyncBits(1);
+ y += 4;
+ z = pretreetable[getChmSection().peekBits(
+ ChmConstants.LZX_PRETREE_TABLEBITS)];
+ if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
+ x = ChmConstants.LZX_PRETREE_TABLEBITS;
+ do {
+ x++;
+ z <<= 1;
+ z += getChmSection().checkBit(x);
+ } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+ }
+ getChmSection().getSyncBits(prelentable[z]);
+ z = getState().mainTreeLengtsTable[i] - z;
+ if (z < 0)
+ z = z + 17;
+ for (int j = 0; j < y; j++)
+ if (i < getState().getMainTreeLengtsTable().length)
+ getState().mainTreeLengtsTable[i++] = (short) z;
+ }
+ }
+ }
+
+ private void assertInRange(short[] array, int index) throws ChmParsingException {
+ if (index >= array.length)
+ throw new ChmParsingException(index + " is bigger than "
+ + array.length);
+ }
+
+ private short[] createAlignedLenTable() {
+ int tablelen = ChmConstants.LZX_ALIGNED_NUM_ELEMENTS;//LZX_BLOCKTYPE_UNCOMPRESSED;//
+ int bits = ChmConstants.LZX_BLOCKTYPE_UNCOMPRESSED;
+ short[] tmp = new short[tablelen];
+ for (int i = 0; i < tablelen; i++) {
+ tmp[i] = (short) getChmSection().getSyncBits(bits);
+ }
+ return tmp;
+ }
+
+ private void createAlignedTreeTable() throws ChmParsingException {
+ getState().setAlignedLenTable(createAlignedLenTable());
+ getState().setAlignedTreeTable(//setAlignedLenTable(
+ createTreeTable2(getState().getAlignedLenTable(),
+ (1 << ChmConstants.LZX_NUM_PRIMARY_LENGTHS)
+ + (ChmConstants.LZX_ALIGNED_MAXSYMBOLS << 1),
+ ChmConstants.LZX_NUM_PRIMARY_LENGTHS,
+ ChmConstants.LZX_ALIGNED_MAXSYMBOLS));
+ }
+
+ private short[] createTreeTable2(short[] lentable, int tablelen, int bits,
+ int maxsymbol) throws ChmParsingException {
+ short[] tmp = new short[tablelen];
+ short sym;
+ int leaf;
+ int bit_num = 1;
+ long fill;
+ int pos = 0;
+ /* the current position in the decode table */
+ long table_mask = (1 << bits);
+ long bit_mask = (table_mask >> 1);
+ long next_symbol = bit_mask;
+
+ /* fills entries for short codes for a direct mapping */
+ while (bit_num <= bits) {
+ for (sym = 0; sym < maxsymbol; sym++) {
+ if (lentable.length > sym && lentable[sym] == bit_num) {
+ leaf = pos;
+
+ if ((pos += bit_mask) > table_mask) {
+ /* table overflow */
+ throw new ChmParsingException("Table overflow");
+ }
+
+ fill = bit_mask;
+ while (fill-- > 0)
+ tmp[leaf++] = sym;
+ }
+ }
+ bit_mask >>= 1;
+ bit_num++;
+ }
+
+ /* if there are any codes longer than nbits */
+ if (pos != table_mask) {
+ /* clears the remainder of the table */
+ for (leaf = pos; leaf < table_mask; leaf++)
+ tmp[leaf] = 0;
+
+ /* gives ourselves room for codes to grow by up to 16 more bits */
+ pos <<= 16;
+ table_mask <<= 16;
+ bit_mask = 1 << 15;
+
+ while (bit_num <= 16) {
+ for (sym = 0; sym < maxsymbol; sym++) {
+ if ((lentable.length > sym) && (lentable[sym] == bit_num)) {
+ leaf = pos >> 16;
+ for (fill = 0; fill < bit_num - bits; fill++) {
+ /*
+ * if this path hasn't been taken yet, 'allocate'
+ * two entries
+ */
+ if (tmp[leaf] == 0) {
+ if (((next_symbol << 1) + 1) < tmp.length) {
+ tmp[(int) (next_symbol << 1)] = 0;
+ tmp[(int) (next_symbol << 1) + 1] = 0;
+ tmp[leaf] = (short) next_symbol++;
+ }
+
+ }
+ /*
+ * follows the path and select either left or right
+ * for next bit
+ */
+ leaf = tmp[leaf] << 1;
+ if (((pos >> (15 - fill)) & 1) != 0)
+ leaf++;
+ }
+ tmp[leaf] = sym;
+
+ if ((pos += bit_mask) > table_mask) {
+ /* table overflow */
+ throw new ChmParsingException("Table overflow");
+ }
+ }
+ }
+ bit_mask >>= 1;
+ bit_num++;
+ }
+ }
+
+ /* is it full table? */
+ if (pos == table_mask)
+ return tmp;
+
+ return tmp;
+ }
+
+ public byte[] getContent() {
+ return content;
+ }
+
+ public byte[] getContent(int startOffset, int endOffset) {
+ return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
+ startOffset, endOffset) : new byte[1];
+ }
+
+ public byte[] getContent(int start) {
+ return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
+ start, getContent().length) : new byte[1];
+ }
+
+ private void setContent(int contentLength) {
+ this.content = new byte[contentLength];
+ }
+
+ private void checkLzxBlock(ChmLzxBlock chmPrevLzxBlock) throws TikaException {
+ if (chmPrevLzxBlock == null && getBlockLength() < Integer.MAX_VALUE)
+ setState(new ChmLzxState((int) getBlockLength()));
+ else
+ //use clone to avoid changing a cached or to be cached block
+ setState(chmPrevLzxBlock.getState().clone());
+ }
+
+ private boolean validateConstructorParams(int blockNumber,
+ byte[] dataSegment, long blockLength) throws TikaException {
+ int goodParameter = 0;
+ if (blockNumber >= 0)
+ ++goodParameter;
+ else
+ throw new ChmParsingException("block number should be possitive");
+ if (dataSegment != null && dataSegment.length > 0)
+ ++goodParameter;
+ else
+ throw new ChmParsingException("data segment should not be null");
+ if (blockLength > 0)
+ ++goodParameter;
+ else
+ throw new ChmParsingException(
+ "block length should be more than zero");
+ return (goodParameter == 3);
+ }
+
+ public int getBlockNumber() {
+ return block_number;
+ }
+
+ private void setBlockNumber(int block_number) {
+ this.block_number = block_number;
+ }
+
+ private long getBlockLength() {
+ return block_length;
+ }
+
+ private void setBlockLength(long block_length) {
+ this.block_length = block_length;
+ }
+
+ public ChmLzxState getState() {
+ return state;
+ }
+
+ private void setState(ChmLzxState state) {
+ this.state = state;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,327 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.util.concurrent.CancellationException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
+import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+public class ChmLzxState implements Cloneable {
+ /* Class' members */
+ private int window; /* the actual decoding window */
+ private long window_size; /* window size (32Kb through 2Mb) */
+ private int window_position; /* current offset within the window */
+ private int main_tree_elements; /* number of main tree elements */
+ private LzxState hadStarted; /* have we started decoding at all yet? */
+ private int block_type; /* type of this block */
+ private int block_length; /* uncompressed length of this block */
+ private int block_remaining; /* uncompressed bytes still left to decode */
+ private int frames_read; /* the number of CFDATA blocks processed */
+ private int intel_file_size; /* magic header value used for transform */
+ private long intel_current_possition; /* current offset in transform space */
+ private IntelState intel_state; /* have we seen any translatable data yet? */
+ private long R0; /* for the LRU offset system */
+ private long R1; /* for the LRU offset system */
+ private long R2; /* for the LRU offset system */
+
+ // Trees - PRETREE, MAINTREE, LENGTH, ALIGNED
+ protected short[] mainTreeLengtsTable;
+ protected short[] mainTreeTable;
+
+ protected short[] lengthTreeTable;
+ protected short[] lengthTreeLengtsTable;
+
+ protected short[] alignedLenTable;
+ protected short[] alignedTreeTable;
+
+ @Override
+ public ChmLzxState clone() {
+ try {
+ ChmLzxState clone = (ChmLzxState)super.clone();
+ clone.mainTreeLengtsTable = arrayClone(mainTreeLengtsTable);
+ clone.mainTreeTable = arrayClone(mainTreeTable);
+ clone.lengthTreeTable = arrayClone(lengthTreeTable);
+ clone.lengthTreeLengtsTable = arrayClone(lengthTreeLengtsTable);
+ clone.alignedLenTable = arrayClone(alignedLenTable);
+ clone.alignedTreeTable = arrayClone(alignedTreeTable);
+ return clone;
+ } catch (CloneNotSupportedException ex) {
+ return null;
+ }
+ }
+
+ protected short[] getMainTreeTable() {
+ return mainTreeTable;
+ }
+
+ protected short[] getAlignedTreeTable() {
+ return alignedTreeTable;
+ }
+
+ protected void setAlignedTreeTable(short[] alignedTreeTable) {
+ this.alignedTreeTable = alignedTreeTable;
+ }
+
+ protected short[] getLengthTreeTable() throws TikaException {
+ if (lengthTreeTable != null)
+ return this.lengthTreeTable;
+ else
+ throw new ChmParsingException("lengthTreeTable is null");
+ }
+
+ protected void setLengthTreeTable(short[] lengthTreeTable) {
+ this.lengthTreeTable = lengthTreeTable;
+ }
+
+ protected void setMainTreeTable(short[] mainTreeTable) {
+ this.mainTreeTable = mainTreeTable;
+ }
+
+ protected short[] getAlignedLenTable() {
+ return this.alignedLenTable;
+ }
+
+ protected void setAlignedLenTable(short[] alignedLenTable) {
+ this.alignedLenTable = alignedLenTable;
+ }
+
+ /**
+ * It suits for informative outlook
+ */
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("actual decoding window:=" + getWindow()
+ + System.getProperty("line.separator"));
+ sb.append("window size (32Kb through 2Mb):=" + getWindowSize()
+ + System.getProperty("line.separator"));
+ sb.append("current offset within the window:=" + getWindowPosition()
+ + System.getProperty("line.separator"));
+ sb.append("number of main tree elements:=" + getMainTreeElements()
+ + System.getProperty("line.separator"));
+ sb.append("have we started decoding at all yet?:=" + getHadStarted()
+ + System.getProperty("line.separator"));
+ sb.append("type of this block:=" + getBlockType()
+ + System.getProperty("line.separator"));
+ sb.append("uncompressed length of this block:=" + getBlockLength()
+ + System.getProperty("line.separator"));
+ sb.append("uncompressed bytes still left to decode:="
+ + getBlockRemaining() + System.getProperty("line.separator"));
+ sb.append("the number of CFDATA blocks processed:=" + getFramesRead()
+ + System.getProperty("line.separator"));
+ sb.append("magic header value used for transform:="
+ + getIntelFileSize() + System.getProperty("line.separator"));
+ sb.append("current offset in transform space:="
+ + getIntelCurrentPossition()
+ + System.getProperty("line.separator"));
+ sb.append("have we seen any translatable data yet?:=" + getIntelState()
+ + System.getProperty("line.separator"));
+ sb.append("R0 for the LRU offset system:=" + getR0()
+ + System.getProperty("line.separator"));
+ sb.append("R1 for the LRU offset system:=" + getR1()
+ + System.getProperty("line.separator"));
+ sb.append("R2 for the LRU offset system:=" + getR2()
+ + System.getProperty("line.separator"));
+ sb.append("main tree length:=" + getMainTreeLengtsTable().length
+ + System.getProperty("line.separator"));
+ sb.append("secondary tree length:=" + getLengthTreeLengtsTable().length
+ + System.getProperty("line.separator"));
+ return sb.toString();
+ }
+
+ public ChmLzxState(int window) throws TikaException {
+ if (window >= 0) {
+ int position_slots;
+ int win = ChmCommons.getWindowSize(window);
+ setWindowSize(1 << win);
+ /* LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) */
+ if (win < 15 || win > 21)
+ throw new ChmParsingException("window less than 15 or window greater than 21");
+
+ /* Calculates required position slots */
+ if (win == 20)
+ position_slots = 42;
+ else if (win == 21)
+ position_slots = 50;
+ else
+ position_slots = win << 1;
+ //TODO: position_slots is not used ?
+ setR0(1);
+ setR1(1);
+ setR2(1);
+ setMainTreeElements(512);
+ setHadStarted(LzxState.NOT_STARTED_DECODING);
+ setFramesRead(0);
+ setBlockRemaining(0);
+ setBlockType(ChmConstants.LZX_BLOCKTYPE_INVALID);
+ setIntelCurrentPossition(0);
+ setIntelState(IntelState.NOT_STARTED);
+ setWindowPosition(0);
+ setMainTreeLengtsTable(new short[getMainTreeElements()]);
+ setLengthTreeLengtsTable(new short[ChmConstants.LZX_NUM_SECONDARY_LENGTHS]);
+ } else
+ throw new CancellationException(
+ "window size should be more than zero");
+ }
+
+ protected void setWindow(int window) {
+ this.window = window;
+ }
+
+ protected int getWindow() {
+ return window;
+ }
+
+ protected void setWindowSize(long window_size) {
+ this.window_size = window_size;
+ }
+
+ protected long getWindowSize() {
+ return window_size;
+ }
+
+ protected void setWindowPosition(int window_position) {
+ this.window_position = window_position;
+ }
+
+ protected int getWindowPosition() {
+ return window_position;
+ }
+
+ protected void setMainTreeElements(int main_tree_elements) {
+ this.main_tree_elements = main_tree_elements;
+ }
+
+ protected int getMainTreeElements() {
+ return main_tree_elements;
+ }
+
+ protected void setHadStarted(LzxState hadStarted) {
+ this.hadStarted = hadStarted;
+ }
+
+ protected LzxState getHadStarted() {
+ return hadStarted;
+ }
+
+ protected void setBlockType(int block_type) {
+ this.block_type = block_type;
+ }
+
+ public int getBlockType() {
+ return block_type;
+ }
+
+ protected void setBlockLength(int block_length) {
+ this.block_length = block_length;
+ }
+
+ protected int getBlockLength() {
+ return block_length;
+ }
+
+ protected void setBlockRemaining(int block_remaining) {
+ this.block_remaining = block_remaining;
+ }
+
+ protected int getBlockRemaining() {
+ return block_remaining;
+ }
+
+ protected void setFramesRead(int frames_read) {
+ this.frames_read = frames_read;
+ }
+
+ protected void increaseFramesRead() {
+ this.frames_read = getFramesRead() + 1;
+ }
+
+ protected int getFramesRead() {
+ return frames_read;
+ }
+
+ protected void setIntelFileSize(int intel_file_size) {
+ this.intel_file_size = intel_file_size;
+ }
+
+ protected int getIntelFileSize() {
+ return intel_file_size;
+ }
+
+ protected void setIntelCurrentPossition(long intel_current_possition) {
+ this.intel_current_possition = intel_current_possition;
+ }
+
+ protected long getIntelCurrentPossition() {
+ return intel_current_possition;
+ }
+
+ protected void setIntelState(IntelState intel_state) {
+ this.intel_state = intel_state;
+ }
+
+ protected IntelState getIntelState() {
+ return intel_state;
+ }
+
+ protected void setR0(long r0) {
+ R0 = r0;
+ }
+
+ protected long getR0() {
+ return R0;
+ }
+
+ protected void setR1(long r1) {
+ R1 = r1;
+ }
+
+ protected long getR1() {
+ return R1;
+ }
+
+ protected void setR2(long r2) {
+ R2 = r2;
+ }
+
+ protected long getR2() {
+ return R2;
+ }
+
+ public void setMainTreeLengtsTable(short[] mainTreeLengtsTable) {
+ this.mainTreeLengtsTable = mainTreeLengtsTable;
+ }
+
+ public short[] getMainTreeLengtsTable() {
+ return mainTreeLengtsTable;
+ }
+
+ public void setLengthTreeLengtsTable(short[] lengthTreeLengtsTable) {
+ this.lengthTreeLengtsTable = lengthTreeLengtsTable;
+ }
+
+ public short[] getLengthTreeLengtsTable() {
+ return lengthTreeLengtsTable;
+ }
+
+ private static short[] arrayClone(short[] a) {
+ return a==null ? null : (short[]) a.clone();
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+
+public class ChmSection {
+ final private byte[] data;
+ final private byte[] prevcontent;
+ private int swath;// kiks
+ private int total;// remains
+ private int buffer;// val
+
+ public ChmSection(byte[] data) throws TikaException {
+ this(data, null);
+ }
+
+ public ChmSection(byte[] data, byte[] prevconent) throws TikaException {
+ ChmCommons.assertByteArrayNotNull(data);
+ this.data = data;
+ this.prevcontent = prevconent;
+ //setData(data);
+ }
+
+ /* Utilities */
+ public byte[] reverseByteOrder(byte[] toBeReversed) throws TikaException {
+ ChmCommons.assertByteArrayNotNull(toBeReversed);
+ ChmCommons.reverse(toBeReversed);
+ return toBeReversed;
+ }
+
+ public int checkBit(int i) {
+ return ((getBuffer() & (1 << (getTotal() - i))) == 0) ? 0 : 1;
+ }
+
+ public int getSyncBits(int bit) {
+ return getDesyncBits(bit, bit);
+ }
+
+ public int peekBits(int bit) {
+ return getDesyncBits(bit, 0);
+ }
+
+ private int getDesyncBits(int bit, int removeBit) {
+ while (getTotal() < 16) {
+ setBuffer((getBuffer() << 16) + unmarshalUByte()
+ + (unmarshalUByte() << 8));
+ setTotal(getTotal() + 16);
+ }
+ int tmp = (getBuffer() >>> (getTotal() - bit));
+ setTotal(getTotal() - removeBit);
+ setBuffer(getBuffer() - ((getBuffer() >>> getTotal()) << getTotal()));
+ return tmp;
+ }
+
+ public int unmarshalUByte() {
+ return getByte() & 255;
+ }
+
+ public byte getByte() {
+ if (getSwath() < getData().length) {
+ setSwath(getSwath() + 1);
+ return getData()[getSwath() - 1];
+ } else
+ return 0;
+ }
+
+ public int getLeft() {
+ return (getData().length - getSwath());
+ }
+
+ public byte[] getData() {
+ return data;
+ }
+
+ public byte[] getPrevContent() {
+ return prevcontent;
+ }
+
+ public BigInteger getBigInteger(int i) {
+ if (getData() == null)
+ return BigInteger.ZERO;
+ if (getData().length - getSwath() < i)
+ i = getData().length - getSwath();
+ byte[] tmp = new byte[i];
+ for (int j = i - 1; j >= 0; j--) {
+ tmp[i - j - 1] = getData()[getSwath() + j];
+ }
+ setSwath(getSwath() + i);
+ return new BigInteger(tmp);
+ }
+
+ public byte[] stringToAsciiBytes(String s) {
+ char[] c = s.toCharArray();
+ byte[] byteval = new byte[c.length];
+ for (int i = 0; i < c.length; i++)
+ byteval[i] = (byte) c[i];
+ return byteval;
+ }
+
+ public BigInteger unmarshalUlong() {
+ return getBigInteger(8);
+ }
+
+ public long unmarshalUInt() {
+ return getBigInteger(4).longValue();
+ }
+
+ public int unmarshalInt() {
+ return getBigInteger(4).intValue();
+ }
+
+ public byte[] unmarshalBytes(int i) {
+ if (i == 0)
+ return new byte[1];
+ byte[] t = new byte[i];
+ for (int j = 0; j < i; j++)
+ t[j] = getData()[j + getSwath()];
+ setSwath(getSwath() + i);
+ return t;
+ }
+
+ public BigInteger getEncint() {
+ byte ob;
+ BigInteger bi = BigInteger.ZERO;
+ byte[] nb = new byte[1];
+ while ((ob = this.getByte()) < 0) {
+ nb[0] = (byte) ((ob & 0x7f));
+ bi = bi.shiftLeft(7).add(new BigInteger(nb));
+ }
+ nb[0] = (byte) ((ob & 0x7f));
+ bi = bi.shiftLeft(7).add(new BigInteger(nb));
+ return bi;
+ }
+
+ public char unmarshalUtfChar() {
+ byte ob;
+ int i = 1;
+ byte[] ba;
+ ob = this.getByte();
+ if (ob < 0) {
+ i = 2;
+ while ((ob << (24 + i)) < 0)
+ i++;
+ }
+ ba = new byte[i];
+ ba[0] = ob;
+ int j = 1;
+ while (j < i) {
+ ba[j] = this.getByte();
+ j++;
+ }
+ i = ba.length;
+ if (i == 1)
+ return (char) ba[0];
+ else {
+ int n;
+ n = ba[0] & 15; // 00001111b, gets last 4 bits
+ j = 1;
+ while (j < i)
+ n = (n << 6) + (ba[j++] & 63);// 00111111b,gets last 6 bits
+ return (char) n;
+ }
+ }
+
+// private void setData(byte[] data) {
+// this.data = data;
+// }
+
+ public int getSwath() {
+ return swath;
+ }
+
+ public void setSwath(int swath) {
+ this.swath = swath;
+ }
+
+ public int getTotal() {
+ return total;
+ }
+
+ public void setTotal(int total) {
+ this.total = total;
+ }
+
+ private int getBuffer() {
+ return buffer;
+ }
+
+ private void setBuffer(int buffer) {
+ this.buffer = buffer;
+ }
+
+ /**
+ * @param args
+ * @throws TikaException
+ */
+ public static void main(String[] args) throws TikaException {
+ byte[] array = { 4, 78, -67, 90, 1, -33 };
+ ChmSection chmSection = new ChmSection(array);
+ System.out.println("before " + Arrays.toString(array));
+ System.out.println("after " + Arrays.toString(chmSection.reverseByteOrder(array)));
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractListManager.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,269 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.poi.hwpf.converter.NumberFormatter;
+
+public abstract class AbstractListManager {
+ private final static String BULLET = "\u00b7";
+
+ protected Map<Integer, ParagraphLevelCounter> listLevelMap = new HashMap<Integer, ParagraphLevelCounter>();
+ protected Map<Integer, LevelTuple[]> overrideTupleMap = new HashMap<Integer, LevelTuple[]>();
+
+ //helper class that is docx/doc format agnostic
+ protected class ParagraphLevelCounter {
+
+ //counts can == 0 if the format is decimal, make sure
+ //that flag values are < 0
+ private final Integer NOT_SEEN_YET = -1;
+ private final Integer FIRST_SKIPPED = -2;
+ private final LevelTuple[] levelTuples;
+ Pattern LEVEL_INTERPOLATOR = Pattern.compile("%(\\d+)");
+ private List<Integer> counts = new ArrayList<Integer>();
+ private int lastLevel = -1;
+
+ public ParagraphLevelCounter(LevelTuple[] levelTuples) {
+ this.levelTuples = levelTuples;
+ }
+
+ public int getNumberOfLevels() {
+ return levelTuples.length;
+ }
+
+ /**
+ * Apply this to every numbered paragraph in order.
+ *
+ * @param levelNumber level number that is being incremented
+ * @return the new formatted number string for this level
+ */
+ public String incrementLevel(int levelNumber, LevelTuple[] overrideLevelTuples) {
+
+ for (int i = lastLevel + 1; i < levelNumber; i++) {
+ if (i >= counts.size()) {
+ int val = getStart(i, overrideLevelTuples);
+ counts.add(i, val);
+ } else {
+ int count = counts.get(i);
+ if (count == NOT_SEEN_YET) {
+ count = getStart(i, overrideLevelTuples);
+ counts.set(i, count);
+ }
+ }
+ }
+
+ if (levelNumber < counts.size()) {
+ resetAfter(levelNumber, overrideLevelTuples);
+ int count = counts.get(levelNumber);
+ if (count == NOT_SEEN_YET) {
+ count = getStart(levelNumber, overrideLevelTuples);
+ } else {
+ count++;
+ }
+ counts.set(levelNumber, count);
+ lastLevel = levelNumber;
+ return format(levelNumber, overrideLevelTuples);
+ }
+
+ counts.add(levelNumber, getStart(levelNumber, overrideLevelTuples));
+ lastLevel = levelNumber;
+ return format(levelNumber, overrideLevelTuples);
+ }
+
+ /**
+ * @param level which level to format
+ * @return the string that represents the number and the surrounding text for this paragraph
+ */
+ private String format(int level, LevelTuple[] overrideLevelTuples) {
+ if (level < 0 || level >= levelTuples.length) {
+ //log?
+ return "";
+ }
+ boolean isLegal = (overrideLevelTuples != null) ? overrideLevelTuples[level].isLegal : levelTuples[level].isLegal;
+ //short circuit bullet
+ String numFmt = getNumFormat(level, isLegal, overrideLevelTuples);
+ if ("bullet".equals(numFmt)) {
+ return BULLET + " ";
+ }
+
+ String lvlText = (overrideLevelTuples == null || overrideLevelTuples[level].lvlText == null) ?
+ levelTuples[level].lvlText : overrideLevelTuples[level].lvlText;
+ StringBuilder sb = new StringBuilder();
+ Matcher m = LEVEL_INTERPOLATOR.matcher(lvlText);
+ int last = 0;
+ while (m.find()) {
+ sb.append(lvlText.substring(last, m.start()));
+ String lvlString = m.group(1);
+ int lvlNum = -1;
+ try {
+ lvlNum = Integer.parseInt(lvlString);
+ } catch (NumberFormatException e) {
+ //swallow
+ }
+ String numString = "";
+ //need to subtract 1 because, e.g. %1 is the format
+ //for the number at array offset 0
+ numString = formatNum(lvlNum - 1, isLegal, overrideLevelTuples);
+
+ sb.append(numString);
+ last = m.end();
+ }
+ sb.append(lvlText.substring(last));
+ if (sb.length() > 0) {
+ //TODO: add in character after number
+ sb.append(" ");
+ }
+ return sb.toString();
+ }
+
+ //actual level number; can return empty string if numberformatter fails
+ private String formatNum(int lvlNum, boolean isLegal, LevelTuple[] overrideLevelTuples) {
+
+ int numFmtStyle = 0;
+ String numFmt = getNumFormat(lvlNum, isLegal, overrideLevelTuples);
+
+ int count = getCount(lvlNum);
+ if (count < 0) {
+ count = 1;
+ }
+ if ("lowerLetter".equals(numFmt)) {
+ numFmtStyle = 4;
+ } else if ("lowerRoman".equals(numFmt)) {
+ numFmtStyle = 2;
+ } else if ("decimal".equals(numFmt)) {
+ numFmtStyle = 0;
+ } else if ("upperLetter".equals(numFmt)) {
+ numFmtStyle = 3;
+ } else if ("upperRoman".equals(numFmt)) {
+ numFmtStyle = 1;
+ } else if ("bullet".equals(numFmt)) {
+ return "";
+ //not yet handled by NumberFormatter...TODO: add to NumberFormatter?
+ } else if ("ordinal".equals(numFmt)) {
+ return ordinalize(count);
+ } else if ("decimalZero".equals(numFmt)) {
+ return "0" + NumberFormatter.getNumber(count, 0);
+ } else if ("none".equals(numFmt)) {
+ return "";
+ }
+ try {
+ return NumberFormatter.getNumber(count, numFmtStyle);
+ } catch (IllegalArgumentException e) {
+ return "";
+ }
+ }
+
+ private String ordinalize(int count) {
+ //this is only good for locale == English
+ String countString = Integer.toString(count);
+ if (countString.endsWith("1")) {
+ return countString + "st";
+ } else if (countString.endsWith("2")) {
+ return countString + "nd";
+ } else if (countString.endsWith("3")) {
+ return countString + "rd";
+ }
+ return countString + "th";
+ }
+
+ private String getNumFormat(int lvlNum, boolean isLegal, LevelTuple[] overrideLevelTuples) {
+ if (lvlNum < 0 || lvlNum >= levelTuples.length) {
+ //log?
+ return "decimal";
+ }
+ if (isLegal) {
+ //return decimal no matter the level if isLegal is true
+ return "decimal";
+ }
+ return (overrideLevelTuples == null || overrideLevelTuples[lvlNum].numFmt == null) ?
+ levelTuples[lvlNum].numFmt : overrideLevelTuples[lvlNum].numFmt;
+ }
+
+ private int getCount(int lvlNum) {
+ if (lvlNum < 0 || lvlNum >= counts.size()) {
+ //log?
+ return 1;
+ }
+ return counts.get(lvlNum);
+ }
+
+ private void resetAfter(int startlevelNumber, LevelTuple[] overrideLevelTuples) {
+ for (int levelNumber = startlevelNumber + 1; levelNumber < counts.size(); levelNumber++) {
+ int cnt = counts.get(levelNumber);
+ if (cnt == NOT_SEEN_YET) {
+ //do nothing
+ } else if (cnt == FIRST_SKIPPED) {
+ //do nothing
+ } else if (levelTuples.length > levelNumber) {
+ //never reset if restarts == 0
+ int restart = (overrideLevelTuples == null || overrideLevelTuples[levelNumber].restart < 0) ?
+ levelTuples[levelNumber].restart : overrideLevelTuples[levelNumber].restart;
+ if (restart == 0) {
+ return;
+ } else if (restart == -1 ||
+ startlevelNumber <= restart - 1) {
+ counts.set(levelNumber, NOT_SEEN_YET);
+ } else {
+ //do nothing/don't reset
+ }
+ } else {
+ //reset!
+ counts.set(levelNumber, NOT_SEEN_YET);
+ }
+ }
+ }
+
+ private int getStart(int levelNumber, LevelTuple[] overrideLevelTuples) {
+ if (levelNumber >= levelTuples.length) {
+ return 1;
+ } else {
+ return (overrideLevelTuples == null || overrideLevelTuples[levelNumber].start < 0) ?
+ levelTuples[levelNumber].start : overrideLevelTuples[levelNumber].start;
+ }
+ }
+ }
+
+ protected class LevelTuple {
+ private final int start;
+ private final int restart;
+ private final String lvlText;
+ private final String numFmt;
+ private final boolean isLegal;
+
+ public LevelTuple(String lvlText) {
+ this.lvlText = lvlText;
+ start = 1;
+ restart = -1;
+ numFmt = "decimal";
+ isLegal = false;
+ }
+
+ public LevelTuple(int start, int restart, String lvlText, String numFmt, boolean isLegal) {
+ this.start = start;
+ this.restart = restart;
+ this.lvlText = lvlText;
+ this.numFmt = numFmt;
+ this.isLegal = isLegal;
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.Ole10Native;
+import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
+import org.apache.tika.parser.pkg.ZipContainerDetector;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+abstract class AbstractPOIFSExtractor {
+ private static final Log logger = LogFactory.getLog(AbstractPOIFSExtractor.class);
+ private final EmbeddedDocumentExtractor extractor;
+ private PasswordProvider passwordProvider;
+ private TikaConfig tikaConfig;
+ private MimeTypes mimeTypes;
+ private Detector detector;
+ private Metadata metadata;
+
+ protected AbstractPOIFSExtractor(ParseContext context) {
+ this(context, null);
+ }
+
+ protected AbstractPOIFSExtractor(ParseContext context, Metadata metadata) {
+ EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+
+ if (ex == null) {
+ this.extractor = new ParsingEmbeddedDocumentExtractor(context);
+ } else {
+ this.extractor = ex;
+ }
+
+ this.passwordProvider = context.get(PasswordProvider.class);
+ this.tikaConfig = context.get(TikaConfig.class);
+ this.mimeTypes = context.get(MimeTypes.class);
+ this.detector = context.get(Detector.class);
+ this.metadata = metadata;
+ }
+
+ // Note - these cache, but avoid creating the default TikaConfig if not needed
+ protected TikaConfig getTikaConfig() {
+ if (tikaConfig == null) {
+ tikaConfig = TikaConfig.getDefaultConfig();
+ }
+ return tikaConfig;
+ }
+
+ protected Detector getDetector() {
+ if (detector != null) return detector;
+
+ detector = getTikaConfig().getDetector();
+ return detector;
+ }
+
+ protected MimeTypes getMimeTypes() {
+ if (mimeTypes != null) return mimeTypes;
+
+ mimeTypes = getTikaConfig().getMimeRepository();
+ return mimeTypes;
+ }
+
+ /**
+ * Returns the password to be used for this file, or null
+ * if no / default password should be used
+ */
+ protected String getPassword() {
+ if (passwordProvider != null) {
+ return passwordProvider.getPassword(metadata);
+ }
+ return null;
+ }
+
+ protected void handleEmbeddedResource(TikaInputStream resource, String filename,
+ String relationshipID, String mediaType, XHTMLContentHandler xhtml,
+ boolean outputHtml)
+ throws IOException, SAXException, TikaException {
+ try {
+ Metadata metadata = new Metadata();
+ if (filename != null) {
+ metadata.set(Metadata.TIKA_MIME_FILE, filename);
+ metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+ }
+ if (relationshipID != null) {
+ metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, relationshipID);
+ }
+ if (mediaType != null) {
+ metadata.set(Metadata.CONTENT_TYPE, mediaType);
+ }
+
+ if (extractor.shouldParseEmbedded(metadata)) {
+ extractor.parseEmbedded(resource, xhtml, metadata, outputHtml);
+ }
+ } finally {
+ resource.close();
+ }
+ }
+
+ /**
+ * Handle an office document that's embedded at the POIFS level
+ */
+ protected void handleEmbeddedOfficeDoc(
+ DirectoryEntry dir, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+
+ // Is it an embedded OLE2 document, or an embedded OOXML document?
+
+ if (dir.hasEntry("Package")) {
+ // It's OOXML (has a ZipFile):
+ Entry ooxml = dir.getEntry("Package");
+
+ try (TikaInputStream stream = TikaInputStream.get(
+ new DocumentInputStream((DocumentEntry) ooxml))) {
+ ZipContainerDetector detector = new ZipContainerDetector();
+ MediaType type = detector.detect(stream, new Metadata());
+ handleEmbeddedResource(stream, null, dir.getName(), type.toString(), xhtml, true);
+ return;
+ }
+ }
+
+ // It's regular OLE2:
+
+ // What kind of document is it?
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
+ POIFSDocumentType type = POIFSDocumentType.detectType(dir);
+ TikaInputStream embedded = null;
+
+ try {
+ if (type == POIFSDocumentType.OLE10_NATIVE) {
+ try {
+ // Try to un-wrap the OLE10Native record:
+ Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
+ if (ole.getLabel() != null) {
+ metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
+ }
+ byte[] data = ole.getDataBuffer();
+ embedded = TikaInputStream.get(data);
+ } catch (Ole10NativeException ex) {
+ // Not a valid OLE10Native record, skip it
+ } catch (Exception e) {
+ logger.warn("Ignoring unexpected exception while parsing possible OLE10_NATIVE embedded document " + dir.getName(), e);
+ }
+ } else if (type == POIFSDocumentType.COMP_OBJ) {
+ try {
+ // Grab the contents and process
+ DocumentEntry contentsEntry;
+ try {
+ contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
+ } catch (FileNotFoundException ioe) {
+ contentsEntry = (DocumentEntry) dir.getEntry("Contents");
+ }
+ DocumentInputStream inp = new DocumentInputStream(contentsEntry);
+ byte[] contents = new byte[contentsEntry.getSize()];
+ inp.readFully(contents);
+ embedded = TikaInputStream.get(contents);
+
+ // Try to work out what it is
+ MediaType mediaType = getDetector().detect(embedded, new Metadata());
+ String extension = type.getExtension();
+ try {
+ MimeType mimeType = getMimeTypes().forName(mediaType.toString());
+ extension = mimeType.getExtension();
+ } catch (MimeTypeException mte) {
+ // No details on this type are known
+ }
+
+ // Record what we can do about it
+ metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
+ metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + extension);
+ } catch (Exception e) {
+ throw new TikaException("Invalid embedded resource", e);
+ }
+ } else {
+ metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
+ metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
+ }
+
+ // Should we parse it?
+ if (extractor.shouldParseEmbedded(metadata)) {
+ if (embedded == null) {
+ // Make a TikaInputStream that just
+ // passes the root directory of the
+ // embedded document, and is otherwise
+ // empty (byte[0]):
+ embedded = TikaInputStream.get(new byte[0]);
+ embedded.setOpenContainer(dir);
+ }
+ extractor.parseEmbedded(embedded, xhtml, metadata, true);
+ }
+ } finally {
+ if (embedded != null) {
+ embedded.close();
+ }
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/Cell.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Cell of content. Classes that implement this interface are used by
+ * Tika parsers (currently just the MS Excel parser) to keep track of
+ * individual pieces of content before they are rendered to the XHTML
+ * SAX event stream.
+ */
+public interface Cell {
+
+ /**
+ * Renders the content to the given XHTML SAX event stream.
+ *
+ * @param handler
+ * @throws SAXException
+ */
+ void render(XHTMLContentHandler handler) throws SAXException;
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-module/src/main/java/org/apache/tika/parser/microsoft/CellDecorator.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Cell decorator.
+ */
+public class CellDecorator implements Cell {
+
+ private final Cell cell;
+
+ public CellDecorator(Cell cell) {
+ this.cell = cell;
+ }
+
+ public void render(XHTMLContentHandler handler) throws SAXException {
+ cell.render(handler);
+ }
+
+}