You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/16 19:23:06 UTC
svn commit: r1725014 [7/28] - in /tika/branches/2.x:
tika-parser-bundles/tika-multimedia-bundle/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-parser-module/
tika-parser-modules/tika-advanced-parser-mo...
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,913 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.math.BigInteger;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
+import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * Decompresses a chm block. Depending on chm block type chooses most relevant
+ * decompressing method. A chm block type can be as follows:</br> <li>UNDEFINED
+ * - no action taken, i.e. skipping the block <li>VERBATIM <li>ALIGNED_OFFSET
+ * <li>UNCOMPRESSED the most simplest In addition there are unknown types (4-7).
+ * Currently relying on previous chm block these types changing according to the
+ * previous chm block type. We need to invent more appropriate way to handle
+ * such types.
+ *
+ */
+public class ChmLzxBlock {
+ private int block_number;
+ private long block_length;
+ private ChmLzxState state;
+ private byte[] content = null;
+ private ChmSection chmSection = null;
+ private int contentLength = 0;
+
+ // trying to find solution for bad blocks ...
+ private int previousBlockType = -1;
+
+ public ChmLzxBlock(int blockNumber, byte[] dataSegment, long blockLength,
+ ChmLzxBlock prevBlock) throws TikaException {
+ try {
+ if (validateConstructorParams(blockNumber, dataSegment, blockLength)) {
+ setBlockNumber(blockNumber);
+
+ if (prevBlock != null
+ && prevBlock.getState().getBlockLength() > prevBlock
+ .getState().getBlockRemaining())
+ setChmSection(new ChmSection(dataSegment, prevBlock.getContent()));
+ else
+ setChmSection(new ChmSection(dataSegment));
+
+ setBlockLength(blockLength);
+
+ // ============================================
+ // we need to take care of previous context
+ // ============================================
+ checkLzxBlock(prevBlock);
+ if (prevBlock == null
+ || blockLength < (int) getBlockLength()) {
+ setContent((int) getBlockLength());
+ }
+ else {
+ setContent((int) blockLength);
+ }
+
+ if (prevBlock != null && prevBlock.getState() != null)
+ previousBlockType = prevBlock.getState().getBlockType();
+
+ extractContent();
+ } else
+ throw new TikaException("Check your chm lzx block parameters");
+ } catch (TikaException e) {
+ throw e;
+ }
+ }
+
+ protected int getContentLength() {
+ return contentLength;
+ }
+
+ protected void setContentLength(int contentLength) {
+ this.contentLength = contentLength;
+ }
+
+ private ChmSection getChmSection() {
+ return chmSection;
+ }
+
+ private void setChmSection(ChmSection chmSection) {
+ this.chmSection = chmSection;
+ }
+
+ private void assertStateNotNull() throws TikaException {
+ if (getState() == null)
+ throw new ChmParsingException("state is null");
+ }
+
+ private void extractContent() throws TikaException {
+ assertStateNotNull();
+ if (getChmSection().getData() != null) {
+ boolean continueLoop = true;
+ while (continueLoop && getContentLength() < getBlockLength()) {
+ if (getState() != null && getState().getBlockRemaining() == 0) {
+ if (getState().getHadStarted() == LzxState.NOT_STARTED_DECODING) {
+ getState().setHadStarted(LzxState.STARTED_DECODING);
+ if (getChmSection().getSyncBits(1) == 1) {
+ int intelSizeTemp = (getChmSection()
+ .getSyncBits(16) << 16)
+ + getChmSection().getSyncBits(16);
+ if (intelSizeTemp >= 0)
+ getState().setIntelFileSize(intelSizeTemp);
+ else
+ getState().setIntelFileSize(0);
+ }
+ }
+ getState().setBlockType(getChmSection().getSyncBits(3));
+ getState().setBlockLength(
+ (getChmSection().getSyncBits(16) << 8)
+ + getChmSection().getSyncBits(8));
+ getState().setBlockRemaining(getState().getBlockLength());
+
+ // ----------------------------------------
+ // Trying to handle 3 - 7 block types
+ // ----------------------------------------
+ if (getState().getBlockType() > 3) {
+ if (previousBlockType >= 0 && previousBlockType < 3)
+ getState().setBlockType(previousBlockType);
+ }
+
+ switch (getState().getBlockType()) {
+ case ChmCommons.ALIGNED_OFFSET:
+ createAlignedTreeTable();
+ //fall through
+ case ChmCommons.VERBATIM:
+ /* Creates mainTreeTable */
+ createMainTreeTable();
+ createLengthTreeTable();
+ if (getState().getMainTreeLengtsTable()[0xe8] != 0)
+ getState().setIntelState(IntelState.STARTED);
+ break;
+ case ChmCommons.UNCOMPRESSED:
+ getState().setIntelState(IntelState.STARTED);
+ if (getChmSection().getTotal() > 16)
+ getChmSection().setSwath(
+ getChmSection().getSwath() - 1);
+ getState().setR0(
+ (new BigInteger(getChmSection()
+ .reverseByteOrder(
+ getChmSection().unmarshalBytes(
+ 4))).longValue()));
+ getState().setR1(
+ (new BigInteger(getChmSection()
+ .reverseByteOrder(
+ getChmSection().unmarshalBytes(
+ 4))).longValue()));
+ getState().setR2(
+ (new BigInteger(getChmSection()
+ .reverseByteOrder(
+ getChmSection().unmarshalBytes(
+ 4))).longValue()));
+ break;
+ default:
+ break;
+ }
+ } //end of if BlockRemaining == 0
+
+ int tempLen;
+
+ if (getContentLength() + getState().getBlockRemaining() > getBlockLength()) {
+ getState().setBlockRemaining(
+ getContentLength() + getState().getBlockRemaining()
+ - (int) getBlockLength());
+ tempLen = (int) getBlockLength();
+ } else {
+ tempLen = getContentLength()
+ + getState().getBlockRemaining();
+ getState().setBlockRemaining(0);
+ }
+
+ int lastLength = getContentLength();
+ switch (getState().getBlockType()) {
+ case ChmCommons.ALIGNED_OFFSET:
+ // if(prevblock.lzxState.length>prevblock.lzxState.remaining)
+ decompressAlignedBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());// prevcontext
+ break;
+ case ChmCommons.VERBATIM:
+ decompressVerbatimBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());
+ break;
+ case ChmCommons.UNCOMPRESSED:
+ decompressUncompressedBlock(tempLen, getChmSection().getPrevContent() == null ? getChmSection().getData() : getChmSection().getPrevContent());
+ break;
+ }
+ getState().increaseFramesRead();
+ if ((getState().getFramesRead() < 32768)
+ && getState().getIntelFileSize() != 0)
+ intelE8Decoding();
+
+ continueLoop = getContentLength() > lastLength;
+ }
+ }
+ }
+
+ protected void intelE8Decoding() {
+ if (getBlockLength() <= ChmConstants.LZX_PRETREE_TABLEBITS
+ || (getState().getIntelState() == IntelState.NOT_STARTED)) {
+ getState().setBlockRemaining(
+ getState().getBlockRemaining() - (int) getBlockLength());
+ } else {
+ long curpos = getState().getBlockRemaining();
+ getState().setBlockRemaining(
+ getState().getBlockRemaining() - (int) getBlockLength());
+ int i = 0;
+ while (i < getBlockLength() - 10) {
+ if (content[i] != 0xe8) {
+ i++;
+ continue;
+ }
+ byte[] b = new byte[4];
+ b[0] = getContent()[i + 3];
+ b[1] = getContent()[i + 2];
+ b[2] = getContent()[i + 1];
+ b[3] = getContent()[i + 0];
+ long absoff = (new BigInteger(b)).longValue();
+ if ((absoff >= -curpos)
+ && (absoff < getState().getIntelFileSize())) {
+ long reloff = (absoff >= 0) ? absoff - curpos : absoff
+ + getState().getIntelFileSize();
+ getContent()[i + 0] = (byte) reloff;
+ getContent()[i + 1] = (byte) (reloff >>> 8);
+ getContent()[i + 2] = (byte) (reloff >>> 16);
+ getContent()[i + 3] = (byte) (reloff >>> 24);
+ }
+ i += 4;
+ curpos += 5;
+ }
+ }
+ }
+
+ private short[] createPreLenTable() {
+ short[] tmp = new short[ChmConstants.LZX_PRETREE_MAXSYMBOLS];
+ for (int i = 0; i < ChmConstants.LZX_PRETREE_MAXSYMBOLS; i++) {
+ tmp[i] = (short) getChmSection().getSyncBits(
+ ChmConstants.LZX_PRETREE_NUM_ELEMENTS_BITS);
+ }
+ return tmp;
+ }
+
+ private void createLengthTreeTable() throws TikaException {
+ //Read Pre Tree Table
+ short[] prelentable = createPreLenTable();
+
+ if (prelentable == null) {
+ throw new ChmParsingException("pretreetable is null");
+ }
+
+ short[] pretreetable = createTreeTable2(prelentable,
+ (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+ + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+ ChmConstants.LZX_PRETREE_TABLEBITS,
+ ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+
+ if (pretreetable == null) {
+ throw new ChmParsingException("pretreetable is null");
+ }
+
+ //Build Length Tree
+ createLengthTreeLenTable(0, ChmConstants.LZX_NUM_SECONDARY_LENGTHS,
+ pretreetable, prelentable);
+
+ getState().setLengthTreeTable(
+ createTreeTable2(getState().getLengthTreeLengtsTable(),
+ (1 << ChmConstants.LZX_LENGTH_TABLEBITS)
+ + (ChmConstants.LZX_LENGTH_MAXSYMBOLS << 1),
+ ChmConstants.LZX_LENGTH_TABLEBITS,
+ ChmConstants.LZX_NUM_SECONDARY_LENGTHS));
+ }
+
+ private void decompressUncompressedBlock(int len, byte[] prevcontent) {
+ if (getContentLength() + getState().getBlockRemaining() <= getBlockLength()) {
+ for (int i = getContentLength(); i < (getContentLength() + getState()
+ .getBlockRemaining()); i++)
+ content[i] = getChmSection().getByte();
+
+ setContentLength(getContentLength()
+ + getState().getBlockRemaining());
+ getState().setBlockRemaining(0);
+ } else {
+ for (int i = getContentLength(); i < getBlockLength(); i++)
+ content[i] = getChmSection().getByte();
+ getState().setBlockRemaining(
+ (int) getBlockLength() - getContentLength());// = blockLen -
+ // contentlen;
+ setContentLength((int) getBlockLength());
+ }
+ }
+
+ private void decompressAlignedBlock(int len, byte[] prevcontent) throws TikaException {
+
+ if ((getChmSection() == null) || (getState() == null)
+ || (getState().getMainTreeTable() == null))
+ throw new ChmParsingException("chm section is null");
+
+ short s;
+ int x, i, border;
+ int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
+ int matchoffset = 0;
+ for (i = getContentLength(); i < len; i++) {
+ /* new code */
+ //read huffman tree from main tree
+ border = getChmSection().peekBits(
+ ChmConstants.LZX_MAINTREE_TABLEBITS);
+ if (border >= getState().mainTreeTable.length)
+ throw new ChmParsingException("error decompressing aligned block.");
+ //break;
+ /* end new code */
+ s = getState().mainTreeTable[getChmSection().peekBits(
+ ChmConstants.LZX_MAINTREE_TABLEBITS)];
+ if (s >= getState().getMainTreeElements()) {
+ x = ChmConstants.LZX_MAINTREE_TABLEBITS;
+ do {
+ x++;
+ s <<= 1;
+ s += getChmSection().checkBit(x);
+ } while ((s = getState().mainTreeTable[s]) >= getState()
+ .getMainTreeElements());
+ }
+ //System.out.printf("%d,", s);
+ //?getChmSection().getSyncBits(getState().mainTreeTable[s]);
+ getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]);
+ if (s < ChmConstants.LZX_NUM_CHARS) {
+ content[i] = (byte) s;
+ } else {
+ s -= ChmConstants.LZX_NUM_CHARS;
+ matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
+ if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
+ matchfooter = getState().lengthTreeTable[getChmSection()
+ .peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)];//.LZX_MAINTREE_TABLEBITS)];
+ if (matchfooter >= ChmConstants.LZX_LENGTH_MAXSYMBOLS/*?LZX_LENGTH_TABLEBITS*/) {
+ x = ChmConstants.LZX_LENGTH_TABLEBITS;
+ do {
+ x++;
+ matchfooter <<= 1;
+ matchfooter += getChmSection().checkBit(x);
+ } while ((matchfooter = getState().lengthTreeTable[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
+ }
+ getChmSection().getSyncBits(
+ getState().lengthTreeLengtsTable[matchfooter]);
+ matchlen += matchfooter;
+ }
+ matchlen += ChmConstants.LZX_MIN_MATCH;
+ matchoffset = s >>> 3;
+ if (matchoffset > 2) {
+ extra = ChmConstants.EXTRA_BITS[matchoffset];
+ matchoffset = (ChmConstants.POSITION_BASE[matchoffset] - 2);
+ if (extra > 3) {
+ extra -= 3;
+ long verbatim_bits = getChmSection().getSyncBits(extra);
+ matchoffset += (verbatim_bits << 3);
+ //READ HUFF SYM in Aligned Tree
+ int aligned_bits = getChmSection().peekBits(
+ ChmConstants.LZX_NUM_PRIMARY_LENGTHS);
+ int t = getState().getAlignedTreeTable()[aligned_bits];
+ if (t >= getState().getMainTreeElements()) {
+ x = ChmConstants.LZX_ALIGNED_TABLEBITS; //?LZX_MAINTREE_TABLEBITS; //?LZX_ALIGNED_TABLEBITS
+ do {
+ x++;
+ t <<= 1;
+ t += getChmSection().checkBit(x);
+ } while ((t = getState().getAlignedTreeTable()[t]) >= getState()
+ .getMainTreeElements());
+ }
+ getChmSection().getSyncBits(
+ getState().getAlignedLenTable()[t]);
+ matchoffset += t;
+ } else if (extra == 3) {
+ int g = getChmSection().peekBits(
+ ChmConstants.LZX_NUM_PRIMARY_LENGTHS);
+ int t = getState().getAlignedTreeTable()[g];
+ if (t >= getState().getMainTreeElements()) {
+ x = ChmConstants.LZX_ALIGNED_TABLEBITS; //?LZX_MAINTREE_TABLEBITS;
+ do {
+ x++;
+ t <<= 1;
+ t += getChmSection().checkBit(x);
+ } while ((t = getState().getAlignedTreeTable()[t]) >= getState()
+ .getMainTreeElements());
+ }
+ getChmSection().getSyncBits(
+ getState().getAlignedLenTable()[t]);
+ matchoffset += t;
+ } else if (extra > 0) {
+ long l = getChmSection().getSyncBits(extra);
+ matchoffset += l;
+ } else
+ matchoffset = 1;
+ getState().setR2(getState().getR1());
+ getState().setR1(getState().getR0());
+ getState().setR0(matchoffset);
+ } else if (matchoffset == 0) {
+ matchoffset = (int) getState().getR0();
+ } else if (matchoffset == 1) {
+ matchoffset = (int) getState().getR1();
+ getState().setR1(getState().getR0());
+ getState().setR0(matchoffset);
+ } else /** match_offset == 2 */
+ {
+ matchoffset = (int) getState().getR2();
+ getState().setR2(getState().getR0());
+ getState().setR0(matchoffset);
+ }
+ rundest = i;
+ runsrc = rundest - matchoffset;
+ i += (matchlen - 1);
+ if (i > len)
+ break;
+
+ if (runsrc < 0) {
+ if (matchlen + runsrc <= 0) {
+ runsrc = prevcontent.length + runsrc;
+ while (matchlen-- > 0)
+ content[rundest++] = prevcontent[runsrc++];
+ } else {
+ runsrc = prevcontent.length + runsrc;
+ while (runsrc < prevcontent.length)
+ content[rundest++] = prevcontent[runsrc++];
+ matchlen = matchlen + runsrc - prevcontent.length;
+ runsrc = 0;
+ while (matchlen-- > 0)
+ content[rundest++] = content[runsrc++];
+ }
+
+ } else {
+ /* copies any wrappes around source data */
+ while ((runsrc < 0) && (matchlen-- > 0)) {
+ content[rundest++] = content[(int) (runsrc + getBlockLength())];
+ runsrc++;
+ }
+ /* copies match data - no worries about destination wraps */
+ while (matchlen-- > 0)
+ content[rundest++] = content[runsrc++];
+ }
+ }
+ }
+ setContentLength(len);
+ }
+
+ private void assertShortArrayNotNull(short[] array) throws TikaException {
+ if (array == null)
+ throw new ChmParsingException("short[] is null");
+ }
+
+ private void decompressVerbatimBlock(int len, byte[] prevcontent) throws TikaException {
+ short s;
+ int x, i;
+ int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
+ int matchoffset = 0;
+ for (i = getContentLength(); i < len; i++) {
+ int f = getChmSection().peekBits(
+ ChmConstants.LZX_MAINTREE_TABLEBITS);
+ assertShortArrayNotNull(getState().getMainTreeTable());
+ s = getState().getMainTreeTable()[f];
+ if (s >= ChmConstants.LZX_MAIN_MAXSYMBOLS) {
+ x = ChmConstants.LZX_MAINTREE_TABLEBITS;
+ do {
+ x++;
+ s <<= 1;
+ s += getChmSection().checkBit(x);
+ } while ((s = getState().getMainTreeTable()[s]) >= ChmConstants.LZX_MAIN_MAXSYMBOLS);
+ }
+ getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]);
+ if (s < ChmConstants.LZX_NUM_CHARS) {
+ content[i] = (byte) s;
+ } else {
+ s -= ChmConstants.LZX_NUM_CHARS;
+ matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
+ if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
+ matchfooter = getState().getLengthTreeTable()[getChmSection()
+ .peekBits(ChmConstants.LZX_LENGTH_TABLEBITS)];
+ if (matchfooter >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS) {
+ x = ChmConstants.LZX_LENGTH_TABLEBITS;
+ do {
+ x++;
+ matchfooter <<= 1;
+ matchfooter += getChmSection().checkBit(x);
+ } while ((matchfooter = getState().getLengthTreeTable()[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
+ }
+ getChmSection().getSyncBits(
+ getState().getLengthTreeLengtsTable()[matchfooter]);
+ matchlen += matchfooter;
+ }
+ matchlen += ChmConstants.LZX_MIN_MATCH;
+ // shorter than 2
+ matchoffset = s >>> 3;
+ if (matchoffset > 2) {
+ if (matchoffset != 3) { // should get other bits to retrieve
+ // offset
+ extra = ChmConstants.EXTRA_BITS[matchoffset];
+ long l = getChmSection().getSyncBits(extra);
+ matchoffset = (int) (ChmConstants.POSITION_BASE[matchoffset] - 2 + l);
+ } else {
+ matchoffset = 1;
+ }
+ getState().setR2(getState().getR1());
+ getState().setR1(getState().getR0());
+ getState().setR0(matchoffset);
+ } else if (matchoffset == 0) {
+ matchoffset = (int) getState().getR0();
+ } else if (matchoffset == 1) {
+ matchoffset = (int) getState().getR1();
+ getState().setR1(getState().getR0());
+ getState().setR0(matchoffset);
+ } else /* match_offset == 2 */
+ {
+ matchoffset = (int) getState().getR2();
+ getState().setR2(getState().getR0());
+ getState().setR0(matchoffset);
+ }
+ rundest = i;
+ runsrc = rundest - matchoffset;
+ i += (matchlen - 1);
+ if (i > len)
+ break;
+ if (runsrc < 0) {
+ if (matchlen + runsrc <= 0) {
+ runsrc = prevcontent.length + runsrc;
+ while ((matchlen-- > 0) && (prevcontent != null)
+ && ((runsrc + 1) > 0))
+ if ((rundest < content.length)
+ && (runsrc < content.length))
+ content[rundest++] = prevcontent[runsrc++];
+ } else {
+ runsrc = prevcontent.length + runsrc;
+ while (runsrc < prevcontent.length)
+ if ((rundest < content.length)
+ && (runsrc < content.length))
+ content[rundest++] = prevcontent[runsrc++];
+ matchlen = matchlen + runsrc - prevcontent.length;
+ runsrc = 0;
+ while (matchlen-- > 0)
+ content[rundest++] = content[runsrc++];
+ }
+
+ } else {
+ /* copies any wrapped source data */
+ while ((runsrc < 0) && (matchlen-- > 0)) {
+ content[rundest++] = content[(int) (runsrc + getBlockLength())];
+ runsrc++;
+ }
+ /* copies match data - no worries about destination wraps */
+ while (matchlen-- > 0) {
+ if ((rundest < content.length)
+ && (runsrc < content.length))
+ content[rundest++] = content[runsrc++];
+ }
+ }
+ }
+ }
+ setContentLength(len);
+ }
+
+ private void createLengthTreeLenTable(int offset, int tablelen,
+ short[] pretreetable, short[] prelentable) throws TikaException {
+ if (prelentable == null || getChmSection() == null
+ || pretreetable == null || prelentable == null)
+ throw new ChmParsingException("is null");
+
+ int i = offset; // represents offset
+ int z, y, x;// local counters
+ while (i < tablelen) {
+ //Read HUFF sym to z
+ z = pretreetable[getChmSection().peekBits(
+ ChmConstants.LZX_PRETREE_TABLEBITS)];
+ if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 1 bug, should be
+ // 20
+ x = ChmConstants.LZX_PRETREE_TABLEBITS;
+ do {
+ x++;
+ z <<= 1;
+ z += getChmSection().checkBit(x);
+ } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS);
+ }
+ getChmSection().getSyncBits(prelentable[z]);
+
+ if (z < 17) {
+ z = getState().getLengthTreeLengtsTable()[i] - z;
+ if (z < 0)
+ z = z + 17;
+ getState().getLengthTreeLengtsTable()[i] = (short) z;
+ i++;
+ } else if (z == 17) {
+ y = getChmSection().getSyncBits(4);
+ y += 4;
+ for (int j = 0; j < y; j++)
+ if (i < getState().getLengthTreeLengtsTable().length)
+ getState().getLengthTreeLengtsTable()[i++] = 0;
+ } else if (z == 18) {
+ y = getChmSection().getSyncBits(5);
+ y += 20;
+ for (int j = 0; j < y; j++)
+ //no tolerate //if (i < getState().getLengthTreeLengtsTable().length)
+ getState().getLengthTreeLengtsTable()[i++] = 0;
+ } else if (z == 19) {
+ y = getChmSection().getSyncBits(1);
+ y += 4;
+ z = pretreetable[getChmSection().peekBits(
+ ChmConstants.LZX_PRETREE_TABLEBITS)];
+ if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 20
+ x = ChmConstants.LZX_PRETREE_TABLEBITS;// 6
+ do {
+ x++;
+ z <<= 1;
+ z += getChmSection().checkBit(x);
+ } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS);//LZX_MAINTREE_TABLEBITS);
+ }
+ getChmSection().getSyncBits(prelentable[z]);
+ z = getState().getLengthTreeLengtsTable()[i] - z;
+ if (z < 0)
+ z = z + 17;
+ for (int j = 0; j < y; j++)
+ getState().getLengthTreeLengtsTable()[i++] = (short) z;
+ }
+ }
+ }
+
+ private void createMainTreeTable() throws TikaException {
+ //Read Pre Tree Table
+ short[] prelentable = createPreLenTable();
+ short[] pretreetable = createTreeTable2(prelentable,
+ (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+ + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+ ChmConstants.LZX_PRETREE_TABLEBITS,
+ ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+
+ createMainTreeLenTable(0, ChmConstants.LZX_NUM_CHARS, pretreetable,
+ prelentable);
+
+ //Read Pre Tree Table
+ prelentable = createPreLenTable();
+ pretreetable = createTreeTable2(prelentable,
+ (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+ + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+ ChmConstants.LZX_PRETREE_TABLEBITS,
+ ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+
+ createMainTreeLenTable(ChmConstants.LZX_NUM_CHARS,
+ getState().mainTreeLengtsTable.length, pretreetable,
+ prelentable);
+
+ getState().setMainTreeTable(
+ createTreeTable2(getState().mainTreeLengtsTable,
+ (1 << ChmConstants.LZX_MAINTREE_TABLEBITS)
+ + (ChmConstants.LZX_MAINTREE_MAXSYMBOLS << 1),
+ ChmConstants.LZX_MAINTREE_TABLEBITS, getState()
+ .getMainTreeElements()));
+ }
+
+ private void createMainTreeLenTable(int offset, int tablelen,
+ short[] pretreetable, short[] prelentable) throws TikaException {
+ if (pretreetable == null)
+ throw new ChmParsingException("pretreetable is null");
+ int i = offset;
+ int z, y, x;
+ while (i < tablelen) {
+ int f = getChmSection().peekBits(
+ ChmConstants.LZX_PRETREE_TABLEBITS);
+ z = pretreetable[f];
+ if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
+ x = ChmConstants.LZX_PRETREE_TABLEBITS;
+ do {
+ x++;
+ z <<= 1;
+ z += getChmSection().checkBit(x);
+ } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+ }
+ getChmSection().getSyncBits(prelentable[z]);
+ if (z < 17) {
+ z = getState().getMainTreeLengtsTable()[i] - z;
+ if (z < 0)
+ z = z + 17;
+ getState().mainTreeLengtsTable[i] = (short) z;
+ i++;
+ } else if (z == 17) {
+ y = getChmSection().getSyncBits(4);
+ y += 4;
+ for (int j = 0; j < y; j++) {
+ assertInRange(getState().getMainTreeLengtsTable(), i);
+ getState().mainTreeLengtsTable[i++] = 0;
+ }
+ } else if (z == 18) {
+ y = getChmSection().getSyncBits(5);
+ y += 20;
+ for (int j = 0; j < y; j++) {
+ assertInRange(getState().getMainTreeLengtsTable(), i);
+ getState().mainTreeLengtsTable[i++] = 0;
+ }
+ } else if (z == 19) {
+ y = getChmSection().getSyncBits(1);
+ y += 4;
+ z = pretreetable[getChmSection().peekBits(
+ ChmConstants.LZX_PRETREE_TABLEBITS)];
+ if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
+ x = ChmConstants.LZX_PRETREE_TABLEBITS;
+ do {
+ x++;
+ z <<= 1;
+ z += getChmSection().checkBit(x);
+ } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+ }
+ getChmSection().getSyncBits(prelentable[z]);
+ z = getState().mainTreeLengtsTable[i] - z;
+ if (z < 0)
+ z = z + 17;
+ for (int j = 0; j < y; j++)
+ if (i < getState().getMainTreeLengtsTable().length)
+ getState().mainTreeLengtsTable[i++] = (short) z;
+ }
+ }
+ }
+
+ private void assertInRange(short[] array, int index) throws ChmParsingException {
+ if (index >= array.length)
+ throw new ChmParsingException(index + " is bigger than "
+ + array.length);
+ }
+
+ private short[] createAlignedLenTable() {
+ int tablelen = ChmConstants.LZX_ALIGNED_NUM_ELEMENTS;//LZX_BLOCKTYPE_UNCOMPRESSED;//
+ int bits = ChmConstants.LZX_BLOCKTYPE_UNCOMPRESSED;
+ short[] tmp = new short[tablelen];
+ for (int i = 0; i < tablelen; i++) {
+ tmp[i] = (short) getChmSection().getSyncBits(bits);
+ }
+ return tmp;
+ }
+
+ private void createAlignedTreeTable() throws ChmParsingException {
+ getState().setAlignedLenTable(createAlignedLenTable());
+ getState().setAlignedTreeTable(//setAlignedLenTable(
+ createTreeTable2(getState().getAlignedLenTable(),
+ (1 << ChmConstants.LZX_NUM_PRIMARY_LENGTHS)
+ + (ChmConstants.LZX_ALIGNED_MAXSYMBOLS << 1),
+ ChmConstants.LZX_NUM_PRIMARY_LENGTHS,
+ ChmConstants.LZX_ALIGNED_MAXSYMBOLS));
+ }
+
+ private short[] createTreeTable2(short[] lentable, int tablelen, int bits,
+ int maxsymbol) throws ChmParsingException {
+ short[] tmp = new short[tablelen];
+ short sym;
+ int leaf;
+ int bit_num = 1;
+ long fill;
+ int pos = 0;
+ /* the current position in the decode table */
+ long table_mask = (1 << bits);
+ long bit_mask = (table_mask >> 1);
+ long next_symbol = bit_mask;
+
+ /* fills entries for short codes for a direct mapping */
+ while (bit_num <= bits) {
+ for (sym = 0; sym < maxsymbol; sym++) {
+ if (lentable.length > sym && lentable[sym] == bit_num) {
+ leaf = pos;
+
+ if ((pos += bit_mask) > table_mask) {
+ /* table overflow */
+ throw new ChmParsingException("Table overflow");
+ }
+
+ fill = bit_mask;
+ while (fill-- > 0)
+ tmp[leaf++] = sym;
+ }
+ }
+ bit_mask >>= 1;
+ bit_num++;
+ }
+
+ /* if there are any codes longer than nbits */
+ if (pos != table_mask) {
+ /* clears the remainder of the table */
+ for (leaf = pos; leaf < table_mask; leaf++)
+ tmp[leaf] = 0;
+
+ /* gives ourselves room for codes to grow by up to 16 more bits */
+ pos <<= 16;
+ table_mask <<= 16;
+ bit_mask = 1 << 15;
+
+ while (bit_num <= 16) {
+ for (sym = 0; sym < maxsymbol; sym++) {
+ if ((lentable.length > sym) && (lentable[sym] == bit_num)) {
+ leaf = pos >> 16;
+ for (fill = 0; fill < bit_num - bits; fill++) {
+ /*
+ * if this path hasn't been taken yet, 'allocate'
+ * two entries
+ */
+ if (tmp[leaf] == 0) {
+ if (((next_symbol << 1) + 1) < tmp.length) {
+ tmp[(int) (next_symbol << 1)] = 0;
+ tmp[(int) (next_symbol << 1) + 1] = 0;
+ tmp[leaf] = (short) next_symbol++;
+ }
+
+ }
+ /*
+ * follows the path and select either left or right
+ * for next bit
+ */
+ leaf = tmp[leaf] << 1;
+ if (((pos >> (15 - fill)) & 1) != 0)
+ leaf++;
+ }
+ tmp[leaf] = sym;
+
+ if ((pos += bit_mask) > table_mask) {
+ /* table overflow */
+ throw new ChmParsingException("Table overflow");
+ }
+ }
+ }
+ bit_mask >>= 1;
+ bit_num++;
+ }
+ }
+
+ /* is it full table? */
+ if (pos == table_mask)
+ return tmp;
+
+ return tmp;
+ }
+
+ public byte[] getContent() {
+ return content;
+ }
+
+ public byte[] getContent(int startOffset, int endOffset) {
+ return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
+ startOffset, endOffset) : new byte[1];
+ }
+
+ public byte[] getContent(int start) {
+ return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
+ start, getContent().length) : new byte[1];
+ }
+
+ private void setContent(int contentLength) {
+ this.content = new byte[contentLength];
+ }
+
+ private void checkLzxBlock(ChmLzxBlock chmPrevLzxBlock) throws TikaException {
+ if (chmPrevLzxBlock == null && getBlockLength() < Integer.MAX_VALUE)
+ setState(new ChmLzxState((int) getBlockLength()));
+ else
+ //use clone to avoid changing a cached or to be cached block
+ setState(chmPrevLzxBlock.getState().clone());
+ }
+
+ private boolean validateConstructorParams(int blockNumber,
+ byte[] dataSegment, long blockLength) throws TikaException {
+ int goodParameter = 0;
+ if (blockNumber >= 0)
+ ++goodParameter;
+ else
+ throw new ChmParsingException("block number should be possitive");
+ if (dataSegment != null && dataSegment.length > 0)
+ ++goodParameter;
+ else
+ throw new ChmParsingException("data segment should not be null");
+ if (blockLength > 0)
+ ++goodParameter;
+ else
+ throw new ChmParsingException(
+ "block length should be more than zero");
+ return (goodParameter == 3);
+ }
+
+ public int getBlockNumber() {
+ return block_number;
+ }
+
+ private void setBlockNumber(int block_number) {
+ this.block_number = block_number;
+ }
+
+ private long getBlockLength() {
+ return block_length;
+ }
+
+ private void setBlockLength(long block_length) {
+ this.block_length = block_length;
+ }
+
+ public ChmLzxState getState() {
+ return state;
+ }
+
+ private void setState(ChmLzxState state) {
+ this.state = state;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,327 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.util.concurrent.CancellationException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
+import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+public class ChmLzxState implements Cloneable {
+ /* Class' members */
+ private int window; /* the actual decoding window */
+ private long window_size; /* window size (32Kb through 2Mb) */
+ private int window_position; /* current offset within the window */
+ private int main_tree_elements; /* number of main tree elements */
+ private LzxState hadStarted; /* have we started decoding at all yet? */
+ private int block_type; /* type of this block */
+ private int block_length; /* uncompressed length of this block */
+ private int block_remaining; /* uncompressed bytes still left to decode */
+ private int frames_read; /* the number of CFDATA blocks processed */
+ private int intel_file_size; /* magic header value used for transform */
+ private long intel_current_possition; /* current offset in transform space */
+ private IntelState intel_state; /* have we seen any translatable data yet? */
+ private long R0; /* for the LRU offset system */
+ private long R1; /* for the LRU offset system */
+ private long R2; /* for the LRU offset system */
+
+ // Trees - PRETREE, MAINTREE, LENGTH, ALIGNED
+ protected short[] mainTreeLengtsTable;
+ protected short[] mainTreeTable;
+
+ protected short[] lengthTreeTable;
+ protected short[] lengthTreeLengtsTable;
+
+ protected short[] alignedLenTable;
+ protected short[] alignedTreeTable;
+
+ @Override
+ public ChmLzxState clone() {
+ try {
+ ChmLzxState clone = (ChmLzxState)super.clone();
+ clone.mainTreeLengtsTable = arrayClone(mainTreeLengtsTable);
+ clone.mainTreeTable = arrayClone(mainTreeTable);
+ clone.lengthTreeTable = arrayClone(lengthTreeTable);
+ clone.lengthTreeLengtsTable = arrayClone(lengthTreeLengtsTable);
+ clone.alignedLenTable = arrayClone(alignedLenTable);
+ clone.alignedTreeTable = arrayClone(alignedTreeTable);
+ return clone;
+ } catch (CloneNotSupportedException ex) {
+ return null;
+ }
+ }
+
+ protected short[] getMainTreeTable() {
+ return mainTreeTable;
+ }
+
+ protected short[] getAlignedTreeTable() {
+ return alignedTreeTable;
+ }
+
+ protected void setAlignedTreeTable(short[] alignedTreeTable) {
+ this.alignedTreeTable = alignedTreeTable;
+ }
+
+ protected short[] getLengthTreeTable() throws TikaException {
+ if (lengthTreeTable != null)
+ return this.lengthTreeTable;
+ else
+ throw new ChmParsingException("lengthTreeTable is null");
+ }
+
+ protected void setLengthTreeTable(short[] lengthTreeTable) {
+ this.lengthTreeTable = lengthTreeTable;
+ }
+
+ protected void setMainTreeTable(short[] mainTreeTable) {
+ this.mainTreeTable = mainTreeTable;
+ }
+
+ protected short[] getAlignedLenTable() {
+ return this.alignedLenTable;
+ }
+
+ protected void setAlignedLenTable(short[] alignedLenTable) {
+ this.alignedLenTable = alignedLenTable;
+ }
+
+ /**
+ * It suits for informative outlook
+ */
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("actual decoding window:=" + getWindow()
+ + System.getProperty("line.separator"));
+ sb.append("window size (32Kb through 2Mb):=" + getWindowSize()
+ + System.getProperty("line.separator"));
+ sb.append("current offset within the window:=" + getWindowPosition()
+ + System.getProperty("line.separator"));
+ sb.append("number of main tree elements:=" + getMainTreeElements()
+ + System.getProperty("line.separator"));
+ sb.append("have we started decoding at all yet?:=" + getHadStarted()
+ + System.getProperty("line.separator"));
+ sb.append("type of this block:=" + getBlockType()
+ + System.getProperty("line.separator"));
+ sb.append("uncompressed length of this block:=" + getBlockLength()
+ + System.getProperty("line.separator"));
+ sb.append("uncompressed bytes still left to decode:="
+ + getBlockRemaining() + System.getProperty("line.separator"));
+ sb.append("the number of CFDATA blocks processed:=" + getFramesRead()
+ + System.getProperty("line.separator"));
+ sb.append("magic header value used for transform:="
+ + getIntelFileSize() + System.getProperty("line.separator"));
+ sb.append("current offset in transform space:="
+ + getIntelCurrentPossition()
+ + System.getProperty("line.separator"));
+ sb.append("have we seen any translatable data yet?:=" + getIntelState()
+ + System.getProperty("line.separator"));
+ sb.append("R0 for the LRU offset system:=" + getR0()
+ + System.getProperty("line.separator"));
+ sb.append("R1 for the LRU offset system:=" + getR1()
+ + System.getProperty("line.separator"));
+ sb.append("R2 for the LRU offset system:=" + getR2()
+ + System.getProperty("line.separator"));
+ sb.append("main tree length:=" + getMainTreeLengtsTable().length
+ + System.getProperty("line.separator"));
+ sb.append("secondary tree length:=" + getLengthTreeLengtsTable().length
+ + System.getProperty("line.separator"));
+ return sb.toString();
+ }
+
+ public ChmLzxState(int window) throws TikaException {
+ if (window >= 0) {
+ int position_slots;
+ int win = ChmCommons.getWindowSize(window);
+ setWindowSize(1 << win);
+ /* LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) */
+ if (win < 15 || win > 21)
+ throw new ChmParsingException("window less than 15 or window greater than 21");
+
+ /* Calculates required position slots */
+ if (win == 20)
+ position_slots = 42;
+ else if (win == 21)
+ position_slots = 50;
+ else
+ position_slots = win << 1;
+ //TODO: position_slots is not used ?
+ setR0(1);
+ setR1(1);
+ setR2(1);
+ setMainTreeElements(512);
+ setHadStarted(LzxState.NOT_STARTED_DECODING);
+ setFramesRead(0);
+ setBlockRemaining(0);
+ setBlockType(ChmConstants.LZX_BLOCKTYPE_INVALID);
+ setIntelCurrentPossition(0);
+ setIntelState(IntelState.NOT_STARTED);
+ setWindowPosition(0);
+ setMainTreeLengtsTable(new short[getMainTreeElements()]);
+ setLengthTreeLengtsTable(new short[ChmConstants.LZX_NUM_SECONDARY_LENGTHS]);
+ } else
+ throw new CancellationException(
+ "window size should be more than zero");
+ }
+
+ protected void setWindow(int window) {
+ this.window = window;
+ }
+
+ protected int getWindow() {
+ return window;
+ }
+
+ protected void setWindowSize(long window_size) {
+ this.window_size = window_size;
+ }
+
+ protected long getWindowSize() {
+ return window_size;
+ }
+
+ protected void setWindowPosition(int window_position) {
+ this.window_position = window_position;
+ }
+
+ protected int getWindowPosition() {
+ return window_position;
+ }
+
+ protected void setMainTreeElements(int main_tree_elements) {
+ this.main_tree_elements = main_tree_elements;
+ }
+
+ protected int getMainTreeElements() {
+ return main_tree_elements;
+ }
+
+ protected void setHadStarted(LzxState hadStarted) {
+ this.hadStarted = hadStarted;
+ }
+
+ protected LzxState getHadStarted() {
+ return hadStarted;
+ }
+
+ protected void setBlockType(int block_type) {
+ this.block_type = block_type;
+ }
+
+ public int getBlockType() {
+ return block_type;
+ }
+
+ protected void setBlockLength(int block_length) {
+ this.block_length = block_length;
+ }
+
+ protected int getBlockLength() {
+ return block_length;
+ }
+
+ protected void setBlockRemaining(int block_remaining) {
+ this.block_remaining = block_remaining;
+ }
+
+ protected int getBlockRemaining() {
+ return block_remaining;
+ }
+
+ protected void setFramesRead(int frames_read) {
+ this.frames_read = frames_read;
+ }
+
+ protected void increaseFramesRead() {
+ this.frames_read = getFramesRead() + 1;
+ }
+
+ protected int getFramesRead() {
+ return frames_read;
+ }
+
+ protected void setIntelFileSize(int intel_file_size) {
+ this.intel_file_size = intel_file_size;
+ }
+
+ protected int getIntelFileSize() {
+ return intel_file_size;
+ }
+
+ protected void setIntelCurrentPossition(long intel_current_possition) {
+ this.intel_current_possition = intel_current_possition;
+ }
+
+ protected long getIntelCurrentPossition() {
+ return intel_current_possition;
+ }
+
+ protected void setIntelState(IntelState intel_state) {
+ this.intel_state = intel_state;
+ }
+
+ protected IntelState getIntelState() {
+ return intel_state;
+ }
+
+ protected void setR0(long r0) {
+ R0 = r0;
+ }
+
+ protected long getR0() {
+ return R0;
+ }
+
+ protected void setR1(long r1) {
+ R1 = r1;
+ }
+
+ protected long getR1() {
+ return R1;
+ }
+
+ protected void setR2(long r2) {
+ R2 = r2;
+ }
+
+ protected long getR2() {
+ return R2;
+ }
+
+ public void setMainTreeLengtsTable(short[] mainTreeLengtsTable) {
+ this.mainTreeLengtsTable = mainTreeLengtsTable;
+ }
+
+ public short[] getMainTreeLengtsTable() {
+ return mainTreeLengtsTable;
+ }
+
+ public void setLengthTreeLengtsTable(short[] lengthTreeLengtsTable) {
+ this.lengthTreeLengtsTable = lengthTreeLengtsTable;
+ }
+
+ public short[] getLengthTreeLengtsTable() {
+ return lengthTreeLengtsTable;
+ }
+
+ private static short[] arrayClone(short[] a) {
+ return a==null ? null : (short[]) a.clone();
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.core.ChmCommons;
+
+public class ChmSection {
+ final private byte[] data;
+ final private byte[] prevcontent;
+ private int swath;// kiks
+ private int total;// remains
+ private int buffer;// val
+
+ public ChmSection(byte[] data) throws TikaException {
+ this(data, null);
+ }
+
+ public ChmSection(byte[] data, byte[] prevconent) throws TikaException {
+ ChmCommons.assertByteArrayNotNull(data);
+ this.data = data;
+ this.prevcontent = prevconent;
+ //setData(data);
+ }
+
+ /* Utilities */
+ public byte[] reverseByteOrder(byte[] toBeReversed) throws TikaException {
+ ChmCommons.assertByteArrayNotNull(toBeReversed);
+ ChmCommons.reverse(toBeReversed);
+ return toBeReversed;
+ }
+
+ public int checkBit(int i) {
+ return ((getBuffer() & (1 << (getTotal() - i))) == 0) ? 0 : 1;
+ }
+
+ public int getSyncBits(int bit) {
+ return getDesyncBits(bit, bit);
+ }
+
+ public int peekBits(int bit) {
+ return getDesyncBits(bit, 0);
+ }
+
+ private int getDesyncBits(int bit, int removeBit) {
+ while (getTotal() < 16) {
+ setBuffer((getBuffer() << 16) + unmarshalUByte()
+ + (unmarshalUByte() << 8));
+ setTotal(getTotal() + 16);
+ }
+ int tmp = (getBuffer() >>> (getTotal() - bit));
+ setTotal(getTotal() - removeBit);
+ setBuffer(getBuffer() - ((getBuffer() >>> getTotal()) << getTotal()));
+ return tmp;
+ }
+
+ public int unmarshalUByte() {
+ return getByte() & 255;
+ }
+
+ public byte getByte() {
+ if (getSwath() < getData().length) {
+ setSwath(getSwath() + 1);
+ return getData()[getSwath() - 1];
+ } else
+ return 0;
+ }
+
+ public int getLeft() {
+ return (getData().length - getSwath());
+ }
+
+ public byte[] getData() {
+ return data;
+ }
+
+ public byte[] getPrevContent() {
+ return prevcontent;
+ }
+
+ public BigInteger getBigInteger(int i) {
+ if (getData() == null)
+ return BigInteger.ZERO;
+ if (getData().length - getSwath() < i)
+ i = getData().length - getSwath();
+ byte[] tmp = new byte[i];
+ for (int j = i - 1; j >= 0; j--) {
+ tmp[i - j - 1] = getData()[getSwath() + j];
+ }
+ setSwath(getSwath() + i);
+ return new BigInteger(tmp);
+ }
+
+ public byte[] stringToAsciiBytes(String s) {
+ char[] c = s.toCharArray();
+ byte[] byteval = new byte[c.length];
+ for (int i = 0; i < c.length; i++)
+ byteval[i] = (byte) c[i];
+ return byteval;
+ }
+
+ public BigInteger unmarshalUlong() {
+ return getBigInteger(8);
+ }
+
+ public long unmarshalUInt() {
+ return getBigInteger(4).longValue();
+ }
+
+ public int unmarshalInt() {
+ return getBigInteger(4).intValue();
+ }
+
+ public byte[] unmarshalBytes(int i) {
+ if (i == 0)
+ return new byte[1];
+ byte[] t = new byte[i];
+ for (int j = 0; j < i; j++)
+ t[j] = getData()[j + getSwath()];
+ setSwath(getSwath() + i);
+ return t;
+ }
+
+ public BigInteger getEncint() {
+ byte ob;
+ BigInteger bi = BigInteger.ZERO;
+ byte[] nb = new byte[1];
+ while ((ob = this.getByte()) < 0) {
+ nb[0] = (byte) ((ob & 0x7f));
+ bi = bi.shiftLeft(7).add(new BigInteger(nb));
+ }
+ nb[0] = (byte) ((ob & 0x7f));
+ bi = bi.shiftLeft(7).add(new BigInteger(nb));
+ return bi;
+ }
+
+ public char unmarshalUtfChar() {
+ byte ob;
+ int i = 1;
+ byte[] ba;
+ ob = this.getByte();
+ if (ob < 0) {
+ i = 2;
+ while ((ob << (24 + i)) < 0)
+ i++;
+ }
+ ba = new byte[i];
+ ba[0] = ob;
+ int j = 1;
+ while (j < i) {
+ ba[j] = this.getByte();
+ j++;
+ }
+ i = ba.length;
+ if (i == 1)
+ return (char) ba[0];
+ else {
+ int n;
+ n = ba[0] & 15; // 00001111b, gets last 4 bits
+ j = 1;
+ while (j < i)
+ n = (n << 6) + (ba[j++] & 63);// 00111111b,gets last 6 bits
+ return (char) n;
+ }
+ }
+
+// private void setData(byte[] data) {
+// this.data = data;
+// }
+
+ public int getSwath() {
+ return swath;
+ }
+
+ public void setSwath(int swath) {
+ this.swath = swath;
+ }
+
+ public int getTotal() {
+ return total;
+ }
+
+ public void setTotal(int total) {
+ this.total = total;
+ }
+
+ private int getBuffer() {
+ return buffer;
+ }
+
+ private void setBuffer(int buffer) {
+ this.buffer = buffer;
+ }
+
+ /**
+ * @param args
+ * @throws TikaException
+ */
+ public static void main(String[] args) throws TikaException {
+ byte[] array = { 4, 78, -67, 90, 1, -33 };
+ ChmSection chmSection = new ChmSection(array);
+ System.out.println("before " + Arrays.toString(array));
+ System.out.println("after " + Arrays.toString(chmSection.reverseByteOrder(array)));
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Queue;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Mbox (mailbox) parser. This version extracts each mail from Mbox and uses the
+ * DelegatingParser to process each mail.
+ */
+public class MboxParser extends AbstractParser {
+
+ public static final String MBOX_MIME_TYPE = "application/mbox";
+ public static final String MBOX_RECORD_DIVIDER = "From ";
+ public static final int MAIL_MAX_SIZE = 50000000;
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -1762689436731160661L;
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("mbox"));
+ private static final Pattern EMAIL_HEADER_PATTERN = Pattern.compile("([^ ]+):[ \t]*(.*)");
+ private static final Pattern EMAIL_ADDRESS_PATTERN = Pattern.compile("<(.*@.*)>");
+
+ private static final String EMAIL_HEADER_METADATA_PREFIX = "MboxParser-";
+ private static final String EMAIL_FROMLINE_METADATA = EMAIL_HEADER_METADATA_PREFIX + "from";
+ private final Map<Integer, Metadata> trackingMetadata = new HashMap<Integer, Metadata>();
+ private boolean tracking = false;
+
+ public static Date parseDate(String headerContent) throws ParseException {
+ SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
+ return dateFormat.parse(headerContent);
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, TikaException, SAXException {
+
+ EmbeddedDocumentExtractor extractor = context.get(EmbeddedDocumentExtractor.class,
+ new ParsingEmbeddedDocumentExtractor(context));
+
+ String charsetName = "windows-1252";
+
+ metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
+ metadata.set(Metadata.CONTENT_ENCODING, charsetName);
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ InputStreamReader isr = new InputStreamReader(stream, charsetName);
+ try (BufferedReader reader = new BufferedReader(isr)) {
+ String curLine = reader.readLine();
+ int mailItem = 0;
+ do {
+ if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
+ Metadata mailMetadata = new Metadata();
+ Queue<String> multiline = new LinkedList<String>();
+ mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
+ mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
+ curLine = reader.readLine();
+
+ ByteArrayOutputStream message = new ByteArrayOutputStream(100000);
+ do {
+ if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
+ String latestLine = multiline.poll();
+ latestLine += " " + curLine.trim();
+ multiline.add(latestLine);
+ } else {
+ multiline.add(curLine);
+ }
+
+ message.write(curLine.getBytes(charsetName));
+ message.write(0x0A);
+ curLine = reader.readLine();
+ }
+ while (curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE);
+
+ for (String item : multiline) {
+ saveHeaderInMetadata(mailMetadata, item);
+ }
+
+ ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray());
+ message = null;
+
+ if (extractor.shouldParseEmbedded(mailMetadata)) {
+ extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true);
+ }
+
+ if (tracking) {
+ getTrackingMetadata().put(mailItem++, mailMetadata);
+ }
+ } else {
+ curLine = reader.readLine();
+ }
+
+ } while (curLine != null && !Thread.currentThread().isInterrupted());
+ }
+
+ xhtml.endDocument();
+ }
+
+ public boolean isTracking() {
+ return tracking;
+ }
+
+ public void setTracking(boolean tracking) {
+ this.tracking = tracking;
+ }
+
+ public Map<Integer, Metadata> getTrackingMetadata() {
+ return trackingMetadata;
+ }
+
+ private void saveHeaderInMetadata(Metadata metadata, String curLine) {
+ Matcher headerMatcher = EMAIL_HEADER_PATTERN.matcher(curLine);
+ if (!headerMatcher.matches()) {
+ return; // ignore malformed header lines
+ }
+
+ String headerTag = headerMatcher.group(1).toLowerCase(Locale.ROOT);
+ String headerContent = headerMatcher.group(2);
+
+ if (headerTag.equalsIgnoreCase("From")) {
+ metadata.set(TikaCoreProperties.CREATOR, headerContent);
+ } else if (headerTag.equalsIgnoreCase("To") || headerTag.equalsIgnoreCase("Cc")
+ || headerTag.equalsIgnoreCase("Bcc")) {
+ Matcher address = EMAIL_ADDRESS_PATTERN.matcher(headerContent);
+ if (address.find()) {
+ metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, address.group(1));
+ } else if (headerContent.indexOf('@') > -1) {
+ metadata.add(Metadata.MESSAGE_RECIPIENT_ADDRESS, headerContent);
+ }
+
+ String property = Metadata.MESSAGE_TO;
+ if (headerTag.equalsIgnoreCase("Cc")) {
+ property = Metadata.MESSAGE_CC;
+ } else if (headerTag.equalsIgnoreCase("Bcc")) {
+ property = Metadata.MESSAGE_BCC;
+ }
+ metadata.add(property, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Subject")) {
+ metadata.add(Metadata.SUBJECT, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Date")) {
+ try {
+ Date date = parseDate(headerContent);
+ metadata.set(TikaCoreProperties.CREATED, date);
+ } catch (ParseException e) {
+ // ignoring date because format was not understood
+ }
+ } else if (headerTag.equalsIgnoreCase("Message-Id")) {
+ metadata.set(TikaCoreProperties.IDENTIFIER, headerContent);
+ } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
+ metadata.set(TikaCoreProperties.RELATION, headerContent);
+ } else if (headerTag.equalsIgnoreCase("Content-Type")) {
+ // TODO - key off content-type in headers to
+ // set mapping to use for content and convert if necessary.
+
+ metadata.add(Metadata.CONTENT_TYPE, headerContent);
+ metadata.set(TikaCoreProperties.FORMAT, headerContent);
+ } else {
+ metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent);
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-office-parser-module/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mbox;
+
+import static java.lang.String.valueOf;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.util.Collections.singleton;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Set;
+
+import com.pff.PSTAttachment;
+import com.pff.PSTFile;
+import com.pff.PSTFolder;
+import com.pff.PSTMessage;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Parser for MS Outlook PST email storage files
+ */
+public class OutlookPSTParser extends AbstractParser {
+
+ private static final long serialVersionUID = 620998217748364063L;
+
+ public static final MediaType MS_OUTLOOK_PST_MIMETYPE = MediaType.application("vnd.ms-outlook-pst");
+ private static final Set<MediaType> SUPPORTED_TYPES = singleton(MS_OUTLOOK_PST_MIMETYPE);
+
+ private static AttributesImpl createAttribute(String attName, String attValue) {
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", attName, attName, "CDATA", attValue);
+ return attributes;
+ }
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ // Use the delegate parser to parse the contained document
+ EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class,
+ new ParsingEmbeddedDocumentExtractor(context));
+
+ metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString());
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ TikaInputStream in = TikaInputStream.get(stream);
+ PSTFile pstFile = null;
+ try {
+ pstFile = new PSTFile(in.getFile().getPath());
+ metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length()));
+ boolean isValid = pstFile.getFileHandle().getFD().valid();
+ metadata.set("isValid", valueOf(isValid));
+ if (isValid) {
+ parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor);
+ }
+ } catch (Exception e) {
+ throw new TikaException(e.getMessage(), e);
+ } finally {
+ if (pstFile != null && pstFile.getFileHandle() != null) {
+ try {
+ pstFile.getFileHandle().close();
+ } catch (IOException e) {
+ //swallow closing exception
+ }
+ }
+ }
+
+ xhtml.endDocument();
+ }
+
+ private void parseFolder(XHTMLContentHandler handler, PSTFolder pstFolder, EmbeddedDocumentExtractor embeddedExtractor)
+ throws Exception {
+ if (pstFolder.getContentCount() > 0) {
+ PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
+ while (pstMail != null) {
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
+ handler.startElement("div", attributes);
+ handler.element("h1", pstMail.getSubject());
+
+ parserMailItem(handler, pstMail, embeddedExtractor);
+ parseMailAttachments(handler, pstMail, embeddedExtractor);
+
+ handler.endElement("div");
+
+ pstMail = (PSTMessage) pstFolder.getNextChild();
+ }
+ }
+
+ if (pstFolder.hasSubfolders()) {
+ for (PSTFolder pstSubFolder : pstFolder.getSubFolders()) {
+ handler.startElement("div", createAttribute("class", "email-folder"));
+ handler.element("h1", pstSubFolder.getDisplayName());
+ parseFolder(handler, pstSubFolder, embeddedExtractor);
+ handler.endElement("div");
+ }
+ }
+ }
+
+ private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException {
+ Metadata mailMetadata = new Metadata();
+ mailMetadata.set(Metadata.RESOURCE_NAME_KEY, pstMail.getInternetMessageId());
+ mailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId());
+ mailMetadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId());
+ mailMetadata.set(TikaCoreProperties.TITLE, pstMail.getSubject());
+ mailMetadata.set(Metadata.MESSAGE_FROM, pstMail.getSenderName());
+ mailMetadata.set(TikaCoreProperties.CREATOR, pstMail.getSenderName());
+ mailMetadata.set(TikaCoreProperties.CREATED, pstMail.getCreationTime());
+ mailMetadata.set(TikaCoreProperties.MODIFIED, pstMail.getLastModificationTime());
+ mailMetadata.set(TikaCoreProperties.COMMENTS, pstMail.getComment());
+ mailMetadata.set("descriptorNodeId", valueOf(pstMail.getDescriptorNodeId()));
+ mailMetadata.set("senderEmailAddress", pstMail.getSenderEmailAddress());
+ mailMetadata.set("recipients", pstMail.getRecipientsString());
+ mailMetadata.set("displayTo", pstMail.getDisplayTo());
+ mailMetadata.set("displayCC", pstMail.getDisplayCC());
+ mailMetadata.set("displayBCC", pstMail.getDisplayBCC());
+ mailMetadata.set("importance", valueOf(pstMail.getImportance()));
+ mailMetadata.set("priority", valueOf(pstMail.getPriority()));
+ mailMetadata.set("flagged", valueOf(pstMail.isFlagged()));
+
+ byte[] mailContent = pstMail.getBody().getBytes(UTF_8);
+ embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true);
+ }
+
+ private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email, EmbeddedDocumentExtractor embeddedExtractor)
+ throws TikaException {
+ int numberOfAttachments = email.getNumberOfAttachments();
+ for (int i = 0; i < numberOfAttachments; i++) {
+ File tempFile = null;
+ try {
+ PSTAttachment attach = email.getAttachment(i);
+
+ // Get the filename; both long and short filenames can be used for attachments
+ String filename = attach.getLongFilename();
+ if (filename.isEmpty()) {
+ filename = attach.getFilename();
+ }
+
+ xhtml.element("p", filename);
+
+ Metadata attachMeta = new Metadata();
+ attachMeta.set(Metadata.RESOURCE_NAME_KEY, filename);
+ attachMeta.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filename);
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", filename);
+ xhtml.startElement("div", attributes);
+ if (embeddedExtractor.shouldParseEmbedded(attachMeta)) {
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tis = TikaInputStream.get(attach.getFileInputStream(), tmp);
+ embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true);
+ } finally {
+ tmp.dispose();
+ }
+ }
+ xhtml.endElement("div");
+
+ } catch (Exception e) {
+ throw new TikaException("Unable to unpack document stream", e);
+ } finally {
+ if (tempFile != null)
+ tempFile.delete();
+ }
+ }
+ }
+
+}