You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 11:11:31 UTC
[25/39] tika git commit: Convert new lines from windows to unix
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java
index 119a47b..e423871 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java
@@ -1,102 +1,102 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.core;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-public class ChmConstants {
- /* Prevents instantiation */
- private ChmConstants() {
- }
-
- public static final String DEFAULT_CHARSET = UTF_8.name();
- public static final String ITSF = "ITSF";
- public static final String ITSP = "ITSP";
- public static final String PMGL = "PMGL";
- public static final String LZXC = "LZXC";
- public static final String CHM_PMGI_MARKER = "PMGI";
- public static final int BYTE_ARRAY_LENGHT = 16;
- public static final int CHM_ITSF_V2_LEN = 0x58;
- public static final int CHM_ITSF_V3_LEN = 0x60;
- public static final int CHM_ITSP_V1_LEN = 0x54;
- public static final int CHM_PMGL_LEN = 0x14;
- public static final int CHM_PMGI_LEN = 0x08;
- public static final int CHM_LZXC_RESETTABLE_V1_LEN = 0x28;
- public static final int CHM_LZXC_MIN_LEN = 0x18;
- public static final int CHM_LZXC_V2_LEN = 0x1c;
- public static final int CHM_SIGNATURE_LEN = 4;
- public static final int CHM_VER_2 = 2;
- public static final int CHM_VER_3 = 3;
- public static final int CHM_VER_1 = 1;
- public static final int CHM_WINDOW_SIZE_BLOCK = 0x8000;
-
- /* my hacking */
- public static final int START_PMGL = 0xCC;
- public static final String CONTROL_DATA = "ControlData";
- public static final String RESET_TABLE = "ResetTable";
- public static final String CONTENT = "Content";
-
- /* some constants defined by the LZX specification */
- public static final int LZX_MIN_MATCH = 2;
- public static final int LZX_MAX_MATCH = 257;
- public static final int LZX_NUM_CHARS = 256;
- public static final int LZX_BLOCKTYPE_INVALID = 0; /*
- * also blocktypes 4-7
- * invalid
- */
- public static final int LZX_BLOCKTYPE_VERBATIM = 1;
- public static final int LZX_BLOCKTYPE_ALIGNED = 2;
- public static final int LZX_BLOCKTYPE_UNCOMPRESSED = 3;
- public static final int LZX_PRETREE_NUM_ELEMENTS_BITS = 4; /* ??? */
- public static final int LZX_PRETREE_NUM_ELEMENTS = 20;
- public static final int LZX_ALIGNED_NUM_ELEMENTS = 8; /*
- * aligned offset tree
- * #elements
- */
- public static final int LZX_NUM_PRIMARY_LENGTHS = 7; /*
- * this one missing
- * from spec!
- */
- public static final int LZX_NUM_SECONDARY_LENGTHS = 249; /*
- * length tree
- * #elements
- */
-
- /* LZX huffman defines: tweak tablebits as desired */
- public static final int LZX_PRETREE_MAXSYMBOLS = LZX_PRETREE_NUM_ELEMENTS;
- public static final int LZX_PRETREE_TABLEBITS = 6;
- public static final int LZX_MAINTREE_MAXSYMBOLS = LZX_NUM_CHARS + 50 * 8;
- public static final int LZX_MAIN_MAXSYMBOLS = LZX_NUM_CHARS * 2;
- public static final int LZX_MAINTREE_TABLEBITS = 12;
- public static final int LZX_LENGTH_MAXSYMBOLS = LZX_NUM_SECONDARY_LENGTHS + 1;
- public static final int LZX_LENGTH_TABLEBITS = 12;
- public static final int LZX_ALIGNED_MAXSYMBOLS = LZX_ALIGNED_NUM_ELEMENTS;
- public static final int LZX_ALIGNED_TABLEBITS = 7;
- public static final int LZX_LENTABLE_SAFETY = 64;
-
- public static short[] EXTRA_BITS = { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5,
- 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
- 15, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
- 17, 17 };
-
- public static int[] POSITION_BASE = { 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32,
- 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536, 2048, 3072,
- 4096, 6144, 8192, 12288, 16384, 24576, 32768, 49152, 65536, 98304,
- 131072, 196608, 262144, 393216, 524288, 655360, 786432, 917504,
- 1048576, 1179648, 1310720, 1441792, 1572864, 1703936, 1835008,
- 1966080, 2097152 };
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.core;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+public class ChmConstants {
+ /* Prevents instantiation */
+ private ChmConstants() {
+ }
+
+ public static final String DEFAULT_CHARSET = UTF_8.name();
+ public static final String ITSF = "ITSF";
+ public static final String ITSP = "ITSP";
+ public static final String PMGL = "PMGL";
+ public static final String LZXC = "LZXC";
+ public static final String CHM_PMGI_MARKER = "PMGI";
+ public static final int BYTE_ARRAY_LENGHT = 16;
+ public static final int CHM_ITSF_V2_LEN = 0x58;
+ public static final int CHM_ITSF_V3_LEN = 0x60;
+ public static final int CHM_ITSP_V1_LEN = 0x54;
+ public static final int CHM_PMGL_LEN = 0x14;
+ public static final int CHM_PMGI_LEN = 0x08;
+ public static final int CHM_LZXC_RESETTABLE_V1_LEN = 0x28;
+ public static final int CHM_LZXC_MIN_LEN = 0x18;
+ public static final int CHM_LZXC_V2_LEN = 0x1c;
+ public static final int CHM_SIGNATURE_LEN = 4;
+ public static final int CHM_VER_2 = 2;
+ public static final int CHM_VER_3 = 3;
+ public static final int CHM_VER_1 = 1;
+ public static final int CHM_WINDOW_SIZE_BLOCK = 0x8000;
+
+ /* my hacking */
+ public static final int START_PMGL = 0xCC;
+ public static final String CONTROL_DATA = "ControlData";
+ public static final String RESET_TABLE = "ResetTable";
+ public static final String CONTENT = "Content";
+
+ /* some constants defined by the LZX specification */
+ public static final int LZX_MIN_MATCH = 2;
+ public static final int LZX_MAX_MATCH = 257;
+ public static final int LZX_NUM_CHARS = 256;
+ public static final int LZX_BLOCKTYPE_INVALID = 0; /*
+ * also blocktypes 4-7
+ * invalid
+ */
+ public static final int LZX_BLOCKTYPE_VERBATIM = 1;
+ public static final int LZX_BLOCKTYPE_ALIGNED = 2;
+ public static final int LZX_BLOCKTYPE_UNCOMPRESSED = 3;
+ public static final int LZX_PRETREE_NUM_ELEMENTS_BITS = 4; /* ??? */
+ public static final int LZX_PRETREE_NUM_ELEMENTS = 20;
+ public static final int LZX_ALIGNED_NUM_ELEMENTS = 8; /*
+ * aligned offset tree
+ * #elements
+ */
+ public static final int LZX_NUM_PRIMARY_LENGTHS = 7; /*
+ * this one missing
+ * from spec!
+ */
+ public static final int LZX_NUM_SECONDARY_LENGTHS = 249; /*
+ * length tree
+ * #elements
+ */
+
+ /* LZX huffman defines: tweak tablebits as desired */
+ public static final int LZX_PRETREE_MAXSYMBOLS = LZX_PRETREE_NUM_ELEMENTS;
+ public static final int LZX_PRETREE_TABLEBITS = 6;
+ public static final int LZX_MAINTREE_MAXSYMBOLS = LZX_NUM_CHARS + 50 * 8;
+ public static final int LZX_MAIN_MAXSYMBOLS = LZX_NUM_CHARS * 2;
+ public static final int LZX_MAINTREE_TABLEBITS = 12;
+ public static final int LZX_LENGTH_MAXSYMBOLS = LZX_NUM_SECONDARY_LENGTHS + 1;
+ public static final int LZX_LENGTH_TABLEBITS = 12;
+ public static final int LZX_ALIGNED_MAXSYMBOLS = LZX_ALIGNED_NUM_ELEMENTS;
+ public static final int LZX_ALIGNED_TABLEBITS = 7;
+ public static final int LZX_LENTABLE_SAFETY = 64;
+
+ public static short[] EXTRA_BITS = { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5,
+ 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
+ 15, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17 };
+
+ public static int[] POSITION_BASE = { 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32,
+ 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536, 2048, 3072,
+ 4096, 6144, 8192, 12288, 16384, 24576, 32768, 49152, 65536, 98304,
+ 131072, 196608, 262144, 393216, 524288, 655360, 786432, 917504,
+ 1048576, 1179648, 1310720, 1441792, 1572864, 1703936, 1835008,
+ 1966080, 2097152 };
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
index 85f4177..454c1c4 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
@@ -1,392 +1,392 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.core;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
-import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
-import org.apache.tika.parser.chm.accessor.ChmItspHeader;
-import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
-import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.apache.tika.parser.chm.assertion.ChmAssert;
-import org.apache.tika.parser.chm.core.ChmCommons.EntryType;
-import org.apache.tika.parser.chm.lzx.ChmBlockInfo;
-import org.apache.tika.parser.chm.lzx.ChmLzxBlock;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-
-/**
- * Extracts text from chm file. Enumerates chm entries.
- */
-public class ChmExtractor {
- private List<ChmLzxBlock> lzxBlocksCache = null;
- private ChmDirectoryListingSet chmDirList = null;
- private ChmItsfHeader chmItsfHeader = null;
- private ChmItspHeader chmItspHeader = null;
- private ChmLzxcResetTable chmLzxcResetTable = null;
- private ChmLzxcControlData chmLzxcControlData = null;
- private byte[] data = null;
- private int indexOfContent;
- private long lzxBlockOffset;
- private long lzxBlockLength;
-
- /**
- * Returns lzxc control data.
- *
- * @return ChmLzxcControlData
- */
- private ChmLzxcControlData getChmLzxcControlData() {
- return chmLzxcControlData;
- }
-
- /**
- * Sets lzxc control data
- *
- * @param chmLzxcControlData
- */
- private void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) {
- this.chmLzxcControlData = chmLzxcControlData;
- }
-
- private ChmItspHeader getChmItspHeader() {
- return chmItspHeader;
- }
-
- private void setChmItspHeader(ChmItspHeader chmItspHeader) {
- this.chmItspHeader = chmItspHeader;
- }
-
- /**
- * Returns lzxc reset table
- *
- * @return ChmLzxcResetTable
- */
- private ChmLzxcResetTable getChmLzxcResetTable() {
- return chmLzxcResetTable;
- }
-
- /**
- * Sets lzxc reset table
- *
- * @param chmLzxcResetTable
- */
- private void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) {
- this.chmLzxcResetTable = chmLzxcResetTable;
- }
-
- /**
- * Returns lzxc hit_cache length
- *
- * @return lzxBlockLength
- */
- private long getLzxBlockLength() {
- return lzxBlockLength;
- }
-
- /**
- * Sets lzxc hit_cache length
- *
- * @param lzxBlockLength
- */
- private void setLzxBlockLength(long lzxBlockLength) {
- this.lzxBlockLength = lzxBlockLength;
- }
-
- /**
- * Returns lzxc hit_cache offset
- *
- * @return lzxBlockOffset
- */
- private long getLzxBlockOffset() {
- return lzxBlockOffset;
- }
-
- /**
- * Sets lzxc hit_cache offset
- */
- private void setLzxBlockOffset(long lzxBlockOffset) {
- this.lzxBlockOffset = lzxBlockOffset;
- }
-
- private int getIndexOfContent() {
- return indexOfContent;
- }
-
- private void setIndexOfContent(int indexOfContent) {
- this.indexOfContent = indexOfContent;
- }
-
- private byte[] getData() {
- return data;
- }
-
- private void setData(byte[] data) {
- this.data = data;
- }
-
- public ChmExtractor(InputStream is) throws TikaException, IOException {
- ChmAssert.assertInputStreamNotNull(is);
- try {
- setData(IOUtils.toByteArray(is));
-
- /* Creates and parses chm itsf header */
- setChmItsfHeader(new ChmItsfHeader());
- // getChmItsfHeader().parse(Arrays.copyOfRange(getData(), 0,
- // ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader());
- getChmItsfHeader().parse(ChmCommons.copyOfRange(getData(), 0,
- ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader());
-
- /* Creates and parses chm itsp header */
- setChmItspHeader(new ChmItspHeader());
- // getChmItspHeader().parse(Arrays.copyOfRange( getData(), (int)
- // getChmItsfHeader().getDirOffset(),
- // (int) getChmItsfHeader().getDirOffset() +
- // ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader());
- getChmItspHeader().parse(
- ChmCommons.copyOfRange(getData(), (int) getChmItsfHeader()
- .getDirOffset(), (int) getChmItsfHeader().getDirOffset() +
- ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader());
-
- /* Creates instance of ChmDirListingContainer */
- setChmDirList(new ChmDirectoryListingSet(getData(),
- getChmItsfHeader(), getChmItspHeader()));
-
- int indexOfControlData = getChmDirList().getControlDataIndex();
- int indexOfResetData = ChmCommons.indexOfResetTableBlock(getData(),
- ChmConstants.LZXC.getBytes(UTF_8));
- byte[] dir_chunk = null;
- if (indexOfResetData > 0)
- dir_chunk = ChmCommons.copyOfRange( getData(), indexOfResetData, indexOfResetData
- + getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength());
- // dir_chunk = Arrays.copyOfRange(getData(), indexOfResetData,
- // indexOfResetData
- // +
- // getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength());
-
- /* Creates and parses chm control data */
- setChmLzxcControlData(new ChmLzxcControlData());
- getChmLzxcControlData().parse(dir_chunk, getChmLzxcControlData());
-
- int indexOfResetTable = getChmDirList().getResetTableIndex();
- setChmLzxcResetTable(new ChmLzxcResetTable());
-
- int startIndex = (int) getChmDirList().getDataOffset()
- + getChmDirList().getDirectoryListingEntryList()
- .get(indexOfResetTable).getOffset();
-
- // assert startIndex < data.length
- ChmAssert.assertCopyingDataIndex(startIndex, getData().length);
-
- // dir_chunk = Arrays.copyOfRange(getData(), startIndex, startIndex
- // +
- // getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength());
- dir_chunk = ChmCommons.copyOfRange(getData(), startIndex, startIndex
- + getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength());
-
- getChmLzxcResetTable().parse(dir_chunk, getChmLzxcResetTable());
-
- setIndexOfContent(ChmCommons.indexOf(getChmDirList().getDirectoryListingEntryList(),
- ChmConstants.CONTENT));
- setLzxBlockOffset((getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()).getOffset()
- + getChmItsfHeader().getDataOffset()));
- setLzxBlockLength(getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()).getLength());
-
- setLzxBlocksCache(new ArrayList<ChmLzxBlock>());
-
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- /**
- * Enumerates chm entities
- *
- * @return list of chm entities
- */
- public List<String> enumerateChm() {
- List<String> listOfEntries = new ArrayList<String>();
- for (DirectoryListingEntry directoryListingEntry : getChmDirList().getDirectoryListingEntryList()) {
- listOfEntries.add(directoryListingEntry.getName());
- }
- return listOfEntries;
- }
-
- /**
- * Decompresses a chm entry
- *
- * @param directoryListingEntry
- *
- * @return decompressed data
- * @throws TikaException
- */
- public byte[] extractChmEntry(DirectoryListingEntry directoryListingEntry) throws TikaException {
- ByteArrayOutputStream buffer = new ByteArrayOutputStream();
- ChmLzxBlock lzxBlock = null;
- try {
- /* UNCOMPRESSED type is easiest one */
- if (directoryListingEntry.getEntryType() == EntryType.UNCOMPRESSED
- && directoryListingEntry.getLength() > 0
- && !ChmCommons.hasSkip(directoryListingEntry)) {
- int dataOffset = (int) (getChmItsfHeader().getDataOffset() + directoryListingEntry
- .getOffset());
- // dataSegment = Arrays.copyOfRange(getData(), dataOffset,
- // dataOffset + directoryListingEntry.getLength());
- buffer.write(ChmCommons.copyOfRange(
- getData(), dataOffset,
- dataOffset + directoryListingEntry.getLength()));
- } else if (directoryListingEntry.getEntryType() == EntryType.COMPRESSED
- && !ChmCommons.hasSkip(directoryListingEntry)) {
- /* Gets a chm hit_cache info */
- ChmBlockInfo bb = ChmBlockInfo.getChmBlockInfoInstance(
- directoryListingEntry, (int) getChmLzxcResetTable()
- .getBlockLen(), getChmLzxcControlData());
-
- int i = 0, start = 0, hit_cache = 0;
-
- if ((getLzxBlockLength() < Integer.MAX_VALUE)
- && (getLzxBlockOffset() < Integer.MAX_VALUE)) {
- // TODO: Improve the caching
- // caching ... = O(n^2) - depends on startBlock and endBlock
- start = -1;
- if (!getLzxBlocksCache().isEmpty()) {
- for (i = 0; i < getLzxBlocksCache().size(); i++) {
- //lzxBlock = getLzxBlocksCache().get(i);
- int bn = getLzxBlocksCache().get(i).getBlockNumber();
- for (int j = bb.getIniBlock(); j <= bb.getStartBlock(); j++) {
- if (bn == j) {
- if (j > start) {
- start = j;
- hit_cache = i;
- }
- }
- }
- if (start == bb.getStartBlock())
- break;
- }
- }
-
-// if (i == getLzxBlocksCache().size() && i == 0) {
- if (start<0) {
- start = bb.getIniBlock();
-
- byte[] dataSegment = ChmCommons.getChmBlockSegment(
- getData(),
- getChmLzxcResetTable(), start,
- (int) getLzxBlockOffset(),
- (int) getLzxBlockLength());
-
- lzxBlock = new ChmLzxBlock(start, dataSegment,
- getChmLzxcResetTable().getBlockLen(), null);
-
- getLzxBlocksCache().add(lzxBlock);
- } else {
- lzxBlock = getLzxBlocksCache().get(hit_cache);
- }
-
- for (i = start; i <= bb.getEndBlock();) {
- if (i == bb.getStartBlock() && i == bb.getEndBlock()) {
- buffer.write(lzxBlock.getContent(
- bb.getStartOffset(), bb.getEndOffset()));
- break;
- }
-
- if (i == bb.getStartBlock()) {
- buffer.write(lzxBlock.getContent(
- bb.getStartOffset()));
- }
-
- if (i > bb.getStartBlock() && i < bb.getEndBlock()) {
- buffer.write(lzxBlock.getContent());
- }
-
- if (i == bb.getEndBlock()) {
- buffer.write(lzxBlock.getContent(
- 0, bb.getEndOffset()));
- break;
- }
-
- i++;
-
- if (i % getChmLzxcControlData().getResetInterval() == 0) {
- lzxBlock = new ChmLzxBlock(i,
- ChmCommons.getChmBlockSegment(getData(),
- getChmLzxcResetTable(), i,
- (int) getLzxBlockOffset(),
- (int) getLzxBlockLength()),
- getChmLzxcResetTable().getBlockLen(), null);
- } else {
- lzxBlock = new ChmLzxBlock(i,
- ChmCommons.getChmBlockSegment(getData(),
- getChmLzxcResetTable(), i,
- (int) getLzxBlockOffset(),
- (int) getLzxBlockLength()),
- getChmLzxcResetTable().getBlockLen(),
- lzxBlock);
- }
-
- getLzxBlocksCache().add(lzxBlock);
- }
-
- if (getLzxBlocksCache().size() > getChmLzxcResetTable()
- .getBlockCount()) {
- getLzxBlocksCache().clear();
- }
- } //end of if
-
- if (buffer.size() != directoryListingEntry.getLength()) {
- throw new TikaException("CHM file extract error: extracted Length is wrong.");
- }
- } //end of if compressed
- } catch (Exception e) {
- throw new TikaException(e.getMessage());
- }
-
- return buffer.toByteArray();
- }
-
- private void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) {
- this.lzxBlocksCache = lzxBlocksCache;
- }
-
- private List<ChmLzxBlock> getLzxBlocksCache() {
- return lzxBlocksCache;
- }
-
- private void setChmDirList(ChmDirectoryListingSet chmDirList) {
- this.chmDirList = chmDirList;
- }
-
- public ChmDirectoryListingSet getChmDirList() {
- return chmDirList;
- }
-
- private void setChmItsfHeader(ChmItsfHeader chmItsfHeader) {
- this.chmItsfHeader = chmItsfHeader;
- }
-
- private ChmItsfHeader getChmItsfHeader() {
- return chmItsfHeader;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.core;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.accessor.ChmItspHeader;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmCommons.EntryType;
+import org.apache.tika.parser.chm.lzx.ChmBlockInfo;
+import org.apache.tika.parser.chm.lzx.ChmLzxBlock;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * Extracts text from chm file. Enumerates chm entries.
+ */
+public class ChmExtractor {
+ private List<ChmLzxBlock> lzxBlocksCache = null;
+ private ChmDirectoryListingSet chmDirList = null;
+ private ChmItsfHeader chmItsfHeader = null;
+ private ChmItspHeader chmItspHeader = null;
+ private ChmLzxcResetTable chmLzxcResetTable = null;
+ private ChmLzxcControlData chmLzxcControlData = null;
+ private byte[] data = null;
+ private int indexOfContent;
+ private long lzxBlockOffset;
+ private long lzxBlockLength;
+
+ /**
+ * Returns lzxc control data.
+ *
+ * @return ChmLzxcControlData
+ */
+ private ChmLzxcControlData getChmLzxcControlData() {
+ return chmLzxcControlData;
+ }
+
+ /**
+ * Sets lzxc control data
+ *
+ * @param chmLzxcControlData
+ */
+ private void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) {
+ this.chmLzxcControlData = chmLzxcControlData;
+ }
+
+ private ChmItspHeader getChmItspHeader() {
+ return chmItspHeader;
+ }
+
+ private void setChmItspHeader(ChmItspHeader chmItspHeader) {
+ this.chmItspHeader = chmItspHeader;
+ }
+
+ /**
+ * Returns lzxc reset table
+ *
+ * @return ChmLzxcResetTable
+ */
+ private ChmLzxcResetTable getChmLzxcResetTable() {
+ return chmLzxcResetTable;
+ }
+
+ /**
+ * Sets lzxc reset table
+ *
+ * @param chmLzxcResetTable
+ */
+ private void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) {
+ this.chmLzxcResetTable = chmLzxcResetTable;
+ }
+
+ /**
+ * Returns lzxc hit_cache length
+ *
+ * @return lzxBlockLength
+ */
+ private long getLzxBlockLength() {
+ return lzxBlockLength;
+ }
+
+ /**
+ * Sets lzxc hit_cache length
+ *
+ * @param lzxBlockLength
+ */
+ private void setLzxBlockLength(long lzxBlockLength) {
+ this.lzxBlockLength = lzxBlockLength;
+ }
+
+ /**
+ * Returns lzxc hit_cache offset
+ *
+ * @return lzxBlockOffset
+ */
+ private long getLzxBlockOffset() {
+ return lzxBlockOffset;
+ }
+
+ /**
+ * Sets lzxc hit_cache offset
+ */
+ private void setLzxBlockOffset(long lzxBlockOffset) {
+ this.lzxBlockOffset = lzxBlockOffset;
+ }
+
+ private int getIndexOfContent() {
+ return indexOfContent;
+ }
+
+ private void setIndexOfContent(int indexOfContent) {
+ this.indexOfContent = indexOfContent;
+ }
+
+ private byte[] getData() {
+ return data;
+ }
+
+ private void setData(byte[] data) {
+ this.data = data;
+ }
+
+ public ChmExtractor(InputStream is) throws TikaException, IOException {
+ ChmAssert.assertInputStreamNotNull(is);
+ try {
+ setData(IOUtils.toByteArray(is));
+
+ /* Creates and parses chm itsf header */
+ setChmItsfHeader(new ChmItsfHeader());
+ // getChmItsfHeader().parse(Arrays.copyOfRange(getData(), 0,
+ // ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader());
+ getChmItsfHeader().parse(ChmCommons.copyOfRange(getData(), 0,
+ ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader());
+
+ /* Creates and parses chm itsp header */
+ setChmItspHeader(new ChmItspHeader());
+ // getChmItspHeader().parse(Arrays.copyOfRange( getData(), (int)
+ // getChmItsfHeader().getDirOffset(),
+ // (int) getChmItsfHeader().getDirOffset() +
+ // ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader());
+ getChmItspHeader().parse(
+ ChmCommons.copyOfRange(getData(), (int) getChmItsfHeader()
+ .getDirOffset(), (int) getChmItsfHeader().getDirOffset() +
+ ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader());
+
+ /* Creates instance of ChmDirListingContainer */
+ setChmDirList(new ChmDirectoryListingSet(getData(),
+ getChmItsfHeader(), getChmItspHeader()));
+
+ int indexOfControlData = getChmDirList().getControlDataIndex();
+ int indexOfResetData = ChmCommons.indexOfResetTableBlock(getData(),
+ ChmConstants.LZXC.getBytes(UTF_8));
+ byte[] dir_chunk = null;
+ if (indexOfResetData > 0)
+ dir_chunk = ChmCommons.copyOfRange( getData(), indexOfResetData, indexOfResetData
+ + getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength());
+ // dir_chunk = Arrays.copyOfRange(getData(), indexOfResetData,
+ // indexOfResetData
+ // +
+ // getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength());
+
+ /* Creates and parses chm control data */
+ setChmLzxcControlData(new ChmLzxcControlData());
+ getChmLzxcControlData().parse(dir_chunk, getChmLzxcControlData());
+
+ int indexOfResetTable = getChmDirList().getResetTableIndex();
+ setChmLzxcResetTable(new ChmLzxcResetTable());
+
+ int startIndex = (int) getChmDirList().getDataOffset()
+ + getChmDirList().getDirectoryListingEntryList()
+ .get(indexOfResetTable).getOffset();
+
+ // assert startIndex < data.length
+ ChmAssert.assertCopyingDataIndex(startIndex, getData().length);
+
+ // dir_chunk = Arrays.copyOfRange(getData(), startIndex, startIndex
+ // +
+ // getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength());
+ dir_chunk = ChmCommons.copyOfRange(getData(), startIndex, startIndex
+ + getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength());
+
+ getChmLzxcResetTable().parse(dir_chunk, getChmLzxcResetTable());
+
+ setIndexOfContent(ChmCommons.indexOf(getChmDirList().getDirectoryListingEntryList(),
+ ChmConstants.CONTENT));
+ setLzxBlockOffset((getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()).getOffset()
+ + getChmItsfHeader().getDataOffset()));
+ setLzxBlockLength(getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()).getLength());
+
+ setLzxBlocksCache(new ArrayList<ChmLzxBlock>());
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * Enumerates chm entities
+ *
+ * @return list of chm entities
+ */
+ public List<String> enumerateChm() {
+ List<String> listOfEntries = new ArrayList<String>();
+ for (DirectoryListingEntry directoryListingEntry : getChmDirList().getDirectoryListingEntryList()) {
+ listOfEntries.add(directoryListingEntry.getName());
+ }
+ return listOfEntries;
+ }
+
+ /**
+ * Decompresses a chm entry
+ *
+ * @param directoryListingEntry
+ *
+ * @return decompressed data
+ * @throws TikaException
+ */
+ public byte[] extractChmEntry(DirectoryListingEntry directoryListingEntry) throws TikaException {
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+ ChmLzxBlock lzxBlock = null;
+ try {
+ /* UNCOMPRESSED type is easiest one */
+ if (directoryListingEntry.getEntryType() == EntryType.UNCOMPRESSED
+ && directoryListingEntry.getLength() > 0
+ && !ChmCommons.hasSkip(directoryListingEntry)) {
+ int dataOffset = (int) (getChmItsfHeader().getDataOffset() + directoryListingEntry
+ .getOffset());
+ // dataSegment = Arrays.copyOfRange(getData(), dataOffset,
+ // dataOffset + directoryListingEntry.getLength());
+ buffer.write(ChmCommons.copyOfRange(
+ getData(), dataOffset,
+ dataOffset + directoryListingEntry.getLength()));
+ } else if (directoryListingEntry.getEntryType() == EntryType.COMPRESSED
+ && !ChmCommons.hasSkip(directoryListingEntry)) {
+ /* Gets a chm hit_cache info */
+ ChmBlockInfo bb = ChmBlockInfo.getChmBlockInfoInstance(
+ directoryListingEntry, (int) getChmLzxcResetTable()
+ .getBlockLen(), getChmLzxcControlData());
+
+ int i = 0, start = 0, hit_cache = 0;
+
+ if ((getLzxBlockLength() < Integer.MAX_VALUE)
+ && (getLzxBlockOffset() < Integer.MAX_VALUE)) {
+ // TODO: Improve the caching
+ // caching ... = O(n^2) - depends on startBlock and endBlock
+ start = -1;
+ if (!getLzxBlocksCache().isEmpty()) {
+ for (i = 0; i < getLzxBlocksCache().size(); i++) {
+ //lzxBlock = getLzxBlocksCache().get(i);
+ int bn = getLzxBlocksCache().get(i).getBlockNumber();
+ for (int j = bb.getIniBlock(); j <= bb.getStartBlock(); j++) {
+ if (bn == j) {
+ if (j > start) {
+ start = j;
+ hit_cache = i;
+ }
+ }
+ }
+ if (start == bb.getStartBlock())
+ break;
+ }
+ }
+
+// if (i == getLzxBlocksCache().size() && i == 0) {
+ if (start<0) {
+ start = bb.getIniBlock();
+
+ byte[] dataSegment = ChmCommons.getChmBlockSegment(
+ getData(),
+ getChmLzxcResetTable(), start,
+ (int) getLzxBlockOffset(),
+ (int) getLzxBlockLength());
+
+ lzxBlock = new ChmLzxBlock(start, dataSegment,
+ getChmLzxcResetTable().getBlockLen(), null);
+
+ getLzxBlocksCache().add(lzxBlock);
+ } else {
+ lzxBlock = getLzxBlocksCache().get(hit_cache);
+ }
+
+ for (i = start; i <= bb.getEndBlock();) {
+ if (i == bb.getStartBlock() && i == bb.getEndBlock()) {
+ buffer.write(lzxBlock.getContent(
+ bb.getStartOffset(), bb.getEndOffset()));
+ break;
+ }
+
+ if (i == bb.getStartBlock()) {
+ buffer.write(lzxBlock.getContent(
+ bb.getStartOffset()));
+ }
+
+ if (i > bb.getStartBlock() && i < bb.getEndBlock()) {
+ buffer.write(lzxBlock.getContent());
+ }
+
+ if (i == bb.getEndBlock()) {
+ buffer.write(lzxBlock.getContent(
+ 0, bb.getEndOffset()));
+ break;
+ }
+
+ i++;
+
+ if (i % getChmLzxcControlData().getResetInterval() == 0) {
+ lzxBlock = new ChmLzxBlock(i,
+ ChmCommons.getChmBlockSegment(getData(),
+ getChmLzxcResetTable(), i,
+ (int) getLzxBlockOffset(),
+ (int) getLzxBlockLength()),
+ getChmLzxcResetTable().getBlockLen(), null);
+ } else {
+ lzxBlock = new ChmLzxBlock(i,
+ ChmCommons.getChmBlockSegment(getData(),
+ getChmLzxcResetTable(), i,
+ (int) getLzxBlockOffset(),
+ (int) getLzxBlockLength()),
+ getChmLzxcResetTable().getBlockLen(),
+ lzxBlock);
+ }
+
+ getLzxBlocksCache().add(lzxBlock);
+ }
+
+ if (getLzxBlocksCache().size() > getChmLzxcResetTable()
+ .getBlockCount()) {
+ getLzxBlocksCache().clear();
+ }
+ } //end of if
+
+ if (buffer.size() != directoryListingEntry.getLength()) {
+ throw new TikaException("CHM file extract error: extracted Length is wrong.");
+ }
+ } //end of if compressed
+ } catch (Exception e) {
+ throw new TikaException(e.getMessage());
+ }
+
+ return buffer.toByteArray();
+ }
+
+ private void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) {
+ this.lzxBlocksCache = lzxBlocksCache;
+ }
+
+ private List<ChmLzxBlock> getLzxBlocksCache() {
+ return lzxBlocksCache;
+ }
+
+ private void setChmDirList(ChmDirectoryListingSet chmDirList) {
+ this.chmDirList = chmDirList;
+ }
+
+ public ChmDirectoryListingSet getChmDirList() {
+ return chmDirList;
+ }
+
+ private void setChmItsfHeader(ChmItsfHeader chmItsfHeader) {
+ this.chmItsfHeader = chmItsfHeader;
+ }
+
+ private ChmItsfHeader getChmItsfHeader() {
+ return chmItsfHeader;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java
index 03f81d3..9ed1898 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java
@@ -1,147 +1,147 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.chm.core;
-
-import java.util.List;
-
-import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
-import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
-import org.apache.tika.parser.chm.accessor.ChmItspHeader;
-import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
-import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
-import org.apache.tika.parser.chm.lzx.ChmLzxBlock;
-
-public class ChmWrapper {
- private List<ChmLzxBlock> lzxBlocksCache = null;
- private ChmDirectoryListingSet chmDirList = null;
- private ChmItsfHeader chmItsfHeader = null;
- private ChmItspHeader chmItspHeader = null;
- private ChmLzxcResetTable chmLzxcResetTable = null;
- private ChmLzxcControlData chmLzxcControlData = null;
- private byte[] data = null;
- private int indexOfContent;
- private long lzxBlockOffset;
- private long lzxBlockLength;
- private int indexOfResetData;
- private int indexOfResetTable;
- private int startIndex;
-
- protected int getStartIndex() {
- return startIndex;
- }
-
- protected void setStartIndex(int startIndex) {
- this.startIndex = startIndex;
- }
-
- protected int getIndexOfResetTable() {
- return indexOfResetTable;
- }
-
- protected void setIndexOfResetTable(int indexOfResetTable) {
- this.indexOfResetTable = indexOfResetTable;
- }
-
- protected List<ChmLzxBlock> getLzxBlocksCache() {
- return lzxBlocksCache;
- }
-
- protected void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) {
- this.lzxBlocksCache = lzxBlocksCache;
- }
-
- protected ChmDirectoryListingSet getChmDirList() {
- return chmDirList;
- }
-
- protected void setChmDirList(ChmDirectoryListingSet chmDirList) {
- this.chmDirList = chmDirList;
- }
-
- protected ChmItsfHeader getChmItsfHeader() {
- return chmItsfHeader;
- }
-
- protected void setChmItsfHeader(ChmItsfHeader chmItsfHeader) {
- this.chmItsfHeader = chmItsfHeader;
- }
-
- protected ChmLzxcResetTable getChmLzxcResetTable() {
- return chmLzxcResetTable;
- }
-
- protected void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) {
- this.chmLzxcResetTable = chmLzxcResetTable;
- }
-
- protected ChmLzxcControlData getChmLzxcControlData() {
- return chmLzxcControlData;
- }
-
- protected void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) {
- this.chmLzxcControlData = chmLzxcControlData;
- }
-
- protected byte[] getData() {
- return data;
- }
-
- protected void setData(byte[] data) {
- this.data = data;
- }
-
- protected int getIndexOfContent() {
- return indexOfContent;
- }
-
- protected void setIndexOfContent(int indexOfContent) {
- this.indexOfContent = indexOfContent;
- }
-
- protected long getLzxBlockOffset() {
- return lzxBlockOffset;
- }
-
- protected void setLzxBlockOffset(long lzxBlockOffset) {
- this.lzxBlockOffset = lzxBlockOffset;
- }
-
- protected long getLzxBlockLength() {
- return lzxBlockLength;
- }
-
- protected void setLzxBlockLength(long lzxBlockLength) {
- this.lzxBlockLength = lzxBlockLength;
- }
-
- protected void setChmItspHeader(ChmItspHeader chmItspHeader) {
- this.chmItspHeader = chmItspHeader;
- }
-
- protected ChmItspHeader getChmItspHeader() {
- return chmItspHeader;
- }
-
- protected void setIndexOfResetData(int indexOfResetData) {
- this.indexOfResetData = indexOfResetData;
- }
-
- protected int getIndexOfResetData() {
- return indexOfResetData;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.chm.core;
+
+import java.util.List;
+
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.accessor.ChmItspHeader;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
+import org.apache.tika.parser.chm.lzx.ChmLzxBlock;
+
+public class ChmWrapper {
+ private List<ChmLzxBlock> lzxBlocksCache = null;
+ private ChmDirectoryListingSet chmDirList = null;
+ private ChmItsfHeader chmItsfHeader = null;
+ private ChmItspHeader chmItspHeader = null;
+ private ChmLzxcResetTable chmLzxcResetTable = null;
+ private ChmLzxcControlData chmLzxcControlData = null;
+ private byte[] data = null;
+ private int indexOfContent;
+ private long lzxBlockOffset;
+ private long lzxBlockLength;
+ private int indexOfResetData;
+ private int indexOfResetTable;
+ private int startIndex;
+
+ protected int getStartIndex() {
+ return startIndex;
+ }
+
+ protected void setStartIndex(int startIndex) {
+ this.startIndex = startIndex;
+ }
+
+ protected int getIndexOfResetTable() {
+ return indexOfResetTable;
+ }
+
+ protected void setIndexOfResetTable(int indexOfResetTable) {
+ this.indexOfResetTable = indexOfResetTable;
+ }
+
+ protected List<ChmLzxBlock> getLzxBlocksCache() {
+ return lzxBlocksCache;
+ }
+
+ protected void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) {
+ this.lzxBlocksCache = lzxBlocksCache;
+ }
+
+ protected ChmDirectoryListingSet getChmDirList() {
+ return chmDirList;
+ }
+
+ protected void setChmDirList(ChmDirectoryListingSet chmDirList) {
+ this.chmDirList = chmDirList;
+ }
+
+ protected ChmItsfHeader getChmItsfHeader() {
+ return chmItsfHeader;
+ }
+
+ protected void setChmItsfHeader(ChmItsfHeader chmItsfHeader) {
+ this.chmItsfHeader = chmItsfHeader;
+ }
+
+ protected ChmLzxcResetTable getChmLzxcResetTable() {
+ return chmLzxcResetTable;
+ }
+
+ protected void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) {
+ this.chmLzxcResetTable = chmLzxcResetTable;
+ }
+
+ protected ChmLzxcControlData getChmLzxcControlData() {
+ return chmLzxcControlData;
+ }
+
+ protected void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) {
+ this.chmLzxcControlData = chmLzxcControlData;
+ }
+
+ protected byte[] getData() {
+ return data;
+ }
+
+ protected void setData(byte[] data) {
+ this.data = data;
+ }
+
+ protected int getIndexOfContent() {
+ return indexOfContent;
+ }
+
+ protected void setIndexOfContent(int indexOfContent) {
+ this.indexOfContent = indexOfContent;
+ }
+
+ protected long getLzxBlockOffset() {
+ return lzxBlockOffset;
+ }
+
+ protected void setLzxBlockOffset(long lzxBlockOffset) {
+ this.lzxBlockOffset = lzxBlockOffset;
+ }
+
+ protected long getLzxBlockLength() {
+ return lzxBlockLength;
+ }
+
+ protected void setLzxBlockLength(long lzxBlockLength) {
+ this.lzxBlockLength = lzxBlockLength;
+ }
+
+ protected void setChmItspHeader(ChmItspHeader chmItspHeader) {
+ this.chmItspHeader = chmItspHeader;
+ }
+
+ protected ChmItspHeader getChmItspHeader() {
+ return chmItspHeader;
+ }
+
+ protected void setIndexOfResetData(int indexOfResetData) {
+ this.indexOfResetData = indexOfResetData;
+ }
+
+ protected int getIndexOfResetData() {
+ return indexOfResetData;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java
index fbed908..46c522b 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java
@@ -1,27 +1,27 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.exception;
-
-import org.apache.tika.exception.TikaException;
-
-public class ChmParsingException extends TikaException {
- private static final long serialVersionUID = 6497936044733665210L;
-
- public ChmParsingException(String description) {
- super(description);
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.exception;
+
+import org.apache.tika.exception.TikaException;
+
+public class ChmParsingException extends TikaException {
+ private static final long serialVersionUID = 6497936044733665210L;
+
+ public ChmParsingException(String description) {
+ super(description);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java
index 7f7564d..cda829c 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java
@@ -1,235 +1,235 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.chm.lzx;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
-import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
-import org.apache.tika.parser.chm.exception.ChmParsingException;
-
-/**
- * A container that contains chm block information such as: i. initial block is
- * using to reset main tree ii. start block is using for knowing where to start
- * iii. end block is using for knowing where to stop iv. start offset is using
- * for knowing where to start reading v. end offset is using for knowing where
- * to stop reading
- *
- */
-public class ChmBlockInfo {
- /* class members */
- private int iniBlock;
- private int startBlock;
- private int endBlock;
- private int startOffset;
- private int endOffset;
-
- private static ChmBlockInfo chmBlockInfo = null;
-
- private ChmBlockInfo() {
-
- }
-
- /**
- * Returns an information related to the chmBlockInfo
- *
- * @param dle
- * - DirectoryListingEntry
- * @param bytesPerBlock
- * - int, = chmLzxcResetTable.block_length
- * @param clcd
- * - ChmLzxcControlData
- * @param chmBlockInfo
- * - ChmBlockInfo
- *
- * @return ChmBlockInfo
- * @throws TikaException
- */
- protected ChmBlockInfo getChmBlockInfo(DirectoryListingEntry dle,
- int bytesPerBlock, ChmLzxcControlData clcd,
- ChmBlockInfo chmBlockInfo) throws TikaException {
- if (!validateParameters(dle, bytesPerBlock, clcd, chmBlockInfo))
- throw new ChmParsingException("Please check you parameters");
-
- chmBlockInfo.setStartBlock(dle.getOffset() / bytesPerBlock);
- chmBlockInfo.setEndBlock((dle.getOffset() + dle.getLength())
- / bytesPerBlock);
- chmBlockInfo.setStartOffset(dle.getOffset() % bytesPerBlock);
- chmBlockInfo.setEndOffset((dle.getOffset() + dle.getLength())
- % bytesPerBlock);
- // potential problem with casting long to int
- chmBlockInfo
- .setIniBlock(chmBlockInfo.startBlock -
- chmBlockInfo.startBlock % (int) clcd.getResetInterval());
-// .setIniBlock((chmBlockInfo.startBlock - chmBlockInfo.startBlock)
-// % (int) clcd.getResetInterval());
- return chmBlockInfo;
- }
-
- public static ChmBlockInfo getChmBlockInfoInstance(
- DirectoryListingEntry dle, int bytesPerBlock,
- ChmLzxcControlData clcd) {
- setChmBlockInfo(new ChmBlockInfo());
- getChmBlockInfo().setStartBlock(dle.getOffset() / bytesPerBlock);
- getChmBlockInfo().setEndBlock(
- (dle.getOffset() + dle.getLength()) / bytesPerBlock);
- getChmBlockInfo().setStartOffset(dle.getOffset() % bytesPerBlock);
- getChmBlockInfo().setEndOffset(
- (dle.getOffset() + dle.getLength()) % bytesPerBlock);
- // potential problem with casting long to int
- getChmBlockInfo().setIniBlock(
- getChmBlockInfo().startBlock - getChmBlockInfo().startBlock
- % (int) clcd.getResetInterval());
-// (getChmBlockInfo().startBlock - getChmBlockInfo().startBlock)
-// % (int) clcd.getResetInterval());
- return getChmBlockInfo();
- }
-
- /**
- * Returns textual representation of ChmBlockInfo
- */
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append("iniBlock:=" + getIniBlock() + ", ");
- sb.append("startBlock:=" + getStartBlock() + ", ");
- sb.append("endBlock:=" + getEndBlock() + ", ");
- sb.append("startOffset:=" + getStartOffset() + ", ");
- sb.append("endOffset:=" + getEndOffset()
- + System.getProperty("line.separator"));
- return sb.toString();
- }
-
- private boolean validateParameters(DirectoryListingEntry dle,
- int bytesPerBlock, ChmLzxcControlData clcd,
- ChmBlockInfo chmBlockInfo) {
- int goodParameter = 0;
- if (dle != null)
- ++goodParameter;
- if (bytesPerBlock > 0)
- ++goodParameter;
- if (clcd != null)
- ++goodParameter;
- if (chmBlockInfo != null)
- ++goodParameter;
- return (goodParameter == 4);
- }
-
- public static void main(String[] args) {
- }
-
- /**
- * Returns an initial block index
- *
- * @return int
- */
- public int getIniBlock() {
- return iniBlock;
- }
-
- /**
- * Sets the initial block index
- *
- * @param iniBlock
- * - int
- */
- private void setIniBlock(int iniBlock) {
- this.iniBlock = iniBlock;
- }
-
- /**
- * Returns the start block index
- *
- * @return int
- */
- public int getStartBlock() {
- return startBlock;
- }
-
- /**
- * Sets the start block index
- *
- * @param startBlock
- * - int
- */
- private void setStartBlock(int startBlock) {
- this.startBlock = startBlock;
- }
-
- /**
- * Returns the end block index
- *
- * @return - int
- */
- public int getEndBlock() {
- return endBlock;
- }
-
- /**
- * Sets the end block index
- *
- * @param endBlock
- * - int
- */
- private void setEndBlock(int endBlock) {
- this.endBlock = endBlock;
- }
-
- /**
- * Returns the start offset index
- *
- * @return - int
- */
- public int getStartOffset() {
- return startOffset;
- }
-
- /**
- * Sets the start offset index
- *
- * @param startOffset
- * - int
- */
- private void setStartOffset(int startOffset) {
- this.startOffset = startOffset;
- }
-
- /**
- * Returns the end offset index
- *
- * @return - int
- */
- public int getEndOffset() {
- return endOffset;
- }
-
- /**
- * Sets the end offset index
- *
- * @param endOffset
- * - int
- */
- private void setEndOffset(int endOffset) {
- this.endOffset = endOffset;
- }
-
- public static void setChmBlockInfo(ChmBlockInfo chmBlockInfo) {
- ChmBlockInfo.chmBlockInfo = chmBlockInfo;
- }
-
- public static ChmBlockInfo getChmBlockInfo() {
- return chmBlockInfo;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * A container that contains chm block information such as: i. initial block is
+ * using to reset main tree ii. start block is using for knowing where to start
+ * iii. end block is using for knowing where to stop iv. start offset is using
+ * for knowing where to start reading v. end offset is using for knowing where
+ * to stop reading
+ *
+ */
+public class ChmBlockInfo {
+ /* class members */
+ private int iniBlock;
+ private int startBlock;
+ private int endBlock;
+ private int startOffset;
+ private int endOffset;
+
+ private static ChmBlockInfo chmBlockInfo = null;
+
+ private ChmBlockInfo() {
+
+ }
+
+ /**
+ * Returns an information related to the chmBlockInfo
+ *
+ * @param dle
+ * - DirectoryListingEntry
+ * @param bytesPerBlock
+ * - int, = chmLzxcResetTable.block_length
+ * @param clcd
+ * - ChmLzxcControlData
+ * @param chmBlockInfo
+ * - ChmBlockInfo
+ *
+ * @return ChmBlockInfo
+ * @throws TikaException
+ */
+ protected ChmBlockInfo getChmBlockInfo(DirectoryListingEntry dle,
+ int bytesPerBlock, ChmLzxcControlData clcd,
+ ChmBlockInfo chmBlockInfo) throws TikaException {
+ if (!validateParameters(dle, bytesPerBlock, clcd, chmBlockInfo))
+ throw new ChmParsingException("Please check you parameters");
+
+ chmBlockInfo.setStartBlock(dle.getOffset() / bytesPerBlock);
+ chmBlockInfo.setEndBlock((dle.getOffset() + dle.getLength())
+ / bytesPerBlock);
+ chmBlockInfo.setStartOffset(dle.getOffset() % bytesPerBlock);
+ chmBlockInfo.setEndOffset((dle.getOffset() + dle.getLength())
+ % bytesPerBlock);
+ // potential problem with casting long to int
+ chmBlockInfo
+ .setIniBlock(chmBlockInfo.startBlock -
+ chmBlockInfo.startBlock % (int) clcd.getResetInterval());
+// .setIniBlock((chmBlockInfo.startBlock - chmBlockInfo.startBlock)
+// % (int) clcd.getResetInterval());
+ return chmBlockInfo;
+ }
+
+ public static ChmBlockInfo getChmBlockInfoInstance(
+ DirectoryListingEntry dle, int bytesPerBlock,
+ ChmLzxcControlData clcd) {
+ setChmBlockInfo(new ChmBlockInfo());
+ getChmBlockInfo().setStartBlock(dle.getOffset() / bytesPerBlock);
+ getChmBlockInfo().setEndBlock(
+ (dle.getOffset() + dle.getLength()) / bytesPerBlock);
+ getChmBlockInfo().setStartOffset(dle.getOffset() % bytesPerBlock);
+ getChmBlockInfo().setEndOffset(
+ (dle.getOffset() + dle.getLength()) % bytesPerBlock);
+ // potential problem with casting long to int
+ getChmBlockInfo().setIniBlock(
+ getChmBlockInfo().startBlock - getChmBlockInfo().startBlock
+ % (int) clcd.getResetInterval());
+// (getChmBlockInfo().startBlock - getChmBlockInfo().startBlock)
+// % (int) clcd.getResetInterval());
+ return getChmBlockInfo();
+ }
+
+ /**
+ * Returns textual representation of ChmBlockInfo
+ */
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("iniBlock:=" + getIniBlock() + ", ");
+ sb.append("startBlock:=" + getStartBlock() + ", ");
+ sb.append("endBlock:=" + getEndBlock() + ", ");
+ sb.append("startOffset:=" + getStartOffset() + ", ");
+ sb.append("endOffset:=" + getEndOffset()
+ + System.getProperty("line.separator"));
+ return sb.toString();
+ }
+
+ private boolean validateParameters(DirectoryListingEntry dle,
+ int bytesPerBlock, ChmLzxcControlData clcd,
+ ChmBlockInfo chmBlockInfo) {
+ int goodParameter = 0;
+ if (dle != null)
+ ++goodParameter;
+ if (bytesPerBlock > 0)
+ ++goodParameter;
+ if (clcd != null)
+ ++goodParameter;
+ if (chmBlockInfo != null)
+ ++goodParameter;
+ return (goodParameter == 4);
+ }
+
+ public static void main(String[] args) {
+ }
+
+ /**
+ * Returns an initial block index
+ *
+ * @return int
+ */
+ public int getIniBlock() {
+ return iniBlock;
+ }
+
+ /**
+ * Sets the initial block index
+ *
+ * @param iniBlock
+ * - int
+ */
+ private void setIniBlock(int iniBlock) {
+ this.iniBlock = iniBlock;
+ }
+
+ /**
+ * Returns the start block index
+ *
+ * @return int
+ */
+ public int getStartBlock() {
+ return startBlock;
+ }
+
+ /**
+ * Sets the start block index
+ *
+ * @param startBlock
+ * - int
+ */
+ private void setStartBlock(int startBlock) {
+ this.startBlock = startBlock;
+ }
+
+ /**
+ * Returns the end block index
+ *
+ * @return - int
+ */
+ public int getEndBlock() {
+ return endBlock;
+ }
+
+ /**
+ * Sets the end block index
+ *
+ * @param endBlock
+ * - int
+ */
+ private void setEndBlock(int endBlock) {
+ this.endBlock = endBlock;
+ }
+
+ /**
+ * Returns the start offset index
+ *
+ * @return - int
+ */
+ public int getStartOffset() {
+ return startOffset;
+ }
+
+ /**
+ * Sets the start offset index
+ *
+ * @param startOffset
+ * - int
+ */
+ private void setStartOffset(int startOffset) {
+ this.startOffset = startOffset;
+ }
+
+ /**
+ * Returns the end offset index
+ *
+ * @return - int
+ */
+ public int getEndOffset() {
+ return endOffset;
+ }
+
+ /**
+ * Sets the end offset index
+ *
+ * @param endOffset
+ * - int
+ */
+ private void setEndOffset(int endOffset) {
+ this.endOffset = endOffset;
+ }
+
+ public static void setChmBlockInfo(ChmBlockInfo chmBlockInfo) {
+ ChmBlockInfo.chmBlockInfo = chmBlockInfo;
+ }
+
+ public static ChmBlockInfo getChmBlockInfo() {
+ return chmBlockInfo;
+ }
+}