You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2011/06/07 17:44:42 UTC
svn commit: r1133047 [2/3] - in
/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm:
accessor/ assertion/ core/ exception/ lzx/
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java Tue Jun 7 15:44:41 2011
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.assertion;
+
+import java.io.InputStream;
+
+import org.apache.tika.parser.chm.accessor.ChmAccessor;
+import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * Contains chm extractor assertions
+ */
+public class ChmAssert {
+ /**
+ * Checks a validity of the chmBlockSegment parameters
+ *
+ * @param data
+ * byte[]
+ * @param resetTable
+ * ChmLzxcResetTable
+ * @param blockNumber
+ * int
+ * @param lzxcBlockOffset
+ * int
+ * @param lzxcBlockLength
+ * int
+ */
+ public static final void assertChmBlockSegment(byte[] data,
+ ChmLzxcResetTable resetTable, int blockNumber, int lzxcBlockOffset,
+ int lzxcBlockLength) {
+ if ((data == null))
+ throw new ChmParsingException("data[] is null");
+
+ if ((data.length <= 0))
+ throw new ChmParsingException(
+ "data[] length should be greater than zero");
+
+ if (resetTable == null)
+ throw new ChmParsingException("resetTable is null");
+
+ if (resetTable.getBlockAddress().length <= 1)
+ throw new ChmParsingException(
+ "resetTable.getBlockAddress().length should be greater than zero");
+
+ if (blockNumber < 0)
+ throw new ChmParsingException(
+ "blockNumber should be positive number");
+
+ if (lzxcBlockOffset < 0)
+ throw new ChmParsingException(
+ "lzxcBlockOffset should be positive number");
+
+ if (lzxcBlockLength < 0)
+ throw new ChmParsingException(
+ "lzxcBlockLength should be positive number");
+ }
+
+ /**
+ * Checks if InputStream is not null
+ *
+ * @param is
+ * InputStream
+ */
+ public static final void assertInputStreamNotNull(InputStream is) {
+ if (is == null)
+ throw new ChmParsingException("input sream is null");
+ }
+
+ /**
+ * Checks validity of ChmAccessor parameters
+ *
+ * @param data
+ * @param chmItsfHeader
+ * @param count
+ */
+ public static final void assertChmAccessorParameters(byte[] data,
+ ChmAccessor<?> chmAccessor, int count) {
+ assertByteArrayNotNull(data);
+ assertChmAccessorNotNull(chmAccessor);
+ }
+
+ /**
+ * Checks if byte[] is not null
+ *
+ * @param data
+ */
+ public static final void assertByteArrayNotNull(byte[] data) {
+ if (data == null)
+ throw new ChmParsingException("byte[] data is null");
+ }
+
+ /**
+ * Checks if ChmAccessor is not null In case of null throws exception
+ *
+ * @param ChmAccessor
+ */
+ public static final void assertChmAccessorNotNull(ChmAccessor<?> chmAccessor) {
+ if (chmAccessor == null)
+ throw new ChmParsingException("chm header is null");
+ }
+
+ /**
+ * Checks validity of the DirectoryListingEntry's parameters In case of
+ * invalid parameter(s) throws an exception
+ *
+ * @param name_length
+ * length of the chm entry name
+ * @param name
+ * chm entry name
+ * @param entryType
+ * EntryType
+ * @param offset
+ * @param length
+ */
+ public static final void assertDirectoryListingEntry(int name_length,
+ String name, ChmCommons.EntryType entryType, int offset, int length) {
+ if (name_length < 0)
+ throw new ChmParsingException("invalid name length");
+ if (name == null)
+ throw new ChmParsingException("invalid name");
+
+ if ((entryType != ChmCommons.EntryType.COMPRESSED)
+ && (entryType != ChmCommons.EntryType.UNCOMPRESSED))
+ throw new ChmParsingException(
+ "invalid compressed type, should be EntryType.COMPRESSED | EntryType.UNCOMPRESSED");
+
+ if (offset < 0)
+ throw new ChmParsingException("invalid offset");
+
+ if (length < 0)
+ throw new ChmParsingException("invalid length");
+ }
+
+ public static void assertCopyingDataIndex(int index, int dataLength) {
+ if (index >= dataLength)
+ throw new ChmParsingException(
+ "cannot parse chm file index > data.length");
+ }
+
+ /**
+ * Checks if int param is greater than zero In case param <=0 throws an
+ * exception
+ *
+ * @param param
+ */
+ public static void assertPositiveInt(int param) {
+ if (param <= 0)
+ throw new ChmParsingException(
+ "resetTable.getBlockAddress().length should be greater than zero");
+ }
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java Tue Jun 7 15:44:41 2011
@@ -0,0 +1,374 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.core;
+
+import java.io.ByteArrayOutputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+public class ChmCommons {
+ /* Prevents initialization */
+ private ChmCommons() {
+ }
+
+ public static void assertByteArrayNotNull(byte[] data) {
+ if (data == null)
+ throw new ChmParsingException("byte[] is null");
+ }
+
+ /**
+ * Represents entry types: uncompressed, compressed
+ */
+ public enum EntryType {
+ UNCOMPRESSED, COMPRESSED
+ }
+
+ /**
+ * Represents lzx states: started decoding, not started decoding
+ */
+ public enum LzxState {
+ STARTED_DECODING, NOT_STARTED_DECODING
+ }
+
+ /**
+ * Represents intel file states during decompression
+ */
+ public enum IntelState {
+ STARTED, NOT_STARTED
+ }
+
+ /**
+ * Represents lzx block types in order to decompress differently
+ */
+ public final static int UNDEFINED = 0;
+ public final static int VERBATIM = 1;
+ public final static int ALIGNED_OFFSET = 2;
+ public final static int UNCOMPRESSED = 3;
+
+ /**
+ * LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) Returns X,
+ * i.e 2^X
+ *
+ * @param window
+ * chmLzxControlData.getWindowSize()
+ *
+ * @return window size
+ */
+ public static int getWindowSize(int window) {
+ int win = 0;
+ while (window > 1) {
+ window >>>= 1;
+ win++;
+ }
+ return win;
+ }
+
+ public static byte[] getChmBlockSegment(byte[] data,
+ ChmLzxcResetTable resetTable, int blockNumber, int lzxcBlockOffset,
+ int lzxcBlockLength) {
+ ChmAssert.assertChmBlockSegment(data, resetTable, blockNumber,
+ lzxcBlockOffset, lzxcBlockLength);
+ int blockLength = -1;
+ // TODO add int_max_value checking
+ if (blockNumber < (resetTable.getBlockAddress().length - 1)) {
+ blockLength = (int) (resetTable.getBlockAddress()[blockNumber + 1] - resetTable
+ .getBlockAddress()[blockNumber]);
+ } else {
+ /* new code */
+ if (blockNumber >= resetTable.getBlockAddress().length)
+ blockLength = 0;
+ else
+ /* end new code */
+ blockLength = (int) (lzxcBlockLength - resetTable
+ .getBlockAddress()[blockNumber]);
+ }
+ byte[] t = Arrays
+ .copyOfRange(
+ data,
+ (int) (lzxcBlockOffset + resetTable.getBlockAddress()[blockNumber]),
+ (int) (lzxcBlockOffset
+ + resetTable.getBlockAddress()[blockNumber] + blockLength));
+ return (t != null) ? t : new byte[1];
+ }
+
+ /**
+ * Returns textual representation of LangID
+ *
+ * @param langID
+ *
+ * @return language name
+ */
+ public static String getLanguage(long langID) {
+ /* Potential problem with casting */
+ switch ((int) langID) {
+ case 1025:
+ return "Arabic";
+ case 1069:
+ return "Basque";
+ case 1027:
+ return "Catalan";
+ case 2052:
+ return "Chinese (Simplified)";
+ case 1028:
+ return "Chinese (Traditional)";
+ case 1029:
+ return "Czech";
+ case 1030:
+ return "Danish";
+ case 1043:
+ return "Dutch";
+ case 1033:
+ return "English (United States)";
+ case 1035:
+ return "Finnish";
+ case 1036:
+ return "French";
+ case 1031:
+ return "German";
+ case 1032:
+ return "Greek";
+ case 1037:
+ return "Hebrew";
+ case 1038:
+ return "Hungarian";
+ case 1040:
+ return "Italian";
+ case 1041:
+ return "Japanese";
+ case 1042:
+ return "Korean";
+ case 1044:
+ return "Norwegian";
+ case 1045:
+ return "Polish";
+ case 2070:
+ return "Portuguese";
+ case 1046:
+ return "Portuguese (Brazil)";
+ case 1049:
+ return "Russian";
+ case 1051:
+ return "Slovakian";
+ case 1060:
+ return "Slovenian";
+ case 3082:
+ return "Spanish";
+ case 1053:
+ return "Swedish";
+ case 1055:
+ return "Turkish";
+ default:
+ return "unknown - http://msdn.microsoft.com/en-us/library/bb165625%28VS.80%29.aspx";
+ }
+ }
+
+ /**
+ * Checks skippable patterns
+ *
+ * @param directoryListingEntry
+ *
+ * @return boolean
+ */
+ public static boolean hasSkip(DirectoryListingEntry directoryListingEntry) {
+ return (directoryListingEntry.getName().startsWith("/$")
+ || directoryListingEntry.getName().startsWith("/#") || directoryListingEntry
+ .getName().startsWith("::")) ? true : false;
+ }
+
+ /**
+ * Writes byte[][] to the file
+ *
+ * @param buffer
+ * @param fileToBeSaved
+ * file name
+ */
+ public static void writeFile(byte[][] buffer, String fileToBeSaved) {
+ FileOutputStream output = null;
+ if (buffer != null && fileToBeSaved != null && !fileToBeSaved.isEmpty()) {
+ try {
+ output = new FileOutputStream(fileToBeSaved);
+ if (output != null)
+ for (int i = 0; i < buffer.length; i++) {
+ output.write(buffer[i]);
+ }
+ } catch (FileNotFoundException e) {
+ System.err.println("The " + fileToBeSaved
+ + " does not seem correct");
+ } catch (IOException e) {
+ e.printStackTrace();
+ } finally {
+ if (output != null)
+ try {
+ output.flush();
+ output.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ /**
+ * Reverses the order of given array
+ *
+ * @param array
+ */
+ public static void reverse(byte[] array) {
+ if (array == null) {
+ return;
+ }
+ int i = 0;
+ int j = array.length - 1;
+ byte tmp;
+ while (j > i) {
+ tmp = array[j];
+ array[j] = array[i];
+ array[i] = tmp;
+ j--;
+ i++;
+ }
+ }
+
+ /**
+ * Returns byte array Closes the InputStream
+ *
+ * @param is
+ * InputStream of chm file
+ *
+ * @return byte array
+ *
+ * @throws IOException
+ */
+ public static byte[] toByteArray(InputStream is) throws IOException {
+ if (is != null) {
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+ int nRead;
+ byte[] data = new byte[16384];
+ while ((nRead = is.read(data, 0, data.length)) != -1) {
+ buffer.write(data, 0, nRead);
+ }
+ buffer.flush();
+ try {
+ is.close();
+ buffer.close();
+ } catch (Exception e) {
+ System.err.println(e.getMessage());
+ }
+ return buffer.toByteArray();
+ } else
+ throw new ChmParsingException("InputStream is null");
+ }
+
+ /**
+ * Returns an index of the reset table
+ *
+ * @param text
+ * @param pattern
+ * @return index of the reset table
+ */
+ public static final int indexOfResetTableBlock(byte[] text, byte[] pattern) {
+ return (indexOf(text, pattern)) - 4;
+ }
+
+ /**
+ * Searches some pattern in byte[]
+ *
+ * @param text
+ * byte[]
+ * @param pattern
+ * byte[]
+ * @return an index, if nothing found returns -1
+ */
+ public static int indexOf(byte[] text, byte[] pattern) {
+ int[] next = null;
+ int i = 0, j = -1;
+
+ /* Preprocessing */
+ if (pattern != null && text != null) {
+ next = new int[pattern.length];
+ next[0] = -1;
+ } else
+ throw new ChmParsingException(
+ "pattern and/or text should not be null");
+
+ /* Computes a failure function */
+ while (i < pattern.length - 1) {
+ if (j == -1 || pattern[i] == pattern[j]) {
+ i++;
+ j++;
+ if (pattern[i] != pattern[j])
+ next[i] = j;
+ else
+ next[i] = next[j];
+ } else
+ j = next[j];
+ }
+
+ /* Reinitializes local variables */
+ i = j = 0;
+
+ /* Matching */
+ while (i < text.length && j < pattern.length) {
+ if (j == -1 || pattern[j] == text[i]) {
+ i++;
+ j++;
+ } else
+ j = next[j];
+ }
+ if (j == pattern.length)
+ return (i - j); // match found at offset i - M
+ else
+ return -1; // not found
+ }
+
+ /**
+ * Searches for some pattern in the directory listing entry list
+ *
+ * @param list
+ * @param pattern
+ * @return an index, if nothing found returns -1
+ */
+ public static int indexOf(List<DirectoryListingEntry> list, String pattern) {
+ int place = 0;
+ for (Iterator<DirectoryListingEntry> iterator = list.iterator(); iterator
+ .hasNext();) {
+ DirectoryListingEntry directoryListingEntry = iterator.next();
+ if (directoryListingEntry.toString().contains(pattern)) {
+ return place;
+ } else
+ ++place;
+ }
+ return -1;// not found
+ }
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+ }
+
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java Tue Jun 7 15:44:41 2011
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.core;
+
+public class ChmConstants {
+ /* Prevents instantiation */
+ private ChmConstants() {
+ }
+
+ public static final String DEFAULT_CHARSET = "UTF-8";
+ public static final String ITSF = "ITSF";
+ public static final String ITSP = "ITSP";
+ public static final String PMGL = "PMGL";
+ public static final String LZXC = "LZXC";
+ public static final String CHM_PMGI_MARKER = "PMGI";
+ public static final int BYTE_ARRAY_LENGHT = 16;
+ public static final int CHM_ITSF_V2_LEN = 0x58;
+ public static final int CHM_ITSF_V3_LEN = 0x60;
+ public static final int CHM_ITSP_V1_LEN = 0x54;
+ public static final int CHM_PMGL_LEN = 0x14;
+ public static final int CHM_PMGI_LEN = 0x08;
+ public static final int CHM_LZXC_RESETTABLE_V1_LEN = 0x28;
+ public static final int CHM_LZXC_MIN_LEN = 0x18;
+ public static final int CHM_LZXC_V2_LEN = 0x1c;
+ public static final int CHM_SIGNATURE_LEN = 4;
+ public static final int CHM_VER_2 = 2;
+ public static final int CHM_VER_3 = 3;
+ public static final int CHM_VER_1 = 1;
+ public static final int CHM_WINDOW_SIZE_BLOCK = 0x8000;
+
+ /* my hacking */
+ public static final int START_PMGL = 0xCC;
+ public static final String CONTROL_DATA = "ControlData";
+ public static final String RESET_TABLE = "ResetTable";
+ public static final String CONTENT = "Content";
+
+ /* some constants defined by the LZX specification */
+ public static final int LZX_MIN_MATCH = 2;
+ public static final int LZX_MAX_MATCH = 257;
+ public static final int LZX_NUM_CHARS = 256;
+ public static final int LZX_BLOCKTYPE_INVALID = 0; /*
+ * also blocktypes 4-7
+ * invalid
+ */
+ public static final int LZX_BLOCKTYPE_VERBATIM = 1;
+ public static final int LZX_BLOCKTYPE_ALIGNED = 2;
+ public static final int LZX_BLOCKTYPE_UNCOMPRESSED = 3;
+ public static final int LZX_PRETREE_NUM_ELEMENTS_BITS = 4; /* ??? */
+ public static final int LZX_PRETREE_NUM_ELEMENTS = 20;
+ public static final int LZX_ALIGNED_NUM_ELEMENTS = 8; /*
+ * aligned offset tree
+ * #elements
+ */
+ public static final int LZX_NUM_PRIMARY_LENGTHS = 7; /*
+ * this one missing from
+ * spec!
+ */
+ public static final int LZX_NUM_SECONDARY_LENGTHS = 249; /*
+ * length tree
+ * #elements
+ */
+
+ /* LZX huffman defines: tweak tablebits as desired */
+ public static final int LZX_PRETREE_MAXSYMBOLS = LZX_PRETREE_NUM_ELEMENTS;
+ public static final int LZX_PRETREE_TABLEBITS = 6;
+ public static final int LZX_MAINTREE_MAXSYMBOLS = LZX_NUM_CHARS + 50 * 8;
+ public static final int LZX_MAIN_MAXSYMBOLS = LZX_NUM_CHARS * 2;
+ public static final int LZX_MAINTREE_TABLEBITS = 12;
+ public static final int LZX_LENGTH_MAXSYMBOLS = LZX_NUM_SECONDARY_LENGTHS + 1;
+ public static final int LZX_LENGTH_TABLEBITS = 12;
+ public static final int LZX_ALIGNED_MAXSYMBOLS = LZX_ALIGNED_NUM_ELEMENTS;
+ public static final int LZX_ALIGNED_TABLEBITS = 7;
+ public static final int LZX_LENTABLE_SAFETY = 64;
+
+ public static short[] EXTRA_BITS = { 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5,
+ 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
+ 15, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+ 17, 17 };
+
+ public static int[] POSITION_BASE = { 0, 1, 2, 3, 4, 6, 8, 12, 16, 24, 32,
+ 48, 64, 96, 128, 192, 256, 384, 512, 768, 1024, 1536, 2048, 3072,
+ 4096, 6144, 8192, 12288, 16384, 24576, 32768, 49152, 65536, 98304,
+ 131072, 196608, 262144, 393216, 524288, 655360, 786432, 917504,
+ 1048576, 1179648, 1310720, 1441792, 1572864, 1703936, 1835008,
+ 1966080, 2097152 };
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java Tue Jun 7 15:44:41 2011
@@ -0,0 +1,384 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.core;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.accessor.ChmItspHeader;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmCommons.EntryType;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+import org.apache.tika.parser.chm.lzx.ChmBlockInfo;
+import org.apache.tika.parser.chm.lzx.ChmLzxBlock;
+
+/**
+ * Extracts text from chm file. Enumerates chm entries.
+ */
+public class ChmExtractor {
+ private List<ChmLzxBlock> lzxBlocksCache = null;
+ private ChmDirectoryListingSet chmDirList = null;
+ private ChmItsfHeader chmItsfHeader = null;
+ private ChmItspHeader chmItspHeader = null;
+ private ChmLzxcResetTable chmLzxcResetTable = null;
+ private ChmLzxcControlData chmLzxcControlData = null;
+ private byte[] data = null;
+ private int indexOfContent;
+ private long lzxBlockOffset;
+ private long lzxBlockLength;
+
+ /**
+ * Returns lzxc control data.
+ *
+ * @return ChmLzxcControlData
+ */
+ private ChmLzxcControlData getChmLzxcControlData() {
+ return chmLzxcControlData;
+ }
+
+ /**
+ * Sets lzxc control data
+ *
+ * @param chmLzxcControlData
+ */
+ private void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) {
+ this.chmLzxcControlData = chmLzxcControlData;
+ }
+
+ private ChmItspHeader getChmItspHeader() {
+ return chmItspHeader;
+ }
+
+ private void setChmItspHeader(ChmItspHeader chmItspHeader) {
+ this.chmItspHeader = chmItspHeader;
+ }
+
+ /**
+ * Returns lzxc reset table
+ *
+ * @return ChmLzxcResetTable
+ */
+ private ChmLzxcResetTable getChmLzxcResetTable() {
+ return chmLzxcResetTable;
+ }
+
+ /**
+ * Sets lzxc reset table
+ *
+ * @param chmLzxcResetTable
+ */
+ private void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) {
+ this.chmLzxcResetTable = chmLzxcResetTable;
+ }
+
+ /**
+ * Returns lzxc block length
+ *
+ * @return lzxBlockLength
+ */
+ private long getLzxBlockLength() {
+ return lzxBlockLength;
+ }
+
+ /**
+ * Sets lzxc block length
+ *
+ * @param lzxBlockLength
+ */
+ private void setLzxBlockLength(long lzxBlockLength) {
+ this.lzxBlockLength = lzxBlockLength;
+ }
+
+ /**
+ * Returns lzxc block offset
+ *
+ * @return lzxBlockOffset
+ */
+ private long getLzxBlockOffset() {
+ return lzxBlockOffset;
+ }
+
+ /**
+ * Sets lzxc block offset
+ */
+ private void setLzxBlockOffset(long lzxBlockOffset) {
+ this.lzxBlockOffset = lzxBlockOffset;
+ }
+
+ private int getIndexOfContent() {
+ return indexOfContent;
+ }
+
+ private void setIndexOfContent(int indexOfContent) {
+ this.indexOfContent = indexOfContent;
+ }
+
+ private byte[] getData() {
+ return data;
+ }
+
+ private void setData(byte[] data) {
+ this.data = data;
+ }
+
+ public ChmExtractor(InputStream is) {
+ ChmAssert.assertInputStreamNotNull(is);
+ try {
+ setData(ChmCommons.toByteArray(is));
+
+ /* Creates and parses chm itsf header */
+ setChmItsfHeader(new ChmItsfHeader());
+ getChmItsfHeader().parse(
+ Arrays.copyOfRange(getData(), 0,
+ ChmConstants.CHM_ITSF_V3_LEN - 1),
+ getChmItsfHeader());
+
+ /* Creates and parses chm itsp header */
+ setChmItspHeader(new ChmItspHeader());
+ getChmItspHeader().parse(
+ Arrays.copyOfRange(getData(), (int) getChmItsfHeader()
+ .getDirOffset(), (int) getChmItsfHeader()
+ .getDirOffset() + ChmConstants.CHM_ITSP_V1_LEN),
+ getChmItspHeader());
+
+ /* Creates instance of ChmDirListingContainer */
+ setChmDirList(new ChmDirectoryListingSet(getData(),
+ getChmItsfHeader(), getChmItspHeader()));
+
+ int indexOfControlData = getChmDirList().getControlDataIndex();
+ int indexOfResetData = ChmCommons.indexOfResetTableBlock(getData(),
+ ChmConstants.LZXC.getBytes());
+ byte[] dir_chunk = null;
+ if (indexOfResetData > 0)
+ dir_chunk = Arrays.copyOfRange(
+ getData(),
+ indexOfResetData,
+ indexOfResetData
+ + getChmDirList()
+ .getDirectoryListingEntryList()
+ .get(indexOfControlData).getLength());
+
+ /* Creates and parses chm control data */
+ setChmLzxcControlData(new ChmLzxcControlData());
+ getChmLzxcControlData().parse(dir_chunk, getChmLzxcControlData());
+
+ int indexOfResetTable = getChmDirList().getResetTableIndex();
+ setChmLzxcResetTable(new ChmLzxcResetTable());
+
+ int startIndex = (int) getChmDirList().getDataOffset()
+ + getChmDirList().getDirectoryListingEntryList()
+ .get(indexOfResetTable).getOffset();
+
+ // assert startIndex < data.length
+ ChmAssert.assertCopyingDataIndex(startIndex, getData().length);
+
+ dir_chunk = Arrays.copyOfRange(getData(), startIndex,
+ startIndex
+ + getChmDirList().getDirectoryListingEntryList()
+ .get(indexOfResetTable).getLength());
+
+ getChmLzxcResetTable().parse(dir_chunk, getChmLzxcResetTable());
+
+ setIndexOfContent(ChmCommons.indexOf(getChmDirList()
+ .getDirectoryListingEntryList(), ChmConstants.CONTENT));
+ setLzxBlockOffset((getChmDirList().getDirectoryListingEntryList()
+ .get(getIndexOfContent()).getOffset() + getChmItsfHeader()
+ .getDataOffset()));
+ setLzxBlockLength(getChmDirList().getDirectoryListingEntryList()
+ .get(getIndexOfContent()).getLength());
+
+ setLzxBlocksCache(new ArrayList<ChmLzxBlock>());
+
+ } catch (IOException e) {
+ System.err.println(e.getMessage());
+ }
+ }
+
+ /**
+ * Enumerates chm entities
+ *
+ * @return list of chm entities
+ */
+ public List<String> enumerateChm() {
+ List<String> listOfEntries = new ArrayList<String>();
+ for (Iterator<DirectoryListingEntry> it = getChmDirList()
+ .getDirectoryListingEntryList().iterator(); it.hasNext();) {
+ listOfEntries.add(it.next().getName());
+ }
+ return listOfEntries;
+ }
+
+ /**
+ * Decompresses a chm entry
+ *
+ * @param directoryListingEntry
+ *
+ * @return decompressed data
+ */
+ public byte[][] extractChmEntry(DirectoryListingEntry directoryListingEntry) {
+ byte[][] tmp = null;
+ byte[] dataSegment = null;
+ ChmLzxBlock lzxBlock = null;
+ try {
+ /* UNCOMPRESSED type is easiest one */
+ if (directoryListingEntry.getEntryType() == EntryType.UNCOMPRESSED
+ && directoryListingEntry.getLength() > 0
+ && !ChmCommons.hasSkip(directoryListingEntry)) {
+ int dataOffset = (int) (getChmItsfHeader().getDataOffset() + directoryListingEntry
+ .getOffset());
+ dataSegment = Arrays.copyOfRange(getData(), dataOffset,
+ dataOffset + directoryListingEntry.getLength());
+ } else if (directoryListingEntry.getEntryType() == EntryType.COMPRESSED
+ && !ChmCommons.hasSkip(directoryListingEntry)) {
+ /* Gets a chm block info */
+ ChmBlockInfo bb = ChmBlockInfo.getChmBlockInfoInstance(
+ directoryListingEntry, (int) getChmLzxcResetTable()
+ .getBlockLen(), getChmLzxcControlData());
+ tmp = new byte[bb.getEndBlock() - bb.getStartBlock() + 1][];
+
+ int i = 0, start = 0, block = 0;
+
+ if ((getLzxBlockLength() < Integer.MAX_VALUE)
+ && (getLzxBlockOffset() < Integer.MAX_VALUE)) {
+ // TODO: Improve the caching
+ // caching ... = O(n^2) - depends on startBlock and endBlock
+ if (getLzxBlocksCache().size() != 0) {
+ for (i = 0; i < getLzxBlocksCache().size(); i++) {
+ lzxBlock = getLzxBlocksCache().get(i);
+ for (int j = bb.getIniBlock(); j <= bb
+ .getStartBlock(); j++) {
+ if (lzxBlock.getBlockNumber() == j)
+ if (j > start) {
+ start = j;
+ block = i;
+ }
+ if (start == bb.getStartBlock())
+ break;
+ }
+ }
+ }
+
+ if (i == getLzxBlocksCache().size() && i == 0) {
+ start = bb.getIniBlock();
+
+ dataSegment = ChmCommons.getChmBlockSegment(getData(),
+ getChmLzxcResetTable(), start,
+ (int) getLzxBlockOffset(),
+ (int) getLzxBlockLength());
+
+ lzxBlock = new ChmLzxBlock(start, dataSegment,
+ getChmLzxcResetTable().getBlockLen(), null);
+
+ getLzxBlocksCache().add(lzxBlock);
+ } else {
+ lzxBlock = getLzxBlocksCache().get(block);
+ }
+
+ for (i = start; i <= bb.getEndBlock();) {
+ if (i == bb.getStartBlock() && i == bb.getEndBlock()) {
+ dataSegment = lzxBlock.getContent(
+ bb.getStartOffset(), bb.getEndOffset());
+ tmp[0] = dataSegment;
+ break;
+ }
+
+ if (i == bb.getStartBlock()) {
+ dataSegment = lzxBlock.getContent(bb
+ .getStartOffset());
+ tmp[0] = dataSegment;
+ }
+
+ if (i > bb.getStartBlock() && i < bb.getEndBlock()) {
+ dataSegment = lzxBlock.getContent();
+ tmp[i - bb.getStartBlock()] = dataSegment;
+ }
+
+ if (i == bb.getEndBlock()) {
+ dataSegment = lzxBlock.getContent(0,
+ bb.getEndOffset());
+ tmp[i - bb.getStartBlock()] = dataSegment;
+ break;
+ }
+
+ i++;
+
+ if (i % getChmLzxcControlData().getResetInterval() == 0) {
+ lzxBlock = new ChmLzxBlock(i,
+ ChmCommons.getChmBlockSegment(getData(),
+ getChmLzxcResetTable(), i,
+ (int) getLzxBlockOffset(),
+ (int) getLzxBlockLength()),
+ getChmLzxcResetTable().getBlockLen(), null);
+ } else {
+ lzxBlock = new ChmLzxBlock(i,
+ ChmCommons.getChmBlockSegment(getData(),
+ getChmLzxcResetTable(), i,
+ (int) getLzxBlockOffset(),
+ (int) getLzxBlockLength()),
+ getChmLzxcResetTable().getBlockLen(),
+ lzxBlock);
+ }
+
+ getLzxBlocksCache().add(lzxBlock);
+ }
+
+ if (getLzxBlocksCache().size() > getChmLzxcResetTable()
+ .getBlockCount()) {
+ getLzxBlocksCache().clear();
+ }
+ }
+ }
+ } catch (ChmParsingException e) {
+ // e.printStackTrace();
+ // System.err.println("Unknown exception");
+ }
+ return (tmp != null) ? tmp : (new byte[1][]);
+ }
+
+ private void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) {
+ this.lzxBlocksCache = lzxBlocksCache;
+ }
+
+ private List<ChmLzxBlock> getLzxBlocksCache() {
+ return lzxBlocksCache;
+ }
+
+ private void setChmDirList(ChmDirectoryListingSet chmDirList) {
+ this.chmDirList = chmDirList;
+ }
+
+ public ChmDirectoryListingSet getChmDirList() {
+ return chmDirList;
+ }
+
+ private void setChmItsfHeader(ChmItsfHeader chmItsfHeader) {
+ this.chmItsfHeader = chmItsfHeader;
+ }
+
+ private ChmItsfHeader getChmItsfHeader() {
+ return chmItsfHeader;
+ }
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java Tue Jun 7 15:44:41 2011
@@ -0,0 +1,130 @@
+package org.apache.tika.parser.chm.core;
+
+import java.util.List;
+
+import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
+import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
+import org.apache.tika.parser.chm.accessor.ChmItspHeader;
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
+import org.apache.tika.parser.chm.lzx.ChmLzxBlock;
+
+public class ChmWrapper {
+ private List<ChmLzxBlock> lzxBlocksCache = null;
+ private ChmDirectoryListingSet chmDirList = null;
+ private ChmItsfHeader chmItsfHeader = null;
+ private ChmItspHeader chmItspHeader = null;
+ private ChmLzxcResetTable chmLzxcResetTable = null;
+ private ChmLzxcControlData chmLzxcControlData = null;
+ private byte[] data = null;
+ private int indexOfContent;
+ private long lzxBlockOffset;
+ private long lzxBlockLength;
+ private int indexOfResetData;
+ private int indexOfResetTable;
+ private int startIndex;
+
+ protected int getStartIndex() {
+ return startIndex;
+ }
+
+ protected void setStartIndex(int startIndex) {
+ this.startIndex = startIndex;
+ }
+
+ protected int getIndexOfResetTable() {
+ return indexOfResetTable;
+ }
+
+ protected void setIndexOfResetTable(int indexOfResetTable) {
+ this.indexOfResetTable = indexOfResetTable;
+ }
+
+ protected List<ChmLzxBlock> getLzxBlocksCache() {
+ return lzxBlocksCache;
+ }
+
+ protected void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) {
+ this.lzxBlocksCache = lzxBlocksCache;
+ }
+
+ protected ChmDirectoryListingSet getChmDirList() {
+ return chmDirList;
+ }
+
+ protected void setChmDirList(ChmDirectoryListingSet chmDirList) {
+ this.chmDirList = chmDirList;
+ }
+
+ protected ChmItsfHeader getChmItsfHeader() {
+ return chmItsfHeader;
+ }
+
+ protected void setChmItsfHeader(ChmItsfHeader chmItsfHeader) {
+ this.chmItsfHeader = chmItsfHeader;
+ }
+
+ protected ChmLzxcResetTable getChmLzxcResetTable() {
+ return chmLzxcResetTable;
+ }
+
+ protected void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) {
+ this.chmLzxcResetTable = chmLzxcResetTable;
+ }
+
+ protected ChmLzxcControlData getChmLzxcControlData() {
+ return chmLzxcControlData;
+ }
+
+ protected void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) {
+ this.chmLzxcControlData = chmLzxcControlData;
+ }
+
+ protected byte[] getData() {
+ return data;
+ }
+
+ protected void setData(byte[] data) {
+ this.data = data;
+ }
+
+ protected int getIndexOfContent() {
+ return indexOfContent;
+ }
+
+ protected void setIndexOfContent(int indexOfContent) {
+ this.indexOfContent = indexOfContent;
+ }
+
+ protected long getLzxBlockOffset() {
+ return lzxBlockOffset;
+ }
+
+ protected void setLzxBlockOffset(long lzxBlockOffset) {
+ this.lzxBlockOffset = lzxBlockOffset;
+ }
+
+ protected long getLzxBlockLength() {
+ return lzxBlockLength;
+ }
+
+ protected void setLzxBlockLength(long lzxBlockLength) {
+ this.lzxBlockLength = lzxBlockLength;
+ }
+
+ protected void setChmItspHeader(ChmItspHeader chmItspHeader) {
+ this.chmItspHeader = chmItspHeader;
+ }
+
+ protected ChmItspHeader getChmItspHeader() {
+ return chmItspHeader;
+ }
+
+ protected void setIndexOfResetData(int indexOfResetData) {
+ this.indexOfResetData = indexOfResetData;
+ }
+
+ protected int getIndexOfResetData() {
+ return indexOfResetData;
+ }
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java Tue Jun 7 15:44:41 2011
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.exception;
+
+public class ChmParsingException extends RuntimeException {
+ private static final long serialVersionUID = 6497936044733665210L;
+
+ public ChmParsingException() {
+ super();
+ }
+
+ public ChmParsingException(String description) {
+ super(description);
+ }
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java Tue Jun 7 15:44:41 2011
@@ -0,0 +1,229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
+import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * A container that contains chm block information such as: i. initial block is
+ * using to reset main tree ii. start block is using for knowing where to start
+ * iii. end block is using for knowing where to stop iv. start offset is using
+ * for knowing where to start reading v. end offset is using for knowing where
+ * to stop reading
+ *
+ */
+public class ChmBlockInfo {
+ /* class members */
+ private int iniBlock;
+ private int startBlock;
+ private int endBlock;
+ private int startOffset;
+ private int endOffset;
+
+ private static ChmBlockInfo chmBlockInfo = null;
+
+ private ChmBlockInfo() {
+
+ }
+
+ /**
+ * Returns an information related to the chmBlockInfo
+ *
+ * @param dle
+ * - DirectoryListingEntry
+ * @param bytesPerBlock
+ * - int, = chmLzxcResetTable.block_length
+ * @param clcd
+ * - ChmLzxcControlData
+ * @param chmBlockInfo
+ * - ChmBlockInfo
+ *
+ * @return ChmBlockInfo
+ */
+ protected ChmBlockInfo getChmBlockInfo(DirectoryListingEntry dle,
+ int bytesPerBlock, ChmLzxcControlData clcd,
+ ChmBlockInfo chmBlockInfo) {
+ if (!validateParameters(dle, bytesPerBlock, clcd, chmBlockInfo))
+ throw new ChmParsingException("Please check you parameters");
+
+ chmBlockInfo.setStartBlock(dle.getOffset() / bytesPerBlock);
+ chmBlockInfo.setEndBlock((dle.getOffset() + dle.getLength())
+ / bytesPerBlock);
+ chmBlockInfo.setStartOffset(dle.getOffset() % bytesPerBlock);
+ chmBlockInfo.setEndOffset((dle.getOffset() + dle.getLength())
+ % bytesPerBlock);
+ // potential problem with casting long to int
+ chmBlockInfo
+ .setIniBlock((chmBlockInfo.startBlock - chmBlockInfo.startBlock)
+ % (int) clcd.getResetInterval());
+ return chmBlockInfo;
+ }
+
+ public static ChmBlockInfo getChmBlockInfoInstance(
+ DirectoryListingEntry dle, int bytesPerBlock,
+ ChmLzxcControlData clcd) {
+ setChmBlockInfo(new ChmBlockInfo());
+ getChmBlockInfo().setStartBlock(dle.getOffset() / bytesPerBlock);
+ getChmBlockInfo().setEndBlock(
+ (dle.getOffset() + dle.getLength()) / bytesPerBlock);
+ getChmBlockInfo().setStartOffset(dle.getOffset() % bytesPerBlock);
+ getChmBlockInfo().setEndOffset(
+ (dle.getOffset() + dle.getLength()) % bytesPerBlock);
+ // potential problem with casting long to int
+ getChmBlockInfo().setIniBlock(
+ (getChmBlockInfo().startBlock - getChmBlockInfo().startBlock)
+ % (int) clcd.getResetInterval());
+ return getChmBlockInfo();
+ }
+
+ /**
+ * Returns textual representation of ChmBlockInfo
+ */
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("iniBlock:=" + getIniBlock() + ", ");
+ sb.append("startBlock:=" + getStartBlock() + ", ");
+ sb.append("endBlock:=" + getEndBlock() + ", ");
+ sb.append("startOffset:=" + getStartOffset() + ", ");
+ sb.append("endOffset:=" + getEndOffset()
+ + System.getProperty("line.separator"));
+ return sb.toString();
+ }
+
+ private boolean validateParameters(DirectoryListingEntry dle,
+ int bytesPerBlock, ChmLzxcControlData clcd,
+ ChmBlockInfo chmBlockInfo) {
+ int goodParameter = 0;
+ if (dle != null)
+ ++goodParameter;
+ if (bytesPerBlock > 0)
+ ++goodParameter;
+ if (clcd != null)
+ ++goodParameter;
+ if (chmBlockInfo != null)
+ ++goodParameter;
+ return (goodParameter == 4);
+ }
+
+ public static void main(String[] args) {
+ }
+
+ /**
+ * Returns an initial block index
+ *
+ * @return int
+ */
+ public int getIniBlock() {
+ return iniBlock;
+ }
+
+ /**
+ * Sets the initial block index
+ *
+ * @param iniBlock
+ * - int
+ */
+ private void setIniBlock(int iniBlock) {
+ this.iniBlock = iniBlock;
+ }
+
+ /**
+ * Returns the start block index
+ *
+ * @return int
+ */
+ public int getStartBlock() {
+ return startBlock;
+ }
+
+ /**
+ * Sets the start block index
+ *
+ * @param startBlock
+ * - int
+ */
+ private void setStartBlock(int startBlock) {
+ this.startBlock = startBlock;
+ }
+
+ /**
+ * Returns the end block index
+ *
+ * @return - int
+ */
+ public int getEndBlock() {
+ return endBlock;
+ }
+
+ /**
+ * Sets the end block index
+ *
+ * @param endBlock
+ * - int
+ */
+ private void setEndBlock(int endBlock) {
+ this.endBlock = endBlock;
+ }
+
+ /**
+ * Returns the start offset index
+ *
+ * @return - int
+ */
+ public int getStartOffset() {
+ return startOffset;
+ }
+
+ /**
+ * Sets the start offset index
+ *
+ * @param startOffset
+ * - int
+ */
+ private void setStartOffset(int startOffset) {
+ this.startOffset = startOffset;
+ }
+
+ /**
+ * Returns the end offset index
+ *
+ * @return - int
+ */
+ public int getEndOffset() {
+ return endOffset;
+ }
+
+ /**
+ * Sets the end offset index
+ *
+ * @param endOffset
+ * - int
+ */
+ private void setEndOffset(int endOffset) {
+ this.endOffset = endOffset;
+ }
+
+ public static void setChmBlockInfo(ChmBlockInfo chmBlockInfo) {
+ ChmBlockInfo.chmBlockInfo = chmBlockInfo;
+ }
+
+ public static ChmBlockInfo getChmBlockInfo() {
+ return chmBlockInfo;
+ }
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java Tue Jun 7 15:44:41 2011
@@ -0,0 +1,906 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
+import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * Decompresses a chm block. Depending on chm block type chooses most relevant
+ * decompressing method. A chm block type can be as follows:</br> <li>UNDEFINED
+ * - no action taken, i.e. skipping the block <li>VERBATIM <li>ALIGNED_OFFSET
+ * <li>UNCOMPRESSED the most simplest In addition there are unknown types (4-7).
+ * Currently relying on previous chm block these types changing according to the
+ * previous chm block type. We need to invent more appropriate way to handle
+ * such types.
+ *
+ */
+public class ChmLzxBlock {
+ private int block_number;
+ private long block_length;
+ private ChmLzxState state;
+ private byte[] content = null;
+ private ChmSection chmSection = null;
+ private int contentLength = 0;
+
+ // trying to find solution for bad blocks ...
+ private int previousBlockType = -1;
+
+ public ChmLzxBlock(int blockNumber, byte[] dataSegment, long blockLength,
+ ChmLzxBlock prevBlock) {
+ try {
+ if (validateConstructorParams(blockNumber, dataSegment, blockLength)) {
+ setBlockNumber(blockNumber);
+
+ if (prevBlock != null
+ && prevBlock.getState().getBlockLength() > prevBlock
+ .getState().getBlockRemaining())
+ setChmSection(new ChmSection(prevBlock.getContent()));
+ else
+ setChmSection(new ChmSection(dataSegment));
+
+ setBlockLength(blockLength);
+
+ // ============================================
+ // we need to take care of previous context
+ // ============================================
+ checkLzxBlock(prevBlock);
+ setContent((int) blockLength);
+ if (prevBlock == null
+ || getContent().length < (int) getBlockLength()) {
+ setContent((int) getBlockLength());
+ }
+
+ if (prevBlock != null && prevBlock.getState() != null)
+ previousBlockType = prevBlock.getState().getBlockType();
+
+ try {
+ extractContent();
+ } catch (ChmParsingException e) {
+ // System.err.println(e.getMessage());
+ }
+ } else
+ System.err.println("Check your chm lzx block parameters");
+ } catch (ChmParsingException e) {
+ // TODO: handle exception
+ }
+ }
+
+ protected int getContentLength() {
+ return contentLength;
+ }
+
+ protected void setContentLength(int contentLength) {
+ this.contentLength = contentLength;
+ }
+
+ private ChmSection getChmSection() {
+ return chmSection;
+ }
+
+ private void setChmSection(ChmSection chmSection) {
+ this.chmSection = chmSection;
+ }
+
+ private void assertStateNotNull() {
+ if (getState() == null)
+ throw new ChmParsingException("state is null");
+ }
+
+ private void extractContent() {
+ assertStateNotNull();
+ if (getChmSection().getData() != null) {
+ while (getContentLength() < getBlockLength()) {// && tempStopLoop
+ if (getState() != null && getState().getBlockRemaining() == 0) {
+ if (getState().getHadStarted() == LzxState.NOT_STARTED_DECODING) {
+ getState().setHadStarted(LzxState.STARTED_DECODING);
+ if (getChmSection().getSyncBits(1) == 1) {
+ int intelSizeTemp = (getChmSection()
+ .getSyncBits(16) << 16)
+ + getChmSection().getSyncBits(16);
+ if (intelSizeTemp >= 0)
+ getState().setIntelFileSize(intelSizeTemp);
+ else
+ getState().setIntelFileSize(0);
+ }
+ }
+ getState().setBlockType(getChmSection().getSyncBits(3));
+ getState().setBlockLength(
+ (getChmSection().getSyncBits(16) << 8)
+ + getChmSection().getSyncBits(8));
+ getState().setBlockRemaining(getState().getBlockLength());
+
+ // ----------------------------------------
+ // Trying to handle 3 - 7 block types
+ // ----------------------------------------
+ if (getState().getBlockType() > 3) {
+ if (previousBlockType >= 0 && previousBlockType < 3)
+ getState().setBlockType(previousBlockType);
+ }
+
+ switch (getState().getBlockType()) {
+ case ChmCommons.ALIGNED_OFFSET:
+ createAlignedTreeTable();
+ case ChmCommons.VERBATIM:
+ /* Creates mainTreeTable */
+ createMainTreeTable();
+ createLengthTreeTable();
+ if (getState().getMainTreeLengtsTable()[0xe8] != 0)
+ getState().setIntelState(IntelState.STARTED);
+ break;
+ case ChmCommons.UNCOMPRESSED:
+ getState().setIntelState(IntelState.STARTED);
+ if (getChmSection().getTotal() > 16)
+ getChmSection().setSwath(
+ getChmSection().getSwath() - 1);
+ getState().setR0(
+ (new BigInteger(getChmSection()
+ .reverseByteOrder(
+ getChmSection().unmarshalBytes(
+ 4))).longValue()));
+ getState().setR1(
+ (new BigInteger(getChmSection()
+ .reverseByteOrder(
+ getChmSection().unmarshalBytes(
+ 4))).longValue()));
+ getState().setR2(
+ (new BigInteger(getChmSection()
+ .reverseByteOrder(
+ getChmSection().unmarshalBytes(
+ 4))).longValue()));
+ break;
+ default:
+ break;
+ }
+ }
+
+ int tempLen;
+
+ if (getContentLength() + getState().getBlockRemaining() > getBlockLength()) {
+ getState().setBlockRemaining(
+ getContentLength() + getState().getBlockRemaining()
+ - (int) getBlockLength());
+ tempLen = (int) getBlockLength();
+ } else {
+ tempLen = getContentLength()
+ + getState().getBlockRemaining();
+ getState().setBlockRemaining(0);
+ }
+
+ switch (getState().getBlockType()) {
+ case ChmCommons.ALIGNED_OFFSET:
+ // if(prevblock.lzxState.length>prevblock.lzxState.remaining)
+ decompressAlignedBlock(tempLen, getChmSection().getData());// prevcontext
+ break;
+ case ChmCommons.VERBATIM:
+ decompressVerbatimBlock(tempLen, getChmSection().getData());
+ break;
+ case ChmCommons.UNCOMPRESSED:
+ decompressUncompressedBlock(tempLen, getChmSection()
+ .getData());
+ break;
+ }
+ getState().increaseFramesRead();
+ if ((getState().getFramesRead() < 32768)
+ && getState().getIntelFileSize() != 0)
+ intelE8Decoding();
+ }
+ }
+ }
+
+ protected void intelE8Decoding() {
+ if (getBlockLength() <= ChmConstants.LZX_PRETREE_TABLEBITS
+ || (getState().getIntelState() == IntelState.NOT_STARTED)) {
+ getState().setBlockRemaining(
+ getState().getBlockRemaining() - (int) getBlockLength());
+ } else {
+ long curpos = getState().getBlockRemaining();
+ getState().setBlockRemaining(
+ getState().getBlockRemaining() - (int) getBlockLength());
+ int i = 0;
+ while (i < getBlockLength() - 10) {
+ if (content[i] != 0xe8) {
+ i++;
+ continue;
+ }
+ byte[] b = new byte[4];
+ b[0] = getContent()[i + 3];
+ b[1] = getContent()[i + 2];
+ b[2] = getContent()[i + 1];
+ b[3] = getContent()[i + 0];
+ long absoff = (new BigInteger(b)).longValue();
+ if ((absoff >= -curpos)
+ && (absoff < getState().getIntelFileSize())) {
+ long reloff = (absoff >= 0) ? absoff - curpos : absoff
+ + getState().getIntelFileSize();
+ getContent()[i + 0] = (byte) reloff;
+ getContent()[i + 1] = (byte) (reloff >>> 8);
+ getContent()[i + 2] = (byte) (reloff >>> 16);
+ getContent()[i + 3] = (byte) (reloff >>> 24);
+ }
+ i += 4;
+ curpos += 5;
+ }
+ }
+ }
+
+ private short[] createPreLenTable() {
+ short[] tmp = new short[ChmConstants.LZX_PRETREE_MAXSYMBOLS];
+ for (int i = 0; i < ChmConstants.LZX_PRETREE_MAXSYMBOLS; i++) {
+ tmp[i] = (short) getChmSection().getSyncBits(
+ ChmConstants.LZX_PRETREE_NUM_ELEMENTS_BITS);
+ }
+ return tmp;
+ }
+
+ private void createLengthTreeTable() {
+ short[] prelentable = createPreLenTable();
+
+ if (prelentable == null) {
+ throw new ChmParsingException("pretreetable is null");
+ }
+
+ short[] pretreetable = createTreeTable2(prelentable,
+ (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+ + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+ ChmConstants.LZX_PRETREE_TABLEBITS,
+ ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+
+ if (pretreetable == null) {
+ throw new ChmParsingException("pretreetable is null");
+ }
+
+ createLengthTreeLenTable(0, ChmConstants.LZX_NUM_SECONDARY_LENGTHS,
+ pretreetable, prelentable);
+
+ getState().setLengthTreeTable(
+ createTreeTable2(getState().getLengthTreeLengtsTable(),
+ (1 << ChmConstants.LZX_MAINTREE_TABLEBITS)
+ + (ChmConstants.LZX_LENGTH_MAXSYMBOLS << 1),
+ ChmConstants.LZX_MAINTREE_TABLEBITS,
+ ChmConstants.LZX_NUM_SECONDARY_LENGTHS));
+ }
+
+ public void decompressUncompressedBlock(int len, byte[] prevcontent) {
+ if (getContentLength() + getState().getBlockRemaining() <= getBlockLength()) {
+ for (int i = getContentLength(); i < (getContentLength() + getState()
+ .getBlockRemaining()); i++)
+ content[i] = getChmSection().getByte();
+
+ setContentLength(getContentLength()
+ + getState().getBlockRemaining());
+ getState().setBlockRemaining(0);
+ } else {
+ for (int i = getContentLength(); i < getBlockLength(); i++)
+ content[i] = getChmSection().getByte();
+ getState().setBlockRemaining(
+ (int) getBlockLength() - getContentLength());// = blockLen -
+ // contentlen;
+ setContentLength((int) getBlockLength());
+ }
+ }
+
+ public void decompressAlignedBlock(int len, byte[] prevcontent) {
+
+ if ((getChmSection() == null) || (getState() == null)
+ || (getState().getMainTreeTable() == null))
+ throw new ChmParsingException("chm section is null");
+
+ short s;
+ int x, i, border;
+ int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
+ int matchoffset = 0;
+ for (i = getContentLength(); i < len; i++) {
+ /* new code */
+ border = getChmSection().getDesyncBits(
+ ChmConstants.LZX_MAINTREE_TABLEBITS, 0);
+ if (border >= getState().mainTreeTable.length)
+ break;
+ /* end new code */
+ s = getState().mainTreeTable[getChmSection().getDesyncBits(
+ ChmConstants.LZX_MAINTREE_TABLEBITS, 0)];
+ if (s >= getState().getMainTreeElements()) {
+ x = ChmConstants.LZX_MAINTREE_TABLEBITS;
+ do {
+ x++;
+ s <<= 1;
+ s += getChmSection().checkBit(x);
+ } while ((s = getState().mainTreeTable[s]) >= getState()
+ .getMainTreeElements());
+ }
+ getChmSection().getSyncBits(getState().mainTreeTable[s]);
+ if (s < ChmConstants.LZX_NUM_CHARS) {
+ content[i] = (byte) s;
+ } else {
+ s -= ChmConstants.LZX_NUM_CHARS;
+ matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
+ if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
+ matchfooter = getState().lengthTreeTable[getChmSection()
+ .getDesyncBits(ChmConstants.LZX_MAINTREE_TABLEBITS,
+ 0)];
+ if (matchfooter >= ChmConstants.LZX_MAINTREE_TABLEBITS) {
+ x = ChmConstants.LZX_MAINTREE_TABLEBITS;
+ do {
+ x++;
+ matchfooter <<= 1;
+ matchfooter += getChmSection().checkBit(x);
+ } while ((matchfooter = getState().lengthTreeTable[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
+ }
+ getChmSection().getSyncBits(
+ getState().lengthTreeLengtsTable[matchfooter]);
+ matchlen += matchfooter;
+ }
+ matchlen += ChmConstants.LZX_MIN_MATCH;
+ matchoffset = s >>> 3;
+ if (matchoffset > 2) {
+ extra = ChmConstants.EXTRA_BITS[matchoffset];
+ matchoffset = (ChmConstants.POSITION_BASE[matchoffset] - 2);
+ if (extra > 3) {
+ extra -= 3;
+ long l = getChmSection().getSyncBits(extra);
+ matchoffset += (l << 3);
+ int g = getChmSection().getDesyncBits(
+ ChmConstants.LZX_NUM_PRIMARY_LENGTHS, 0);
+ int t = getState().getAlignedTreeTable()[g];
+ if (t >= getState().getMainTreeElements()) {
+ x = ChmConstants.LZX_MAINTREE_TABLEBITS;
+ do {
+ x++;
+ t <<= 1;
+ t += getChmSection().checkBit(x);
+ } while ((t = getState().getAlignedTreeTable()[t]) >= getState()
+ .getMainTreeElements());
+ }
+ getChmSection().getSyncBits(
+ getState().getAlignedTreeTable()[t]);
+ matchoffset += t;
+ } else if (extra == 3) {
+ int g = (int) getChmSection().getDesyncBits(
+ ChmConstants.LZX_NUM_PRIMARY_LENGTHS, 0);
+ int t = getState().getAlignedTreeTable()[g];
+ if (t >= getState().getMainTreeElements()) {
+ x = ChmConstants.LZX_MAINTREE_TABLEBITS;
+ do {
+ x++;
+ t <<= 1;
+ t += getChmSection().checkBit(x);
+ } while ((t = getState().getAlignedTreeTable()[t]) >= getState()
+ .getMainTreeElements());
+ }
+ getChmSection().getSyncBits(
+ getState().getAlignedTreeTable()[t]);
+ matchoffset += t;
+ } else if (extra > 0) {
+ long l = getChmSection().getSyncBits(extra);
+ matchoffset += l;
+ } else
+ matchoffset = 1;
+ getState().setR2(getState().getR1());
+ getState().setR1(getState().getR0());
+ getState().setR0(matchoffset);
+ } else if (matchoffset == 0) {
+ matchoffset = (int) getState().getR0();
+ } else if (matchoffset == 1) {
+ matchoffset = (int) getState().getR1();
+ getState().setR1(getState().getR0());
+ getState().setR0(matchoffset);
+ } else /** match_offset == 2 */
+ {
+ matchoffset = (int) getState().getR2();
+ getState().setR2(getState().getR0());
+ getState().setR0(matchoffset);
+ }
+ rundest = i;
+ runsrc = rundest - matchoffset;
+ i += (matchlen - 1);
+ if (i > len)
+ break;
+
+ if (runsrc < 0) {
+ if (matchlen + runsrc <= 0) {
+ runsrc = prevcontent.length + runsrc;
+ while (matchlen-- > 0)
+ content[rundest++] = prevcontent[runsrc++];
+ } else {
+ runsrc = prevcontent.length + runsrc;
+ while (runsrc < prevcontent.length)
+ content[rundest++] = prevcontent[runsrc++];
+ matchlen = matchlen + runsrc - prevcontent.length;
+ runsrc = 0;
+ while (matchlen-- > 0)
+ content[rundest++] = content[runsrc++];
+ }
+
+ } else {
+ /* copies any wrappes around source data */
+ while ((runsrc < 0) && (matchlen-- > 0)) {
+ content[rundest++] = content[(int) (runsrc + getBlockLength())];
+ runsrc++;
+ }
+ /* copies match data - no worries about destination wraps */
+ while (matchlen-- > 0)
+ content[rundest++] = content[runsrc++];
+ }
+ }
+ }
+ setContentLength(len);
+ }
+
+ private void assertShortArrayNotNull(short[] array) {
+ if (array == null)
+ throw new ChmParsingException("short[] is null");
+ }
+
+ private void decompressVerbatimBlock(int len, byte[] prevcontent) {
+ short s;
+ int x, i;
+ int matchlen = 0, matchfooter = 0, extra, rundest, runsrc;
+ int matchoffset = 0;
+ for (i = getContentLength(); i < len; i++) {
+ int f = (int) getChmSection().getDesyncBits(
+ ChmConstants.LZX_MAINTREE_TABLEBITS, 0);
+ assertShortArrayNotNull(getState().getMainTreeTable());
+ s = getState().getMainTreeTable()[f];
+ if (s >= ChmConstants.LZX_MAIN_MAXSYMBOLS) {
+ x = ChmConstants.LZX_MAINTREE_TABLEBITS;
+ do {
+ x++;
+ s <<= 1;
+ s += getChmSection().checkBit(x);
+ } while ((s = getState().getMainTreeTable()[s]) >= ChmConstants.LZX_MAIN_MAXSYMBOLS);
+ }
+ getChmSection().getSyncBits(getState().getMainTreeLengtsTable()[s]);
+ if (s < ChmConstants.LZX_NUM_CHARS) {
+ content[i] = (byte) s;
+ } else {
+ s -= ChmConstants.LZX_NUM_CHARS;
+ matchlen = s & ChmConstants.LZX_NUM_PRIMARY_LENGTHS;
+ if (matchlen == ChmConstants.LZX_NUM_PRIMARY_LENGTHS) {
+ matchfooter = getState().getLengthTreeTable()[(int) getChmSection()
+ .getDesyncBits(ChmConstants.LZX_LENGTH_TABLEBITS, 0)];
+ if (matchfooter >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS) {
+ x = ChmConstants.LZX_LENGTH_TABLEBITS;
+ do {
+ x++;
+ matchfooter <<= 1;
+ matchfooter += getChmSection().checkBit(x);
+ } while ((matchfooter = getState().getLengthTreeTable()[matchfooter]) >= ChmConstants.LZX_NUM_SECONDARY_LENGTHS);
+ }
+ getChmSection().getSyncBits(
+ getState().getLengthTreeLengtsTable()[matchfooter]);
+ matchlen += matchfooter;
+ }
+ matchlen += ChmConstants.LZX_MIN_MATCH;
+ // shorter than 2
+ matchoffset = s >>> 3;
+ if (matchoffset > 2) {
+ if (matchoffset != 3) { // should get other bits to retrieve
+ // offset
+ extra = ChmConstants.EXTRA_BITS[matchoffset];
+ long l = getChmSection().getSyncBits(extra);
+ matchoffset = (int) (ChmConstants.POSITION_BASE[matchoffset] - 2 + l);
+ } else {
+ matchoffset = 1;
+ }
+ getState().setR2(getState().getR1());
+ getState().setR1(getState().getR0());
+ getState().setR0(matchoffset);
+ } else if (matchoffset == 0) {
+ matchoffset = (int) getState().getR0();
+ } else if (matchoffset == 1) {
+ matchoffset = (int) getState().getR1();
+ getState().setR1(getState().getR0());
+ getState().setR0(matchoffset);
+ } else /* match_offset == 2 */
+ {
+ matchoffset = (int) getState().getR2();
+ getState().setR2(getState().getR0());
+ getState().setR0(matchoffset);
+ }
+ rundest = i;
+ runsrc = rundest - matchoffset;
+ i += (matchlen - 1);
+ if (i > len)
+ break;
+ if (runsrc < 0) {
+ if (matchlen + runsrc <= 0) {
+ runsrc = prevcontent.length + runsrc;
+ while ((matchlen-- > 0) && (prevcontent != null)
+ && ((runsrc + 1) > 0))
+ if ((rundest < content.length)
+ && (runsrc < content.length))
+ content[rundest++] = prevcontent[runsrc++];
+ } else {
+ runsrc = prevcontent.length + runsrc;
+ while (runsrc < prevcontent.length)
+ if ((rundest < content.length)
+ && (runsrc < content.length))
+ content[rundest++] = prevcontent[runsrc++];
+ matchlen = matchlen + runsrc - prevcontent.length;
+ runsrc = 0;
+ while (matchlen-- > 0)
+ content[rundest++] = content[runsrc++];
+ }
+
+ } else {
+ /* copies any wrapped source data */
+ while ((runsrc < 0) && (matchlen-- > 0)) {
+ content[rundest++] = content[(int) (runsrc + getBlockLength())];
+ runsrc++;
+ }
+ /* copies match data - no worries about destination wraps */
+ while (matchlen-- > 0) {
+ if ((rundest < content.length)
+ && (runsrc < content.length))
+ content[rundest++] = content[runsrc++];
+ }
+ }
+ }
+ }
+ setContentLength(len);
+ }
+
+ private void createLengthTreeLenTable(int offset, int tablelen,
+ short[] pretreetable, short[] prelentable) {
+ if (prelentable == null || getChmSection() == null
+ || pretreetable == null || prelentable == null)
+ throw new ChmParsingException("is null");
+
+ int i = offset; // represents offset
+ int z, y, x;// local counters
+ while (i < tablelen) {
+ z = pretreetable[(int) getChmSection().getDesyncBits(
+ ChmConstants.LZX_PRETREE_TABLEBITS, 0)];
+ if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 1 bug, should be
+ // 20
+ x = ChmConstants.LZX_PRETREE_TABLEBITS;
+ do {
+ x++;
+ z <<= 1;
+ z += getChmSection().checkBit(x);
+ } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS);
+ }
+ getChmSection().getSyncBits(prelentable[z]);
+ if (z < 17) {
+ z = getState().getLengthTreeLengtsTable()[i] - z;
+ if (z < 0)
+ z = z + 17;
+ getState().getLengthTreeLengtsTable()[i] = (short) z;
+ i++;
+ } else if (z == 17) {
+ y = (int) getChmSection().getSyncBits(4);
+ y += 4;
+ for (int j = 0; j < y; j++)
+ if (i < getState().getLengthTreeLengtsTable().length)
+ getState().getLengthTreeLengtsTable()[i++] = 0;
+ } else if (z == 18) {
+ y = (int) getChmSection().getSyncBits(5);
+ y += 20;
+ for (int j = 0; j < y; j++)
+ if (i < getState().getLengthTreeLengtsTable().length)
+ getState().getLengthTreeLengtsTable()[i++] = 0;
+ } else if (z == 19) {
+ y = getChmSection().getSyncBits(1);
+ y += 4;
+ z = pretreetable[(int) getChmSection().getDesyncBits(
+ ChmConstants.LZX_PRETREE_TABLEBITS, 0)];
+ if (z >= ChmConstants.LZX_PRETREE_NUM_ELEMENTS) {// 20
+ x = ChmConstants.LZX_PRETREE_TABLEBITS;// 6
+ do {
+ x++;
+ z <<= 1;
+ z += getChmSection().checkBit(x);
+ } while ((z = pretreetable[z]) >= ChmConstants.LZX_MAINTREE_TABLEBITS);
+ }
+ getChmSection().getSyncBits(prelentable[z]);
+ z = getState().getLengthTreeLengtsTable()[i] - z;
+ if (z < 0)
+ z = z + 17;
+ for (int j = 0; j < y; j++)
+ getState().getLengthTreeLengtsTable()[i++] = (short) z;
+ }
+ }
+ }
+
+ private void createMainTreeTable() {
+ short[] prelentable = createPreLenTable();
+ short[] pretreetable = createTreeTable2(prelentable,
+ (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+ + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+ ChmConstants.LZX_PRETREE_TABLEBITS,
+ ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+ createMainTreeLenTable(0, ChmConstants.LZX_NUM_CHARS, pretreetable,
+ prelentable);
+ prelentable = createPreLenTable();
+ pretreetable = createTreeTable2(prelentable,
+ (1 << ChmConstants.LZX_PRETREE_TABLEBITS)
+ + (ChmConstants.LZX_PRETREE_MAXSYMBOLS << 1),
+ ChmConstants.LZX_PRETREE_TABLEBITS,
+ ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+ createMainTreeLenTable(ChmConstants.LZX_NUM_CHARS,
+ getState().mainTreeLengtsTable.length, pretreetable,
+ prelentable);
+
+ getState().setMainTreeTable(
+ createTreeTable2(getState().mainTreeLengtsTable,
+ (1 << ChmConstants.LZX_MAINTREE_TABLEBITS)
+ + (ChmConstants.LZX_MAINTREE_MAXSYMBOLS << 1),
+ ChmConstants.LZX_MAINTREE_TABLEBITS, getState()
+ .getMainTreeElements()));
+
+ }
+
+ private void createMainTreeLenTable(int offset, int tablelen,
+ short[] pretreetable, short[] prelentable) {
+ if (pretreetable == null)
+ throw new ChmParsingException("pretreetable is null");
+ int i = offset;
+ int z, y, x;
+ while (i < tablelen) {
+ int f = getChmSection().getDesyncBits(
+ ChmConstants.LZX_PRETREE_TABLEBITS, 0);
+ z = pretreetable[f];
+ if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
+ x = ChmConstants.LZX_PRETREE_TABLEBITS;
+ do {
+ x++;
+ z <<= 1;
+ z += getChmSection().checkBit(x);
+ } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+ }
+ getChmSection().getSyncBits(prelentable[z]);
+ if (z < 17) {
+ z = getState().getMainTreeLengtsTable()[i] - z;
+ if (z < 0)
+ z = z + 17;
+ getState().mainTreeLengtsTable[i] = (short) z;
+ i++;
+ } else if (z == 17) {
+ y = getChmSection().getSyncBits(4);
+ y += 4;
+ for (int j = 0; j < y; j++) {
+ assertInRange(getState().getMainTreeLengtsTable(), i);
+ getState().mainTreeLengtsTable[i++] = 0;
+ }
+ } else if (z == 18) {
+ y = getChmSection().getSyncBits(5);
+ y += 20;
+ for (int j = 0; j < y; j++) {
+ assertInRange(getState().getMainTreeLengtsTable(), i);
+ getState().mainTreeLengtsTable[i++] = 0;
+ }
+ } else if (z == 19) {
+ y = getChmSection().getSyncBits(1);
+ y += 4;
+ z = pretreetable[getChmSection().getDesyncBits(
+ ChmConstants.LZX_PRETREE_TABLEBITS, 0)];
+ if (z >= ChmConstants.LZX_PRETREE_MAXSYMBOLS) {
+ x = ChmConstants.LZX_PRETREE_TABLEBITS;
+ do {
+ x++;
+ z <<= 1;
+ z += getChmSection().checkBit(x);
+ } while ((z = pretreetable[z]) >= ChmConstants.LZX_PRETREE_MAXSYMBOLS);
+ }
+ getChmSection().getSyncBits(prelentable[z]);
+ z = getState().mainTreeLengtsTable[i] - z;
+ if (z < 0)
+ z = z + 17;
+ for (int j = 0; j < y; j++)
+ if (i < getState().getMainTreeLengtsTable().length)
+ getState().mainTreeLengtsTable[i++] = (short) z;
+ }
+ }
+ }
+
+ private void assertInRange(short[] array, int index) {
+ if (index >= array.length)
+ throw new ChmParsingException(index + " is bigger than "
+ + array.length);
+ }
+
+ private short[] createAlignedLenTable() {
+ int tablelen = ChmConstants.LZX_BLOCKTYPE_UNCOMPRESSED;
+ int bits = ChmConstants.LZX_BLOCKTYPE_UNCOMPRESSED;
+ short[] tmp = new short[tablelen];
+ for (int i = 0; i < tablelen; i++) {
+ tmp[i] = (short) getChmSection().getSyncBits(bits);
+ }
+ return tmp;
+ }
+
+ private void createAlignedTreeTable() {
+ getState().setAlignedLenTable(createAlignedLenTable());
+ getState().setAlignedLenTable(
+ createTreeTable2(getState().getAlignedLenTable(),
+ (1 << ChmConstants.LZX_NUM_PRIMARY_LENGTHS)
+ + (ChmConstants.LZX_ALIGNED_MAXSYMBOLS << 1),
+ ChmConstants.LZX_NUM_PRIMARY_LENGTHS,
+ ChmConstants.LZX_ALIGNED_MAXSYMBOLS));
+ }
+
+ private short[] createTreeTable2(short[] lentable, int tablelen, int bits,
+ int maxsymbol) {
+ short[] tmp = new short[tablelen];
+ short sym;
+ int leaf;
+ int bit_num = 1;
+ long fill;
+ int pos = 0;
+ /* the current position in the decode table */
+ long table_mask = (1 << bits);
+ long bit_mask = (table_mask >> 1);
+ long next_symbol = bit_mask;
+
+ /* fills entries for short codes for a direct mapping */
+ while (bit_num <= bits) {
+ for (sym = 0; sym < maxsymbol; sym++) {
+ if (lentable.length > sym && lentable[sym] == bit_num) {
+ leaf = pos;// pos=0
+
+ if ((pos += bit_mask) > table_mask)
+ return null;
+
+ fill = bit_mask;
+ while (fill-- > 0)
+ tmp[leaf++] = sym;
+ }
+ }
+ bit_mask >>= 1;
+ bit_num++;
+ }
+
+ /* if there are any codes longer than nbits */
+ if (pos != table_mask) {
+ /* clears the remainder of the table */
+ for (leaf = pos; leaf < table_mask; leaf++)
+ tmp[leaf] = 0;
+
+ /* gives ourselves room for codes to grow by up to 16 more bits */
+ pos <<= 16;
+ table_mask <<= 16;
+ bit_mask = 1 << 15;
+
+ while (bit_num <= 16) {
+ for (sym = 0; sym < maxsymbol; sym++) {
+ if ((lentable.length > sym) && (lentable[sym] == bit_num)) {
+ leaf = pos >> 16;
+ for (fill = 0; fill < bit_num - bits; fill++) {
+ /*
+ * if this path hasn't been taken yet, 'allocate'
+ * two entries
+ */
+ if (tmp[leaf] == 0) {
+ if (((next_symbol << 1) + 1) < tmp.length) {
+ tmp[(int) (next_symbol << 1)] = 0;
+ tmp[(int) (next_symbol << 1) + 1] = 0;
+ tmp[leaf] = (short) next_symbol++;
+ }
+
+ }
+ /*
+ * follows the path and select either left or right
+ * for next bit
+ */
+ leaf = tmp[leaf] << 1;
+ if (((pos >> (15 - fill)) & 1) != 0)
+ leaf++;
+ }
+ tmp[leaf] = sym;
+
+ if ((pos += bit_mask) > table_mask)
+ return null;
+ /* table overflow */
+ } else {
+ // return null;
+ }
+ }
+ bit_mask >>= 1;
+ bit_num++;
+ }
+ }
+
+ /* is it full table? */
+ if (pos == table_mask)
+ return tmp;
+
+ return tmp;
+ }
+
+ public byte[] getContent() {
+ return content;
+ }
+
+ public byte[] getContent(int startOffset, int endOffset) {
+ int length = endOffset - startOffset;
+ return (getContent() != null) ? Arrays.copyOfRange(getContent(),
+ startOffset, (startOffset + length)) : new byte[1];
+ }
+
+ public byte[] getContent(int start) {
+ return (getContent() != null) ? Arrays.copyOfRange(getContent(), start,
+ (getContent().length + start)) : new byte[1];
+ }
+
+ private void setContent(int contentLength) {
+ this.content = new byte[contentLength];
+ }
+
+ private void checkLzxBlock(ChmLzxBlock chmPrevLzxBlock) {
+ if (chmPrevLzxBlock == null && getBlockLength() < Integer.MAX_VALUE)
+ setState(new ChmLzxState((int) getBlockLength()));
+ else
+ setState(chmPrevLzxBlock.getState());
+ }
+
+ private boolean validateConstructorParams(int blockNumber,
+ byte[] dataSegment, long blockLength) {
+ int goodParameter = 0;
+ if (blockNumber >= 0)
+ ++goodParameter;
+ else
+ throw new ChmParsingException("block number should be possitive");
+ if (dataSegment != null && dataSegment.length > 0)
+ ++goodParameter;
+ else
+ throw new ChmParsingException("data segment should not be null");
+ if (blockLength > 0)
+ ++goodParameter;
+ else
+ throw new ChmParsingException(
+ "block length should be more than zero");
+ return (goodParameter == 3);
+ }
+
+ public int getBlockNumber() {
+ return block_number;
+ }
+
+ private void setBlockNumber(int block_number) {
+ this.block_number = block_number;
+ }
+
+ private long getBlockLength() {
+ return block_length;
+ }
+
+ private void setBlockLength(long block_length) {
+ this.block_length = block_length;
+ }
+
+ public ChmLzxState getState() {
+ return state;
+ }
+
+ private void setState(ChmLzxState state) {
+ this.state = state;
+ }
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+ // TODO Auto-generated method stub
+
+ }
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java Tue Jun 7 15:44:41 2011
@@ -0,0 +1,310 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.lzx;
+
+import java.util.concurrent.CancellationException;
+
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
+import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+public class ChmLzxState {
+ /* Class' members */
+ private int window; /* the actual decoding window */
+ private long window_size; /* window size (32Kb through 2Mb) */
+ private int window_position; /* current offset within the window */
+ private int main_tree_elements; /* number of main tree elements */
+ private LzxState hadStarted; /* have we started decoding at all yet? */
+ private int block_type; /* type of this block */
+ private int block_length; /* uncompressed length of this block */
+ private int block_remaining; /* uncompressed bytes still left to decode */
+ private int frames_read; /* the number of CFDATA blocks processed */
+ private int intel_file_size; /* magic header value used for transform */
+ private long intel_current_possition; /* current offset in transform space */
+ private IntelState intel_state; /* have we seen any translatable data yet? */
+ private long R0; /* for the LRU offset system */
+ private long R1; /* for the LRU offset system */
+ private long R2; /* for the LRU offset system */
+
+ // Trees - PRETREE, MAINTREE, LENGTH, ALIGNED
+ protected short[] mainTreeLengtsTable;
+ protected short[] mainTreeTable;
+
+ protected short[] lengthTreeTable;
+ protected short[] lengthTreeLengtsTable;
+
+ protected short[] alignedLenTable;
+ protected short[] alignedTreeTable;
+
+ protected short[] getMainTreeTable() {
+ return mainTreeTable;
+ }
+
+ protected short[] getAlignedTreeTable() {
+ return alignedTreeTable;
+ }
+
+ protected void setAlignedTreeTable(short[] alignedTreeTable) {
+ this.alignedTreeTable = alignedTreeTable;
+ }
+
+ protected short[] getLengthTreeTable() {
+ if (lengthTreeTable != null)
+ return this.lengthTreeTable;
+ else
+ throw new ChmParsingException("lengthTreeTable is null");
+ }
+
+ protected void setLengthTreeTable(short[] lengthTreeTable) {
+ this.lengthTreeTable = lengthTreeTable;
+ }
+
+ protected void setMainTreeTable(short[] mainTreeTable) {
+ this.mainTreeTable = mainTreeTable;
+ }
+
+ protected short[] getAlignedLenTable() {
+ return this.alignedLenTable;
+ }
+
+ protected void setAlignedLenTable(short[] alignedLenTable) {
+ this.alignedLenTable = alignedLenTable;
+ }
+
+ /**
+ * It suits for informative outlook
+ */
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("actual decoding window:=" + getWindow()
+ + System.getProperty("line.separator"));
+ sb.append("window size (32Kb through 2Mb):=" + getWindowSize()
+ + System.getProperty("line.separator"));
+ sb.append("current offset within the window:=" + getWindowPosition()
+ + System.getProperty("line.separator"));
+ sb.append("number of main tree elements:=" + getMainTreeElements()
+ + System.getProperty("line.separator"));
+ sb.append("have we started decoding at all yet?:=" + getHadStarted()
+ + System.getProperty("line.separator"));
+ sb.append("type of this block:=" + getBlockType()
+ + System.getProperty("line.separator"));
+ sb.append("uncompressed length of this block:=" + getBlockLength()
+ + System.getProperty("line.separator"));
+ sb.append("uncompressed bytes still left to decode:="
+ + getBlockRemaining() + System.getProperty("line.separator"));
+ sb.append("the number of CFDATA blocks processed:=" + getFramesRead()
+ + System.getProperty("line.separator"));
+ sb.append("magic header value used for transform:="
+ + getIntelFileSize() + System.getProperty("line.separator"));
+ sb.append("current offset in transform space:="
+ + getIntelCurrentPossition()
+ + System.getProperty("line.separator"));
+ sb.append("have we seen any translatable data yet?:=" + getIntelState()
+ + System.getProperty("line.separator"));
+ sb.append("R0 for the LRU offset system:=" + getR0()
+ + System.getProperty("line.separator"));
+ sb.append("R1 for the LRU offset system:=" + getR1()
+ + System.getProperty("line.separator"));
+ sb.append("R2 for the LRU offset system:=" + getR2()
+ + System.getProperty("line.separator"));
+ sb.append("main tree length:=" + getMainTreeLengtsTable().length
+ + System.getProperty("line.separator"));
+ sb.append("secondary tree length:=" + getLengthTreeLengtsTable().length
+ + System.getProperty("line.separator"));
+ return sb.toString();
+ }
+
+ public ChmLzxState(int window) {
+ if (window >= 0) {
+ int position_slots;
+ int win = ChmCommons.getWindowSize(window);
+ setWindowSize(1 << win);
+ /* LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) */
+ if (win < 15 || win > 21)
+ System.err
+ .println("window less than 15 or window greater than 21");
+ /* Calculates required position slots */
+ if (win == 20)
+ position_slots = 42;
+ else if (win == 21)
+ position_slots = 50;
+ else
+ position_slots = win << 1;
+
+ setR0(1);
+ setR1(1);
+ setR2(1);
+ setMainTreeElements(512);
+ setHadStarted(LzxState.NOT_STARTED_DECODING);
+ setFramesRead(0);
+ setBlockRemaining(0);
+ setBlockType(ChmConstants.LZX_BLOCKTYPE_INVALID);
+ setIntelCurrentPossition(0);
+ setIntelState(IntelState.NOT_STARTED);
+ setWindowPosition(0);
+ setMainTreeLengtsTable(new short[getMainTreeElements()]);
+ setLengthTreeLengtsTable(new short[ChmConstants.LZX_NUM_SECONDARY_LENGTHS]);
+ } else
+ throw new CancellationException(
+ "window size should be more than zero");
+ }
+
+ protected void setWindow(int window) {
+ this.window = window;
+ }
+
+ protected int getWindow() {
+ return window;
+ }
+
+ protected void setWindowSize(long window_size) {
+ this.window_size = window_size;
+ }
+
+ protected long getWindowSize() {
+ return window_size;
+ }
+
+ protected void setWindowPosition(int window_position) {
+ this.window_position = window_position;
+ }
+
+ protected int getWindowPosition() {
+ return window_position;
+ }
+
+ protected void setMainTreeElements(int main_tree_elements) {
+ this.main_tree_elements = main_tree_elements;
+ }
+
+ protected int getMainTreeElements() {
+ return main_tree_elements;
+ }
+
+ protected void setHadStarted(LzxState hadStarted) {
+ this.hadStarted = hadStarted;
+ }
+
+ protected LzxState getHadStarted() {
+ return hadStarted;
+ }
+
+ protected void setBlockType(int block_type) {
+ this.block_type = block_type;
+ }
+
+ public int getBlockType() {
+ return block_type;
+ }
+
+ protected void setBlockLength(int block_length) {
+ this.block_length = block_length;
+ }
+
+ protected int getBlockLength() {
+ return block_length;
+ }
+
+ protected void setBlockRemaining(int block_remaining) {
+ this.block_remaining = block_remaining;
+ }
+
+ protected int getBlockRemaining() {
+ return block_remaining;
+ }
+
+ protected void setFramesRead(int frames_read) {
+ this.frames_read = frames_read;
+ }
+
+ protected void increaseFramesRead() {
+ this.frames_read = getFramesRead() + 1;
+ }
+
+ protected int getFramesRead() {
+ return frames_read;
+ }
+
+ protected void setIntelFileSize(int intel_file_size) {
+ this.intel_file_size = intel_file_size;
+ }
+
+ protected int getIntelFileSize() {
+ return intel_file_size;
+ }
+
+ protected void setIntelCurrentPossition(long intel_current_possition) {
+ this.intel_current_possition = intel_current_possition;
+ }
+
+ protected long getIntelCurrentPossition() {
+ return intel_current_possition;
+ }
+
+ protected void setIntelState(IntelState intel_state) {
+ this.intel_state = intel_state;
+ }
+
+ protected IntelState getIntelState() {
+ return intel_state;
+ }
+
+ protected void setR0(long r0) {
+ R0 = r0;
+ }
+
+ protected long getR0() {
+ return R0;
+ }
+
+ protected void setR1(long r1) {
+ R1 = r1;
+ }
+
+ protected long getR1() {
+ return R1;
+ }
+
+ protected void setR2(long r2) {
+ R2 = r2;
+ }
+
+ protected long getR2() {
+ return R2;
+ }
+
+ public static void main(String[] args) {
+ }
+
+ public void setMainTreeLengtsTable(short[] mainTreeLengtsTable) {
+ this.mainTreeLengtsTable = mainTreeLengtsTable;
+ }
+
+ public short[] getMainTreeLengtsTable() {
+ return mainTreeLengtsTable;
+ }
+
+ public void setLengthTreeLengtsTable(short[] lengthTreeLengtsTable) {
+ this.lengthTreeLengtsTable = lengthTreeLengtsTable;
+ }
+
+ public short[] getLengthTreeLengtsTable() {
+ return lengthTreeLengtsTable;
+ }
+}