You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2011/06/07 17:44:42 UTC

svn commit: r1133047 [1/3] - in /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm: accessor/ assertion/ core/ exception/ lzx/

Author: mattmann
Date: Tue Jun  7 15:44:41 2011
New Revision: 1133047

URL: http://svn.apache.org/viewvc?rev=1133047&view=rev
Log:
- progress towards TIKA-245 Support of CHM Format (Oleg's patch, in parts, as suggested by Jukka)

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/assertion/
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/exception/
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java Tue Jun  7 15:44:41 2011
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.io.Serializable;
+
+/**
+ * 
+ * Defines an accessor interface
+ * 
+ * @param <T>
+ */
+public interface ChmAccessor<T> extends Serializable {
+	/**
+	 * Parses chm accessor
+	 * 
+	 * @param data
+	 *            chm file
+	 * @param chmAccessor
+	 */
+	void parse(byte[] data, T chmAccessor);
+}

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java Tue Jun  7 15:44:41 2011
@@ -0,0 +1,386 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.math.BigInteger;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+
+/**
+ * Holds chm listing entries
+ */
+public class ChmDirectoryListingSet {
+	private List<DirectoryListingEntry> dlel;
+	private byte[] data;
+	private int placeHolder = -1;
+	private long dataOffset = -1;
+	private int controlDataIndex = -1;
+	private int resetTableIndex = -1;
+
+	private boolean isNotControlDataFound = true;
+	private boolean isNotResetTableFound = true;
+
+	/**
+	 * Constructs chm directory listing set
+	 * 
+	 * @param data
+	 *            byte[]
+	 * @param chmItsHeader
+	 * @param chmItspHeader
+	 */
+	public ChmDirectoryListingSet(byte[] data, ChmItsfHeader chmItsHeader,
+			ChmItspHeader chmItspHeader) {
+		setDirectoryListingEntryList(new ArrayList<DirectoryListingEntry>());
+		ChmCommons.assertByteArrayNotNull(data);
+		setData(data);
+		enumerateChmDirectoryListingList(chmItsHeader, chmItspHeader);
+	}
+
+	public String toString() {
+		StringBuilder sb = new StringBuilder();
+		sb.append("list:=" + getDirectoryListingEntryList().toString()
+				+ System.getProperty("line.separator"));
+		sb.append("number of list items:="
+				+ getDirectoryListingEntryList().size());
+		return sb.toString();
+	}
+
+	/**
+	 * Returns control data index that located in List
+	 * 
+	 * @return control data index
+	 */
+	public int getControlDataIndex() {
+		return controlDataIndex;
+	}
+
+	/**
+	 * Sets control data index
+	 * 
+	 * @param controlDataIndex
+	 */
+	protected void setControlDataIndex(int controlDataIndex) {
+		this.controlDataIndex = controlDataIndex;
+	}
+
+	/**
+	 * Return index of reset table
+	 * 
+	 * @return reset table index
+	 */
+	public int getResetTableIndex() {
+		return resetTableIndex;
+	}
+
+	/**
+	 * Sets reset table index
+	 * 
+	 * @param resetTableIndex
+	 */
+	protected void setResetTableIndex(int resetTableIndex) {
+		this.resetTableIndex = resetTableIndex;
+	}
+
+	/**
+	 * Gets place holder
+	 * 
+	 * @return place holder
+	 */
+	private int getPlaceHolder() {
+		return placeHolder;
+	}
+
+	/**
+	 * Sets place holder
+	 * 
+	 * @param placeHolder
+	 */
+	private void setPlaceHolder(int placeHolder) {
+		this.placeHolder = placeHolder;
+	}
+
+	/**
+	 * Enumerates chm directory listing entries
+	 * 
+	 * @param chmItsHeader
+	 *            chm itsf header
+	 * @param chmItspHeader
+	 *            chm itsp header
+	 */
+	private void enumerateChmDirectoryListingList(ChmItsfHeader chmItsHeader,
+			ChmItspHeader chmItspHeader) {
+		try {
+			int startPmgl = chmItspHeader.getIndex_head();
+			int stopPmgl = chmItspHeader.getUnknown_0024();
+			int dir_offset = (int) (chmItsHeader.getDirOffset() + chmItspHeader
+					.getHeader_len());
+			setDataOffset(chmItsHeader.getDataOffset());
+
+			/* loops over all pmgls */
+			int previous_index = 0;
+			byte[] dir_chunk = null;
+			for (int i = startPmgl; i <= stopPmgl; i++) {
+				int data_copied = ((1 + i) * (int) chmItspHeader.getBlock_len())
+						+ dir_offset;
+				if (i == 0) {
+					dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
+					dir_chunk = Arrays
+							.copyOfRange(getData(), dir_offset,
+									(((1 + i) * (int) chmItspHeader
+											.getBlock_len()) + dir_offset));
+					previous_index = data_copied;
+				} else {
+					dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
+					dir_chunk = Arrays
+							.copyOfRange(getData(), previous_index,
+									(((1 + i) * (int) chmItspHeader
+											.getBlock_len()) + dir_offset));
+					previous_index = data_copied;
+				}
+				enumerateOneSegment(dir_chunk);
+				dir_chunk = null;
+			}
+		} catch (Exception e) {
+			e.printStackTrace();
+		} finally {
+			setData(null);
+		}
+	}
+
+	/**
+	 * Checks control data
+	 * 
+	 * @param dle
+	 *            chm directory listing entry
+	 */
+	private void checkControlData(DirectoryListingEntry dle) {
+		if (isNotControlDataFound) {
+			if (dle.getName().contains(ChmConstants.CONTROL_DATA)) {
+				setControlDataIndex(getDirectoryListingEntryList().size());
+				isNotControlDataFound = false;
+			}
+		}
+	}
+
+	/**
+	 * Checks reset table
+	 * 
+	 * @param dle
+	 *            chm directory listing entry
+	 */
+	private void checkResetTable(DirectoryListingEntry dle) {
+		if (isNotResetTableFound) {
+			if (dle.getName().contains(ChmConstants.RESET_TABLE)) {
+				setResetTableIndex(getDirectoryListingEntryList().size());
+				isNotResetTableFound = false;
+			}
+		}
+	}
+
+	/**
+	 * Enumerates chm directory listing entries in single chm segment
+	 * 
+	 * @param dir_chunk
+	 */
+	private void enumerateOneSegment(byte[] dir_chunk) {
+		try {
+			if (dir_chunk != null) {
+
+				int indexWorkData = ChmCommons.indexOf(dir_chunk,
+						"::".getBytes());
+				int indexUserData = ChmCommons.indexOf(dir_chunk,
+						"/".getBytes());
+
+				if (indexUserData < indexWorkData)
+					setPlaceHolder(indexUserData);
+				else
+					setPlaceHolder(indexWorkData);
+
+				if (getPlaceHolder() > 0
+						&& dir_chunk[getPlaceHolder() - 1] != 115) {// #{
+					do {
+						if (dir_chunk[getPlaceHolder() - 1] > 0) {
+							DirectoryListingEntry dle = new DirectoryListingEntry();
+
+							// two cases: 1. when dir_chunk[getPlaceHolder() -
+							// 1] == 0x73
+							// 2. when dir_chunk[getPlaceHolder() + 1] == 0x2f
+							doNameCheck(dir_chunk, dle);
+
+							dle.setName(new String(Arrays.copyOfRange(
+									dir_chunk, getPlaceHolder(),
+									(getPlaceHolder() + dle.getNameLength()))));
+							checkControlData(dle);
+							checkResetTable(dle);
+							setPlaceHolder(getPlaceHolder()
+									+ dle.getNameLength());
+
+							/* Sets entry type */
+							if (getPlaceHolder() < dir_chunk.length
+									&& dir_chunk[getPlaceHolder()] == 0)
+								dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED);
+							else
+								dle.setEntryType(ChmCommons.EntryType.COMPRESSED);
+
+							setPlaceHolder(getPlaceHolder() + 1);
+							dle.setOffset(getEncint(dir_chunk));
+							dle.setLength(getEncint(dir_chunk));
+							getDirectoryListingEntryList().add(dle);
+						} else
+							setPlaceHolder(getPlaceHolder() + 1);
+
+					} while (hasNext(dir_chunk));
+				}
+			}
+
+		} catch (Exception e) {
+			e.printStackTrace();
+		}
+	}
+
+	/**
+	 * Checks if a name and name length are correct. If not then handles it as
+	 * follows: 1. when dir_chunk[getPlaceHolder() - 1] == 0x73 ('/') 2. when
+	 * dir_chunk[getPlaceHolder() + 1] == 0x2f ('s')
+	 * 
+	 * @param dir_chunk
+	 * @param dle
+	 */
+	private void doNameCheck(byte[] dir_chunk, DirectoryListingEntry dle) {
+		if (dir_chunk[getPlaceHolder() - 1] == 0x73) {
+			dle.setNameLength(dir_chunk[getPlaceHolder() - 1] & 0x21);
+		} else if (dir_chunk[getPlaceHolder() + 1] == 0x2f) {
+			dle.setNameLength(dir_chunk[getPlaceHolder()]);
+			setPlaceHolder(getPlaceHolder() + 1);
+		} else {
+			dle.setNameLength(dir_chunk[getPlaceHolder() - 1]);
+		}
+	}
+
+	/**
+	 * Checks if it's possible move further on byte[]
+	 * 
+	 * @param dir_chunk
+	 * 
+	 * @return boolean
+	 */
+	private boolean hasNext(byte[] dir_chunk) {
+		while (getPlaceHolder() < dir_chunk.length) {
+			if (dir_chunk[getPlaceHolder()] == 47
+					&& dir_chunk[getPlaceHolder() + 1] != ':') {
+				setPlaceHolder(getPlaceHolder());
+				return true;
+			} else if (dir_chunk[getPlaceHolder()] == ':'
+					&& dir_chunk[getPlaceHolder() + 1] == ':') {
+				setPlaceHolder(getPlaceHolder());
+				return true;
+			} else
+				setPlaceHolder(getPlaceHolder() + 1);
+		}
+		return false;
+	}
+
+	/**
+	 * Returns encrypted integer
+	 * 
+	 * @param data_chunk
+	 * 
+	 * @return
+	 */
+	private int getEncint(byte[] data_chunk) {
+		byte ob;
+		BigInteger bi = BigInteger.ZERO;
+		byte[] nb = new byte[1];
+
+		if (getPlaceHolder() < data_chunk.length) {
+			while ((ob = data_chunk[getPlaceHolder()]) < 0) {
+				nb[0] = (byte) ((ob & 0x7f));
+				bi = bi.shiftLeft(7).add(new BigInteger(nb));
+				setPlaceHolder(getPlaceHolder() + 1);
+			}
+			nb[0] = (byte) ((ob & 0x7f));
+			bi = bi.shiftLeft(7).add(new BigInteger(nb));
+			setPlaceHolder(getPlaceHolder() + 1);
+		}
+		return bi.intValue();
+	}
+
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+	}
+
+	/**
+	 * Sets chm directory listing entry list
+	 * 
+	 * @param dlel
+	 *            chm directory listing entry list
+	 */
+	public void setDirectoryListingEntryList(List<DirectoryListingEntry> dlel) {
+		this.dlel = dlel;
+	}
+
+	/**
+	 * Returns chm directory listing entry list
+	 * 
+	 * @return List<DirectoryListingEntry>
+	 */
+	public List<DirectoryListingEntry> getDirectoryListingEntryList() {
+		return dlel;
+	}
+
+	/**
+	 * Sets data
+	 * 
+	 * @param data
+	 */
+	private void setData(byte[] data) {
+		this.data = data;
+	}
+
+	/**
+	 * Returns data
+	 * 
+	 * @return
+	 */
+	private byte[] getData() {
+		return data;
+	}
+
+	/**
+	 * Sets data offset
+	 * 
+	 * @param dataOffset
+	 */
+	private void setDataOffset(long dataOffset) {
+		this.dataOffset = dataOffset;
+	}
+
+	/**
+	 * Returns data offset
+	 * 
+	 * @return dataOffset
+	 */
+	public long getDataOffset() {
+		return dataOffset;
+	}
+}

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java Tue Jun  7 15:44:41 2011
@@ -0,0 +1,497 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.math.BigInteger;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * The Header 0000: char[4] 'ITSF' 0004: DWORD 3 (Version number) 0008: DWORD
+ * Total header length, including header section table and following data. 000C:
+ * DWORD 1 (unknown) 0010: DWORD a timestamp 0014: DWORD Windows Language ID
+ * 0018: GUID {7C01FD10-7BAA-11D0-9E0C-00A0-C922-E6EC} 0028: GUID
+ * {7C01FD11-7BAA-11D0-9E0C-00A0-C922-E6EC} Note: a GUID is $10 bytes, arranged
+ * as 1 DWORD, 2 WORDs, and 8 BYTEs. 0000: QWORD Offset of section from
+ * beginning of file 0008: QWORD Length of section Following the header section
+ * table is 8 bytes of additional header data. In Version 2 files, this data is
+ * not there and the content section starts immediately after the directory.
+ * 
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?show-translation-form=1}
+ * 
+ */
+/* structure of ITSF headers */
+public class ChmItsfHeader implements ChmAccessor<ChmItsfHeader> {
+	private static final long serialVersionUID = 2215291838533213826L;
+	private byte[] signature = new String("ITSF").getBytes(); /* 0 (ITSF) */
+	private int version; /* 4 */
+	private int header_len; /* 8 */
+	private int unknown_000c; /* c */
+	private long last_modified; /* 10 */
+	private long lang_id; /* 14 */
+	private byte[] dir_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 18 */
+	private byte[] stream_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 28 */
+	private long unknown_offset; /* 38 */
+	private long unknown_len; /* 40 */
+	private long dir_offset; /* 48 */
+	private long dir_len; /* 50 */
+	private long data_offset; /* 58 (Not present before V3) */
+
+	/* local usage */
+	private int dataRemained;
+	private int currentPlace = 0;
+
+	/**
+	 * Prints the values of ChmfHeader
+	 */
+	public String toString() {
+		StringBuilder sb = new StringBuilder();
+		sb.append(new String(getSignature()) + " ");
+		sb.append(getVersion() + " ");
+		sb.append(getHeaderLen() + " ");
+		sb.append(getUnknown_000c() + " ");
+		sb.append(getLastModified() + " ");
+		sb.append(getLangId() + " ");
+		sb.append(getDir_uuid() + " ");
+		sb.append(getStream_uuid() + " ");
+		sb.append(getUnknownOffset() + " ");
+		sb.append(getUnknownLen() + " ");
+		sb.append(getDirOffset() + " ");
+		sb.append(getDirLen() + " ");
+		sb.append(getDataOffset() + " ");
+		return sb.toString();
+	}
+
+	/**
+	 * Returns a signature of itsf header
+	 * 
+	 * @return itsf header
+	 */
+	public byte[] getSignature() {
+		return signature;
+	}
+
+	/**
+	 * Sets itsf header signature
+	 * 
+	 * @param signature
+	 */
+	protected void setSignature(byte[] signature) {
+		this.signature = signature;
+	}
+
+	/**
+	 * Returns itsf header version
+	 * 
+	 * @return itsf version
+	 */
+	public int getVersion() {
+		return version;
+	}
+
+	/**
+	 * Sets itsf version
+	 * 
+	 * @param version
+	 */
+	protected void setVersion(int version) {
+		this.version = version;
+	}
+
+	/**
+	 * Returns itsf header length
+	 * 
+	 * @return length
+	 */
+	public int getHeaderLen() {
+		return header_len;
+	}
+
+	/**
+	 * Sets itsf header length
+	 * 
+	 * @param header_len
+	 */
+	protected void setHeaderLen(int header_len) {
+		this.header_len = header_len;
+	}
+
+	/**
+	 * Returns unknown_00c value
+	 * 
+	 * @return unknown_00c
+	 */
+	public int getUnknown_000c() {
+		return unknown_000c;
+	}
+
+	/**
+	 * Sets unknown_00c
+	 * 
+	 * @param unknown_000c
+	 */
+	protected void setUnknown_000c(int unknown_000c) {
+		this.unknown_000c = unknown_000c;
+	}
+
+	/**
+	 * Returns last modified date of the chm file
+	 * 
+	 * @return last modified date as long
+	 */
+	public long getLastModified() {
+		return last_modified;
+	}
+
+	/**
+	 * Sets last modified date of the chm file
+	 * 
+	 * @param last_modified
+	 */
+	protected void setLastModified(long last_modified) {
+		this.last_modified = last_modified;
+	}
+
+	/**
+	 * Returns language ID
+	 * 
+	 * @return language_id
+	 */
+	public long getLangId() {
+		return lang_id;
+	}
+
+	/**
+	 * Sets language_id
+	 * 
+	 * @param lang_id
+	 */
+	protected void setLangId(long lang_id) {
+		this.lang_id = lang_id;
+	}
+
+	/**
+	 * Returns directory uuid
+	 * 
+	 * @return dir_uuid
+	 */
+	public byte[] getDir_uuid() {
+		return dir_uuid;
+	}
+
+	/**
+	 * Sets directory uuid
+	 * 
+	 * @param dir_uuid
+	 */
+	protected void setDir_uuid(byte[] dir_uuid) {
+		this.dir_uuid = dir_uuid;
+	}
+
+	/**
+	 * Returns stream uuid
+	 * 
+	 * @return stream_uuid
+	 */
+	public byte[] getStream_uuid() {
+		return stream_uuid;
+	}
+
+	/**
+	 * Sets stream uuid
+	 * 
+	 * @param stream_uuid
+	 */
+	protected void setStream_uuid(byte[] stream_uuid) {
+		this.stream_uuid = stream_uuid;
+	}
+
+	/**
+	 * Returns unknown offset
+	 * 
+	 * @return unknown_offset
+	 */
+	public long getUnknownOffset() {
+		return unknown_offset;
+	}
+
+	/**
+	 * Sets unknown offset
+	 * 
+	 * @param unknown_offset
+	 */
+	protected void setUnknownOffset(long unknown_offset) {
+		this.unknown_offset = unknown_offset;
+	}
+
+	/**
+	 * Returns unknown length
+	 * 
+	 * @return unknown_length
+	 */
+	public long getUnknownLen() {
+		return unknown_len;
+	}
+
+	/**
+	 * Sets unknown length
+	 * 
+	 * @param unknown_len
+	 */
+	protected void setUnknownLen(long unknown_len) {
+		this.unknown_len = unknown_len;
+	}
+
+	/**
+	 * Returns directory offset
+	 * 
+	 * @return directory_offset
+	 */
+	public long getDirOffset() {
+		return dir_offset;
+	}
+
+	/**
+	 * Sets directory offset
+	 * 
+	 * @param dir_offset
+	 */
+	protected void setDirOffset(long dir_offset) {
+		this.dir_offset = dir_offset;
+	}
+
+	/**
+	 * Returns directory length
+	 * 
+	 * @return directory_offset
+	 */
+	public long getDirLen() {
+		return dir_len;
+	}
+
+	/**
+	 * Sets directory length
+	 * 
+	 * @param dir_len
+	 */
+	protected void setDirLen(long dir_len) {
+		this.dir_len = dir_len;
+	}
+
+	/**
+	 * Returns data offset
+	 * 
+	 * @return data_offset
+	 */
+	public long getDataOffset() {
+		return data_offset;
+	}
+
+	/**
+	 * Sets data offset
+	 * 
+	 * @param data_offset
+	 */
+	protected void setDataOffset(long data_offset) {
+		this.data_offset = data_offset;
+	}
+
+	/**
+	 * Copies 4 first bytes of the byte[]
+	 * 
+	 * @param data
+	 * @param chmItsfHeader
+	 * @param count
+	 */
+	private void unmarshalCharArray(byte[] data, ChmItsfHeader chmItsfHeader,
+			int count) {
+		ChmAssert.assertChmAccessorParameters(data, chmItsfHeader, count);
+		System.arraycopy(data, 0, chmItsfHeader.signature, 0, count);
+		this.setCurrentPlace(this.getCurrentPlace() + count);
+		this.setDataRemained(this.getDataRemained() - count);
+	}
+
+	/**
+	 * Copies X bytes of source byte[] to the dest byte[]
+	 * 
+	 * @param data
+	 * @param dest
+	 * @param count
+	 * @return
+	 */
+	private byte[] unmarshalUuid(byte[] data, byte[] dest, int count) {
+		System.arraycopy(data, this.getCurrentPlace(), dest, 0, count);
+		this.setCurrentPlace(this.getCurrentPlace() + count);
+		this.setDataRemained(this.getDataRemained() - count);
+		return dest;
+	}
+
+	/**
+	 * Takes 8 bytes and reverses them
+	 * 
+	 * @param data
+	 * @param dest
+	 * @return
+	 */
+	private long unmarshalUint64(byte[] data, long dest) {
+		byte[] temp = new byte[8];
+		int i, j;
+
+		if (8 > this.getDataRemained())
+			throw new ChmParsingException("8 > this.getDataRemained()");
+
+		for (i = 8, j = 7; i > 0; i--) {
+			temp[j--] = data[this.getCurrentPlace()];
+			this.setCurrentPlace(this.getCurrentPlace() + 1);
+		}
+
+		dest = new BigInteger(temp).longValue();
+		this.setDataRemained(this.getDataRemained() - 8);
+		return dest;
+	}
+
+	private int unmarshalInt32(byte[] data, int dest) {
+		ChmAssert.assertByteArrayNotNull(data);
+
+		if (4 > this.getDataRemained())
+			throw new ChmParsingException("4 > dataLenght");
+		dest = data[this.getCurrentPlace()]
+				| data[this.getCurrentPlace() + 1] << 8
+				| data[this.getCurrentPlace() + 2] << 16
+				| data[this.getCurrentPlace() + 3] << 24;
+
+		this.setCurrentPlace(this.getCurrentPlace() + 4);
+		this.setDataRemained(this.getDataRemained() - 4);
+		return dest;
+	}
+
+	private long unmarshalUInt32(byte[] data, long dest) {
+		ChmAssert.assertByteArrayNotNull(data);
+		if (4 > getDataRemained())
+			throw new ChmParsingException("4 > dataLenght");
+		dest = data[this.getCurrentPlace()]
+				| data[this.getCurrentPlace() + 1] << 8
+				| data[this.getCurrentPlace() + 2] << 16
+				| data[this.getCurrentPlace() + 3] << 24;
+
+		setDataRemained(this.getDataRemained() - 4);
+		this.setCurrentPlace(this.getCurrentPlace() + 4);
+		return dest;
+	}
+
+	public static void main(String[] args) {
+	}
+
+	/**
+	 * Sets data remained to be processed
+	 * 
+	 * @param dataRemained
+	 */
+	private void setDataRemained(int dataRemained) {
+		this.dataRemained = dataRemained;
+	}
+
+	/**
+	 * Returns data remained
+	 * 
+	 * @return data_remainned
+	 */
+	private int getDataRemained() {
+		return dataRemained;
+	}
+
+	/**
+	 * Sets current place in the byte[]
+	 * 
+	 * @param currentPlace
+	 */
+	private void setCurrentPlace(int currentPlace) {
+		this.currentPlace = currentPlace;
+	}
+
+	/**
+	 * Returns current place in the byte[]
+	 * 
+	 * @return current place
+	 */
+	private int getCurrentPlace() {
+		return currentPlace;
+	}
+
+	// @Override
+	public void parse(byte[] data, ChmItsfHeader chmItsfHeader) {
+		if (data.length < ChmConstants.CHM_ITSF_V2_LEN
+				|| data.length > ChmConstants.CHM_ITSF_V3_LEN)
+			throw new ChmParsingException(
+					"we only know how to deal with the 0x58 and 0x60 byte structures");
+
+		chmItsfHeader.setDataRemained(data.length);
+		chmItsfHeader.unmarshalCharArray(data, chmItsfHeader,
+				ChmConstants.CHM_SIGNATURE_LEN);
+		chmItsfHeader.setVersion(chmItsfHeader.unmarshalInt32(data,
+				chmItsfHeader.getVersion()));
+		chmItsfHeader.setHeaderLen(chmItsfHeader.unmarshalInt32(data,
+				chmItsfHeader.getHeaderLen()));
+		chmItsfHeader.setUnknown_000c(chmItsfHeader.unmarshalInt32(data,
+				chmItsfHeader.getUnknown_000c()));
+		chmItsfHeader.setLastModified(chmItsfHeader.unmarshalUInt32(data,
+				chmItsfHeader.getLastModified()));
+		chmItsfHeader.setLangId(chmItsfHeader.unmarshalUInt32(data,
+				chmItsfHeader.getLangId()));
+		chmItsfHeader.setDir_uuid(chmItsfHeader.unmarshalUuid(data,
+				chmItsfHeader.getDir_uuid(), 16));
+		chmItsfHeader.setStream_uuid(chmItsfHeader.unmarshalUuid(data,
+				chmItsfHeader.getStream_uuid(), 16));
+		chmItsfHeader.setUnknownOffset(chmItsfHeader.unmarshalUint64(data,
+				chmItsfHeader.getUnknownOffset()));
+		chmItsfHeader.setUnknownLen(chmItsfHeader.unmarshalUint64(data,
+				chmItsfHeader.getUnknownLen()));
+		chmItsfHeader.setDirOffset(chmItsfHeader.unmarshalUint64(data,
+				chmItsfHeader.getDirOffset()));
+		chmItsfHeader.setDirLen(chmItsfHeader.unmarshalUint64(data,
+				chmItsfHeader.getDirLen()));
+
+		if (!new String(chmItsfHeader.getSignature()).equals(ChmConstants.ITSF))
+			throw new ChmParsingException("seems not valid file");
+		if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_2) {
+			if (chmItsfHeader.getHeaderLen() < ChmConstants.CHM_ITSF_V2_LEN)
+				throw new ChmParsingException("something wrong with header");
+		} else if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_3) {
+			if (chmItsfHeader.getHeaderLen() < ChmConstants.CHM_ITSF_V3_LEN)
+				throw new ChmParsingException("unknown v3 header lenght");
+		} else
+			throw new ChmParsingException("unsupported chm format");
+
+		/*
+		 * now, if we have a V3 structure, unmarshal the rest, otherwise,
+		 * compute it
+		 */
+		if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_3) {
+			if (chmItsfHeader.getDataRemained() >= 0)
+				chmItsfHeader.setDataOffset(chmItsfHeader.getDirOffset()
+						+ chmItsfHeader.getDirLen());
+			else
+				throw new ChmParsingException(
+						"cannot set data offset, no data remained");
+		} else
+			chmItsfHeader.setDataOffset(chmItsfHeader.getDirOffset()
+					+ chmItsfHeader.getDirLen());
+	}
+}

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java Tue Jun  7 15:44:41 2011
@@ -0,0 +1,548 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * Directory header The directory starts with a header; its format is as
+ * follows: 0000: char[4] 'ITSP' 0004: DWORD Version number 1 0008: DWORD Length
+ * of the directory header 000C: DWORD $0a (unknown) 0010: DWORD $1000 Directory
+ * chunk size 0014: DWORD "Density" of quickref section, usually 2 0018: DWORD
+ * Depth of the index tree - 1 there is no index, 2 if there is one level of
+ * PMGI chunks 001C: DWORD Chunk number of root index chunk, -1 if there is none
+ * (though at least one file has 0 despite there being no index chunk, probably
+ * a bug) 0020: DWORD Chunk number of first PMGL (listing) chunk 0024: DWORD
+ * Chunk number of last PMGL (listing) chunk 0028: DWORD -1 (unknown) 002C:
+ * DWORD Number of directory chunks (total) 0030: DWORD Windows language ID
+ * 0034: GUID {5D02926A-212E-11D0-9DF9-00A0C922E6EC} 0044: DWORD $54 (This is
+ * the length again) 0048: DWORD -1 (unknown) 004C: DWORD -1 (unknown) 0050:
+ * DWORD -1 (unknown)
+ * 
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?show-translation-form=1}
+ * 
+ */
+public class ChmItspHeader implements ChmAccessor<ChmItspHeader> {
+	// TODO: refactor all unmarshals
+	private static final long serialVersionUID = 1962394421998181341L;
+	private byte[] signature = new String(ChmConstants.ITSP).getBytes(); /*
+																		 * 0
+																		 * (ITSP
+																		 * )
+																		 */
+	private int version; /* 4 */
+	private int header_len; /* 8 */
+	private int unknown_000c; /* c */
+	private long block_len; /* 10 */
+	private int blockidx_intvl; /* 14 */
+	private int index_depth; /* 18 */
+	private int index_root; /* 1c */
+	private int index_head; /* 20 */
+	private int unknown_0024; /* 24 */
+	private long num_blocks; /* 28 */
+	private int unknown_002c; /* 2c */
+	private long lang_id; /* 30 */
+	private byte[] system_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 34 */
+	private byte[] unknown_0044 = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 44 */
+
+	/* local usage */
+	private int dataRemained;
+	private int currentPlace = 0;
+
+	public String toString() {
+		StringBuilder sb = new StringBuilder();
+		sb.append("[ signature:=" + new String(getSignature())
+				+ System.getProperty("line.separator"));
+		sb.append("version:=\t" + getVersion()
+				+ System.getProperty("line.separator"));
+		sb.append("header_len:=\t" + getHeader_len()
+				+ System.getProperty("line.separator"));
+		sb.append("unknown_00c:=\t" + getUnknown_000c()
+				+ System.getProperty("line.separator"));
+		sb.append("block_len:=\t" + getBlock_len() + " [directory chunk size]"
+				+ System.getProperty("line.separator"));
+		sb.append("blockidx_intvl:=" + getBlockidx_intvl()
+				+ ", density of quickref section, usually 2"
+				+ System.getProperty("line.separator"));
+		sb.append("index_depth:=\t"
+				+ getIndex_depth()
+				+ ", depth of the index tree - 1 there is no index, 2 if there is one level of PMGI chunk"
+				+ System.getProperty("line.separator"));
+		sb.append("index_root:=\t" + getIndex_root()
+				+ ", chunk number of root index chunk, -1 if there is none"
+				+ System.getProperty("line.separator"));
+		sb.append("index_head:=\t" + getIndex_head()
+				+ ", chunk number of first PMGL (listing) chunk"
+				+ System.getProperty("line.separator"));
+		sb.append("unknown_0024:=\t" + getUnknown_0024()
+				+ ", chunk number of last PMGL (listing) chunk"
+				+ System.getProperty("line.separator"));
+		sb.append("num_blocks:=\t" + getNum_blocks() + ", -1 (unknown)"
+				+ System.getProperty("line.separator"));
+		sb.append("unknown_002c:=\t" + getUnknown_002c()
+				+ ", number of directory chunks (total)"
+				+ System.getProperty("line.separator"));
+		sb.append("lang_id:=\t" + getLang_id() + " - "
+				+ ChmCommons.getLanguage(getLang_id())
+				+ System.getProperty("line.separator"));
+		sb.append("system_uuid:=" + getSystem_uuid()
+				+ System.getProperty("line.separator"));
+		sb.append("unknown_0044:=" + getUnknown_0044() + " ]");
+		return sb.toString();
+	}
+
+	/**
+	 * Copies 4 bits from data[]
+	 * 
+	 * @param data
+	 * @param chmItspHeader
+	 * @param count
+	 */
+	private void unmarshalCharArray(byte[] data, ChmItspHeader chmItspHeader,
+			int count) {
+		ChmAssert.assertByteArrayNotNull(data);
+		ChmAssert.assertChmAccessorNotNull(chmItspHeader);
+		this.setDataRemained(data.length);
+		System.arraycopy(data, 0, chmItspHeader.signature, 0, count);
+		this.setCurrentPlace(this.getCurrentPlace() + count);
+		this.setDataRemained(this.getDataRemained() - count);
+	}
+
+	private int unmarshalInt32(byte[] data, int dataLenght, int dest) {
+		ChmAssert.assertByteArrayNotNull(data);
+		if (4 > this.getDataRemained())
+			throw new ChmParsingException("4 > dataLenght");
+		dest = data[this.getCurrentPlace()]
+				| data[this.getCurrentPlace() + 1] << 8
+				| data[this.getCurrentPlace() + 2] << 16
+				| data[this.getCurrentPlace() + 3] << 24;
+
+		this.setCurrentPlace(this.getCurrentPlace() + 4);
+		this.setDataRemained(this.getDataRemained() - 4);
+		return dest;
+	}
+
+	private long unmarshalUInt32(byte[] data, int dataLenght, long dest) {
+		ChmAssert.assertByteArrayNotNull(data);
+		if (4 > dataLenght)
+			throw new ChmParsingException("4 > dataLenght");
+		dest = data[this.getCurrentPlace()]
+				| data[this.getCurrentPlace() + 1] << 8
+				| data[this.getCurrentPlace() + 2] << 16
+				| data[this.getCurrentPlace() + 3] << 24;
+
+		setDataRemained(this.getDataRemained() - 4);
+		this.setCurrentPlace(this.getCurrentPlace() + 4);
+		return dest;
+	}
+
+	private byte[] unmarshalUuid(byte[] data, int dataLenght, byte[] dest,
+			int count) {
+		System.arraycopy(data, this.getCurrentPlace(), dest, 0, count);
+		this.setCurrentPlace(this.getCurrentPlace() + count);
+		this.setDataRemained(this.getDataRemained() - count);
+		return dest;
+	}
+
+	/**
+	 * Returns how many bytes remained
+	 * 
+	 * @return int
+	 */
+	private int getDataRemained() {
+		return dataRemained;
+	}
+
+	/**
+	 * Sets how many bytes remained
+	 * 
+	 * @param dataRemained
+	 */
+	private void setDataRemained(int dataRemained) {
+		this.dataRemained = dataRemained;
+	}
+
+	/**
+	 * Returns a place holder
+	 * 
+	 * @return current place
+	 */
+	private int getCurrentPlace() {
+		return currentPlace;
+	}
+
+	/**
+	 * Sets current place
+	 * 
+	 * @param currentPlace
+	 */
+	private void setCurrentPlace(int currentPlace) {
+		this.currentPlace = currentPlace;
+	}
+
+	/**
+	 * Returns a signature of the header
+	 * 
+	 * @return itsp signature
+	 */
+	public byte[] getSignature() {
+		return signature;
+	}
+
+	/**
+	 * Sets itsp signature
+	 * 
+	 * @param signature
+	 */
+	protected void setSignature(byte[] signature) {
+		this.signature = signature;
+	}
+
+	/**
+	 * Returns version of itsp header
+	 * 
+	 * @return version
+	 */
+	public int getVersion() {
+		return version;
+	}
+
+	/**
+	 * Sets a version of itsp header
+	 * 
+	 * @param version
+	 */
+	protected void setVersion(int version) {
+		this.version = version;
+	}
+
+	/**
+	 * Returns header length
+	 * 
+	 * @return header length
+	 */
+	public int getHeader_len() {
+		return header_len;
+	}
+
+	/**
+	 * Sets itsp header length
+	 * 
+	 * @param header_len
+	 */
+	protected void setHeader_len(int header_len) {
+		this.header_len = header_len;
+	}
+
+	/**
+	 * Returns 000c unknown bytes
+	 */
+	public int getUnknown_000c() {
+		return unknown_000c;
+	}
+
+	/**
+	 * Sets 000c unknown bytes Unknown means here that those guys who cracked
+	 * the chm format do not know what's it purposes for
+	 * 
+	 * @param unknown_000c
+	 */
+	protected void setUnknown_000c(int unknown_000c) {
+		this.unknown_000c = unknown_000c;
+	}
+
+	/**
+	 * Returns block's length
+	 * 
+	 * @return block_length
+	 */
+	public long getBlock_len() {
+		return block_len;
+	}
+
+	/**
+	 * Sets block length
+	 * 
+	 * @param block_len
+	 */
+	protected void setBlock_len(long block_len) {
+		this.block_len = block_len;
+	}
+
+	/**
+	 * Returns block index interval
+	 * 
+	 * @return blockidx_intvl
+	 */
+	public int getBlockidx_intvl() {
+		return blockidx_intvl;
+	}
+
+	/**
+	 * Sets block index interval
+	 * 
+	 * @param blockidx_intvl
+	 */
+	protected void setBlockidx_intvl(int blockidx_intvl) {
+		this.blockidx_intvl = blockidx_intvl;
+	}
+
+	/**
+	 * Returns an index depth
+	 * 
+	 * @return index_depth
+	 */
+	public int getIndex_depth() {
+		return index_depth;
+	}
+
+	/**
+	 * Sets an index depth
+	 * 
+	 * @param index_depth
+	 */
+	protected void setIndex_depth(int index_depth) {
+		this.index_depth = index_depth;
+	}
+
+	/**
+	 * Returns index root
+	 * 
+	 * @return index_root
+	 */
+	public int getIndex_root() {
+		return index_root;
+	}
+
+	/**
+	 * Sets an index root
+	 * 
+	 * @param index_root
+	 */
+	protected void setIndex_root(int index_root) {
+		this.index_root = index_root;
+	}
+
+	/**
+	 * Returns an index head
+	 * 
+	 * @return index_head
+	 */
+	public int getIndex_head() {
+		return index_head;
+	}
+
+	/**
+	 * Sets an index head
+	 * 
+	 * @param index_head
+	 */
+	protected void setIndex_head(int index_head) {
+		this.index_head = index_head;
+	}
+
+	/**
+	 * Returns 0024 unknown bytes
+	 * 
+	 * @return unknown_0024
+	 */
+	public int getUnknown_0024() {
+		return unknown_0024;
+	}
+
+	/**
+	 * Sets 0024 unknown bytes
+	 * 
+	 * @param unknown_0024
+	 */
+	protected void setUnknown_0024(int unknown_0024) {
+		this.unknown_0024 = unknown_0024;
+	}
+
+	/**
+	 * Returns number of blocks
+	 * 
+	 * @return num_blocks
+	 */
+	public long getNum_blocks() {
+		return num_blocks;
+	}
+
+	/**
+	 * Sets number of blocks containing in the chm file
+	 * 
+	 * @param num_blocks
+	 */
+	protected void setNum_blocks(long num_blocks) {
+		this.num_blocks = num_blocks;
+	}
+
+	/**
+	 * Returns 002c unknown bytes
+	 * 
+	 * @return unknown_002c
+	 */
+	public int getUnknown_002c() {
+		return unknown_002c;
+	}
+
+	/**
+	 * Sets 002c unknown bytes
+	 * 
+	 * @param unknown_002c
+	 */
+	protected void setUnknown_002c(int unknown_002c) {
+		this.unknown_002c = unknown_002c;
+	}
+
+	/**
+	 * Returns language id
+	 * 
+	 * @return lang_id
+	 */
+	public long getLang_id() {
+		return lang_id;
+	}
+
+	/**
+	 * Sets language id
+	 * 
+	 * @param lang_id
+	 */
+	protected void setLang_id(long lang_id) {
+		this.lang_id = lang_id;
+	}
+
+	/**
+	 * Returns system uuid
+	 * 
+	 * @return system_uuid
+	 */
+	public byte[] getSystem_uuid() {
+		return system_uuid;
+	}
+
+	/**
+	 * Sets system uuid
+	 * 
+	 * @param system_uuid
+	 */
+	protected void setSystem_uuid(byte[] system_uuid) {
+		this.system_uuid = system_uuid;
+	}
+
+	/**
+	 * Returns 0044 unknown bytes
+	 * 
+	 * @return unknown_0044
+	 */
+	public byte[] getUnknown_0044() {
+		return unknown_0044;
+	}
+
+	/**
+	 * Sets 0044 unknown bytes
+	 * 
+	 * @param unknown_0044
+	 */
+	protected void setUnknown_0044(byte[] unknown_0044) {
+		this.unknown_0044 = unknown_0044;
+	}
+
+	// @Override
+	public void parse(byte[] data, ChmItspHeader chmItspHeader) {
+		/* we only know how to deal with the 0x58 and 0x60 byte structures */
+		if (data.length != ChmConstants.CHM_ITSP_V1_LEN)
+			throw new ChmParsingException(
+					"we only know how to deal with the 0x58 and 0x60 byte structures");
+
+		/* unmarshal common fields */
+		chmItspHeader.unmarshalCharArray(data, chmItspHeader,
+				ChmConstants.CHM_SIGNATURE_LEN);
+		// ChmCommons.unmarshalCharArray(data, chmItspHeader,
+		// ChmConstants.CHM_SIGNATURE_LEN);
+		chmItspHeader.setVersion(chmItspHeader.unmarshalInt32(data,
+				chmItspHeader.getDataRemained(), chmItspHeader.getVersion()));
+		chmItspHeader
+				.setHeader_len(chmItspHeader.unmarshalInt32(data,
+						chmItspHeader.getDataRemained(),
+						chmItspHeader.getHeader_len()));
+		chmItspHeader.setUnknown_000c(chmItspHeader.unmarshalInt32(data,
+				chmItspHeader.getDataRemained(),
+				chmItspHeader.getUnknown_000c()));
+		chmItspHeader.setBlock_len(chmItspHeader.unmarshalUInt32(data,
+				chmItspHeader.getDataRemained(), chmItspHeader.getBlock_len()));
+		chmItspHeader.setBlockidx_intvl(chmItspHeader.unmarshalInt32(data,
+				chmItspHeader.getDataRemained(),
+				chmItspHeader.getBlockidx_intvl()));
+		chmItspHeader
+				.setIndex_depth(chmItspHeader.unmarshalInt32(data,
+						chmItspHeader.getDataRemained(),
+						chmItspHeader.getIndex_depth()));
+		chmItspHeader
+				.setIndex_root(chmItspHeader.unmarshalInt32(data,
+						chmItspHeader.getDataRemained(),
+						chmItspHeader.getIndex_root()));
+		chmItspHeader
+				.setIndex_head(chmItspHeader.unmarshalInt32(data,
+						chmItspHeader.getDataRemained(),
+						chmItspHeader.getIndex_head()));
+		chmItspHeader.setUnknown_0024(chmItspHeader.unmarshalInt32(data,
+				chmItspHeader.getDataRemained(),
+				chmItspHeader.getUnknown_0024()));
+		chmItspHeader
+				.setNum_blocks(chmItspHeader.unmarshalUInt32(data,
+						chmItspHeader.getDataRemained(),
+						chmItspHeader.getNum_blocks()));
+		chmItspHeader.setUnknown_002c((chmItspHeader.unmarshalInt32(data,
+				chmItspHeader.getDataRemained(),
+				chmItspHeader.getUnknown_002c())));
+		chmItspHeader.setLang_id(chmItspHeader.unmarshalUInt32(data,
+				chmItspHeader.getDataRemained(), chmItspHeader.getLang_id()));
+		chmItspHeader
+				.setSystem_uuid(chmItspHeader.unmarshalUuid(data,
+						chmItspHeader.getDataRemained(),
+						chmItspHeader.getSystem_uuid(),
+						ChmConstants.BYTE_ARRAY_LENGHT));
+		chmItspHeader
+				.setUnknown_0044(chmItspHeader.unmarshalUuid(data,
+						chmItspHeader.getDataRemained(),
+						chmItspHeader.getUnknown_0044(),
+						ChmConstants.BYTE_ARRAY_LENGHT));
+
+		/* Checks validity of the itsp header */
+		if (!new String(chmItspHeader.getSignature()).equals(ChmConstants.ITSP))
+			throw new ChmParsingException("seems not valid signature");
+
+		if (chmItspHeader.getVersion() != ChmConstants.CHM_VER_1)
+			throw new ChmParsingException("!=ChmConstants.CHM_VER_1");
+
+		if (chmItspHeader.getHeader_len() != ChmConstants.CHM_ITSP_V1_LEN)
+			throw new ChmParsingException("!= ChmConstants.CHM_ITSP_V1_LEN");
+	}
+
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+	}
+}

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java Tue Jun  7 15:44:41 2011
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * 
+ * ::DataSpace/Storage/<SectionName>/ControlData This file contains $20 bytes of
+ * information on the compression. The information is partially known: 0000:
+ * DWORD 6 (unknown) 0004: ASCII 'LZXC' Compression type identifier 0008: DWORD
+ * 2 (Possibly numeric code for LZX) 000C: DWORD The Huffman reset interval in
+ * $8000-byte blocks 0010: DWORD The window size in $8000-byte blocks 0014:
+ * DWORD unknown (sometimes 2, sometimes 1, sometimes 0) 0018: DWORD 0 (unknown)
+ * 001C: DWORD 0 (unknown)
+ * 
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?page=2 }
+ * 
+ */
+public class ChmLzxcControlData implements ChmAccessor<ChmLzxcControlData> {
+	private static final long serialVersionUID = -7897854774939631565L;
+	/* class' members */
+	private long size; /* 0 */
+	private byte[] signature = new String(ChmConstants.LZXC).getBytes(); /*
+																		 * 4
+																		 * (LZXC
+																		 * )
+																		 */
+	private long version; /* 8 */
+	private long resetInterval; /* c */
+	private long windowSize; /* 10 */
+	private long windowsPerReset; /* 14 */
+	private long unknown_18; /* 18 */
+
+	/* local usage */
+	private int dataRemained;
+	private int currentPlace = 0;
+
+	/**
+	 * Returns a remained data
+	 * 
+	 * @return dataRemained
+	 */
+	private int getDataRemained() {
+		return dataRemained;
+	}
+
+	/**
+	 * Sets a remained data
+	 * 
+	 * @param dataRemained
+	 */
+	private void setDataRemained(int dataRemained) {
+		this.dataRemained = dataRemained;
+	}
+
+	/**
+	 * Returns a place holder
+	 * 
+	 * @return current_place
+	 */
+	private int getCurrentPlace() {
+		return currentPlace;
+	}
+
+	/**
+	 * Sets a place holder
+	 * 
+	 * @param current_place
+	 */
+	private void setCurrentPlace(int currentPlace) {
+		this.currentPlace = currentPlace;
+	}
+
+	/**
+	 * Returns a size of control data
+	 * 
+	 * @return size
+	 */
+	public long getSize() {
+		return size;
+	}
+
+	/**
+	 * Sets a size of control data
+	 * 
+	 * @param size
+	 */
+	protected void setSize(long size) {
+		this.size = size;
+	}
+
+	/**
+	 * Returns a signature of control data block
+	 * 
+	 * @return signature
+	 */
+	public byte[] getSignature() {
+		return signature;
+	}
+
+	/**
+	 * Sets a signature of control data block
+	 * 
+	 * @param signature
+	 */
+	protected void setSignature(byte[] signature) {
+		this.signature = signature;
+	}
+
+	/**
+	 * Returns a version of control data block
+	 * 
+	 * @return version
+	 */
+	public long getVersion() {
+		return version;
+	}
+
+	/**
+	 * Sets version of control data block
+	 * 
+	 * @param version
+	 */
+	protected void setVersion(long version) {
+		this.version = version;
+	}
+
+	/**
+	 * Returns reset interval
+	 * 
+	 * @return reset_interval
+	 */
+	public long getResetInterval() {
+		return resetInterval;
+	}
+
+	/**
+	 * Sets a reset interval
+	 * 
+	 * @param resetInterval
+	 */
+	protected void setResetInterval(long resetInterval) {
+		this.resetInterval = resetInterval;
+	}
+
+	/**
+	 * Returns a window size
+	 * 
+	 * @return window_size
+	 */
+	public long getWindowSize() {
+		return windowSize;
+	}
+
+	/**
+	 * Sets a window size
+	 * 
+	 * @param window_size
+	 */
+	protected void setWindowSize(long windowSize) {
+		this.windowSize = windowSize;
+	}
+
+	/**
+	 * Returns windows per reset
+	 * 
+	 * @return
+	 */
+	public long getWindowsPerReset() {
+		return windowsPerReset;
+	}
+
+	/**
+	 * Sets windows per reset
+	 * 
+	 * @param windows_per_reset
+	 */
+	protected void setWindowsPerReset(long windowsPerReset) {
+		this.windowsPerReset = windowsPerReset;
+	}
+
+	/**
+	 * Returns unknown 18 bytes
+	 * 
+	 * @return unknown_18
+	 */
+	public long getUnknown_18() {
+		return unknown_18;
+	}
+
+	/**
+	 * Sets unknown 18 bytes
+	 * 
+	 * @param unknown_18
+	 */
+	protected void setUnknown_18(long unknown_18) {
+		this.unknown_18 = unknown_18;
+	}
+
+	private long unmarshalUInt32(byte[] data, long dest) {
+		assert (data != null && data.length > 0);
+		if (4 > getDataRemained())
+			throw new ChmParsingException("4 > dataLenght");
+		dest = data[this.getCurrentPlace()]
+				| data[this.getCurrentPlace() + 1] << 8
+				| data[this.getCurrentPlace() + 2] << 16
+				| data[this.getCurrentPlace() + 3] << 24;
+
+		setDataRemained(this.getDataRemained() - 4);
+		this.setCurrentPlace(this.getCurrentPlace() + 4);
+		return dest;
+	}
+
+	private void unmarshalCharArray(byte[] data,
+			ChmLzxcControlData chmLzxcControlData, int count) {
+		ChmAssert.assertByteArrayNotNull(data);
+		ChmAssert.assertChmAccessorNotNull(chmLzxcControlData);
+		ChmAssert.assertPositiveInt(count);
+		System.arraycopy(data, 4, chmLzxcControlData.getSignature(), 0, count);
+		this.setCurrentPlace(this.getCurrentPlace() + count);
+		this.setDataRemained(this.getDataRemained() - count);
+	}
+
+	/**
+	 * Returns textual representation of ChmLzxcControlData
+	 */
+	public String toString() {
+		StringBuilder sb = new StringBuilder();
+		sb.append("size(unknown):=" + this.getSize() + ", ");
+		sb.append("signature(Compression type identifier):="
+				+ new String(this.getSignature()) + ", ");
+		sb.append("version(Possibly numeric code for LZX):="
+				+ this.getVersion() + System.getProperty("line.separator"));
+		sb.append("resetInterval(The Huffman reset interval):="
+				+ this.getResetInterval() + ", ");
+		sb.append("windowSize:=" + this.getWindowSize() + ", ");
+		sb.append("windowsPerReset(unknown (sometimes 2, sometimes 1, sometimes 0):="
+				+ this.getWindowsPerReset() + ", ");
+		sb.append("unknown_18:=" + this.getUnknown_18()
+				+ System.getProperty("line.separator"));
+		return sb.toString();
+	}
+
+	// @Override
+	public void parse(byte[] data, ChmLzxcControlData chmLzxcControlData) {
+		if (data == null || (data.length < ChmConstants.CHM_LZXC_MIN_LEN))
+			throw new ChmParsingException("we want at least 0x18 bytes");
+		chmLzxcControlData.setDataRemained(data.length);
+		chmLzxcControlData.setSize(unmarshalUInt32(data,
+				chmLzxcControlData.getSize()));
+		chmLzxcControlData.unmarshalCharArray(data, chmLzxcControlData,
+				ChmConstants.CHM_SIGNATURE_LEN);
+		chmLzxcControlData.setVersion(unmarshalUInt32(data,
+				chmLzxcControlData.getVersion()));
+		chmLzxcControlData.setResetInterval(unmarshalUInt32(data,
+				chmLzxcControlData.getResetInterval()));
+		chmLzxcControlData.setWindowSize(unmarshalUInt32(data,
+				chmLzxcControlData.getWindowSize()));
+		chmLzxcControlData.setWindowsPerReset(unmarshalUInt32(data,
+				chmLzxcControlData.getWindowsPerReset()));
+
+		if (data.length >= ChmConstants.CHM_LZXC_V2_LEN)
+			chmLzxcControlData.setUnknown_18(unmarshalUInt32(data,
+					chmLzxcControlData.getUnknown_18()));
+		else
+			chmLzxcControlData.setUnknown_18(0);
+
+		if (chmLzxcControlData.getVersion() == 2) {
+			chmLzxcControlData.setWindowSize(getWindowSize()
+					* ChmConstants.CHM_WINDOW_SIZE_BLOCK);
+		}
+
+		if (chmLzxcControlData.getWindowSize() == 0
+				|| chmLzxcControlData.getResetInterval() == 0)
+			throw new ChmParsingException(
+					"window size / resetInterval should be more than zero");
+
+		if (chmLzxcControlData.getWindowSize() == 1)
+			throw new ChmParsingException(
+					"window size / resetInterval should be more than 1");
+
+		/* checks a signature */
+		if (!new String(chmLzxcControlData.getSignature())
+				.equals(ChmConstants.LZXC))
+			throw new ChmParsingException(
+					"the signature does not seem to be correct");
+	}
+
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+	}
+}

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java Tue Jun  7 15:44:41 2011
@@ -0,0 +1,353 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * LZXC reset table For ensuring a decompression. Reads the block named
+ * "::DataSpace/Storage/<SectionName>/Transform/{7FC28940-9D31-11D0-9B27-00A0C91E9C7C}/InstanceData/ResetTable"
+ * .
+ * 
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?page=2 }
+ * 
+ */
+public class ChmLzxcResetTable implements ChmAccessor<ChmLzxcResetTable> {
+	private static final long serialVersionUID = -8209574429411707460L;
+	/* class members */
+	private long version; // 0000: DWORD 2 unknown (possibly a version number)
+	private long block_count; // 0004: DWORD Number of entries in reset table
+	private long unknown; // 0008: DWORD 8 unknown
+	private long table_offset; // 000C: DWORD $28 Length of table header (area
+								// before table entries)
+	private long uncompressed_len; // 0010: QWORD Uncompressed Length
+	private long compressed_len; // 0018: QWORD Compressed Length
+	private long block_len; // 0020: QWORD 0x8000 block size for locations below
+	private long[] block_address;
+
+	/* local usage */
+	private int dataRemained;
+	private int currentPlace = 0;
+
+	private int getDataRemained() {
+		return dataRemained;
+	}
+
+	private void setDataRemained(int dataRemained) {
+		this.dataRemained = dataRemained;
+	}
+
+	/**
+	 * Returns block addresses
+	 * 
+	 * @return block addresses
+	 */
+	public long[] getBlockAddress() {
+		return block_address;
+	}
+
+	/**
+	 * Sets block addresses
+	 * 
+	 * @param block_address
+	 */
+	public void setBlockAddress(long[] block_address) {
+		this.block_address = block_address;
+	}
+
+	private int getCurrentPlace() {
+		return currentPlace;
+	}
+
+	private void setCurrentPlace(int currentPlace) {
+		this.currentPlace = currentPlace;
+	}
+
+	@Override
+	public String toString() {
+		StringBuilder sb = new StringBuilder();
+		sb.append("version:=" + getVersion()
+				+ System.getProperty("line.separator"));
+		sb.append("block_count:=" + getBlockCount()
+				+ System.getProperty("line.separator"));
+		sb.append("unknown:=" + getUnknown()
+				+ System.getProperty("line.separator"));
+		sb.append("table_offset:=" + getTableOffset()
+				+ System.getProperty("line.separator"));
+		sb.append("uncompressed_len:=" + getUncompressedLen()
+				+ System.getProperty("line.separator"));
+		sb.append("compressed_len:=" + getCompressedLen()
+				+ System.getProperty("line.separator"));
+		sb.append("block_len:=" + getBlockLen()
+				+ System.getProperty("line.separator"));
+		sb.append("block_addresses:=" + Arrays.toString(getBlockAddress()));
+		return sb.toString();
+	}
+
+	/**
+	 * Enumerates chm block addresses
+	 * 
+	 * @param data
+	 * 
+	 * @return byte[] of addresses
+	 */
+	private long[] enumerateBlockAddresses(byte[] data) {
+		ChmAssert.assertByteArrayNotNull(data);
+		/* we have limit of number of blocks to be extracted */
+		if (getBlockCount() > 5000)
+			setBlockCount(5000);
+
+		if (getBlockCount() < 0 && (getDataRemained() / 8) > 0)
+			setBlockCount(getDataRemained() / 8);
+
+		long[] addresses = new long[(int) getBlockCount()];
+		int rem = getDataRemained() / 8;
+		for (int i = 0; i < rem; i++) {
+			long num = -1;
+
+			try {
+				addresses[i] = unmarshalUint64(data, num);
+			} catch (Exception e) {
+				// System.err.println(e.getMessage());
+			}
+		}
+		return addresses;
+	}
+
+	/**
+	 * Validates parameters such as byte[] and chm lzxc reset table
+	 * 
+	 * @param data
+	 * @param chmLzxcResetTable
+	 * 
+	 * @return boolean
+	 */
+	private boolean validateParamaters(byte[] data,
+			ChmLzxcResetTable chmLzxcResetTable) {
+		int goodParameter = 0;
+		ChmAssert.assertByteArrayNotNull(data);
+		++goodParameter;
+		ChmAssert.assertChmAccessorNotNull(chmLzxcResetTable);
+		++goodParameter;
+		return (goodParameter == 2);
+	}
+
+	private long unmarshalUInt32(byte[] data, long dest) {
+		ChmAssert.assertByteArrayNotNull(data);
+		dest = data[this.getCurrentPlace()]
+				| data[this.getCurrentPlace() + 1] << 8
+				| data[this.getCurrentPlace() + 2] << 16
+				| data[this.getCurrentPlace() + 3] << 24;
+
+		setDataRemained(this.getDataRemained() - 4);
+		this.setCurrentPlace(this.getCurrentPlace() + 4);
+		return dest;
+	}
+
+	private long unmarshalUint64(byte[] data, long dest) {
+		ChmAssert.assertByteArrayNotNull(data);
+		byte[] temp = new byte[8];
+		int i, j;// counters
+
+		for (i = 8, j = 7; i > 0; i--) {
+			if (data.length > this.getCurrentPlace()) {
+				temp[j--] = data[this.getCurrentPlace()];
+				this.setCurrentPlace(this.getCurrentPlace() + 1);
+			} else
+				throw new ChmParsingException(
+						"data is too small to calculate address block");
+		}
+		dest = new BigInteger(temp).longValue();
+		this.setDataRemained(this.getDataRemained() - 8);
+		return dest;
+	}
+
+	/**
+	 * Returns the version
+	 * 
+	 * @return - long
+	 */
+	public long getVersion() {
+		return version;
+	}
+
+	/**
+	 * Sets the version
+	 * 
+	 * @param version
+	 *            - long
+	 */
+	public void setVersion(long version) {
+		this.version = version;
+	}
+
+	/**
+	 * Gets a block count
+	 * 
+	 * @return - int
+	 */
+	public long getBlockCount() {
+		return block_count;
+	}
+
+	/**
+	 * Sets a block count
+	 * 
+	 * @param block_count
+	 *            - long
+	 */
+	public void setBlockCount(long block_count) {
+		this.block_count = block_count;
+	}
+
+	/**
+	 * Gets unknown
+	 * 
+	 * @return - long
+	 */
+	public long getUnknown() {
+		return unknown;
+	}
+
+	/**
+	 * Sets an unknown
+	 * 
+	 * @param unknown
+	 *            - long
+	 */
+	public void setUnknown(long unknown) {
+		this.unknown = unknown;
+	}
+
+	/**
+	 * Gets a table offset
+	 * 
+	 * @return - long
+	 */
+	public long getTableOffset() {
+		return table_offset;
+	}
+
+	/**
+	 * Sets a table offset
+	 * 
+	 * @param table_offset
+	 *            - long
+	 */
+	public void setTableOffset(long table_offset) {
+		this.table_offset = table_offset;
+	}
+
+	/**
+	 * Gets uncompressed length
+	 * 
+	 * @return - {@link BigInteger }
+	 */
+	public long getUncompressedLen() {
+		return uncompressed_len;
+	}
+
+	/**
+	 * Sets uncompressed length
+	 * 
+	 * @param uncompressed_len
+	 *            - {@link BigInteger}
+	 */
+	public void setUncompressedLen(long uncompressed_len) {
+		this.uncompressed_len = uncompressed_len;
+	}
+
+	/**
+	 * Gets compressed length
+	 * 
+	 * @return - {@link BigInteger}
+	 */
+	public long getCompressedLen() {
+		return compressed_len;
+	}
+
+	/**
+	 * Sets compressed length
+	 * 
+	 * @param compressed_len
+	 *            - {@link BigInteger}
+	 */
+	public void setCompressedLen(long compressed_len) {
+		this.compressed_len = compressed_len;
+	}
+
+	/**
+	 * Gets a block length
+	 * 
+	 * @return - {@link BigInteger}
+	 */
+	public long getBlockLen() {
+		return block_len;
+	}
+
+	/**
+	 * Sets a block length
+	 * 
+	 * @param block_len
+	 *            - {@link BigInteger}
+	 */
+	public void setBlockLlen(long block_len) {
+		this.block_len = block_len;
+	}
+
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+
+	}
+
+	// @Override
+	public void parse(byte[] data, ChmLzxcResetTable chmLzxcResetTable) {
+		setDataRemained(data.length);
+		if (validateParamaters(data, chmLzxcResetTable)) {
+			/* unmarshal fields */
+			chmLzxcResetTable.setVersion(unmarshalUInt32(data,
+					chmLzxcResetTable.getVersion()));
+			chmLzxcResetTable.setBlockCount(unmarshalUInt32(data,
+					chmLzxcResetTable.getBlockCount()));
+			chmLzxcResetTable.setUnknown(unmarshalUInt32(data,
+					chmLzxcResetTable.getUnknown()));
+			chmLzxcResetTable.setTableOffset(unmarshalUInt32(data,
+					chmLzxcResetTable.getTableOffset()));
+			chmLzxcResetTable.setUncompressedLen(unmarshalUint64(data,
+					chmLzxcResetTable.getUncompressedLen()));
+			chmLzxcResetTable.setCompressedLen(unmarshalUint64(data,
+					chmLzxcResetTable.getCompressedLen()));
+			chmLzxcResetTable.setBlockLlen(unmarshalUint64(data,
+					chmLzxcResetTable.getBlockLen()));
+			chmLzxcResetTable.setBlockAddress(enumerateBlockAddresses(data));
+		}
+
+		/* checks chmLzxcResetTable */
+		if (chmLzxcResetTable.getVersion() != ChmConstants.CHM_VER_2)
+			throw new ChmParsingException(
+					"does not seem currect version of chmLzxcResetTable");
+	}
+}

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java Tue Jun  7 15:44:41 2011
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.util.Arrays;
+
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * Description Note: not always exists An index chunk has the following format:
+ * 0000: char[4] 'PMGI' 0004: DWORD Length of quickref/free area at end of
+ * directory chunk 0008: Directory index entries (to quickref/free area) The
+ * quickref area in an PMGI is the same as in an PMGL The format of a directory
+ * index entry is as follows: BYTE: length of name BYTEs: name (UTF-8 encoded)
+ * ENCINT: directory listing chunk which starts with name Encoded Integers aka
+ * ENCINT An ENCINT is a variable-length integer. The high bit of each byte
+ * indicates "continued to the next byte". Bytes are stored most significant to
+ * least significant. So, for example, $EA $15 is (((0xEA&0x7F)<<7)|0x15) =
+ * 0x3515.
+ * 
+ * <p>
+ * Note: This class is not in use
+ * 
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?show-translation-form=1 }
+ * 
+ * 
+ */
+public class ChmPmgiHeader implements ChmAccessor<ChmPmgiHeader> {
+	private static final long serialVersionUID = -2092282339894303701L;
+	private byte[] signature = new String(ChmConstants.CHM_PMGI_MARKER)
+			.getBytes(); /* 0 (PMGI) */
+	private long free_space; /* 4 */
+
+	/* local usage */
+	private int dataRemained;
+	private int currentPlace = 0;
+
+	private int getDataRemained() {
+		return dataRemained;
+	}
+
+	private void setDataRemained(int dataRemained) {
+		this.dataRemained = dataRemained;
+	}
+
+	private int getCurrentPlace() {
+		return currentPlace;
+	}
+
+	private void setCurrentPlace(int currentPlace) {
+		this.currentPlace = currentPlace;
+	}
+
+	private void unmarshalCharArray(byte[] data, ChmPmgiHeader chmPmgiHeader,
+			int count) {
+		int index = -1;
+		ChmAssert.assertByteArrayNotNull(data);
+		ChmAssert.assertChmAccessorNotNull(chmPmgiHeader);
+		ChmAssert.assertPositiveInt(count);
+		this.setDataRemained(data.length);
+		index = ChmCommons.indexOf(data,
+				ChmConstants.CHM_PMGI_MARKER.getBytes());
+		if (index >= 0)
+			System.arraycopy(data, index, chmPmgiHeader.getSignature(), 0,
+					count);
+		else
+			System.err.println(ChmPmgiHeader.class.getName()
+					+ " does not exist a PMGI, use PMGL instead");
+		this.setCurrentPlace(this.getCurrentPlace() + count);
+		this.setDataRemained(this.getDataRemained() - count);
+	}
+
+	private long unmarshalUInt32(byte[] data, long dest) {
+		ChmAssert.assertByteArrayNotNull(data);
+
+		if (4 > getDataRemained())
+			throw new ChmParsingException("4 > dataLenght");
+		dest = data[this.getCurrentPlace()]
+				| data[this.getCurrentPlace() + 1] << 8
+				| data[this.getCurrentPlace() + 2] << 16
+				| data[this.getCurrentPlace() + 3] << 24;
+
+		setDataRemained(this.getDataRemained() - 4);
+		this.setCurrentPlace(this.getCurrentPlace() + 4);
+		return dest;
+	}
+
+	/**
+	 * Returns pmgi signature if exists
+	 * 
+	 * @return signature
+	 */
+	public byte[] getSignature() {
+		return signature;
+	}
+
+	/**
+	 * Sets pmgi signature
+	 * 
+	 * @param signature
+	 */
+	protected void setSignature(byte[] signature) {
+		this.signature = signature;
+	}
+
+	/**
+	 * Returns pmgi free space
+	 * 
+	 * @return free_space
+	 */
+	public long getFreeSpace() {
+		return free_space;
+	}
+
+	/**
+	 * Sets pmgi free space
+	 * 
+	 * @param free_space
+	 */
+	protected void setFreeSpace(long free_space) {
+		this.free_space = free_space;
+	}
+
+	/**
+	 * Returns textual representation of the pmgi header
+	 */
+	public String toString() {
+		StringBuilder sb = new StringBuilder();
+		sb.append("signature:=" + new String(getSignature()) + ", ");
+		sb.append("free space:=" + getFreeSpace()
+				+ System.getProperty("line.separator"));
+		return sb.toString();
+	}
+
+	// @Override
+	public void parse(byte[] data, ChmPmgiHeader chmPmgiHeader) {
+		/* we only know how to deal with a 0x8 byte structures */
+		if (data.length < ChmConstants.CHM_PMGI_LEN)
+			throw new ChmParsingException(
+					"we only know how to deal with a 0x8 byte structures");
+
+		/* unmarshal fields */
+		chmPmgiHeader.unmarshalCharArray(data, chmPmgiHeader,
+				ChmConstants.CHM_SIGNATURE_LEN);
+		chmPmgiHeader.setFreeSpace(chmPmgiHeader.unmarshalUInt32(data,
+				chmPmgiHeader.getFreeSpace()));
+
+		/* check structure */
+		if (!Arrays.equals(chmPmgiHeader.getSignature(),
+				ChmConstants.CHM_PMGI_MARKER.getBytes()))
+			throw new ChmParsingException(
+					"it does not seem to be valid a PMGI signature, check ChmItsp index_root if it was -1, means no PMGI, use PMGL insted");
+
+	}
+
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+
+	}
+}

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java Tue Jun  7 15:44:41 2011
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * Description There are two types of directory chunks -- index chunks, and
+ * listing chunks. The index chunk will be omitted if there is only one listing
+ * chunk. A listing chunk has the following format: 0000: char[4] 'PMGL' 0004:
+ * DWORD Length of free space and/or quickref area at end of directory chunk
+ * 0008: DWORD Always 0 000C: DWORD Chunk number of previous listing chunk when
+ * reading directory in sequence (-1 if this is the first listing chunk) 0010:
+ * DWORD Chunk number of next listing chunk when reading directory in sequence
+ * (-1 if this is the last listing chunk) 0014: Directory listing entries (to
+ * quickref area) Sorted by filename; the sort is case-insensitive The quickref
+ * area is written backwards from the end of the chunk. One quickref entry
+ * exists for every n entries in the file, where n is calculated as 1 + (1 <<
+ * quickref density). So for density = 2, n = 5 Chunklen-0002: WORD Number of
+ * entries in the chunk Chunklen-0004: WORD Offset of entry n from entry 0
+ * Chunklen-0008: WORD Offset of entry 2n from entry 0 Chunklen-000C: WORD
+ * Offset of entry 3n from entry 0 ... The format of a directory listing entry
+ * is as follows BYTE: length of name BYTEs: name (UTF-8 encoded) ENCINT:
+ * content section ENCINT: offset ENCINT: length The offset is from the
+ * beginning of the content section the file is in, after the section has been
+ * decompressed (if appropriate). The length also refers to length of the file
+ * in the section after decompression. There are two kinds of file represented
+ * in the directory: user data and format related files. The files which are
+ * format-related have names which begin with '::', the user data files have
+ * names which begin with "/".
+ * 
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?show-translation-form=1 }
+ * 
+ * @author olegt
+ * 
+ */
+public class ChmPmglHeader implements ChmAccessor<ChmPmglHeader> {
+	private static final long serialVersionUID = -6139486487475923593L;
+	private byte[] signature = new String(ChmConstants.PMGL).getBytes(); /*
+																		 * 0
+																		 * (PMGL
+																		 * )
+																		 */
+	private long free_space; /* 4 */
+	private long unknown_0008; /* 8 */
+	private int block_prev; /* c */
+	private int block_next; /* 10 */
+
+	/* local usage */
+	private int dataRemained;
+	private int currentPlace = 0;
+
+	private int getDataRemained() {
+		return dataRemained;
+	}
+
+	private void setDataRemained(int dataRemained) {
+		this.dataRemained = dataRemained;
+	}
+
+	private int getCurrentPlace() {
+		return currentPlace;
+	}
+
+	private void setCurrentPlace(int currentPlace) {
+		this.currentPlace = currentPlace;
+	}
+
+	public long getFreeSpace() {
+		return free_space;
+	}
+
+	public void setFreeSpace(long free_space) {
+		this.free_space = free_space;
+	}
+
+	public String toString() {
+		StringBuilder sb = new StringBuilder();
+		sb.append("signatute:=" + new String(getSignature()) + ", ");
+		sb.append("free space:=" + getFreeSpace() + ", ");
+		sb.append("unknown0008:=" + getUnknown0008() + ", ");
+		sb.append("prev block:=" + getBlockPrev() + ", ");
+		sb.append("next block:=" + getBlockNext()
+				+ System.getProperty("line.separator"));
+		return sb.toString();
+	}
+
+	protected void unmarshalCharArray(byte[] data, ChmPmglHeader chmPmglHeader,
+			int count) {
+		ChmAssert.assertByteArrayNotNull(data);
+		this.setDataRemained(data.length);
+		System.arraycopy(data, 0, chmPmglHeader.signature, 0, count);
+		this.setCurrentPlace(this.getCurrentPlace() + count);
+		this.setDataRemained(this.getDataRemained() - count);
+	}
+
+	private int unmarshalInt32(byte[] data, int dest) {
+		ChmAssert.assertByteArrayNotNull(data);
+		if (4 > this.getDataRemained())
+			throw new ChmParsingException("4 > dataLenght");
+		dest = data[this.getCurrentPlace()]
+				| data[this.getCurrentPlace() + 1] << 8
+				| data[this.getCurrentPlace() + 2] << 16
+				| data[this.getCurrentPlace() + 3] << 24;
+
+		this.setCurrentPlace(this.getCurrentPlace() + 4);
+		this.setDataRemained(this.getDataRemained() - 4);
+		return dest;
+	}
+
+	private long unmarshalUInt32(byte[] data, long dest) {
+		ChmAssert.assertByteArrayNotNull(data);
+		if (4 > getDataRemained())
+			throw new ChmParsingException("4 > dataLenght");
+		dest = data[this.getCurrentPlace()]
+				| data[this.getCurrentPlace() + 1] << 8
+				| data[this.getCurrentPlace() + 2] << 16
+				| data[this.getCurrentPlace() + 3] << 24;
+
+		setDataRemained(this.getDataRemained() - 4);
+		this.setCurrentPlace(this.getCurrentPlace() + 4);
+		return dest;
+	}
+
+	// @Override
+	public void parse(byte[] data, ChmPmglHeader chmPmglHeader) {
+		if (data.length < ChmConstants.CHM_PMGL_LEN)
+			throw new ChmParsingException(ChmPmglHeader.class.getName()
+					+ " we only know how to deal with a 0x14 byte structures");
+
+		/* unmarshal fields */
+		chmPmglHeader.unmarshalCharArray(data, chmPmglHeader,
+				ChmConstants.CHM_SIGNATURE_LEN);
+		chmPmglHeader.setFreeSpace(chmPmglHeader.unmarshalUInt32(data,
+				chmPmglHeader.getFreeSpace()));
+		chmPmglHeader.setUnknown0008(chmPmglHeader.unmarshalUInt32(data,
+				chmPmglHeader.getUnknown0008()));
+		chmPmglHeader.setBlockPrev(chmPmglHeader.unmarshalInt32(data,
+				chmPmglHeader.getBlockPrev()));
+		chmPmglHeader.setBlockNext(chmPmglHeader.unmarshalInt32(data,
+				chmPmglHeader.getBlockNext()));
+
+		/* check structure */
+		if (!new String(chmPmglHeader.getSignature()).equals(ChmConstants.PMGL))
+			throw new ChmParsingException(ChmPmglHeader.class.getName()
+					+ " pmgl != pmgl.signature");
+
+	}
+
+	public byte[] getSignature() {
+		return signature;
+	}
+
+	protected void setSignature(byte[] signature) {
+		this.signature = signature;
+	}
+
+	public long getUnknown0008() {
+		return unknown_0008;
+	}
+
+	protected void setUnknown0008(long unknown_0008) {
+		this.unknown_0008 = unknown_0008;
+	}
+
+	public int getBlockPrev() {
+		return block_prev;
+	}
+
+	protected void setBlockPrev(int block_prev) {
+		this.block_prev = block_prev;
+	}
+
+	public int getBlockNext() {
+		return block_next;
+	}
+
+	protected void setBlockNext(int block_next) {
+		this.block_next = block_next;
+	}
+
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+
+	}
+}

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java Tue Jun  7 15:44:41 2011
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmCommons;
+
+/**
+ * The format of a directory listing entry is as follows: BYTE: length of name
+ * BYTEs: name (UTF-8 encoded) ENCINT: content section ENCINT: offset ENCINT:
+ * length The offset is from the beginning of the content section the file is
+ * in, after the section has been decompressed (if appropriate). The length also
+ * refers to length of the file in the section after decompression. There are
+ * two kinds of file represented in the directory: user data and format related
+ * files. The files which are format-related have names which begin with '::',
+ * the user data files have names which begin with "/".
+ * 
+ */
+public class DirectoryListingEntry {
+	/* Length of the entry name */
+	private int name_length;
+	/* Entry name or directory name */
+	private String name;
+	/* Entry type */
+	private ChmCommons.EntryType entryType;
+	/* Entry offset */
+	private int offset;
+	/* Entry size */
+	private int length;
+
+	public DirectoryListingEntry() {
+
+	}
+
+	/**
+	 * Constructs directoryListingEntry
+	 * 
+	 * @param name_length
+	 *            int
+	 * @param name
+	 *            String
+	 * @param isCompressed
+	 *            ChmCommons.EntryType
+	 * @param offset
+	 *            int
+	 * @param length
+	 *            int
+	 */
+	public DirectoryListingEntry(int name_length, String name,
+			ChmCommons.EntryType isCompressed, int offset, int length) {
+		ChmAssert.assertDirectoryListingEntry(name_length, name, isCompressed,
+				offset, length);
+		setNameLength(name_length);
+		setName(name);
+		setEntryType(isCompressed);
+		setOffset(offset);
+		setLength(length);
+	}
+
+	public String toString() {
+		StringBuilder sb = new StringBuilder();
+		sb.append("name_length:=" + getNameLength()
+				+ System.getProperty("line.separator"));
+		sb.append("name:=" + getName() + System.getProperty("line.separator"));
+		sb.append("entryType:=" + getEntryType()
+				+ System.getProperty("line.separator"));
+		sb.append("offset:=" + getOffset()
+				+ System.getProperty("line.separator"));
+		sb.append("length:=" + getLength());
+		return sb.toString();
+	}
+
+	/**
+	 * Returns an entry name length
+	 * 
+	 * @return int
+	 */
+	public int getNameLength() {
+		return name_length;
+	}
+
+	/**
+	 * Sets an entry name length
+	 * 
+	 * @param name_length
+	 *            int
+	 */
+	protected void setNameLength(int name_length) {
+		this.name_length = name_length;
+	}
+
+	/**
+	 * Returns an entry name
+	 * 
+	 * @return String
+	 */
+	public String getName() {
+		return name;
+	}
+
+	/**
+	 * Sets entry name
+	 * 
+	 * @param name
+	 *            String
+	 */
+	protected void setName(String name) {
+		this.name = name;
+	}
+
+	/**
+	 * Returns ChmCommons.EntryType (COMPRESSED or UNCOMPRESSED)
+	 * 
+	 * @return ChmCommons.EntryType
+	 */
+	public ChmCommons.EntryType getEntryType() {
+		return entryType;
+	}
+
+	protected void setEntryType(ChmCommons.EntryType entryType) {
+		this.entryType = entryType;
+	}
+
+	public int getOffset() {
+		return offset;
+	}
+
+	protected void setOffset(int offset) {
+		this.offset = offset;
+	}
+
+	public int getLength() {
+		return length;
+	}
+
+	protected void setLength(int length) {
+		this.length = length;
+	}
+
+	public static void main(String[] args) {
+	}
+}