You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2011/06/07 17:44:42 UTC
svn commit: r1133047 [1/3] - in
/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm:
accessor/ assertion/ core/ exception/ lzx/
Author: mattmann
Date: Tue Jun 7 15:44:41 2011
New Revision: 1133047
URL: http://svn.apache.org/viewvc?rev=1133047&view=rev
Log:
- progress towards TIKA-245 Support of CHM Format (Oleg's patch, in parts, as suggested by Jukka)
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/assertion/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/assertion/ChmAssert.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmConstants.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmWrapper.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/exception/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/exception/ChmParsingException.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmBlockInfo.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxState.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmAccessor.java Tue Jun 7 15:44:41 2011
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.io.Serializable;
+
+/**
+ *
+ * Defines an accessor interface
+ *
+ * @param <T>
+ */
+public interface ChmAccessor<T> extends Serializable {
+ /**
+ * Parses chm accessor
+ *
+ * @param data
+ * chm file
+ * @param chmAccessor
+ */
+ void parse(byte[] data, T chmAccessor);
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java Tue Jun 7 15:44:41 2011
@@ -0,0 +1,386 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.math.BigInteger;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+
+/**
+ * Holds chm listing entries
+ */
+public class ChmDirectoryListingSet {
+ private List<DirectoryListingEntry> dlel;
+ private byte[] data;
+ private int placeHolder = -1;
+ private long dataOffset = -1;
+ private int controlDataIndex = -1;
+ private int resetTableIndex = -1;
+
+ private boolean isNotControlDataFound = true;
+ private boolean isNotResetTableFound = true;
+
+ /**
+ * Constructs chm directory listing set
+ *
+ * @param data
+ * byte[]
+ * @param chmItsHeader
+ * @param chmItspHeader
+ */
+ public ChmDirectoryListingSet(byte[] data, ChmItsfHeader chmItsHeader,
+ ChmItspHeader chmItspHeader) {
+ setDirectoryListingEntryList(new ArrayList<DirectoryListingEntry>());
+ ChmCommons.assertByteArrayNotNull(data);
+ setData(data);
+ enumerateChmDirectoryListingList(chmItsHeader, chmItspHeader);
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("list:=" + getDirectoryListingEntryList().toString()
+ + System.getProperty("line.separator"));
+ sb.append("number of list items:="
+ + getDirectoryListingEntryList().size());
+ return sb.toString();
+ }
+
+ /**
+ * Returns control data index that located in List
+ *
+ * @return control data index
+ */
+ public int getControlDataIndex() {
+ return controlDataIndex;
+ }
+
+ /**
+ * Sets control data index
+ *
+ * @param controlDataIndex
+ */
+ protected void setControlDataIndex(int controlDataIndex) {
+ this.controlDataIndex = controlDataIndex;
+ }
+
+ /**
+ * Return index of reset table
+ *
+ * @return reset table index
+ */
+ public int getResetTableIndex() {
+ return resetTableIndex;
+ }
+
+ /**
+ * Sets reset table index
+ *
+ * @param resetTableIndex
+ */
+ protected void setResetTableIndex(int resetTableIndex) {
+ this.resetTableIndex = resetTableIndex;
+ }
+
+ /**
+ * Gets place holder
+ *
+ * @return place holder
+ */
+ private int getPlaceHolder() {
+ return placeHolder;
+ }
+
+ /**
+ * Sets place holder
+ *
+ * @param placeHolder
+ */
+ private void setPlaceHolder(int placeHolder) {
+ this.placeHolder = placeHolder;
+ }
+
+ /**
+ * Enumerates chm directory listing entries
+ *
+ * @param chmItsHeader
+ * chm itsf header
+ * @param chmItspHeader
+ * chm itsp header
+ */
+ private void enumerateChmDirectoryListingList(ChmItsfHeader chmItsHeader,
+ ChmItspHeader chmItspHeader) {
+ try {
+ int startPmgl = chmItspHeader.getIndex_head();
+ int stopPmgl = chmItspHeader.getUnknown_0024();
+ int dir_offset = (int) (chmItsHeader.getDirOffset() + chmItspHeader
+ .getHeader_len());
+ setDataOffset(chmItsHeader.getDataOffset());
+
+ /* loops over all pmgls */
+ int previous_index = 0;
+ byte[] dir_chunk = null;
+ for (int i = startPmgl; i <= stopPmgl; i++) {
+ int data_copied = ((1 + i) * (int) chmItspHeader.getBlock_len())
+ + dir_offset;
+ if (i == 0) {
+ dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
+ dir_chunk = Arrays
+ .copyOfRange(getData(), dir_offset,
+ (((1 + i) * (int) chmItspHeader
+ .getBlock_len()) + dir_offset));
+ previous_index = data_copied;
+ } else {
+ dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
+ dir_chunk = Arrays
+ .copyOfRange(getData(), previous_index,
+ (((1 + i) * (int) chmItspHeader
+ .getBlock_len()) + dir_offset));
+ previous_index = data_copied;
+ }
+ enumerateOneSegment(dir_chunk);
+ dir_chunk = null;
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ } finally {
+ setData(null);
+ }
+ }
+
+ /**
+ * Checks control data
+ *
+ * @param dle
+ * chm directory listing entry
+ */
+ private void checkControlData(DirectoryListingEntry dle) {
+ if (isNotControlDataFound) {
+ if (dle.getName().contains(ChmConstants.CONTROL_DATA)) {
+ setControlDataIndex(getDirectoryListingEntryList().size());
+ isNotControlDataFound = false;
+ }
+ }
+ }
+
+ /**
+ * Checks reset table
+ *
+ * @param dle
+ * chm directory listing entry
+ */
+ private void checkResetTable(DirectoryListingEntry dle) {
+ if (isNotResetTableFound) {
+ if (dle.getName().contains(ChmConstants.RESET_TABLE)) {
+ setResetTableIndex(getDirectoryListingEntryList().size());
+ isNotResetTableFound = false;
+ }
+ }
+ }
+
+ /**
+ * Enumerates chm directory listing entries in single chm segment
+ *
+ * @param dir_chunk
+ */
+ private void enumerateOneSegment(byte[] dir_chunk) {
+ try {
+ if (dir_chunk != null) {
+
+ int indexWorkData = ChmCommons.indexOf(dir_chunk,
+ "::".getBytes());
+ int indexUserData = ChmCommons.indexOf(dir_chunk,
+ "/".getBytes());
+
+ if (indexUserData < indexWorkData)
+ setPlaceHolder(indexUserData);
+ else
+ setPlaceHolder(indexWorkData);
+
+ if (getPlaceHolder() > 0
+ && dir_chunk[getPlaceHolder() - 1] != 115) {// #{
+ do {
+ if (dir_chunk[getPlaceHolder() - 1] > 0) {
+ DirectoryListingEntry dle = new DirectoryListingEntry();
+
+ // two cases: 1. when dir_chunk[getPlaceHolder() -
+ // 1] == 0x73
+ // 2. when dir_chunk[getPlaceHolder() + 1] == 0x2f
+ doNameCheck(dir_chunk, dle);
+
+ dle.setName(new String(Arrays.copyOfRange(
+ dir_chunk, getPlaceHolder(),
+ (getPlaceHolder() + dle.getNameLength()))));
+ checkControlData(dle);
+ checkResetTable(dle);
+ setPlaceHolder(getPlaceHolder()
+ + dle.getNameLength());
+
+ /* Sets entry type */
+ if (getPlaceHolder() < dir_chunk.length
+ && dir_chunk[getPlaceHolder()] == 0)
+ dle.setEntryType(ChmCommons.EntryType.UNCOMPRESSED);
+ else
+ dle.setEntryType(ChmCommons.EntryType.COMPRESSED);
+
+ setPlaceHolder(getPlaceHolder() + 1);
+ dle.setOffset(getEncint(dir_chunk));
+ dle.setLength(getEncint(dir_chunk));
+ getDirectoryListingEntryList().add(dle);
+ } else
+ setPlaceHolder(getPlaceHolder() + 1);
+
+ } while (hasNext(dir_chunk));
+ }
+ }
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ /**
+ * Checks if a name and name length are correct. If not then handles it as
+ * follows: 1. when dir_chunk[getPlaceHolder() - 1] == 0x73 ('/') 2. when
+ * dir_chunk[getPlaceHolder() + 1] == 0x2f ('s')
+ *
+ * @param dir_chunk
+ * @param dle
+ */
+ private void doNameCheck(byte[] dir_chunk, DirectoryListingEntry dle) {
+ if (dir_chunk[getPlaceHolder() - 1] == 0x73) {
+ dle.setNameLength(dir_chunk[getPlaceHolder() - 1] & 0x21);
+ } else if (dir_chunk[getPlaceHolder() + 1] == 0x2f) {
+ dle.setNameLength(dir_chunk[getPlaceHolder()]);
+ setPlaceHolder(getPlaceHolder() + 1);
+ } else {
+ dle.setNameLength(dir_chunk[getPlaceHolder() - 1]);
+ }
+ }
+
+ /**
+ * Checks if it's possible move further on byte[]
+ *
+ * @param dir_chunk
+ *
+ * @return boolean
+ */
+ private boolean hasNext(byte[] dir_chunk) {
+ while (getPlaceHolder() < dir_chunk.length) {
+ if (dir_chunk[getPlaceHolder()] == 47
+ && dir_chunk[getPlaceHolder() + 1] != ':') {
+ setPlaceHolder(getPlaceHolder());
+ return true;
+ } else if (dir_chunk[getPlaceHolder()] == ':'
+ && dir_chunk[getPlaceHolder() + 1] == ':') {
+ setPlaceHolder(getPlaceHolder());
+ return true;
+ } else
+ setPlaceHolder(getPlaceHolder() + 1);
+ }
+ return false;
+ }
+
+ /**
+ * Returns encrypted integer
+ *
+ * @param data_chunk
+ *
+ * @return
+ */
+ private int getEncint(byte[] data_chunk) {
+ byte ob;
+ BigInteger bi = BigInteger.ZERO;
+ byte[] nb = new byte[1];
+
+ if (getPlaceHolder() < data_chunk.length) {
+ while ((ob = data_chunk[getPlaceHolder()]) < 0) {
+ nb[0] = (byte) ((ob & 0x7f));
+ bi = bi.shiftLeft(7).add(new BigInteger(nb));
+ setPlaceHolder(getPlaceHolder() + 1);
+ }
+ nb[0] = (byte) ((ob & 0x7f));
+ bi = bi.shiftLeft(7).add(new BigInteger(nb));
+ setPlaceHolder(getPlaceHolder() + 1);
+ }
+ return bi.intValue();
+ }
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+ }
+
+ /**
+ * Sets chm directory listing entry list
+ *
+ * @param dlel
+ * chm directory listing entry list
+ */
+ public void setDirectoryListingEntryList(List<DirectoryListingEntry> dlel) {
+ this.dlel = dlel;
+ }
+
+ /**
+ * Returns chm directory listing entry list
+ *
+ * @return List<DirectoryListingEntry>
+ */
+ public List<DirectoryListingEntry> getDirectoryListingEntryList() {
+ return dlel;
+ }
+
+ /**
+ * Sets data
+ *
+ * @param data
+ */
+ private void setData(byte[] data) {
+ this.data = data;
+ }
+
+ /**
+ * Returns data
+ *
+ * @return
+ */
+ private byte[] getData() {
+ return data;
+ }
+
+ /**
+ * Sets data offset
+ *
+ * @param dataOffset
+ */
+ private void setDataOffset(long dataOffset) {
+ this.dataOffset = dataOffset;
+ }
+
+ /**
+ * Returns data offset
+ *
+ * @return dataOffset
+ */
+ public long getDataOffset() {
+ return dataOffset;
+ }
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItsfHeader.java Tue Jun 7 15:44:41 2011
@@ -0,0 +1,497 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.math.BigInteger;
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * The Header 0000: char[4] 'ITSF' 0004: DWORD 3 (Version number) 0008: DWORD
+ * Total header length, including header section table and following data. 000C:
+ * DWORD 1 (unknown) 0010: DWORD a timestamp 0014: DWORD Windows Language ID
+ * 0018: GUID {7C01FD10-7BAA-11D0-9E0C-00A0-C922-E6EC} 0028: GUID
+ * {7C01FD11-7BAA-11D0-9E0C-00A0-C922-E6EC} Note: a GUID is $10 bytes, arranged
+ * as 1 DWORD, 2 WORDs, and 8 BYTEs. 0000: QWORD Offset of section from
+ * beginning of file 0008: QWORD Length of section Following the header section
+ * table is 8 bytes of additional header data. In Version 2 files, this data is
+ * not there and the content section starts immediately after the directory.
+ *
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?show-translation-form=1}
+ *
+ */
+/* structure of ITSF headers */
+public class ChmItsfHeader implements ChmAccessor<ChmItsfHeader> {
+ private static final long serialVersionUID = 2215291838533213826L;
+ private byte[] signature = new String("ITSF").getBytes(); /* 0 (ITSF) */
+ private int version; /* 4 */
+ private int header_len; /* 8 */
+ private int unknown_000c; /* c */
+ private long last_modified; /* 10 */
+ private long lang_id; /* 14 */
+ private byte[] dir_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 18 */
+ private byte[] stream_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 28 */
+ private long unknown_offset; /* 38 */
+ private long unknown_len; /* 40 */
+ private long dir_offset; /* 48 */
+ private long dir_len; /* 50 */
+ private long data_offset; /* 58 (Not present before V3) */
+
+ /* local usage */
+ private int dataRemained;
+ private int currentPlace = 0;
+
+ /**
+ * Prints the values of ChmfHeader
+ */
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append(new String(getSignature()) + " ");
+ sb.append(getVersion() + " ");
+ sb.append(getHeaderLen() + " ");
+ sb.append(getUnknown_000c() + " ");
+ sb.append(getLastModified() + " ");
+ sb.append(getLangId() + " ");
+ sb.append(getDir_uuid() + " ");
+ sb.append(getStream_uuid() + " ");
+ sb.append(getUnknownOffset() + " ");
+ sb.append(getUnknownLen() + " ");
+ sb.append(getDirOffset() + " ");
+ sb.append(getDirLen() + " ");
+ sb.append(getDataOffset() + " ");
+ return sb.toString();
+ }
+
+ /**
+ * Returns a signature of itsf header
+ *
+ * @return itsf header
+ */
+ public byte[] getSignature() {
+ return signature;
+ }
+
+ /**
+ * Sets itsf header signature
+ *
+ * @param signature
+ */
+ protected void setSignature(byte[] signature) {
+ this.signature = signature;
+ }
+
+ /**
+ * Returns itsf header version
+ *
+ * @return itsf version
+ */
+ public int getVersion() {
+ return version;
+ }
+
+ /**
+ * Sets itsf version
+ *
+ * @param version
+ */
+ protected void setVersion(int version) {
+ this.version = version;
+ }
+
+ /**
+ * Returns itsf header length
+ *
+ * @return length
+ */
+ public int getHeaderLen() {
+ return header_len;
+ }
+
+ /**
+ * Sets itsf header length
+ *
+ * @param header_len
+ */
+ protected void setHeaderLen(int header_len) {
+ this.header_len = header_len;
+ }
+
+ /**
+ * Returns unknown_00c value
+ *
+ * @return unknown_00c
+ */
+ public int getUnknown_000c() {
+ return unknown_000c;
+ }
+
+ /**
+ * Sets unknown_00c
+ *
+ * @param unknown_000c
+ */
+ protected void setUnknown_000c(int unknown_000c) {
+ this.unknown_000c = unknown_000c;
+ }
+
+ /**
+ * Returns last modified date of the chm file
+ *
+ * @return last modified date as long
+ */
+ public long getLastModified() {
+ return last_modified;
+ }
+
+ /**
+ * Sets last modified date of the chm file
+ *
+ * @param last_modified
+ */
+ protected void setLastModified(long last_modified) {
+ this.last_modified = last_modified;
+ }
+
+ /**
+ * Returns language ID
+ *
+ * @return language_id
+ */
+ public long getLangId() {
+ return lang_id;
+ }
+
+ /**
+ * Sets language_id
+ *
+ * @param lang_id
+ */
+ protected void setLangId(long lang_id) {
+ this.lang_id = lang_id;
+ }
+
+ /**
+ * Returns directory uuid
+ *
+ * @return dir_uuid
+ */
+ public byte[] getDir_uuid() {
+ return dir_uuid;
+ }
+
+ /**
+ * Sets directory uuid
+ *
+ * @param dir_uuid
+ */
+ protected void setDir_uuid(byte[] dir_uuid) {
+ this.dir_uuid = dir_uuid;
+ }
+
+ /**
+ * Returns stream uuid
+ *
+ * @return stream_uuid
+ */
+ public byte[] getStream_uuid() {
+ return stream_uuid;
+ }
+
+ /**
+ * Sets stream uuid
+ *
+ * @param stream_uuid
+ */
+ protected void setStream_uuid(byte[] stream_uuid) {
+ this.stream_uuid = stream_uuid;
+ }
+
+ /**
+ * Returns unknown offset
+ *
+ * @return unknown_offset
+ */
+ public long getUnknownOffset() {
+ return unknown_offset;
+ }
+
+ /**
+ * Sets unknown offset
+ *
+ * @param unknown_offset
+ */
+ protected void setUnknownOffset(long unknown_offset) {
+ this.unknown_offset = unknown_offset;
+ }
+
+ /**
+ * Returns unknown length
+ *
+ * @return unknown_length
+ */
+ public long getUnknownLen() {
+ return unknown_len;
+ }
+
+ /**
+ * Sets unknown length
+ *
+ * @param unknown_len
+ */
+ protected void setUnknownLen(long unknown_len) {
+ this.unknown_len = unknown_len;
+ }
+
+ /**
+ * Returns directory offset
+ *
+ * @return directory_offset
+ */
+ public long getDirOffset() {
+ return dir_offset;
+ }
+
+ /**
+ * Sets directory offset
+ *
+ * @param dir_offset
+ */
+ protected void setDirOffset(long dir_offset) {
+ this.dir_offset = dir_offset;
+ }
+
+ /**
+ * Returns directory length
+ *
+ * @return directory_offset
+ */
+ public long getDirLen() {
+ return dir_len;
+ }
+
+ /**
+ * Sets directory length
+ *
+ * @param dir_len
+ */
+ protected void setDirLen(long dir_len) {
+ this.dir_len = dir_len;
+ }
+
+ /**
+ * Returns data offset
+ *
+ * @return data_offset
+ */
+ public long getDataOffset() {
+ return data_offset;
+ }
+
+ /**
+ * Sets data offset
+ *
+ * @param data_offset
+ */
+ protected void setDataOffset(long data_offset) {
+ this.data_offset = data_offset;
+ }
+
+ /**
+ * Copies 4 first bytes of the byte[]
+ *
+ * @param data
+ * @param chmItsfHeader
+ * @param count
+ */
+ private void unmarshalCharArray(byte[] data, ChmItsfHeader chmItsfHeader,
+ int count) {
+ ChmAssert.assertChmAccessorParameters(data, chmItsfHeader, count);
+ System.arraycopy(data, 0, chmItsfHeader.signature, 0, count);
+ this.setCurrentPlace(this.getCurrentPlace() + count);
+ this.setDataRemained(this.getDataRemained() - count);
+ }
+
+ /**
+ * Copies X bytes of source byte[] to the dest byte[]
+ *
+ * @param data
+ * @param dest
+ * @param count
+ * @return
+ */
+ private byte[] unmarshalUuid(byte[] data, byte[] dest, int count) {
+ System.arraycopy(data, this.getCurrentPlace(), dest, 0, count);
+ this.setCurrentPlace(this.getCurrentPlace() + count);
+ this.setDataRemained(this.getDataRemained() - count);
+ return dest;
+ }
+
+ /**
+ * Takes 8 bytes and reverses them
+ *
+ * @param data
+ * @param dest
+ * @return
+ */
+ private long unmarshalUint64(byte[] data, long dest) {
+ byte[] temp = new byte[8];
+ int i, j;
+
+ if (8 > this.getDataRemained())
+ throw new ChmParsingException("8 > this.getDataRemained()");
+
+ for (i = 8, j = 7; i > 0; i--) {
+ temp[j--] = data[this.getCurrentPlace()];
+ this.setCurrentPlace(this.getCurrentPlace() + 1);
+ }
+
+ dest = new BigInteger(temp).longValue();
+ this.setDataRemained(this.getDataRemained() - 8);
+ return dest;
+ }
+
+ private int unmarshalInt32(byte[] data, int dest) {
+ ChmAssert.assertByteArrayNotNull(data);
+
+ if (4 > this.getDataRemained())
+ throw new ChmParsingException("4 > dataLenght");
+ dest = data[this.getCurrentPlace()]
+ | data[this.getCurrentPlace() + 1] << 8
+ | data[this.getCurrentPlace() + 2] << 16
+ | data[this.getCurrentPlace() + 3] << 24;
+
+ this.setCurrentPlace(this.getCurrentPlace() + 4);
+ this.setDataRemained(this.getDataRemained() - 4);
+ return dest;
+ }
+
+ private long unmarshalUInt32(byte[] data, long dest) {
+ ChmAssert.assertByteArrayNotNull(data);
+ if (4 > getDataRemained())
+ throw new ChmParsingException("4 > dataLenght");
+ dest = data[this.getCurrentPlace()]
+ | data[this.getCurrentPlace() + 1] << 8
+ | data[this.getCurrentPlace() + 2] << 16
+ | data[this.getCurrentPlace() + 3] << 24;
+
+ setDataRemained(this.getDataRemained() - 4);
+ this.setCurrentPlace(this.getCurrentPlace() + 4);
+ return dest;
+ }
+
+ public static void main(String[] args) {
+ }
+
+ /**
+ * Sets data remained to be processed
+ *
+ * @param dataRemained
+ */
+ private void setDataRemained(int dataRemained) {
+ this.dataRemained = dataRemained;
+ }
+
+ /**
+ * Returns data remained
+ *
+ * @return data_remainned
+ */
+ private int getDataRemained() {
+ return dataRemained;
+ }
+
+ /**
+ * Sets current place in the byte[]
+ *
+ * @param currentPlace
+ */
+ private void setCurrentPlace(int currentPlace) {
+ this.currentPlace = currentPlace;
+ }
+
+ /**
+ * Returns current place in the byte[]
+ *
+ * @return current place
+ */
+ private int getCurrentPlace() {
+ return currentPlace;
+ }
+
+ // @Override
+ public void parse(byte[] data, ChmItsfHeader chmItsfHeader) {
+ if (data.length < ChmConstants.CHM_ITSF_V2_LEN
+ || data.length > ChmConstants.CHM_ITSF_V3_LEN)
+ throw new ChmParsingException(
+ "we only know how to deal with the 0x58 and 0x60 byte structures");
+
+ chmItsfHeader.setDataRemained(data.length);
+ chmItsfHeader.unmarshalCharArray(data, chmItsfHeader,
+ ChmConstants.CHM_SIGNATURE_LEN);
+ chmItsfHeader.setVersion(chmItsfHeader.unmarshalInt32(data,
+ chmItsfHeader.getVersion()));
+ chmItsfHeader.setHeaderLen(chmItsfHeader.unmarshalInt32(data,
+ chmItsfHeader.getHeaderLen()));
+ chmItsfHeader.setUnknown_000c(chmItsfHeader.unmarshalInt32(data,
+ chmItsfHeader.getUnknown_000c()));
+ chmItsfHeader.setLastModified(chmItsfHeader.unmarshalUInt32(data,
+ chmItsfHeader.getLastModified()));
+ chmItsfHeader.setLangId(chmItsfHeader.unmarshalUInt32(data,
+ chmItsfHeader.getLangId()));
+ chmItsfHeader.setDir_uuid(chmItsfHeader.unmarshalUuid(data,
+ chmItsfHeader.getDir_uuid(), 16));
+ chmItsfHeader.setStream_uuid(chmItsfHeader.unmarshalUuid(data,
+ chmItsfHeader.getStream_uuid(), 16));
+ chmItsfHeader.setUnknownOffset(chmItsfHeader.unmarshalUint64(data,
+ chmItsfHeader.getUnknownOffset()));
+ chmItsfHeader.setUnknownLen(chmItsfHeader.unmarshalUint64(data,
+ chmItsfHeader.getUnknownLen()));
+ chmItsfHeader.setDirOffset(chmItsfHeader.unmarshalUint64(data,
+ chmItsfHeader.getDirOffset()));
+ chmItsfHeader.setDirLen(chmItsfHeader.unmarshalUint64(data,
+ chmItsfHeader.getDirLen()));
+
+ if (!new String(chmItsfHeader.getSignature()).equals(ChmConstants.ITSF))
+ throw new ChmParsingException("seems not valid file");
+ if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_2) {
+ if (chmItsfHeader.getHeaderLen() < ChmConstants.CHM_ITSF_V2_LEN)
+ throw new ChmParsingException("something wrong with header");
+ } else if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_3) {
+ if (chmItsfHeader.getHeaderLen() < ChmConstants.CHM_ITSF_V3_LEN)
+ throw new ChmParsingException("unknown v3 header lenght");
+ } else
+ throw new ChmParsingException("unsupported chm format");
+
+ /*
+ * now, if we have a V3 structure, unmarshal the rest, otherwise,
+ * compute it
+ */
+ if (chmItsfHeader.getVersion() == ChmConstants.CHM_VER_3) {
+ if (chmItsfHeader.getDataRemained() >= 0)
+ chmItsfHeader.setDataOffset(chmItsfHeader.getDirOffset()
+ + chmItsfHeader.getDirLen());
+ else
+ throw new ChmParsingException(
+ "cannot set data offset, no data remained");
+ } else
+ chmItsfHeader.setDataOffset(chmItsfHeader.getDirOffset()
+ + chmItsfHeader.getDirLen());
+ }
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmItspHeader.java Tue Jun 7 15:44:41 2011
@@ -0,0 +1,548 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * Directory header The directory starts with a header; its format is as
+ * follows: 0000: char[4] 'ITSP' 0004: DWORD Version number 1 0008: DWORD Length
+ * of the directory header 000C: DWORD $0a (unknown) 0010: DWORD $1000 Directory
+ * chunk size 0014: DWORD "Density" of quickref section, usually 2 0018: DWORD
+ * Depth of the index tree - 1 there is no index, 2 if there is one level of
+ * PMGI chunks 001C: DWORD Chunk number of root index chunk, -1 if there is none
+ * (though at least one file has 0 despite there being no index chunk, probably
+ * a bug) 0020: DWORD Chunk number of first PMGL (listing) chunk 0024: DWORD
+ * Chunk number of last PMGL (listing) chunk 0028: DWORD -1 (unknown) 002C:
+ * DWORD Number of directory chunks (total) 0030: DWORD Windows language ID
+ * 0034: GUID {5D02926A-212E-11D0-9DF9-00A0C922E6EC} 0044: DWORD $54 (This is
+ * the length again) 0048: DWORD -1 (unknown) 004C: DWORD -1 (unknown) 0050:
+ * DWORD -1 (unknown)
+ *
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?show-translation-form=1}
+ *
+ */
+public class ChmItspHeader implements ChmAccessor<ChmItspHeader> {
+ // TODO: refactor all unmarshals
+ private static final long serialVersionUID = 1962394421998181341L;
+ private byte[] signature = new String(ChmConstants.ITSP).getBytes(); /*
+ * 0
+ * (ITSP
+ * )
+ */
+ private int version; /* 4 */
+ private int header_len; /* 8 */
+ private int unknown_000c; /* c */
+ private long block_len; /* 10 */
+ private int blockidx_intvl; /* 14 */
+ private int index_depth; /* 18 */
+ private int index_root; /* 1c */
+ private int index_head; /* 20 */
+ private int unknown_0024; /* 24 */
+ private long num_blocks; /* 28 */
+ private int unknown_002c; /* 2c */
+ private long lang_id; /* 30 */
+ private byte[] system_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 34 */
+ private byte[] unknown_0044 = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 44 */
+
+ /* local usage */
+ private int dataRemained;
+ private int currentPlace = 0;
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("[ signature:=" + new String(getSignature())
+ + System.getProperty("line.separator"));
+ sb.append("version:=\t" + getVersion()
+ + System.getProperty("line.separator"));
+ sb.append("header_len:=\t" + getHeader_len()
+ + System.getProperty("line.separator"));
+ sb.append("unknown_00c:=\t" + getUnknown_000c()
+ + System.getProperty("line.separator"));
+ sb.append("block_len:=\t" + getBlock_len() + " [directory chunk size]"
+ + System.getProperty("line.separator"));
+ sb.append("blockidx_intvl:=" + getBlockidx_intvl()
+ + ", density of quickref section, usually 2"
+ + System.getProperty("line.separator"));
+ sb.append("index_depth:=\t"
+ + getIndex_depth()
+ + ", depth of the index tree - 1 there is no index, 2 if there is one level of PMGI chunk"
+ + System.getProperty("line.separator"));
+ sb.append("index_root:=\t" + getIndex_root()
+ + ", chunk number of root index chunk, -1 if there is none"
+ + System.getProperty("line.separator"));
+ sb.append("index_head:=\t" + getIndex_head()
+ + ", chunk number of first PMGL (listing) chunk"
+ + System.getProperty("line.separator"));
+ sb.append("unknown_0024:=\t" + getUnknown_0024()
+ + ", chunk number of last PMGL (listing) chunk"
+ + System.getProperty("line.separator"));
+ sb.append("num_blocks:=\t" + getNum_blocks() + ", -1 (unknown)"
+ + System.getProperty("line.separator"));
+ sb.append("unknown_002c:=\t" + getUnknown_002c()
+ + ", number of directory chunks (total)"
+ + System.getProperty("line.separator"));
+ sb.append("lang_id:=\t" + getLang_id() + " - "
+ + ChmCommons.getLanguage(getLang_id())
+ + System.getProperty("line.separator"));
+ sb.append("system_uuid:=" + getSystem_uuid()
+ + System.getProperty("line.separator"));
+ sb.append("unknown_0044:=" + getUnknown_0044() + " ]");
+ return sb.toString();
+ }
+
+ /**
+ * Copies 4 bits from data[]
+ *
+ * @param data
+ * @param chmItspHeader
+ * @param count
+ */
+ private void unmarshalCharArray(byte[] data, ChmItspHeader chmItspHeader,
+ int count) {
+ ChmAssert.assertByteArrayNotNull(data);
+ ChmAssert.assertChmAccessorNotNull(chmItspHeader);
+ this.setDataRemained(data.length);
+ System.arraycopy(data, 0, chmItspHeader.signature, 0, count);
+ this.setCurrentPlace(this.getCurrentPlace() + count);
+ this.setDataRemained(this.getDataRemained() - count);
+ }
+
+ private int unmarshalInt32(byte[] data, int dataLenght, int dest) {
+ ChmAssert.assertByteArrayNotNull(data);
+ if (4 > this.getDataRemained())
+ throw new ChmParsingException("4 > dataLenght");
+ dest = data[this.getCurrentPlace()]
+ | data[this.getCurrentPlace() + 1] << 8
+ | data[this.getCurrentPlace() + 2] << 16
+ | data[this.getCurrentPlace() + 3] << 24;
+
+ this.setCurrentPlace(this.getCurrentPlace() + 4);
+ this.setDataRemained(this.getDataRemained() - 4);
+ return dest;
+ }
+
+ private long unmarshalUInt32(byte[] data, int dataLenght, long dest) {
+ ChmAssert.assertByteArrayNotNull(data);
+ if (4 > dataLenght)
+ throw new ChmParsingException("4 > dataLenght");
+ dest = data[this.getCurrentPlace()]
+ | data[this.getCurrentPlace() + 1] << 8
+ | data[this.getCurrentPlace() + 2] << 16
+ | data[this.getCurrentPlace() + 3] << 24;
+
+ setDataRemained(this.getDataRemained() - 4);
+ this.setCurrentPlace(this.getCurrentPlace() + 4);
+ return dest;
+ }
+
+ private byte[] unmarshalUuid(byte[] data, int dataLenght, byte[] dest,
+ int count) {
+ System.arraycopy(data, this.getCurrentPlace(), dest, 0, count);
+ this.setCurrentPlace(this.getCurrentPlace() + count);
+ this.setDataRemained(this.getDataRemained() - count);
+ return dest;
+ }
+
+ /**
+ * Returns how many bytes remained
+ *
+ * @return int
+ */
+ private int getDataRemained() {
+ return dataRemained;
+ }
+
+ /**
+ * Sets how many bytes remained
+ *
+ * @param dataRemained
+ */
+ private void setDataRemained(int dataRemained) {
+ this.dataRemained = dataRemained;
+ }
+
+ /**
+ * Returns a place holder
+ *
+ * @return current place
+ */
+ private int getCurrentPlace() {
+ return currentPlace;
+ }
+
+ /**
+ * Sets current place
+ *
+ * @param currentPlace
+ */
+ private void setCurrentPlace(int currentPlace) {
+ this.currentPlace = currentPlace;
+ }
+
+ /**
+ * Returns a signature of the header
+ *
+ * @return itsp signature
+ */
+ public byte[] getSignature() {
+ return signature;
+ }
+
+ /**
+ * Sets itsp signature
+ *
+ * @param signature
+ */
+ protected void setSignature(byte[] signature) {
+ this.signature = signature;
+ }
+
+ /**
+ * Returns version of itsp header
+ *
+ * @return version
+ */
+ public int getVersion() {
+ return version;
+ }
+
+ /**
+ * Sets a version of itsp header
+ *
+ * @param version
+ */
+ protected void setVersion(int version) {
+ this.version = version;
+ }
+
+ /**
+ * Returns header length
+ *
+ * @return header length
+ */
+ public int getHeader_len() {
+ return header_len;
+ }
+
+ /**
+ * Sets itsp header length
+ *
+ * @param header_len
+ */
+ protected void setHeader_len(int header_len) {
+ this.header_len = header_len;
+ }
+
+ /**
+ * Returns 000c unknown bytes
+ */
+ public int getUnknown_000c() {
+ return unknown_000c;
+ }
+
+ /**
+ * Sets 000c unknown bytes Unknown means here that those guys who cracked
+ * the chm format do not know what's it purposes for
+ *
+ * @param unknown_000c
+ */
+ protected void setUnknown_000c(int unknown_000c) {
+ this.unknown_000c = unknown_000c;
+ }
+
+ /**
+ * Returns block's length
+ *
+ * @return block_length
+ */
+ public long getBlock_len() {
+ return block_len;
+ }
+
+ /**
+ * Sets block length
+ *
+ * @param block_len
+ */
+ protected void setBlock_len(long block_len) {
+ this.block_len = block_len;
+ }
+
+ /**
+ * Returns block index interval
+ *
+ * @return blockidx_intvl
+ */
+ public int getBlockidx_intvl() {
+ return blockidx_intvl;
+ }
+
+ /**
+ * Sets block index interval
+ *
+ * @param blockidx_intvl
+ */
+ protected void setBlockidx_intvl(int blockidx_intvl) {
+ this.blockidx_intvl = blockidx_intvl;
+ }
+
+ /**
+ * Returns an index depth
+ *
+ * @return index_depth
+ */
+ public int getIndex_depth() {
+ return index_depth;
+ }
+
+ /**
+ * Sets an index depth
+ *
+ * @param index_depth
+ */
+ protected void setIndex_depth(int index_depth) {
+ this.index_depth = index_depth;
+ }
+
+ /**
+ * Returns index root
+ *
+ * @return index_root
+ */
+ public int getIndex_root() {
+ return index_root;
+ }
+
+ /**
+ * Sets an index root
+ *
+ * @param index_root
+ */
+ protected void setIndex_root(int index_root) {
+ this.index_root = index_root;
+ }
+
+ /**
+ * Returns an index head
+ *
+ * @return index_head
+ */
+ public int getIndex_head() {
+ return index_head;
+ }
+
+ /**
+ * Sets an index head
+ *
+ * @param index_head
+ */
+ protected void setIndex_head(int index_head) {
+ this.index_head = index_head;
+ }
+
+ /**
+ * Returns 0024 unknown bytes
+ *
+ * @return unknown_0024
+ */
+ public int getUnknown_0024() {
+ return unknown_0024;
+ }
+
+ /**
+ * Sets 0024 unknown bytes
+ *
+ * @param unknown_0024
+ */
+ protected void setUnknown_0024(int unknown_0024) {
+ this.unknown_0024 = unknown_0024;
+ }
+
+ /**
+ * Returns number of blocks
+ *
+ * @return num_blocks
+ */
+ public long getNum_blocks() {
+ return num_blocks;
+ }
+
+ /**
+ * Sets number of blocks containing in the chm file
+ *
+ * @param num_blocks
+ */
+ protected void setNum_blocks(long num_blocks) {
+ this.num_blocks = num_blocks;
+ }
+
+ /**
+ * Returns 002c unknown bytes
+ *
+ * @return unknown_002c
+ */
+ public int getUnknown_002c() {
+ return unknown_002c;
+ }
+
+ /**
+ * Sets 002c unknown bytes
+ *
+ * @param unknown_002c
+ */
+ protected void setUnknown_002c(int unknown_002c) {
+ this.unknown_002c = unknown_002c;
+ }
+
+ /**
+ * Returns language id
+ *
+ * @return lang_id
+ */
+ public long getLang_id() {
+ return lang_id;
+ }
+
+ /**
+ * Sets language id
+ *
+ * @param lang_id
+ */
+ protected void setLang_id(long lang_id) {
+ this.lang_id = lang_id;
+ }
+
+ /**
+ * Returns system uuid
+ *
+ * @return system_uuid
+ */
+ public byte[] getSystem_uuid() {
+ return system_uuid;
+ }
+
+ /**
+ * Sets system uuid
+ *
+ * @param system_uuid
+ */
+ protected void setSystem_uuid(byte[] system_uuid) {
+ this.system_uuid = system_uuid;
+ }
+
+ /**
+ * Returns 0044 unknown bytes
+ *
+ * @return unknown_0044
+ */
+ public byte[] getUnknown_0044() {
+ return unknown_0044;
+ }
+
+ /**
+ * Sets 0044 unknown bytes
+ *
+ * @param unknown_0044
+ */
+ protected void setUnknown_0044(byte[] unknown_0044) {
+ this.unknown_0044 = unknown_0044;
+ }
+
+ // @Override
+ public void parse(byte[] data, ChmItspHeader chmItspHeader) {
+ /* we only know how to deal with the 0x58 and 0x60 byte structures */
+ if (data.length != ChmConstants.CHM_ITSP_V1_LEN)
+ throw new ChmParsingException(
+ "we only know how to deal with the 0x58 and 0x60 byte structures");
+
+ /* unmarshal common fields */
+ chmItspHeader.unmarshalCharArray(data, chmItspHeader,
+ ChmConstants.CHM_SIGNATURE_LEN);
+ // ChmCommons.unmarshalCharArray(data, chmItspHeader,
+ // ChmConstants.CHM_SIGNATURE_LEN);
+ chmItspHeader.setVersion(chmItspHeader.unmarshalInt32(data,
+ chmItspHeader.getDataRemained(), chmItspHeader.getVersion()));
+ chmItspHeader
+ .setHeader_len(chmItspHeader.unmarshalInt32(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getHeader_len()));
+ chmItspHeader.setUnknown_000c(chmItspHeader.unmarshalInt32(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getUnknown_000c()));
+ chmItspHeader.setBlock_len(chmItspHeader.unmarshalUInt32(data,
+ chmItspHeader.getDataRemained(), chmItspHeader.getBlock_len()));
+ chmItspHeader.setBlockidx_intvl(chmItspHeader.unmarshalInt32(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getBlockidx_intvl()));
+ chmItspHeader
+ .setIndex_depth(chmItspHeader.unmarshalInt32(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getIndex_depth()));
+ chmItspHeader
+ .setIndex_root(chmItspHeader.unmarshalInt32(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getIndex_root()));
+ chmItspHeader
+ .setIndex_head(chmItspHeader.unmarshalInt32(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getIndex_head()));
+ chmItspHeader.setUnknown_0024(chmItspHeader.unmarshalInt32(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getUnknown_0024()));
+ chmItspHeader
+ .setNum_blocks(chmItspHeader.unmarshalUInt32(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getNum_blocks()));
+ chmItspHeader.setUnknown_002c((chmItspHeader.unmarshalInt32(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getUnknown_002c())));
+ chmItspHeader.setLang_id(chmItspHeader.unmarshalUInt32(data,
+ chmItspHeader.getDataRemained(), chmItspHeader.getLang_id()));
+ chmItspHeader
+ .setSystem_uuid(chmItspHeader.unmarshalUuid(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getSystem_uuid(),
+ ChmConstants.BYTE_ARRAY_LENGHT));
+ chmItspHeader
+ .setUnknown_0044(chmItspHeader.unmarshalUuid(data,
+ chmItspHeader.getDataRemained(),
+ chmItspHeader.getUnknown_0044(),
+ ChmConstants.BYTE_ARRAY_LENGHT));
+
+ /* Checks validity of the itsp header */
+ if (!new String(chmItspHeader.getSignature()).equals(ChmConstants.ITSP))
+ throw new ChmParsingException("seems not valid signature");
+
+ if (chmItspHeader.getVersion() != ChmConstants.CHM_VER_1)
+ throw new ChmParsingException("!=ChmConstants.CHM_VER_1");
+
+ if (chmItspHeader.getHeader_len() != ChmConstants.CHM_ITSP_V1_LEN)
+ throw new ChmParsingException("!= ChmConstants.CHM_ITSP_V1_LEN");
+ }
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+ }
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcControlData.java Tue Jun 7 15:44:41 2011
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ *
+ * ::DataSpace/Storage/<SectionName>/ControlData This file contains $20 bytes of
+ * information on the compression. The information is partially known: 0000:
+ * DWORD 6 (unknown) 0004: ASCII 'LZXC' Compression type identifier 0008: DWORD
+ * 2 (Possibly numeric code for LZX) 000C: DWORD The Huffman reset interval in
+ * $8000-byte blocks 0010: DWORD The window size in $8000-byte blocks 0014:
+ * DWORD unknown (sometimes 2, sometimes 1, sometimes 0) 0018: DWORD 0 (unknown)
+ * 001C: DWORD 0 (unknown)
+ *
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?page=2 }
+ *
+ */
+public class ChmLzxcControlData implements ChmAccessor<ChmLzxcControlData> {
+ private static final long serialVersionUID = -7897854774939631565L;
+ /* class' members */
+ private long size; /* 0 */
+ private byte[] signature = new String(ChmConstants.LZXC).getBytes(); /*
+ * 4
+ * (LZXC
+ * )
+ */
+ private long version; /* 8 */
+ private long resetInterval; /* c */
+ private long windowSize; /* 10 */
+ private long windowsPerReset; /* 14 */
+ private long unknown_18; /* 18 */
+
+ /* local usage */
+ private int dataRemained;
+ private int currentPlace = 0;
+
+ /**
+ * Returns a remained data
+ *
+ * @return dataRemained
+ */
+ private int getDataRemained() {
+ return dataRemained;
+ }
+
+ /**
+ * Sets a remained data
+ *
+ * @param dataRemained
+ */
+ private void setDataRemained(int dataRemained) {
+ this.dataRemained = dataRemained;
+ }
+
+ /**
+ * Returns a place holder
+ *
+ * @return current_place
+ */
+ private int getCurrentPlace() {
+ return currentPlace;
+ }
+
+ /**
+ * Sets a place holder
+ *
+ * @param current_place
+ */
+ private void setCurrentPlace(int currentPlace) {
+ this.currentPlace = currentPlace;
+ }
+
+ /**
+ * Returns a size of control data
+ *
+ * @return size
+ */
+ public long getSize() {
+ return size;
+ }
+
+ /**
+ * Sets a size of control data
+ *
+ * @param size
+ */
+ protected void setSize(long size) {
+ this.size = size;
+ }
+
+ /**
+ * Returns a signature of control data block
+ *
+ * @return signature
+ */
+ public byte[] getSignature() {
+ return signature;
+ }
+
+ /**
+ * Sets a signature of control data block
+ *
+ * @param signature
+ */
+ protected void setSignature(byte[] signature) {
+ this.signature = signature;
+ }
+
+ /**
+ * Returns a version of control data block
+ *
+ * @return version
+ */
+ public long getVersion() {
+ return version;
+ }
+
+ /**
+ * Sets version of control data block
+ *
+ * @param version
+ */
+ protected void setVersion(long version) {
+ this.version = version;
+ }
+
+ /**
+ * Returns reset interval
+ *
+ * @return reset_interval
+ */
+ public long getResetInterval() {
+ return resetInterval;
+ }
+
+ /**
+ * Sets a reset interval
+ *
+ * @param resetInterval
+ */
+ protected void setResetInterval(long resetInterval) {
+ this.resetInterval = resetInterval;
+ }
+
+ /**
+ * Returns a window size
+ *
+ * @return window_size
+ */
+ public long getWindowSize() {
+ return windowSize;
+ }
+
+ /**
+ * Sets a window size
+ *
+ * @param window_size
+ */
+ protected void setWindowSize(long windowSize) {
+ this.windowSize = windowSize;
+ }
+
+ /**
+ * Returns windows per reset
+ *
+ * @return
+ */
+ public long getWindowsPerReset() {
+ return windowsPerReset;
+ }
+
+ /**
+ * Sets windows per reset
+ *
+ * @param windows_per_reset
+ */
+ protected void setWindowsPerReset(long windowsPerReset) {
+ this.windowsPerReset = windowsPerReset;
+ }
+
+ /**
+ * Returns unknown 18 bytes
+ *
+ * @return unknown_18
+ */
+ public long getUnknown_18() {
+ return unknown_18;
+ }
+
+ /**
+ * Sets unknown 18 bytes
+ *
+ * @param unknown_18
+ */
+ protected void setUnknown_18(long unknown_18) {
+ this.unknown_18 = unknown_18;
+ }
+
+ private long unmarshalUInt32(byte[] data, long dest) {
+ assert (data != null && data.length > 0);
+ if (4 > getDataRemained())
+ throw new ChmParsingException("4 > dataLenght");
+ dest = data[this.getCurrentPlace()]
+ | data[this.getCurrentPlace() + 1] << 8
+ | data[this.getCurrentPlace() + 2] << 16
+ | data[this.getCurrentPlace() + 3] << 24;
+
+ setDataRemained(this.getDataRemained() - 4);
+ this.setCurrentPlace(this.getCurrentPlace() + 4);
+ return dest;
+ }
+
+ private void unmarshalCharArray(byte[] data,
+ ChmLzxcControlData chmLzxcControlData, int count) {
+ ChmAssert.assertByteArrayNotNull(data);
+ ChmAssert.assertChmAccessorNotNull(chmLzxcControlData);
+ ChmAssert.assertPositiveInt(count);
+ System.arraycopy(data, 4, chmLzxcControlData.getSignature(), 0, count);
+ this.setCurrentPlace(this.getCurrentPlace() + count);
+ this.setDataRemained(this.getDataRemained() - count);
+ }
+
+ /**
+ * Returns textual representation of ChmLzxcControlData
+ */
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("size(unknown):=" + this.getSize() + ", ");
+ sb.append("signature(Compression type identifier):="
+ + new String(this.getSignature()) + ", ");
+ sb.append("version(Possibly numeric code for LZX):="
+ + this.getVersion() + System.getProperty("line.separator"));
+ sb.append("resetInterval(The Huffman reset interval):="
+ + this.getResetInterval() + ", ");
+ sb.append("windowSize:=" + this.getWindowSize() + ", ");
+ sb.append("windowsPerReset(unknown (sometimes 2, sometimes 1, sometimes 0):="
+ + this.getWindowsPerReset() + ", ");
+ sb.append("unknown_18:=" + this.getUnknown_18()
+ + System.getProperty("line.separator"));
+ return sb.toString();
+ }
+
+ // @Override
+ public void parse(byte[] data, ChmLzxcControlData chmLzxcControlData) {
+ if (data == null || (data.length < ChmConstants.CHM_LZXC_MIN_LEN))
+ throw new ChmParsingException("we want at least 0x18 bytes");
+ chmLzxcControlData.setDataRemained(data.length);
+ chmLzxcControlData.setSize(unmarshalUInt32(data,
+ chmLzxcControlData.getSize()));
+ chmLzxcControlData.unmarshalCharArray(data, chmLzxcControlData,
+ ChmConstants.CHM_SIGNATURE_LEN);
+ chmLzxcControlData.setVersion(unmarshalUInt32(data,
+ chmLzxcControlData.getVersion()));
+ chmLzxcControlData.setResetInterval(unmarshalUInt32(data,
+ chmLzxcControlData.getResetInterval()));
+ chmLzxcControlData.setWindowSize(unmarshalUInt32(data,
+ chmLzxcControlData.getWindowSize()));
+ chmLzxcControlData.setWindowsPerReset(unmarshalUInt32(data,
+ chmLzxcControlData.getWindowsPerReset()));
+
+ if (data.length >= ChmConstants.CHM_LZXC_V2_LEN)
+ chmLzxcControlData.setUnknown_18(unmarshalUInt32(data,
+ chmLzxcControlData.getUnknown_18()));
+ else
+ chmLzxcControlData.setUnknown_18(0);
+
+ if (chmLzxcControlData.getVersion() == 2) {
+ chmLzxcControlData.setWindowSize(getWindowSize()
+ * ChmConstants.CHM_WINDOW_SIZE_BLOCK);
+ }
+
+ if (chmLzxcControlData.getWindowSize() == 0
+ || chmLzxcControlData.getResetInterval() == 0)
+ throw new ChmParsingException(
+ "window size / resetInterval should be more than zero");
+
+ if (chmLzxcControlData.getWindowSize() == 1)
+ throw new ChmParsingException(
+ "window size / resetInterval should be more than 1");
+
+ /* checks a signature */
+ if (!new String(chmLzxcControlData.getSignature())
+ .equals(ChmConstants.LZXC))
+ throw new ChmParsingException(
+ "the signature does not seem to be correct");
+ }
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+ }
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmLzxcResetTable.java Tue Jun 7 15:44:41 2011
@@ -0,0 +1,353 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * LZXC reset table For ensuring a decompression. Reads the block named
+ * "::DataSpace/Storage/<SectionName>/Transform/{7FC28940-9D31-11D0-9B27-00A0C91E9C7C}/InstanceData/ResetTable"
+ * .
+ *
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?page=2 }
+ *
+ */
+public class ChmLzxcResetTable implements ChmAccessor<ChmLzxcResetTable> {
+ private static final long serialVersionUID = -8209574429411707460L;
+ /* class members */
+ private long version; // 0000: DWORD 2 unknown (possibly a version number)
+ private long block_count; // 0004: DWORD Number of entries in reset table
+ private long unknown; // 0008: DWORD 8 unknown
+ private long table_offset; // 000C: DWORD $28 Length of table header (area
+ // before table entries)
+ private long uncompressed_len; // 0010: QWORD Uncompressed Length
+ private long compressed_len; // 0018: QWORD Compressed Length
+ private long block_len; // 0020: QWORD 0x8000 block size for locations below
+ private long[] block_address;
+
+ /* local usage */
+ private int dataRemained;
+ private int currentPlace = 0;
+
+ private int getDataRemained() {
+ return dataRemained;
+ }
+
+ private void setDataRemained(int dataRemained) {
+ this.dataRemained = dataRemained;
+ }
+
+ /**
+ * Returns block addresses
+ *
+ * @return block addresses
+ */
+ public long[] getBlockAddress() {
+ return block_address;
+ }
+
+ /**
+ * Sets block addresses
+ *
+ * @param block_address
+ */
+ public void setBlockAddress(long[] block_address) {
+ this.block_address = block_address;
+ }
+
+ private int getCurrentPlace() {
+ return currentPlace;
+ }
+
+ private void setCurrentPlace(int currentPlace) {
+ this.currentPlace = currentPlace;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("version:=" + getVersion()
+ + System.getProperty("line.separator"));
+ sb.append("block_count:=" + getBlockCount()
+ + System.getProperty("line.separator"));
+ sb.append("unknown:=" + getUnknown()
+ + System.getProperty("line.separator"));
+ sb.append("table_offset:=" + getTableOffset()
+ + System.getProperty("line.separator"));
+ sb.append("uncompressed_len:=" + getUncompressedLen()
+ + System.getProperty("line.separator"));
+ sb.append("compressed_len:=" + getCompressedLen()
+ + System.getProperty("line.separator"));
+ sb.append("block_len:=" + getBlockLen()
+ + System.getProperty("line.separator"));
+ sb.append("block_addresses:=" + Arrays.toString(getBlockAddress()));
+ return sb.toString();
+ }
+
+ /**
+ * Enumerates chm block addresses
+ *
+ * @param data
+ *
+ * @return byte[] of addresses
+ */
+ private long[] enumerateBlockAddresses(byte[] data) {
+ ChmAssert.assertByteArrayNotNull(data);
+ /* we have limit of number of blocks to be extracted */
+ if (getBlockCount() > 5000)
+ setBlockCount(5000);
+
+ if (getBlockCount() < 0 && (getDataRemained() / 8) > 0)
+ setBlockCount(getDataRemained() / 8);
+
+ long[] addresses = new long[(int) getBlockCount()];
+ int rem = getDataRemained() / 8;
+ for (int i = 0; i < rem; i++) {
+ long num = -1;
+
+ try {
+ addresses[i] = unmarshalUint64(data, num);
+ } catch (Exception e) {
+ // System.err.println(e.getMessage());
+ }
+ }
+ return addresses;
+ }
+
+ /**
+ * Validates parameters such as byte[] and chm lzxc reset table
+ *
+ * @param data
+ * @param chmLzxcResetTable
+ *
+ * @return boolean
+ */
+ private boolean validateParamaters(byte[] data,
+ ChmLzxcResetTable chmLzxcResetTable) {
+ int goodParameter = 0;
+ ChmAssert.assertByteArrayNotNull(data);
+ ++goodParameter;
+ ChmAssert.assertChmAccessorNotNull(chmLzxcResetTable);
+ ++goodParameter;
+ return (goodParameter == 2);
+ }
+
+ private long unmarshalUInt32(byte[] data, long dest) {
+ ChmAssert.assertByteArrayNotNull(data);
+ dest = data[this.getCurrentPlace()]
+ | data[this.getCurrentPlace() + 1] << 8
+ | data[this.getCurrentPlace() + 2] << 16
+ | data[this.getCurrentPlace() + 3] << 24;
+
+ setDataRemained(this.getDataRemained() - 4);
+ this.setCurrentPlace(this.getCurrentPlace() + 4);
+ return dest;
+ }
+
+ private long unmarshalUint64(byte[] data, long dest) {
+ ChmAssert.assertByteArrayNotNull(data);
+ byte[] temp = new byte[8];
+ int i, j;// counters
+
+ for (i = 8, j = 7; i > 0; i--) {
+ if (data.length > this.getCurrentPlace()) {
+ temp[j--] = data[this.getCurrentPlace()];
+ this.setCurrentPlace(this.getCurrentPlace() + 1);
+ } else
+ throw new ChmParsingException(
+ "data is too small to calculate address block");
+ }
+ dest = new BigInteger(temp).longValue();
+ this.setDataRemained(this.getDataRemained() - 8);
+ return dest;
+ }
+
+ /**
+ * Returns the version
+ *
+ * @return - long
+ */
+ public long getVersion() {
+ return version;
+ }
+
+ /**
+ * Sets the version
+ *
+ * @param version
+ * - long
+ */
+ public void setVersion(long version) {
+ this.version = version;
+ }
+
+ /**
+ * Gets a block count
+ *
+ * @return - int
+ */
+ public long getBlockCount() {
+ return block_count;
+ }
+
+ /**
+ * Sets a block count
+ *
+ * @param block_count
+ * - long
+ */
+ public void setBlockCount(long block_count) {
+ this.block_count = block_count;
+ }
+
+ /**
+ * Gets unknown
+ *
+ * @return - long
+ */
+ public long getUnknown() {
+ return unknown;
+ }
+
+ /**
+ * Sets an unknown
+ *
+ * @param unknown
+ * - long
+ */
+ public void setUnknown(long unknown) {
+ this.unknown = unknown;
+ }
+
+ /**
+ * Gets a table offset
+ *
+ * @return - long
+ */
+ public long getTableOffset() {
+ return table_offset;
+ }
+
+ /**
+ * Sets a table offset
+ *
+ * @param table_offset
+ * - long
+ */
+ public void setTableOffset(long table_offset) {
+ this.table_offset = table_offset;
+ }
+
+ /**
+ * Gets uncompressed length
+ *
+ * @return - {@link BigInteger }
+ */
+ public long getUncompressedLen() {
+ return uncompressed_len;
+ }
+
+ /**
+ * Sets uncompressed length
+ *
+ * @param uncompressed_len
+ * - {@link BigInteger}
+ */
+ public void setUncompressedLen(long uncompressed_len) {
+ this.uncompressed_len = uncompressed_len;
+ }
+
+ /**
+ * Gets compressed length
+ *
+ * @return - {@link BigInteger}
+ */
+ public long getCompressedLen() {
+ return compressed_len;
+ }
+
+ /**
+ * Sets compressed length
+ *
+ * @param compressed_len
+ * - {@link BigInteger}
+ */
+ public void setCompressedLen(long compressed_len) {
+ this.compressed_len = compressed_len;
+ }
+
+ /**
+ * Gets a block length
+ *
+ * @return - {@link BigInteger}
+ */
+ public long getBlockLen() {
+ return block_len;
+ }
+
+ /**
+ * Sets a block length
+ *
+ * @param block_len
+ * - {@link BigInteger}
+ */
+ public void setBlockLlen(long block_len) {
+ this.block_len = block_len;
+ }
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+
+ }
+
+ // @Override
+ public void parse(byte[] data, ChmLzxcResetTable chmLzxcResetTable) {
+ setDataRemained(data.length);
+ if (validateParamaters(data, chmLzxcResetTable)) {
+ /* unmarshal fields */
+ chmLzxcResetTable.setVersion(unmarshalUInt32(data,
+ chmLzxcResetTable.getVersion()));
+ chmLzxcResetTable.setBlockCount(unmarshalUInt32(data,
+ chmLzxcResetTable.getBlockCount()));
+ chmLzxcResetTable.setUnknown(unmarshalUInt32(data,
+ chmLzxcResetTable.getUnknown()));
+ chmLzxcResetTable.setTableOffset(unmarshalUInt32(data,
+ chmLzxcResetTable.getTableOffset()));
+ chmLzxcResetTable.setUncompressedLen(unmarshalUint64(data,
+ chmLzxcResetTable.getUncompressedLen()));
+ chmLzxcResetTable.setCompressedLen(unmarshalUint64(data,
+ chmLzxcResetTable.getCompressedLen()));
+ chmLzxcResetTable.setBlockLlen(unmarshalUint64(data,
+ chmLzxcResetTable.getBlockLen()));
+ chmLzxcResetTable.setBlockAddress(enumerateBlockAddresses(data));
+ }
+
+ /* checks chmLzxcResetTable */
+ if (chmLzxcResetTable.getVersion() != ChmConstants.CHM_VER_2)
+ throw new ChmParsingException(
+ "does not seem currect version of chmLzxcResetTable");
+ }
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmgiHeader.java Tue Jun 7 15:44:41 2011
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import java.util.Arrays;
+
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmCommons;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * Description Note: not always exists An index chunk has the following format:
+ * 0000: char[4] 'PMGI' 0004: DWORD Length of quickref/free area at end of
+ * directory chunk 0008: Directory index entries (to quickref/free area) The
+ * quickref area in an PMGI is the same as in an PMGL The format of a directory
+ * index entry is as follows: BYTE: length of name BYTEs: name (UTF-8 encoded)
+ * ENCINT: directory listing chunk which starts with name Encoded Integers aka
+ * ENCINT An ENCINT is a variable-length integer. The high bit of each byte
+ * indicates "continued to the next byte". Bytes are stored most significant to
+ * least significant. So, for example, $EA $15 is (((0xEA&0x7F)<<7)|0x15) =
+ * 0x3515.
+ *
+ * <p>
+ * Note: This class is not in use
+ *
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?show-translation-form=1 }
+ *
+ *
+ */
+public class ChmPmgiHeader implements ChmAccessor<ChmPmgiHeader> {
+ private static final long serialVersionUID = -2092282339894303701L;
+ private byte[] signature = new String(ChmConstants.CHM_PMGI_MARKER)
+ .getBytes(); /* 0 (PMGI) */
+ private long free_space; /* 4 */
+
+ /* local usage */
+ private int dataRemained;
+ private int currentPlace = 0;
+
+ private int getDataRemained() {
+ return dataRemained;
+ }
+
+ private void setDataRemained(int dataRemained) {
+ this.dataRemained = dataRemained;
+ }
+
+ private int getCurrentPlace() {
+ return currentPlace;
+ }
+
+ private void setCurrentPlace(int currentPlace) {
+ this.currentPlace = currentPlace;
+ }
+
+ private void unmarshalCharArray(byte[] data, ChmPmgiHeader chmPmgiHeader,
+ int count) {
+ int index = -1;
+ ChmAssert.assertByteArrayNotNull(data);
+ ChmAssert.assertChmAccessorNotNull(chmPmgiHeader);
+ ChmAssert.assertPositiveInt(count);
+ this.setDataRemained(data.length);
+ index = ChmCommons.indexOf(data,
+ ChmConstants.CHM_PMGI_MARKER.getBytes());
+ if (index >= 0)
+ System.arraycopy(data, index, chmPmgiHeader.getSignature(), 0,
+ count);
+ else
+ System.err.println(ChmPmgiHeader.class.getName()
+ + " does not exist a PMGI, use PMGL instead");
+ this.setCurrentPlace(this.getCurrentPlace() + count);
+ this.setDataRemained(this.getDataRemained() - count);
+ }
+
+ private long unmarshalUInt32(byte[] data, long dest) {
+ ChmAssert.assertByteArrayNotNull(data);
+
+ if (4 > getDataRemained())
+ throw new ChmParsingException("4 > dataLenght");
+ dest = data[this.getCurrentPlace()]
+ | data[this.getCurrentPlace() + 1] << 8
+ | data[this.getCurrentPlace() + 2] << 16
+ | data[this.getCurrentPlace() + 3] << 24;
+
+ setDataRemained(this.getDataRemained() - 4);
+ this.setCurrentPlace(this.getCurrentPlace() + 4);
+ return dest;
+ }
+
+ /**
+ * Returns pmgi signature if exists
+ *
+ * @return signature
+ */
+ public byte[] getSignature() {
+ return signature;
+ }
+
+ /**
+ * Sets pmgi signature
+ *
+ * @param signature
+ */
+ protected void setSignature(byte[] signature) {
+ this.signature = signature;
+ }
+
+ /**
+ * Returns pmgi free space
+ *
+ * @return free_space
+ */
+ public long getFreeSpace() {
+ return free_space;
+ }
+
+ /**
+ * Sets pmgi free space
+ *
+ * @param free_space
+ */
+ protected void setFreeSpace(long free_space) {
+ this.free_space = free_space;
+ }
+
+ /**
+ * Returns textual representation of the pmgi header
+ */
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("signature:=" + new String(getSignature()) + ", ");
+ sb.append("free space:=" + getFreeSpace()
+ + System.getProperty("line.separator"));
+ return sb.toString();
+ }
+
+ // @Override
+ public void parse(byte[] data, ChmPmgiHeader chmPmgiHeader) {
+ /* we only know how to deal with a 0x8 byte structures */
+ if (data.length < ChmConstants.CHM_PMGI_LEN)
+ throw new ChmParsingException(
+ "we only know how to deal with a 0x8 byte structures");
+
+ /* unmarshal fields */
+ chmPmgiHeader.unmarshalCharArray(data, chmPmgiHeader,
+ ChmConstants.CHM_SIGNATURE_LEN);
+ chmPmgiHeader.setFreeSpace(chmPmgiHeader.unmarshalUInt32(data,
+ chmPmgiHeader.getFreeSpace()));
+
+ /* check structure */
+ if (!Arrays.equals(chmPmgiHeader.getSignature(),
+ ChmConstants.CHM_PMGI_MARKER.getBytes()))
+ throw new ChmParsingException(
+ "it does not seem to be valid a PMGI signature, check ChmItsp index_root if it was -1, means no PMGI, use PMGL insted");
+
+ }
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+
+ }
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmPmglHeader.java Tue Jun 7 15:44:41 2011
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmConstants;
+import org.apache.tika.parser.chm.exception.ChmParsingException;
+
+/**
+ * Description There are two types of directory chunks -- index chunks, and
+ * listing chunks. The index chunk will be omitted if there is only one listing
+ * chunk. A listing chunk has the following format: 0000: char[4] 'PMGL' 0004:
+ * DWORD Length of free space and/or quickref area at end of directory chunk
+ * 0008: DWORD Always 0 000C: DWORD Chunk number of previous listing chunk when
+ * reading directory in sequence (-1 if this is the first listing chunk) 0010:
+ * DWORD Chunk number of next listing chunk when reading directory in sequence
+ * (-1 if this is the last listing chunk) 0014: Directory listing entries (to
+ * quickref area) Sorted by filename; the sort is case-insensitive The quickref
+ * area is written backwards from the end of the chunk. One quickref entry
+ * exists for every n entries in the file, where n is calculated as 1 + (1 <<
+ * quickref density). So for density = 2, n = 5 Chunklen-0002: WORD Number of
+ * entries in the chunk Chunklen-0004: WORD Offset of entry n from entry 0
+ * Chunklen-0008: WORD Offset of entry 2n from entry 0 Chunklen-000C: WORD
+ * Offset of entry 3n from entry 0 ... The format of a directory listing entry
+ * is as follows BYTE: length of name BYTEs: name (UTF-8 encoded) ENCINT:
+ * content section ENCINT: offset ENCINT: length The offset is from the
+ * beginning of the content section the file is in, after the section has been
+ * decompressed (if appropriate). The length also refers to length of the file
+ * in the section after decompression. There are two kinds of file represented
+ * in the directory: user data and format related files. The files which are
+ * format-related have names which begin with '::', the user data files have
+ * names which begin with "/".
+ *
+ * {@link http
+ * ://translated.by/you/microsoft-s-html-help-chm-format-incomplete/original
+ * /?show-translation-form=1 }
+ *
+ * @author olegt
+ *
+ */
+public class ChmPmglHeader implements ChmAccessor<ChmPmglHeader> {
+ private static final long serialVersionUID = -6139486487475923593L;
+ private byte[] signature = new String(ChmConstants.PMGL).getBytes(); /*
+ * 0
+ * (PMGL
+ * )
+ */
+ private long free_space; /* 4 */
+ private long unknown_0008; /* 8 */
+ private int block_prev; /* c */
+ private int block_next; /* 10 */
+
+ /* local usage */
+ private int dataRemained;
+ private int currentPlace = 0;
+
+ private int getDataRemained() {
+ return dataRemained;
+ }
+
+ private void setDataRemained(int dataRemained) {
+ this.dataRemained = dataRemained;
+ }
+
+ private int getCurrentPlace() {
+ return currentPlace;
+ }
+
+ private void setCurrentPlace(int currentPlace) {
+ this.currentPlace = currentPlace;
+ }
+
+ public long getFreeSpace() {
+ return free_space;
+ }
+
+ public void setFreeSpace(long free_space) {
+ this.free_space = free_space;
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("signatute:=" + new String(getSignature()) + ", ");
+ sb.append("free space:=" + getFreeSpace() + ", ");
+ sb.append("unknown0008:=" + getUnknown0008() + ", ");
+ sb.append("prev block:=" + getBlockPrev() + ", ");
+ sb.append("next block:=" + getBlockNext()
+ + System.getProperty("line.separator"));
+ return sb.toString();
+ }
+
+ protected void unmarshalCharArray(byte[] data, ChmPmglHeader chmPmglHeader,
+ int count) {
+ ChmAssert.assertByteArrayNotNull(data);
+ this.setDataRemained(data.length);
+ System.arraycopy(data, 0, chmPmglHeader.signature, 0, count);
+ this.setCurrentPlace(this.getCurrentPlace() + count);
+ this.setDataRemained(this.getDataRemained() - count);
+ }
+
+ private int unmarshalInt32(byte[] data, int dest) {
+ ChmAssert.assertByteArrayNotNull(data);
+ if (4 > this.getDataRemained())
+ throw new ChmParsingException("4 > dataLenght");
+ dest = data[this.getCurrentPlace()]
+ | data[this.getCurrentPlace() + 1] << 8
+ | data[this.getCurrentPlace() + 2] << 16
+ | data[this.getCurrentPlace() + 3] << 24;
+
+ this.setCurrentPlace(this.getCurrentPlace() + 4);
+ this.setDataRemained(this.getDataRemained() - 4);
+ return dest;
+ }
+
+ private long unmarshalUInt32(byte[] data, long dest) {
+ ChmAssert.assertByteArrayNotNull(data);
+ if (4 > getDataRemained())
+ throw new ChmParsingException("4 > dataLenght");
+ dest = data[this.getCurrentPlace()]
+ | data[this.getCurrentPlace() + 1] << 8
+ | data[this.getCurrentPlace() + 2] << 16
+ | data[this.getCurrentPlace() + 3] << 24;
+
+ setDataRemained(this.getDataRemained() - 4);
+ this.setCurrentPlace(this.getCurrentPlace() + 4);
+ return dest;
+ }
+
+ // @Override
+ public void parse(byte[] data, ChmPmglHeader chmPmglHeader) {
+ if (data.length < ChmConstants.CHM_PMGL_LEN)
+ throw new ChmParsingException(ChmPmglHeader.class.getName()
+ + " we only know how to deal with a 0x14 byte structures");
+
+ /* unmarshal fields */
+ chmPmglHeader.unmarshalCharArray(data, chmPmglHeader,
+ ChmConstants.CHM_SIGNATURE_LEN);
+ chmPmglHeader.setFreeSpace(chmPmglHeader.unmarshalUInt32(data,
+ chmPmglHeader.getFreeSpace()));
+ chmPmglHeader.setUnknown0008(chmPmglHeader.unmarshalUInt32(data,
+ chmPmglHeader.getUnknown0008()));
+ chmPmglHeader.setBlockPrev(chmPmglHeader.unmarshalInt32(data,
+ chmPmglHeader.getBlockPrev()));
+ chmPmglHeader.setBlockNext(chmPmglHeader.unmarshalInt32(data,
+ chmPmglHeader.getBlockNext()));
+
+ /* check structure */
+ if (!new String(chmPmglHeader.getSignature()).equals(ChmConstants.PMGL))
+ throw new ChmParsingException(ChmPmglHeader.class.getName()
+ + " pmgl != pmgl.signature");
+
+ }
+
+ public byte[] getSignature() {
+ return signature;
+ }
+
+ protected void setSignature(byte[] signature) {
+ this.signature = signature;
+ }
+
+ public long getUnknown0008() {
+ return unknown_0008;
+ }
+
+ protected void setUnknown0008(long unknown_0008) {
+ this.unknown_0008 = unknown_0008;
+ }
+
+ public int getBlockPrev() {
+ return block_prev;
+ }
+
+ protected void setBlockPrev(int block_prev) {
+ this.block_prev = block_prev;
+ }
+
+ public int getBlockNext() {
+ return block_next;
+ }
+
+ protected void setBlockNext(int block_next) {
+ this.block_next = block_next;
+ }
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+
+ }
+}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java?rev=1133047&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/DirectoryListingEntry.java Tue Jun 7 15:44:41 2011
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.chm.accessor;
+
+import org.apache.tika.parser.chm.assertion.ChmAssert;
+import org.apache.tika.parser.chm.core.ChmCommons;
+
+/**
+ * The format of a directory listing entry is as follows: BYTE: length of name
+ * BYTEs: name (UTF-8 encoded) ENCINT: content section ENCINT: offset ENCINT:
+ * length The offset is from the beginning of the content section the file is
+ * in, after the section has been decompressed (if appropriate). The length also
+ * refers to length of the file in the section after decompression. There are
+ * two kinds of file represented in the directory: user data and format related
+ * files. The files which are format-related have names which begin with '::',
+ * the user data files have names which begin with "/".
+ *
+ */
+public class DirectoryListingEntry {
+ /* Length of the entry name */
+ private int name_length;
+ /* Entry name or directory name */
+ private String name;
+ /* Entry type */
+ private ChmCommons.EntryType entryType;
+ /* Entry offset */
+ private int offset;
+ /* Entry size */
+ private int length;
+
+ public DirectoryListingEntry() {
+
+ }
+
+ /**
+ * Constructs directoryListingEntry
+ *
+ * @param name_length
+ * int
+ * @param name
+ * String
+ * @param isCompressed
+ * ChmCommons.EntryType
+ * @param offset
+ * int
+ * @param length
+ * int
+ */
+ public DirectoryListingEntry(int name_length, String name,
+ ChmCommons.EntryType isCompressed, int offset, int length) {
+ ChmAssert.assertDirectoryListingEntry(name_length, name, isCompressed,
+ offset, length);
+ setNameLength(name_length);
+ setName(name);
+ setEntryType(isCompressed);
+ setOffset(offset);
+ setLength(length);
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("name_length:=" + getNameLength()
+ + System.getProperty("line.separator"));
+ sb.append("name:=" + getName() + System.getProperty("line.separator"));
+ sb.append("entryType:=" + getEntryType()
+ + System.getProperty("line.separator"));
+ sb.append("offset:=" + getOffset()
+ + System.getProperty("line.separator"));
+ sb.append("length:=" + getLength());
+ return sb.toString();
+ }
+
+ /**
+ * Returns an entry name length
+ *
+ * @return int
+ */
+ public int getNameLength() {
+ return name_length;
+ }
+
+ /**
+ * Sets an entry name length
+ *
+ * @param name_length
+ * int
+ */
+ protected void setNameLength(int name_length) {
+ this.name_length = name_length;
+ }
+
+ /**
+ * Returns an entry name
+ *
+ * @return String
+ */
+ public String getName() {
+ return name;
+ }
+
+ /**
+ * Sets entry name
+ *
+ * @param name
+ * String
+ */
+ protected void setName(String name) {
+ this.name = name;
+ }
+
+ /**
+ * Returns ChmCommons.EntryType (COMPRESSED or UNCOMPRESSED)
+ *
+ * @return ChmCommons.EntryType
+ */
+ public ChmCommons.EntryType getEntryType() {
+ return entryType;
+ }
+
+ protected void setEntryType(ChmCommons.EntryType entryType) {
+ this.entryType = entryType;
+ }
+
+ public int getOffset() {
+ return offset;
+ }
+
+ protected void setOffset(int offset) {
+ this.offset = offset;
+ }
+
+ public int getLength() {
+ return length;
+ }
+
+ protected void setLength(int length) {
+ this.length = length;
+ }
+
+ public static void main(String[] args) {
+ }
+}