You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/02/11 17:35:44 UTC
[tika] branch master updated: TIKA-2756 -- upgrade Jackcess and
remove dependencies on commons-lang
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 888f51f TIKA-2756 -- upgrade Jackcess and remove dependencies on commons-lang
888f51f is described below
commit 888f51fe5ba7d735fecfa7d2aa4acb6fe8d69508
Author: TALLISON <ta...@apache.org>
AuthorDate: Mon Feb 11 12:35:32 2019 -0500
TIKA-2756 -- upgrade Jackcess and remove dependencies on commons-lang
---
tika-parsers/pom.xml | 17 +-
.../parser/microsoft/JackcessCompoundOleUtil.java | 253 -------
.../tika/parser/microsoft/JackcessExtractor.java | 14 +-
.../tika/parser/microsoft/JackcessOleUtil.java | 751 ---------------------
4 files changed, 10 insertions(+), 1025 deletions(-)
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index d4823dc..628359e 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -260,7 +260,7 @@
<dependency>
<groupId>com.healthmarketscience.jackcess</groupId>
<artifactId>jackcess</artifactId>
- <version>2.2.2</version>
+ <version>3.0.0</version>
<exclusions>
<exclusion>
<groupId>commons-logging</groupId>
@@ -271,7 +271,7 @@
<dependency>
<groupId>com.healthmarketscience.jackcess</groupId>
<artifactId>jackcess-encrypt</artifactId>
- <version>2.1.4</version>
+ <version>3.0.0</version>
<exclusions>
<exclusion>
<groupId>org.bouncycastle</groupId>
@@ -496,6 +496,11 @@
</exclusions>
</dependency>
<dependency>
+ <groupId>org.jdom</groupId>
+ <artifactId>jdom2</artifactId>
+ <version>2.0.6</version>
+ </dependency>
+ <dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>27.0.1-jre</version>
@@ -763,6 +768,7 @@
<groupId>org.apache.uima</groupId>
<artifactId>uimafit-core</artifactId>
<version>2.4.0</version>
+ <scope>provided</scope>
<exclusions>
<exclusion>
<groupId>org.apache.uima</groupId>
@@ -800,6 +806,7 @@
<groupId>org.apache.uima</groupId>
<artifactId>uimaj-core</artifactId>
<version>3.0.1</version>
+ <scope>provided</scope>
<exclusions>
<exclusion>
<groupId>javax.annotation</groupId>
@@ -807,12 +814,6 @@
</exclusion>
</exclusions>
</dependency>
-
- <dependency>
- <groupId>org.jdom</groupId>
- <artifactId>jdom2</artifactId>
- <version>2.0.6</version>
- </dependency>
<!--Jackson parse String to JSON-->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessCompoundOleUtil.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessCompoundOleUtil.java
deleted file mode 100644
index 4786cc8..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessCompoundOleUtil.java
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
-Copyright (c) 2013 James Ahlborn
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package org.apache.tika.parser.microsoft;
-
-import com.healthmarketscience.jackcess.RuntimeIOException;
-import com.healthmarketscience.jackcess.impl.ByteUtil;
-import com.healthmarketscience.jackcess.impl.CustomToStringStyle;
-import com.healthmarketscience.jackcess.util.MemFileChannel;
-import com.healthmarketscience.jackcess.util.OleBlob;
-import org.apache.poi.poifs.filesystem.DirectoryEntry;
-import org.apache.poi.poifs.filesystem.DocumentEntry;
-import org.apache.poi.poifs.filesystem.DocumentInputStream;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.UnsupportedEncodingException;
-import java.net.URLDecoder;
-import java.net.URLEncoder;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-/**
- * Temporary copy/paste from Jackcess to allow upgrade to POI 4.0.0.
- * This class will be removed once POI 4.0.0 is released and jackcess
- * updates to the most recent version of POI.
- * @deprecated -- this class will be removed in Tika >= 1.20
- */
-@Deprecated
-class JackcessCompoundOleUtil implements JackcessOleUtil.CompoundPackageFactory {
- private static final String ENTRY_NAME_CHARSET = "UTF-8";
- private static final String ENTRY_SEPARATOR = "/";
- private static final String CONTENTS_ENTRY = "CONTENTS";
-
- static {
- // force a poi class to be loaded to ensure that when this class is
- // loaded, we know that the poi classes are available
- POIFSFileSystem.class.getName();
- }
-
- public JackcessCompoundOleUtil() {
- }
-
- /**
- * Creates a nes CompoundContent for the given blob information.
- */
- public JackcessOleUtil.ContentImpl createCompoundPackageContent(
- JackcessOleUtil.OleBlobImpl blob, String prettyName, String className, String typeName,
- ByteBuffer blobBb, int dataBlockLen) {
- return new CompoundContentImpl(blob, prettyName, className, typeName,
- blobBb.position(), dataBlockLen);
- }
-
- /**
- * Gets a DocumentEntry from compound storage based on a fully qualified,
- * encoded entry name.
- *
- * @param entryName fully qualified, encoded entry name
- * @param dir root directory of the compound storage
- * @return the relevant DocumentEntry
- * @throws FileNotFoundException if the entry does not exist
- * @throws IOException if some other io error occurs
- */
- public static DocumentEntry getDocumentEntry(String entryName,
- DirectoryEntry dir)
- throws IOException {
- // split entry name into individual components and decode them
- List<String> entryNames = new ArrayList<String>();
- for (String str : entryName.split(ENTRY_SEPARATOR)) {
- if (str.length() == 0) {
- continue;
- }
- entryNames.add(decodeEntryName(str));
- }
-
- DocumentEntry entry = null;
- Iterator<String> iter = entryNames.iterator();
- while (iter.hasNext()) {
- org.apache.poi.poifs.filesystem.Entry tmpEntry = dir.getEntry(iter.next());
- if (tmpEntry instanceof DirectoryEntry) {
- dir = (DirectoryEntry) tmpEntry;
- } else if (!iter.hasNext() && (tmpEntry instanceof DocumentEntry)) {
- entry = (DocumentEntry) tmpEntry;
- } else {
- break;
- }
- }
-
- if (entry == null) {
- throw new FileNotFoundException("Could not find document " + entryName);
- }
-
- return entry;
- }
-
- private static String encodeEntryName(String name) {
- try {
- return URLEncoder.encode(name, ENTRY_NAME_CHARSET);
- } catch (UnsupportedEncodingException e) {
- throw new RuntimeException(e);
- }
- }
-
- private static String decodeEntryName(String name) {
- try {
- return URLDecoder.decode(name, ENTRY_NAME_CHARSET);
- } catch (UnsupportedEncodingException e) {
- throw new RuntimeException(e);
- }
- }
-
- private static final class CompoundContentImpl
- extends JackcessOleUtil.EmbeddedPackageContentImpl
- implements OleBlob.CompoundContent {
- private POIFSFileSystem _fs;
-
- private CompoundContentImpl(
- JackcessOleUtil.OleBlobImpl blob, String prettyName, String className,
- String typeName, int position, int length) {
- super(blob, prettyName, className, typeName, position, length);
- }
-
- public OleBlob.ContentType getType() {
- return OleBlob.ContentType.COMPOUND_STORAGE;
- }
-
- private POIFSFileSystem getFileSystem() throws IOException {
- if (_fs == null) {
- _fs = new POIFSFileSystem(MemFileChannel.newChannel(getStream(), "r"));
- }
- return _fs;
- }
-
- public Iterator<Entry> iterator() {
- try {
- return getEntries(new ArrayList<Entry>(), getFileSystem().getRoot(),
- ENTRY_SEPARATOR).iterator();
- } catch (IOException e) {
- throw new RuntimeIOException(e);
- }
- }
-
- public EntryImpl getEntry(String entryName) throws IOException {
- return new EntryImpl(entryName,
- getDocumentEntry(entryName, getFileSystem().getRoot()));
- }
-
- public boolean hasContentsEntry() throws IOException {
- return getFileSystem().getRoot().hasEntry(CONTENTS_ENTRY);
- }
-
- public EntryImpl getContentsEntry() throws IOException {
- return getEntry(CONTENTS_ENTRY);
- }
-
- private List<Entry> getEntries(List<Entry> entries, DirectoryEntry dir,
- String prefix) {
- for (org.apache.poi.poifs.filesystem.Entry entry : dir) {
- if (entry instanceof DirectoryEntry) {
- // .. recurse into this directory
- getEntries(entries, (DirectoryEntry) entry, prefix + ENTRY_SEPARATOR);
- } else if (entry instanceof DocumentEntry) {
- // grab the entry name/detils
- DocumentEntry de = (DocumentEntry) entry;
- String entryName = prefix + encodeEntryName(entry.getName());
- entries.add(new EntryImpl(entryName, de));
- }
- }
- return entries;
- }
-
- @Override
- public void close() {
- ByteUtil.closeQuietly(_fs);
- _fs = null;
- super.close();
- }
-
-
-
- private final class EntryImpl implements OleBlob.CompoundContent.Entry {
- private final String _name;
- private final DocumentEntry _docEntry;
-
- private EntryImpl(String name, DocumentEntry docEntry) {
- _name = name;
- _docEntry = docEntry;
- }
-
- public OleBlob.ContentType getType() {
- return OleBlob.ContentType.UNKNOWN;
- }
-
- public String getName() {
- return _name;
- }
-
- public CompoundContentImpl getParent() {
- return CompoundContentImpl.this;
- }
-
- public JackcessOleUtil.OleBlobImpl getBlob() {
- return getParent().getBlob();
- }
-
- public long length() {
- return _docEntry.getSize();
- }
-
- public InputStream getStream() throws IOException {
- return new DocumentInputStream(_docEntry);
- }
-
- public void writeTo(OutputStream out) throws IOException {
- InputStream in = null;
- try {
- ByteUtil.copy(in = getStream(), out);
- } finally {
- ByteUtil.closeQuietly(in);
- }
- }
-
- @Override
- public String toString() {
- return CustomToStringStyle.valueBuilder(this)
- .append("name", _name)
- .append("length", length())
- .toString();
- }
- }
- }
-}
-
-
-
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
index 3a10346..1ae4ab8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
@@ -304,7 +304,7 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
private void handleOLE(Row row, String cName, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
- OleBlob blob = getBlob(row, cName);
+ OleBlob blob = row.getBlob(cName);
//lifted shamelessly from Jackcess's OleBlobTest
if (blob == null)
return;
@@ -368,18 +368,6 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
}
}
- /*
- Temporary work around until POI 4.0.0 is released and jackcess upgrades
- This is copy/pasted from jackcess
- */
- private OleBlob getBlob(Row row, String cName) {
- byte[] bytes = row.getBytes(cName);
- if (bytes == null) {
- return null;
- }
- return JackcessOleUtil.parseBlob(bytes);
- }
-
private void handleCompoundContent(OleBlob.CompoundContent cc, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
InputStream is = null;
POIFSFileSystem fileSystem = null;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessOleUtil.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessOleUtil.java
deleted file mode 100644
index d93e6ab..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/JackcessOleUtil.java
+++ /dev/null
@@ -1,751 +0,0 @@
-/*
-Copyright (c) 2013 James Ahlborn
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package org.apache.tika.parser.microsoft;
-
-import java.io.ByteArrayInputStream;
-import java.io.Closeable;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.nio.ByteBuffer;
-import java.nio.charset.Charset;
-import java.sql.Blob;
-import java.sql.SQLException;
-import java.sql.SQLFeatureNotSupportedException;
-import java.text.Normalizer;
-import java.util.EnumSet;
-import java.util.Set;
-import java.util.regex.Pattern;
-
-import com.healthmarketscience.jackcess.DataType;
-import com.healthmarketscience.jackcess.util.OleBlob;
-import static com.healthmarketscience.jackcess.util.OleBlob.*;
-
-import com.healthmarketscience.jackcess.impl.ByteUtil;
-import com.healthmarketscience.jackcess.impl.PageChannel;
-
-/**
- * Utility code for working with OLE data.
- * Temporary workaround until POI 4.0.0 is released and Jackcess is updated
- *
- *
- * @author James Ahlborn
- * @usage _advanced_class_
- * @deprecated this class will be removed in Tika >= 1.20
- */
-@Deprecated
-class JackcessOleUtil {
-
-
- /**
- * Interface used to allow optional inclusion of the poi library for working
- * with compound ole data.
- */
- interface CompoundPackageFactory
- {
- public ContentImpl createCompoundPackageContent(
- OleBlobImpl blob, String prettyName, String className, String typeName,
- ByteBuffer blobBb, int dataBlockLen);
- }
-
- private static final int PACKAGE_SIGNATURE = 0x1C15;
- private static final Charset OLE_CHARSET = Charset.forName("US-ASCII");
- private static final Charset OLE_UTF_CHARSET = Charset.forName("UTF-16LE");
- private static final byte[] COMPOUND_STORAGE_SIGNATURE =
- {(byte)0xd0,(byte)0xcf,(byte)0x11,(byte)0xe0,
- (byte)0xa1,(byte)0xb1,(byte)0x1a,(byte)0xe1};
- private static final String SIMPLE_PACKAGE_TYPE = "Package";
- private static final int PACKAGE_OBJECT_TYPE = 0x02;
- private static final int OLE_VERSION = 0x0501;
- private static final int OLE_FORMAT = 0x02;
- private static final int PACKAGE_STREAM_SIGNATURE = 0x02;
- private static final int PS_EMBEDDED_FILE = 0x030000;
- private static final int PS_LINKED_FILE = 0x010000;
- private static final Set<ContentType> WRITEABLE_TYPES = EnumSet.of(
- ContentType.LINK, ContentType.SIMPLE_PACKAGE, ContentType.OTHER);
- private static final byte[] NO_DATA = new byte[0];
- private static final int LINK_HEADER = 0x01;
- private static final byte[] PACKAGE_FOOTER = {
- 0x01, 0x05, 0x00, 0x00, 0x00, 0x00,
- 0x00, 0x00, 0x01, (byte)0xAD, 0x05, (byte)0xFE
- };
-
- // regex pattern which matches all the crazy extra stuff in unicode
- private static final Pattern UNICODE_ACCENT_PATTERN =
- Pattern.compile("[\\p{InCombiningDiacriticalMarks}\\p{IsLm}\\p{IsSk}]+");
-
- private static final CompoundPackageFactory COMPOUND_FACTORY;
-
- static {
- CompoundPackageFactory compoundFactory = null;
- try {
- compoundFactory = (CompoundPackageFactory)
- Class.forName("org.apache.tika.parser.microsoft.JackcessCompoundOleUtil")
- .newInstance();
- } catch(Throwable t) {
- // must not have poi, will load compound ole data as "other"
- }
- COMPOUND_FACTORY = compoundFactory;
- }
-
- /**
- * Parses an access database blob structure and returns an appropriate
- * OleBlob instance.
- */
- public static OleBlob parseBlob(byte[] bytes) {
- return new OleBlobImpl(bytes);
- }
-
- /**
- * Creates a new OlBlob instance using the given information.
- */
- public static OleBlob createBlob(Builder oleBuilder)
- throws IOException
- {
- try {
-
- if(!WRITEABLE_TYPES.contains(oleBuilder.getType())) {
- throw new IllegalArgumentException(
- "Cannot currently create ole values of type " +
- oleBuilder.getType());
- }
-
- long contentLen = oleBuilder.getContentLength();
- byte[] contentBytes = oleBuilder.getBytes();
- InputStream contentStream = oleBuilder.getStream();
- byte[] packageStreamHeader = NO_DATA;
- byte[] packageStreamFooter = NO_DATA;
-
- switch(oleBuilder.getType()) {
- case LINK:
- packageStreamHeader = writePackageStreamHeader(oleBuilder);
-
- // link "content" is file path
- contentBytes = getZeroTermStrBytes(oleBuilder.getFilePath());
- contentLen = contentBytes.length;
- break;
-
- case SIMPLE_PACKAGE:
- packageStreamHeader = writePackageStreamHeader(oleBuilder);
- packageStreamFooter = writePackageStreamFooter(oleBuilder);
- break;
-
- case OTHER:
- // nothing more to do
- break;
- default:
- throw new RuntimeException("unexpected type " + oleBuilder.getType());
- }
-
- long payloadLen = packageStreamHeader.length + packageStreamFooter.length +
- contentLen;
- byte[] packageHeader = writePackageHeader(oleBuilder, payloadLen);
-
- long totalOleLen = packageHeader.length + PACKAGE_FOOTER.length +
- payloadLen;
- if(totalOleLen > DataType.OLE.getMaxSize()) {
- throw new IllegalArgumentException("Content size of " + totalOleLen +
- " is too large for ole column");
- }
-
- byte[] oleBytes = new byte[(int)totalOleLen];
- ByteBuffer bb = PageChannel.wrap(oleBytes);
- bb.put(packageHeader);
- bb.put(packageStreamHeader);
-
- if(contentLen > 0L) {
- if(contentBytes != null) {
- bb.put(contentBytes);
- } else {
- byte[] buf = new byte[8192];
- int numBytes = 0;
- while((numBytes = contentStream.read(buf)) >= 0) {
- bb.put(buf, 0, numBytes);
- }
- }
- }
-
- bb.put(packageStreamFooter);
- bb.put(PACKAGE_FOOTER);
-
- return parseBlob(oleBytes);
-
- } finally {
- ByteUtil.closeQuietly(oleBuilder.getStream());
- }
- }
-
- private static byte[] writePackageHeader(Builder oleBuilder,
- long contentLen) {
-
- byte[] prettyNameBytes = getZeroTermStrBytes(oleBuilder.getPrettyName());
- String className = oleBuilder.getClassName();
- String typeName = oleBuilder.getTypeName();
- if(className == null) {
- className = typeName;
- } else if(typeName == null) {
- typeName = className;
- }
- byte[] classNameBytes = getZeroTermStrBytes(className);
- byte[] typeNameBytes = getZeroTermStrBytes(typeName);
-
- int packageHeaderLen = 20 + prettyNameBytes.length + classNameBytes.length;
-
- int oleHeaderLen = 24 + typeNameBytes.length;
-
- byte[] headerBytes = new byte[packageHeaderLen + oleHeaderLen];
-
- ByteBuffer bb = PageChannel.wrap(headerBytes);
-
- // write outer package header
- bb.putShort((short)PACKAGE_SIGNATURE);
- bb.putShort((short)packageHeaderLen);
- bb.putInt(PACKAGE_OBJECT_TYPE);
- bb.putShort((short)prettyNameBytes.length);
- bb.putShort((short)classNameBytes.length);
- int prettyNameOff = bb.position() + 8;
- bb.putShort((short)prettyNameOff);
- bb.putShort((short)(prettyNameOff + prettyNameBytes.length));
- bb.putInt(-1);
- bb.put(prettyNameBytes);
- bb.put(classNameBytes);
-
- // put ole header
- bb.putInt(OLE_VERSION);
- bb.putInt(OLE_FORMAT);
- bb.putInt(typeNameBytes.length);
- bb.put(typeNameBytes);
- bb.putLong(0L);
- bb.putInt((int)contentLen);
-
- return headerBytes;
- }
-
- private static byte[] writePackageStreamHeader(Builder oleBuilder) {
-
- byte[] fileNameBytes = getZeroTermStrBytes(oleBuilder.getFileName());
- byte[] filePathBytes = getZeroTermStrBytes(oleBuilder.getFilePath());
-
- int headerLen = 6 + fileNameBytes.length + filePathBytes.length;
-
- if(oleBuilder.getType() == ContentType.SIMPLE_PACKAGE) {
-
- headerLen += 8 + filePathBytes.length;
-
- } else {
-
- headerLen += 2;
- }
-
- byte[] headerBytes = new byte[headerLen];
- ByteBuffer bb = PageChannel.wrap(headerBytes);
- bb.putShort((short)PACKAGE_STREAM_SIGNATURE);
- bb.put(fileNameBytes);
- bb.put(filePathBytes);
-
- if(oleBuilder.getType() == ContentType.SIMPLE_PACKAGE) {
- bb.putInt(PS_EMBEDDED_FILE);
- bb.putInt(filePathBytes.length);
- bb.put(filePathBytes, 0, filePathBytes.length);
- bb.putInt((int) oleBuilder.getContentLength());
- } else {
- bb.putInt(PS_LINKED_FILE);
- bb.putShort((short)LINK_HEADER);
- }
-
- return headerBytes;
- }
-
- private static byte[] writePackageStreamFooter(Builder oleBuilder) {
-
- // note, these are _not_ zero terminated
- byte[] fileNameBytes = oleBuilder.getFileName().getBytes(OLE_UTF_CHARSET);
- byte[] filePathBytes = oleBuilder.getFilePath().getBytes(OLE_UTF_CHARSET);
-
- int footerLen = 12 + (filePathBytes.length * 2) + fileNameBytes.length;
-
- byte[] footerBytes = new byte[footerLen];
- ByteBuffer bb = PageChannel.wrap(footerBytes);
-
- bb.putInt(filePathBytes.length/2);
- bb.put(filePathBytes);
- bb.putInt(fileNameBytes.length/2);
- bb.put(fileNameBytes);
- bb.putInt(filePathBytes.length/2);
- bb.put(filePathBytes);
-
- return footerBytes;
- }
-
- /**
- * creates the appropriate ContentImpl for the given blob.
- */
- private static ContentImpl parseContent(OleBlobImpl blob)
- throws IOException
- {
- ByteBuffer bb = PageChannel.wrap(blob.getBytes());
-
- if((bb.remaining() < 2) || (bb.getShort() != PACKAGE_SIGNATURE)) {
- return new UnknownContentImpl(blob);
- }
-
- // read outer package header
- int headerSize = bb.getShort();
- /* int objType = */ bb.getInt();
- int prettyNameLen = bb.getShort();
- int classNameLen = bb.getShort();
- int prettyNameOff = bb.getShort();
- int classNameOff = bb.getShort();
- /* int objSize = */ bb.getInt();
- String prettyName = readStr(bb, prettyNameOff, prettyNameLen);
- String className = readStr(bb, classNameOff, classNameLen);
- bb.position(headerSize);
-
- // read ole header
- int oleVer = bb.getInt();
- /* int format = */ bb.getInt();
-
- if(oleVer != OLE_VERSION) {
- return new UnknownContentImpl(blob);
- }
-
- int typeNameLen = bb.getInt();
- String typeName = readStr(bb, bb.position(), typeNameLen);
- bb.getLong(); // unused
- int dataBlockLen = bb.getInt();
- int dataBlockPos = bb.position();
-
-
- if(SIMPLE_PACKAGE_TYPE.equalsIgnoreCase(typeName)) {
- return createSimplePackageContent(
- blob, prettyName, className, typeName, bb, dataBlockLen);
- }
-
- // if COMPOUND_FACTORY is null, the poi library isn't available, so just
- // load compound data as "other"
- if((COMPOUND_FACTORY != null) &&
- (bb.remaining() >= COMPOUND_STORAGE_SIGNATURE.length) &&
- ByteUtil.matchesRange(bb, bb.position(), COMPOUND_STORAGE_SIGNATURE)) {
- return COMPOUND_FACTORY.createCompoundPackageContent(
- blob, prettyName, className, typeName, bb, dataBlockLen);
- }
-
- // this is either some other "special" (as yet unhandled) format, or it is
- // simply an embedded file (or it is compound data and poi isn't available)
- return new OtherContentImpl(blob, prettyName, className,
- typeName, dataBlockPos, dataBlockLen);
- }
-
- private static ContentImpl createSimplePackageContent(
- OleBlobImpl blob, String prettyName, String className, String typeName,
- ByteBuffer blobBb, int dataBlockLen) {
-
- int dataBlockPos = blobBb.position();
- ByteBuffer bb = PageChannel.narrowBuffer(blobBb, dataBlockPos,
- dataBlockPos + dataBlockLen);
-
- int packageSig = bb.getShort();
- if(packageSig != PACKAGE_STREAM_SIGNATURE) {
- return new OtherContentImpl(blob, prettyName, className,
- typeName, dataBlockPos, dataBlockLen);
- }
-
- String fileName = readZeroTermStr(bb);
- String filePath = readZeroTermStr(bb);
- int packageType = bb.getInt();
-
- if(packageType == PS_EMBEDDED_FILE) {
-
- int localFilePathLen = bb.getInt();
- String localFilePath = readStr(bb, bb.position(), localFilePathLen);
- int dataLen = bb.getInt();
- int dataPos = bb.position();
- bb.position(dataLen + dataPos);
-
- // remaining strings are in "reverse" order (local file path, file name,
- // file path). these string usee a real utf charset, and therefore can
- // "fix" problems with ascii based names (so we prefer these strings to
- // the original strings we found)
- int strNum = 0;
- while(true) {
-
- int rem = bb.remaining();
- if(rem < 4) {
- break;
- }
-
- int strLen = bb.getInt();
- String remStr = readStr(bb, bb.position(), strLen * 2, OLE_UTF_CHARSET);
-
- switch(strNum) {
- case 0:
- localFilePath = remStr;
- break;
- case 1:
- fileName = remStr;
- break;
- case 2:
- filePath = remStr;
- break;
- default:
- // ignore
- }
-
- ++strNum;
- }
-
- return new SimplePackageContentImpl(
- blob, prettyName, className, typeName, dataPos, dataLen,
- fileName, filePath, localFilePath);
- }
-
- if(packageType == PS_LINKED_FILE) {
-
- bb.getShort(); //unknown
- String linkStr = readZeroTermStr(bb);
-
- return new LinkContentImpl(blob, prettyName, className, typeName,
- fileName, linkStr, filePath);
- }
-
- return new OtherContentImpl(blob, prettyName, className,
- typeName, dataBlockPos, dataBlockLen);
- }
-
- private static String readStr(ByteBuffer bb, int off, int len) {
- return readStr(bb, off, len, OLE_CHARSET);
- }
-
- private static String readZeroTermStr(ByteBuffer bb) {
- int off = bb.position();
- while(bb.hasRemaining()) {
- byte b = bb.get();
- if(b == 0) {
- break;
- }
- }
- int len = bb.position() - off;
- return readStr(bb, off, len);
- }
-
- private static String readStr(ByteBuffer bb, int off, int len,
- Charset charset) {
- String str = new String(bb.array(), off, len, charset);
- bb.position(off + len);
- if(str.charAt(str.length() - 1) == '\0') {
- str = str.substring(0, str.length() - 1);
- }
- return str;
- }
-
- private static byte[] getZeroTermStrBytes(String str) {
- // since we are converting to ascii, try to make "nicer" versions of crazy
- // chars (e.g. convert "u with an umlaut" to just "u"). this may not
- // ultimately help anything but it is what ms access does.
-
- // decompose complex chars into combos of char and accent
- str = Normalizer.normalize(str, Normalizer.Form.NFD);
- // strip the accents
- str = UNICODE_ACCENT_PATTERN.matcher(str).replaceAll("");
- // (re)normalize what is left
- str = Normalizer.normalize(str, Normalizer.Form.NFC);
-
- return (str + '\0').getBytes(OLE_CHARSET);
- }
-
-
- static final class OleBlobImpl implements OleBlob
- {
- private byte[] _bytes;
- private ContentImpl _content;
-
- private OleBlobImpl(byte[] bytes) {
- _bytes = bytes;
- }
-
- public void writeTo(OutputStream out) throws IOException {
- out.write(_bytes);
- }
-
- public Content getContent() throws IOException {
- if(_content == null) {
- _content = parseContent(this);
- }
- return _content;
- }
-
- public InputStream getBinaryStream() throws SQLException {
- return new ByteArrayInputStream(_bytes);
- }
-
- public InputStream getBinaryStream(long pos, long len)
- throws SQLException
- {
- return new ByteArrayInputStream(_bytes, fromJdbcOffset(pos), (int)len);
- }
-
- public long length() throws SQLException {
- return _bytes.length;
- }
-
- public byte[] getBytes() throws IOException {
- if(_bytes == null) {
- throw new IOException("blob is closed");
- }
- return _bytes;
- }
-
- public byte[] getBytes(long pos, int len) throws SQLException {
- return ByteUtil.copyOf(_bytes, fromJdbcOffset(pos), len);
- }
-
- public long position(byte[] pattern, long start) throws SQLException {
- int pos = ByteUtil.findRange(PageChannel.wrap(_bytes),
- fromJdbcOffset(start), pattern);
- return((pos >= 0) ? toJdbcOffset(pos) : pos);
- }
-
- public long position(Blob pattern, long start) throws SQLException {
- return position(pattern.getBytes(1L, (int)pattern.length()), start);
- }
-
- public OutputStream setBinaryStream(long position) throws SQLException {
- throw new SQLFeatureNotSupportedException();
- }
-
- public void truncate(long len) throws SQLException {
- throw new SQLFeatureNotSupportedException();
- }
-
- public int setBytes(long pos, byte[] bytes) throws SQLException {
- throw new SQLFeatureNotSupportedException();
- }
-
- public int setBytes(long pos, byte[] bytes, int offset, int lesn)
- throws SQLException {
- throw new SQLFeatureNotSupportedException();
- }
-
- public void free() {
- close();
- }
-
- public void close() {
- _bytes = null;
- ByteUtil.closeQuietly(_content);
- _content = null;
- }
-
- private static int toJdbcOffset(int off) {
- return off + 1;
- }
-
- private static int fromJdbcOffset(long off) {
- return (int)off - 1;
- }
-
- }
-
- static abstract class ContentImpl implements Content, Closeable
- {
- protected final OleBlobImpl _blob;
-
- protected ContentImpl(OleBlobImpl blob) {
- _blob = blob;
- }
-
- public OleBlobImpl getBlob() {
- return _blob;
- }
-
- protected byte[] getBytes() throws IOException {
- return getBlob().getBytes();
- }
-
- public void close() {
- // base does nothing
- }
-
- }
-
- static abstract class EmbeddedContentImpl extends ContentImpl
- implements EmbeddedContent
- {
- private final int _position;
- private final int _length;
-
- protected EmbeddedContentImpl(OleBlobImpl blob, int position, int length)
- {
- super(blob);
- _position = position;
- _length = length;
- }
-
- public long length() {
- return _length;
- }
-
- public InputStream getStream() throws IOException {
- return new ByteArrayInputStream(getBytes(), _position, _length);
- }
-
- public void writeTo(OutputStream out) throws IOException {
- out.write(getBytes(), _position, _length);
- }
- }
-
- static abstract class EmbeddedPackageContentImpl
- extends EmbeddedContentImpl
- implements PackageContent
- {
- private final String _prettyName;
- private final String _className;
- private final String _typeName;
-
- protected EmbeddedPackageContentImpl(
- OleBlobImpl blob, String prettyName, String className,
- String typeName, int position, int length)
- {
- super(blob, position, length);
- _prettyName = prettyName;
- _className = className;
- _typeName = typeName;
- }
-
- public String getPrettyName() {
- return _prettyName;
- }
-
- public String getClassName() {
- return _className;
- }
-
- public String getTypeName() {
- return _typeName;
- }
-
- }
-
- private static final class LinkContentImpl
- extends EmbeddedPackageContentImpl
- implements LinkContent
- {
- private final String _fileName;
- private final String _linkPath;
- private final String _filePath;
-
- private LinkContentImpl(OleBlobImpl blob, String prettyName,
- String className, String typeName,
- String fileName, String linkPath,
- String filePath)
- {
- super(blob, prettyName, className, typeName, -1, -1);
- _fileName = fileName;
- _linkPath = linkPath;
- _filePath = filePath;
- }
-
- public ContentType getType() {
- return ContentType.LINK;
- }
-
- public String getFileName() {
- return _fileName;
- }
-
- public String getLinkPath() {
- return _linkPath;
- }
-
- public String getFilePath() {
- return _filePath;
- }
-
- public InputStream getLinkStream() throws IOException {
- return new FileInputStream(getLinkPath());
- }
- }
-
- private static final class SimplePackageContentImpl
- extends EmbeddedPackageContentImpl
- implements SimplePackageContent
- {
- private final String _fileName;
- private final String _filePath;
- private final String _localFilePath;
-
- private SimplePackageContentImpl(OleBlobImpl blob, String prettyName,
- String className, String typeName,
- int position, int length,
- String fileName, String filePath,
- String localFilePath)
- {
- super(blob, prettyName, className, typeName, position, length);
- _fileName = fileName;
- _filePath = filePath;
- _localFilePath = localFilePath;
- }
-
- public ContentType getType() {
- return ContentType.SIMPLE_PACKAGE;
- }
-
- public String getFileName() {
- return _fileName;
- }
-
- public String getFilePath() {
- return _filePath;
- }
-
- public String getLocalFilePath() {
- return _localFilePath;
- }
-
- }
-
- private static final class OtherContentImpl
- extends EmbeddedPackageContentImpl
- implements OtherContent
- {
- private OtherContentImpl(
- OleBlobImpl blob, String prettyName, String className,
- String typeName, int position, int length)
- {
- super(blob, prettyName, className, typeName, position, length);
- }
-
- public ContentType getType() {
- return ContentType.OTHER;
- }
-
- }
-
- private static final class UnknownContentImpl extends ContentImpl
- {
- private UnknownContentImpl(OleBlobImpl blob) {
- super(blob);
- }
-
- public ContentType getType() {
- return ContentType.UNKNOWN;
- }
-
- }
-
- }