You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/22 13:48:49 UTC

[tika] branch master updated: TIKA-2909 -- trivial formatting updates and add entry to CHANGES.txt file

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new bc16d2e  TIKA-2909 -- trivial formatting updates and add entry to CHANGES.txt file
bc16d2e is described below

commit bc16d2e83b70d09167f4cf0a493e509ee1705320
Author: TALLISON <ta...@apache.org>
AuthorDate: Mon Jul 22 09:48:37 2019 -0400

    TIKA-2909 -- trivial formatting updates and add entry to CHANGES.txt file
---
 CHANGES.txt                                        |    2 +
 .../apache/tika/parser/hwp/HwpStreamReader.java    |  263 ++---
 .../apache/tika/parser/hwp/HwpTextExtractorV5.java | 1007 ++++++++++----------
 .../org/apache/tika/parser/hwp/HwpV5Parser.java    |   45 +-
 .../apache/tika/parser/hwp/HwpV5ParserTest.java    |   72 +-
 5 files changed, 682 insertions(+), 707 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index e3d3ea6..3072fd9 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -10,6 +10,8 @@ Release 1.22 - ???
    * NOTE: Known regression: PDFBOX-4587 -- PDF passwords with codepoints
      between 0xF000 and 0XF0000 will cause an exception.
 
+   * Add parser for HWP v5 files via SooMyung Lee (soomyung) (TIKA-2909).
+
    * Fix order of closing streams to avoid "Failed to close temporary resource"
      exception (TIKA-2908).
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java
index badcf20..978b361 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java
@@ -1,131 +1,134 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
-
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.hwp;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.poi.util.IOUtils;
-import org.apache.poi.util.LittleEndian;
-
-public class HwpStreamReader {
-	private InputStream input;
-	private byte[] buf;
-
-	public HwpStreamReader(InputStream inputStream) {
-		this.input = inputStream;
-		buf = new byte[4];
-	}
-
-	/**
-	 * More data to read ?
-	 * 
-	 * @return
-	 * @throws IOException
-	 */
-	public boolean available() throws IOException {
-		return input.available() > 0;
-	}
-
-	/**
-	 * unsigned 1 byte
-	 * 
-	 * @return
-	 * @throws IOException
-	 */
-	public short uint8() throws IOException {
-		int read = IOUtils.readFully(input, buf, 0, 1);
-
-		if (read == -1)
-			return -1;
-
-		return LittleEndian.getUByte(buf);
-	}
-
-	/**
-	 * unsigned 2 byte
-	 * 
-	 * @return
-	 * @throws IOException
-	 */
-	public int uint16() throws IOException {
-		int read = IOUtils.readFully(input, buf, 0, 2);
-
-		if (read == -1)
-			return -1;
-
-		if (read < 2)
-			throw new EOFException();
-
-		return LittleEndian.getUShort(buf);
-	}
-
-	/**
-	 * unsigned 2 byte array
-	 * 
-	 * @param i
-	 * @return
-	 * @throws IOException
-	 */
-	public int[] uint16(int i) throws IOException {
-		if (i <= 0)
-			throw new IllegalArgumentException();
-
-		byte[] buf = new byte[i * 2];
-		int read = IOUtils.readFully(input, buf, 0, i * 2);
-
-		if (read != i * 2)
-			throw new EOFException();
-		
-		int[] uints = new int[i];
-		for (int ii = 0; ii < i; ii++) {
-			uints[ii] = LittleEndian.getUShort(buf, ii * 2);
-		}
-
-		return uints;
-	}
-
-	/**
-	 * unsigned 4 byte
-	 * 
-	 * @return
-	 * @throws IOException
-	 */
-	public long uint32() throws IOException {
-		int read = IOUtils.readFully(input, buf, 0, 4);
-
-		if (read == -1)
-			return -1;
-
-		if (read < 4)
-			throw new EOFException();
-
-		return LittleEndian.getUInt(buf);
-	}
-
-	/**
-	 * ensure skip of n byte
-	 * 
-	 * @param n
-	 * @throws IOException
-	 */
-	public void ensureSkip(long n) throws IOException {
-		IOUtils.skipFully(input, n);
-	}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.hwp;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.LittleEndian;
+
+public class HwpStreamReader {
+    private InputStream input;
+    private byte[] buf;
+
+    public HwpStreamReader(InputStream inputStream) {
+        this.input = inputStream;
+        buf = new byte[4];
+    }
+
+    /**
+     * More data to read ?
+     *
+     * @return
+     * @throws IOException
+     */
+    public boolean available() throws IOException {
+        return input.available() > 0;
+    }
+
+    /**
+     * unsigned 1 byte
+     *
+     * @return
+     * @throws IOException
+     */
+    public short uint8() throws IOException {
+        int read = IOUtils.readFully(input, buf, 0, 1);
+
+        if (read == -1) {
+            return -1;
+        }
+
+        return LittleEndian.getUByte(buf);
+    }
+
+    /**
+     * unsigned 2 byte
+     *
+     * @return
+     * @throws IOException
+     */
+    public int uint16() throws IOException {
+        int read = IOUtils.readFully(input, buf, 0, 2);
+
+        if (read == -1) {
+            return -1;
+        }
+
+        if (read < 2) {
+            throw new EOFException();
+        }
+        return LittleEndian.getUShort(buf);
+    }
+
+    /**
+     * unsigned 2 byte array
+     *
+     * @param i
+     * @return
+     * @throws IOException
+     */
+    public int[] uint16(int i) throws IOException {
+        if (i <= 0) {
+            throw new IllegalArgumentException();
+        }
+        byte[] buf = new byte[i * 2];
+        int read = IOUtils.readFully(input, buf, 0, i * 2);
+
+        if (read != i * 2) {
+            throw new EOFException();
+        }
+        int[] uints = new int[i];
+        for (int ii = 0; ii < i; ii++) {
+            uints[ii] = LittleEndian.getUShort(buf, ii * 2);
+        }
+
+        return uints;
+    }
+
+    /**
+     * unsigned 4 byte
+     *
+     * @return
+     * @throws IOException
+     */
+    public long uint32() throws IOException {
+        int read = IOUtils.readFully(input, buf, 0, 4);
+
+        if (read == -1) {
+            return -1;
+        }
+
+        if (read < 4) {
+            throw new EOFException();
+        }
+
+        return LittleEndian.getUInt(buf);
+    }
+
+    /**
+     * ensure skip of n byte
+     *
+     * @param n
+     * @throws IOException
+     */
+    public void ensureSkip(long n) throws IOException {
+        IOUtils.skipFully(input, n);
+    }
 }
\ No newline at end of file
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
index 9369873..625d9c5 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
@@ -1,500 +1,507 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
-
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.hwp;
-
-import java.io.EOFException;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.security.InvalidKeyException;
-import java.security.Key;
-import java.security.NoSuchAlgorithmException;
-import java.util.Arrays;
-import java.util.Date;
-import java.util.Iterator;
-import java.util.zip.Inflater;
-import java.util.zip.InflaterInputStream;
-
-import javax.crypto.Cipher;
-import javax.crypto.CipherInputStream;
-import javax.crypto.NoSuchPaddingException;
-import javax.crypto.spec.SecretKeySpec;
-
-import org.apache.poi.hpsf.NoPropertySetStreamException;
-import org.apache.poi.hpsf.Property;
-import org.apache.poi.hpsf.PropertySet;
-import org.apache.poi.poifs.filesystem.DirectoryEntry;
-import org.apache.poi.poifs.filesystem.DirectoryNode;
-import org.apache.poi.poifs.filesystem.DocumentEntry;
-import org.apache.poi.poifs.filesystem.DocumentInputStream;
-import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.util.IOUtils;
-import org.apache.poi.util.LittleEndian;
-import org.apache.tika.exception.EncryptedDocumentException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.exception.UnsupportedFormatException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.SAXException;
-
-public class HwpTextExtractorV5 {
-	protected static Logger log = LoggerFactory
-			.getLogger(HwpTextExtractorV5.class);
-
-	private static final byte[] HWP_V5_SIGNATURE = "HWP Document File"
-			.getBytes();
-
-	private static final int HWPTAG_BEGIN = 0x010;
-	
-	private static final int I = 1; // INLINE
-	private static final int C = 2; // CONTROL
-	private static final int X = 3; // EXTENDED
-	
-	private static final int[] HWP_CHAR_TYPE = new int[] { C, X, X, X, I, I, I, I, I, I, // 0-9
-			C, X, X, C, X, X, X, X, X, I, // 10-19
-			I, X, X, X, C, C, C, C, C, C, // 20-29
-			C, C }; // 30-31
-
-
-	/**
-	 * extract Text from HWP Stream.
-	 * 
-	 * @param source
-	 * @param writer
-	 * @return
-	 * @throws FileNotFoundException
-	 * @throws IOException
-	 * @throws SAXException 
-	 */
-	public void extract(InputStream source, Metadata metadata, XHTMLContentHandler xhtml)
-			throws FileNotFoundException, IOException,
-			TikaException, SAXException {
-		if (source == null || xhtml == null)
-			throw new IllegalArgumentException();
-
-		POIFSFileSystem fs = null;
-		try {
-			fs = new POIFSFileSystem(source);
-
-			DirectoryNode root = fs.getRoot();
-
-			extract0(root, metadata, xhtml);
-
-		} catch (IOException e) {
-			throw new TikaException(
-					"error occurred when parsing HWP Format, It may not HWP Format.", e);
-		} finally {
-			IOUtils.closeQuietly(fs);
-		}
-	}
-
-	private void extract0(DirectoryNode root, Metadata metadata, XHTMLContentHandler xhtml)
-			throws IOException, SAXException, TikaException {
-
-		Entry headerEntry = root.getEntry("FileHeader");
-		if (!headerEntry.isDocumentEntry())
-			throw new UnsupportedFormatException("cannot parse the File Header");
-
-		FileHeader header = getHeader(headerEntry);
-
-		if (header == null)
-			throw new UnsupportedFormatException("cannot parse the File Header");
-		if (header.encrypted)
-			throw new EncryptedDocumentException("document is encrypted");
-
-		parseSummaryInformation(root, metadata);
-		
-		if (header.viewtext) {
-			parseViewText(header, root, xhtml);
-		} else {
-			parseBodyText(header, root, xhtml);
-		}
-		
-	}
-
-	private void parseSummaryInformation(DirectoryNode root, Metadata metadata) throws TikaException {
-
-		try {
-			Entry summaryEntry = root.getEntry("\u0005HwpSummaryInformation");
-			
-			populateMatadata(summaryEntry, metadata);
-			
-		} catch (NoPropertySetStreamException | IOException e) {
-			throw new UnsupportedFormatException(
-					"cannot parse the Summary Information");
-		}
-		
-	}
-	
-	private void populateMatadata(Entry summaryEntry, Metadata metadata) throws IOException, NoPropertySetStreamException {
-		
-		DocumentInputStream summaryStream = new DocumentInputStream(
-				(DocumentEntry) summaryEntry);
-		
-		PropertySet ps = new PropertySet(summaryStream);
-		
-		Property[] props = ps.getProperties();
-		
-		for(Property prop : props) {
-			int propID = (int)prop.getID();
-			Object value = prop.getValue();
-			
-			switch(propID) {
-			case 2:
-				metadata.set(TikaCoreProperties.TITLE, (String)value);
-				break;
-			case 3:
-				metadata.set(OfficeOpenXMLCore.SUBJECT, (String)value);
-				break;
-			case 4:
-				metadata.set(TikaCoreProperties.CREATOR, (String)value);
-				break;
-			case 5:
-				metadata.set(Office.KEYWORDS, (String)value);
-				break;
-			case 6:
-				metadata.set(TikaCoreProperties.COMMENTS, (String)value);
-				break;
-			case 8:
-				metadata.set(TikaCoreProperties.MODIFIER, (String)value);
-				break;
-			case 12:
-				metadata.set(TikaCoreProperties.CREATED, (Date)value);
-				break;
-			case 13:
-				metadata.set(TikaCoreProperties.MODIFIED, (Date)value);
-				break;
-			case 14:
-				metadata.set(Office.PAGE_COUNT, (int)value);
-				break;
-			default:
-			}
-		}
-	}
-	
-	/**
-	 * extract the HWP File Header
-	 * 
-	 * @param fs
-	 * @return
-	 * @throws IOException
-	 */
-	private FileHeader getHeader(Entry headerEntry) throws IOException {
-		// confirm signature
-		byte[] header = new byte[256]; // the length of File header is 256
-		DocumentInputStream headerStream = new DocumentInputStream(
-				(DocumentEntry) headerEntry);
-		try {
-			int read = headerStream.read(header);
-			if (read != 256
-					|| !Arrays.equals(HWP_V5_SIGNATURE, Arrays.copyOfRange(
-							header, 0, HWP_V5_SIGNATURE.length)))
-				return null;
-		} finally {
-			headerStream.close();
-		}
-
-		FileHeader fileHeader = new FileHeader();
-
-		// version. debug
-		fileHeader.version = HwpVersion.parseVersion(LittleEndian.getUInt(
-				header, 32));
-		long flags = LittleEndian.getUInt(header, 36);
-		log.debug("Flags={}", Long.toBinaryString(flags).replace(' ', '0'));
-
-		fileHeader.compressed = (flags & 0x01) == 0x01;
-		fileHeader.encrypted = (flags & 0x02) == 0x02;
-		fileHeader.viewtext = (flags & 0x04) == 0x04;
-
-		return fileHeader;
-	}
-
-	/**
-	 * extract Text
-	 * 
-	 * @param writer
-	 * @param source
-	 * 
-	 * @return
-	 * @throws IOException
-	 * @throws SAXException 
-	 */
-	private void parseBodyText(FileHeader header, DirectoryNode root,
-			XHTMLContentHandler xhtml) throws IOException, SAXException {
-		// read BodyText
-		Entry bodyText = root.getEntry("BodyText");
-		if (bodyText == null || !bodyText.isDirectoryEntry())
-			throw new IOException("Invalid BodyText");
-
-		Iterator<Entry> iterator = ((DirectoryEntry) bodyText).getEntries();
-		while (iterator.hasNext()) {
-			Entry entry = iterator.next();
-			if (entry.getName().startsWith("Section")
-					&& entry instanceof DocumentEntry) {
-				log.debug("extract {}", entry.getName());
-
-				InputStream input = new DocumentInputStream(
-						(DocumentEntry) entry);
-				if (header.compressed)
-					input = new InflaterInputStream(input, new Inflater(true));
-
-				HwpStreamReader reader = new HwpStreamReader(input);
-
-				parse(reader, xhtml);
-				
-			} else {
-				log.warn("Unknown Entry '{}'({})", entry.getName(), entry);
-			}
-		}
-	}
-
-	/**
-	 * 텍스트 추출
-	 * 
-	 * @param writer
-	 * @param source
-	 * 
-	 * @return
-	 * @throws IOException
-	 */
-	private void parseViewText(FileHeader header, DirectoryNode root,
-			XHTMLContentHandler xhtml) throws IOException {
-		// read BodyText
-		Entry bodyText = root.getEntry("ViewText");
-		if (bodyText == null || !bodyText.isDirectoryEntry())
-			throw new IOException("Invalid ViewText");
-
-		Iterator<Entry> iterator = ((DirectoryEntry) bodyText).getEntries();
-		while (iterator.hasNext()) {
-			Entry entry = iterator.next();
-			if (entry.getName().startsWith("Section")
-					&& entry instanceof DocumentEntry) {
-				log.debug("extract {}", entry.getName());
-
-				InputStream input = new DocumentInputStream(
-						(DocumentEntry) entry);
-	
-				try {
-					Key key = readKey(input);
-					input = createDecryptStream(input, key);
-					if (header.compressed)
-						input = new InflaterInputStream(input, new Inflater(
-								true));
-
-					HwpStreamReader sectionStream = new HwpStreamReader(input);
-					parse(sectionStream, xhtml);
-				} catch (InvalidKeyException e) {
-					throw new IOException(e);
-				} catch (NoSuchAlgorithmException e) {
-					throw new IOException(e);
-				} catch (NoSuchPaddingException e) {
-					throw new IOException(e);
-				} catch (SAXException e) {
-					throw new IOException(e);
-				} finally {
-					IOUtils.closeQuietly(input);
-				}
-			} else {
-				log.warn("unknown Entry '{}'({})", entry.getName(), entry);
-			}
-		}
-	}
-
-	private Key readKey(InputStream input) throws IOException {
-		byte[] data = new byte[260];
-
-		if (IOUtils.readFully(input, data, 0, 4) != 4)// TAG,
-			throw new EOFException(); 
-
-		if (IOUtils.readFully(input, data, 0, 256) != 256)
-			throw new EOFException();
-
-		SRand srand = new SRand(LittleEndian.getInt(data));
-		byte xor = 0;
-		for (int i = 0, n = 0; i < 256; i++, n--) {
-			if (n == 0) {
-				xor = (byte) (srand.rand() & 0xFF);
-				n = (int) ((srand.rand() & 0xF) + 1);
-			}
-			if (i >= 4) {
-				data[i] = (byte) ((data[i]) ^ (xor));
-			}
-		}
-
-		int offset = 4 + (data[0] & 0xF); // 4 + (0~15) ?
-		byte[] key = Arrays.copyOfRange(data, offset, offset + 16);
-
-		SecretKeySpec secretKey = new SecretKeySpec(key, "AES");
-		return secretKey;
-	}
-
-	public InputStream createDecryptStream(InputStream input, Key key)
-			throws IOException, NoSuchAlgorithmException,
-			NoSuchPaddingException, InvalidKeyException {
-		Cipher cipher = null;
-
-		cipher = Cipher.getInstance("AES/ECB/NoPadding");
-		cipher.init(Cipher.DECRYPT_MODE, key);
-
-		return new CipherInputStream(input, cipher);
-	}
-
-	/**
-	 * extract characters from Section stream
-	 * 
-	 * @param reader
-	 * @param writer
-	 * @throws IOException
-	 * @throws SAXException 
-	 */
-	private void parse(HwpStreamReader reader, XHTMLContentHandler xhtml)
-			throws IOException, SAXException {
-		StringBuffer buf = new StringBuffer(1024);
-		TagInfo tag = new TagInfo();
-
-		while (true) {
-			if (!readTag(reader, tag))
-				break;
-
-			if (HWPTAG_BEGIN + 51 == tag.id) {
-				if (tag.length % 2 != 0)
-					throw new IOException("Invalid block size");
-
-				buf.setLength(0);
-				writeParaText(reader, tag.length, buf);
-
-				if (buf.length() > 0) {
-					buf.append('\n');
-					
-					xhtml.startElement("p");
-					xhtml.characters(buf.toString());
-					xhtml.endElement("p");
-				}
-			} else {
-				reader.ensureSkip(tag.length);
-			}
-		}
-	}
-
-
-	/**
-	 * transfer character stream of HWPTAG_PARA_TEXT to STRING
-	 * 
-	 * @param reader
-	 * @param datasize
-	 * @param buf
-	 * @throws IOException
-	 */
-	private void writeParaText(HwpStreamReader reader, long datasize,
-			StringBuffer buf) throws IOException {
-		int[] chars = reader.uint16((int) (datasize / 2));
-
-		for (int index = 0; index < chars.length; index++) {
-			int ch = chars[index];
-			if (ch < 32) {
-				if (ch == 9) { // tab, INLINE
-					buf.append('\t');
-					index += 7;
-				} else {
-					int type = HWP_CHAR_TYPE[ch];
-					if (I == type) { // INLINE
-						index += 7;
-					} else if (X == type) { // EXTENDED
-						index += 7;
-					} else if (C == type) { // CONTROL
-						buf.append(' ');
-					}
-				}
-			} else {
-				buf.append((char) ch);
-			}
-		}
-	}
-
-	private boolean readTag(HwpStreamReader reader, TagInfo tag)
-			throws IOException {
-		// see p.24 of hwp 5.0 format guide
-
-		long recordHeader = reader.uint32();
-		if (recordHeader == -1)
-			return false;
-
-		tag.id = recordHeader & 0x3FF;
-		tag.level = (recordHeader >> 10) & 0x3FF;
-		tag.length = (recordHeader >> 20) & 0xFFF;
-
-		// see p.24 of hwp 5.0 format guide
-		if (tag.length == 0xFFF)
-			tag.length = reader.uint32();
-
-		return true;
-	}
-
-	private static class SRand {
-		private int random_seed;
-
-		private SRand(int seed) {
-			random_seed = seed;
-		}
-
-		private int rand() {
-			random_seed = (random_seed * 214013 + 2531011) & 0xFFFFFFFF;
-			return (random_seed >> 16) & 0x7FFF;
-		}
-	}
-	
-	static class FileHeader {
-		HwpVersion version;
-		boolean compressed; // bit 0
-		boolean encrypted; // bit 1
-		boolean viewtext; // bit 2
-	}
-
-	static class TagInfo {
-		long id;
-		long level;
-		long length;
-	}
-
-	static class HwpVersion {
-		int m;
-		int n;
-		int p;
-		int r;
-
-		public String toString() {
-			return String.format("%d.%d.%d.%d", m, n, p, r);
-		}
-
-		public static HwpVersion parseVersion(long longVersion) {
-			HwpVersion version = new HwpVersion();
-			version.m = (int) ((longVersion & 0xFF000000L) >> 24);
-			version.n = (int) ((longVersion & 0x00FF0000L) >> 16);
-			version.p = (int) ((longVersion & 0x0000FF00L) >> 8);
-			version.r = (int) ((longVersion & 0x000000FFL));
-			return version;
-		}
-	}
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.hwp;
+
+import javax.crypto.Cipher;
+import javax.crypto.CipherInputStream;
+import javax.crypto.NoSuchPaddingException;
+import javax.crypto.spec.SecretKeySpec;
+
+import java.io.EOFException;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.security.InvalidKeyException;
+import java.security.Key;
+import java.security.NoSuchAlgorithmException;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.zip.Inflater;
+import java.util.zip.InflaterInputStream;
+
+import org.apache.poi.hpsf.NoPropertySetStreamException;
+import org.apache.poi.hpsf.Property;
+import org.apache.poi.hpsf.PropertySet;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.LittleEndian;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.UnsupportedFormatException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
+
+public class HwpTextExtractorV5 {
+
+    protected static Logger LOG = LoggerFactory
+            .getLogger(HwpTextExtractorV5.class);
+
+    private static final byte[] HWP_V5_SIGNATURE = "HWP Document File"
+            .getBytes();
+
+    private static final int HWPTAG_BEGIN = 0x010;
+
+    private static final int I = 1; // INLINE
+    private static final int C = 2; // CONTROL
+    private static final int X = 3; // EXTENDED
+
+    private static final int[] HWP_CHAR_TYPE = new int[]{
+            C, X, X, X, I, I, I, I, I, I, // 0-9
+            C, X, X, C, X, X, X, X, X, I, // 10-19
+            I, X, X, X, C, C, C, C, C, C, // 20-29
+            C, C}; // 30-31
+
+
+    /**
+     * extract Text from HWP Stream.
+     *
+     * @param source
+     * @param metadata
+     * @param xhtml
+     * @return
+     * @throws FileNotFoundException
+     * @throws IOException
+     * @throws SAXException
+     */
+    public void extract(InputStream source, Metadata metadata, XHTMLContentHandler xhtml)
+            throws FileNotFoundException, IOException,
+            TikaException, SAXException {
+        if (source == null || xhtml == null)
+            throw new IllegalArgumentException();
+
+        POIFSFileSystem fs = null;
+        try {
+            fs = new POIFSFileSystem(source);
+
+            DirectoryNode root = fs.getRoot();
+            extract0(root, metadata, xhtml);
+
+        } catch (IOException e) {
+            throw new TikaException(
+                    "error occurred when parsing HWP Format, It may not HWP Format.", e);
+        } finally {
+            IOUtils.closeQuietly(fs);
+        }
+    }
+
+    private void extract0(DirectoryNode root, Metadata metadata, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+
+        Entry headerEntry = root.getEntry("FileHeader");
+        if (!headerEntry.isDocumentEntry()) {
+            throw new UnsupportedFormatException("cannot parse the File Header");
+        }
+
+        FileHeader header = getHeader(headerEntry);
+
+        if (header == null) {
+            throw new UnsupportedFormatException("cannot parse the File Header");
+        }
+
+        if (header.encrypted) {
+            throw new EncryptedDocumentException("document is encrypted");
+        }
+
+        parseSummaryInformation(root, metadata);
+
+        if (header.viewtext) {
+            parseViewText(header, root, xhtml);
+        } else {
+            parseBodyText(header, root, xhtml);
+        }
+
+    }
+
+    private void parseSummaryInformation(DirectoryNode root, Metadata metadata) throws TikaException {
+
+        try {
+            Entry summaryEntry = root.getEntry("\u0005HwpSummaryInformation");
+
+            populateMatadata(summaryEntry, metadata);
+
+        } catch (NoPropertySetStreamException | IOException e) {
+            throw new UnsupportedFormatException(
+                    "cannot parse the Summary Information");
+        }
+
+    }
+
+    private void populateMatadata(Entry summaryEntry, Metadata metadata) throws IOException, NoPropertySetStreamException {
+
+        DocumentInputStream summaryStream = new DocumentInputStream(
+                (DocumentEntry) summaryEntry);
+
+        PropertySet ps = new PropertySet(summaryStream);
+
+        Property[] props = ps.getProperties();
+
+        for (Property prop : props) {
+            int propID = (int) prop.getID();
+            Object value = prop.getValue();
+
+            switch (propID) {
+                case 2:
+                    metadata.set(TikaCoreProperties.TITLE, (String) value);
+                    break;
+                case 3:
+                    metadata.set(OfficeOpenXMLCore.SUBJECT, (String) value);
+                    break;
+                case 4:
+                    metadata.set(TikaCoreProperties.CREATOR, (String) value);
+                    break;
+                case 5:
+                    metadata.set(Office.KEYWORDS, (String) value);
+                    break;
+                case 6:
+                    metadata.set(TikaCoreProperties.COMMENTS, (String) value);
+                    break;
+                case 8:
+                    metadata.set(TikaCoreProperties.MODIFIER, (String) value);
+                    break;
+                case 12:
+                    metadata.set(TikaCoreProperties.CREATED, (Date) value);
+                    break;
+                case 13:
+                    metadata.set(TikaCoreProperties.MODIFIED, (Date) value);
+                    break;
+                case 14:
+                    metadata.set(Office.PAGE_COUNT, (int) value);
+                    break;
+                default:
+            }
+        }
+    }
+
+    /**
+     * extract the HWP File Header
+     *
+     * @param headerEntry
+     * @return
+     * @throws IOException
+     */
+    private FileHeader getHeader(Entry headerEntry) throws IOException {
+        // confirm signature
+        byte[] header = new byte[256]; // the length of File header is 256
+
+        try (DocumentInputStream headerStream = new DocumentInputStream(
+                (DocumentEntry) headerEntry)) {
+            int read = headerStream.read(header);
+            if (read != 256
+                    || !Arrays.equals(HWP_V5_SIGNATURE, Arrays.copyOfRange(
+                    header, 0, HWP_V5_SIGNATURE.length)))
+                return null;
+        }
+
+        FileHeader fileHeader = new FileHeader();
+
+        // version. debug
+        fileHeader.version = HwpVersion.parseVersion(LittleEndian.getUInt(
+                header, 32));
+        long flags = LittleEndian.getUInt(header, 36);
+        LOG.debug("Flags={}", Long.toBinaryString(flags).replace(' ', '0'));
+
+        fileHeader.compressed = (flags & 0x01) == 0x01;
+        fileHeader.encrypted = (flags & 0x02) == 0x02;
+        fileHeader.viewtext = (flags & 0x04) == 0x04;
+
+        return fileHeader;
+    }
+
+    /**
+     * extract Text
+     *
+     * @param header
+     * @param root
+     * @param xhtml
+     * @return
+     * @throws IOException
+     * @throws SAXException
+     */
+    private void parseBodyText(FileHeader header, DirectoryNode root,
+                               XHTMLContentHandler xhtml) throws IOException, SAXException {
+        // read BodyText
+        Entry bodyText = root.getEntry("BodyText");
+        if (bodyText == null || !bodyText.isDirectoryEntry())
+            throw new IOException("Invalid BodyText");
+
+        Iterator<Entry> iterator = ((DirectoryEntry) bodyText).getEntries();
+        while (iterator.hasNext()) {
+            Entry entry = iterator.next();
+            if (entry.getName().startsWith("Section")
+                    && entry instanceof DocumentEntry) {
+                LOG.debug("extract {}", entry.getName());
+
+                InputStream input = new DocumentInputStream(
+                        (DocumentEntry) entry);
+                if (header.compressed)
+                    input = new InflaterInputStream(input, new Inflater(true));
+
+                HwpStreamReader reader = new HwpStreamReader(input);
+
+                parse(reader, xhtml);
+
+            } else {
+                LOG.warn("Unknown Entry '{}'({})", entry.getName(), entry);
+            }
+        }
+    }
+
+    /**
+     * 텍스트 추출
+     *
+     * @param header
+     * @param root
+     * @param xhtml
+     * @return
+     * @throws IOException
+     */
+    private void parseViewText(FileHeader header, DirectoryNode root,
+                               XHTMLContentHandler xhtml) throws IOException {
+        // read BodyText
+        Entry bodyText = root.getEntry("ViewText");
+        if (bodyText == null || !bodyText.isDirectoryEntry()) {
+            throw new IOException("Invalid ViewText");
+        }
+
+        Iterator<Entry> iterator = ((DirectoryEntry) bodyText).getEntries();
+        while (iterator.hasNext()) {
+            Entry entry = iterator.next();
+            if (entry.getName().startsWith("Section")
+                    && entry instanceof DocumentEntry) {
+                LOG.debug("extract {}", entry.getName());
+
+                InputStream input = new DocumentInputStream(
+                        (DocumentEntry) entry);
+
+                try {
+                    Key key = readKey(input);
+                    input = createDecryptStream(input, key);
+                    if (header.compressed) {
+                        input = new InflaterInputStream(input, new Inflater(
+                                true));
+                    }
+
+                    HwpStreamReader sectionStream = new HwpStreamReader(input);
+                    parse(sectionStream, xhtml);
+                } catch (InvalidKeyException e) {
+                    throw new IOException(e);
+                } catch (NoSuchAlgorithmException e) {
+                    throw new IOException(e);
+                } catch (NoSuchPaddingException e) {
+                    throw new IOException(e);
+                } catch (SAXException e) {
+                    throw new IOException(e);
+                } finally {
+                    IOUtils.closeQuietly(input);
+                }
+            } else {
+                LOG.warn("unknown Entry '{}'({})", entry.getName(), entry);
+            }
+        }
+    }
+
+    private Key readKey(InputStream input) throws IOException {
+        byte[] data = new byte[260];
+
+        if (IOUtils.readFully(input, data, 0, 4) != 4) {// TAG,
+            throw new EOFException();
+        }
+
+        if (IOUtils.readFully(input, data, 0, 256) != 256) {
+            throw new EOFException();
+        }
+
+        SRand srand = new SRand(LittleEndian.getInt(data));
+        byte xor = 0;
+        for (int i = 0, n = 0; i < 256; i++, n--) {
+            if (n == 0) {
+                xor = (byte) (srand.rand() & 0xFF);
+                n = (int) ((srand.rand() & 0xF) + 1);
+            }
+            if (i >= 4) {
+                data[i] = (byte) ((data[i]) ^ (xor));
+            }
+        }
+
+        int offset = 4 + (data[0] & 0xF); // 4 + (0~15) ?
+        byte[] key = Arrays.copyOfRange(data, offset, offset + 16);
+
+        SecretKeySpec secretKey = new SecretKeySpec(key, "AES");
+        return secretKey;
+    }
+
+    public InputStream createDecryptStream(InputStream input, Key key)
+            throws NoSuchAlgorithmException,
+            NoSuchPaddingException, InvalidKeyException {
+        Cipher cipher = null;
+
+        cipher = Cipher.getInstance("AES/ECB/NoPadding");
+        cipher.init(Cipher.DECRYPT_MODE, key);
+
+        return new CipherInputStream(input, cipher);
+    }
+
+    /**
+     * extract characters from Section stream
+     *
+     * @param reader
+     * @param xhtml
+     * @throws IOException
+     * @throws SAXException
+     */
+    private void parse(HwpStreamReader reader, XHTMLContentHandler xhtml)
+            throws IOException, SAXException {
+        StringBuffer buf = new StringBuffer(1024);
+        TagInfo tag = new TagInfo();
+
+        while (true) {
+            if (!readTag(reader, tag))
+                break;
+
+            if (HWPTAG_BEGIN + 51 == tag.id) {
+                if (tag.length % 2 != 0) {
+                    throw new IOException("Invalid block size");
+                }
+                buf.setLength(0);
+                writeParaText(reader, tag.length, buf);
+
+                if (buf.length() > 0) {
+                    buf.append('\n');
+
+                    xhtml.startElement("p");
+                    xhtml.characters(buf.toString());
+                    xhtml.endElement("p");
+                }
+            } else {
+                reader.ensureSkip(tag.length);
+            }
+        }
+    }
+
+
+    /**
+     * transfer character stream of HWPTAG_PARA_TEXT to STRING
+     *
+     * @param reader
+     * @param datasize
+     * @param buf
+     * @throws IOException
+     */
+    private void writeParaText(HwpStreamReader reader, long datasize,
+                               StringBuffer buf) throws IOException {
+        int[] chars = reader.uint16((int) (datasize / 2));
+
+        for (int index = 0; index < chars.length; index++) {
+            int ch = chars[index];
+            if (ch < 32) {
+                if (ch == 9) { // tab, INLINE
+                    buf.append('\t');
+                    index += 7;
+                } else {
+                    int type = HWP_CHAR_TYPE[ch];
+                    if (I == type) { // INLINE
+                        index += 7;
+                    } else if (X == type) { // EXTENDED
+                        index += 7;
+                    } else if (C == type) { // CONTROL
+                        buf.append(' ');
+                    }
+                }
+            } else {
+                buf.append((char) ch);
+            }
+        }
+    }
+
+    private boolean readTag(HwpStreamReader reader, TagInfo tag)
+            throws IOException {
+        // see p.24 of hwp 5.0 format guide
+
+        long recordHeader = reader.uint32();
+        if (recordHeader == -1)
+            return false;
+
+        tag.id = recordHeader & 0x3FF;
+        tag.level = (recordHeader >> 10) & 0x3FF;
+        tag.length = (recordHeader >> 20) & 0xFFF;
+
+        // see p.24 of hwp 5.0 format guide
+        if (tag.length == 0xFFF)
+            tag.length = reader.uint32();
+
+        return true;
+    }
+
+    private static class SRand {
+        private int random_seed;
+
+        private SRand(int seed) {
+            random_seed = seed;
+        }
+
+        private int rand() {
+            random_seed = (random_seed * 214013 + 2531011) & 0xFFFFFFFF;
+            return (random_seed >> 16) & 0x7FFF;
+        }
+    }
+
+    static class FileHeader {
+        HwpVersion version;
+        boolean compressed; // bit 0
+        boolean encrypted; // bit 1
+        boolean viewtext; // bit 2
+    }
+
+    static class TagInfo {
+        long id;
+        long level;
+        long length;
+    }
+
+    static class HwpVersion {
+        int m;
+        int n;
+        int p;
+        int r;
+
+        public String toString() {
+            return String.format("%d.%d.%d.%d", m, n, p, r);
+        }
+
+        public static HwpVersion parseVersion(long longVersion) {
+            HwpVersion version = new HwpVersion();
+            version.m = (int) ((longVersion & 0xFF000000L) >> 24);
+            version.n = (int) ((longVersion & 0x00FF0000L) >> 16);
+            version.p = (int) ((longVersion & 0x0000FF00L) >> 8);
+            version.r = (int) ((longVersion & 0x000000FFL));
+            return version;
+        }
+    }
+
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpV5Parser.java b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpV5Parser.java
index 98d724d..7461caa 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpV5Parser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpV5Parser.java
@@ -1,6 +1,5 @@
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
-
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
@@ -33,36 +32,36 @@ import org.xml.sax.SAXException;
 
 public class HwpV5Parser extends AbstractParser {
 
-	private static final long serialVersionUID = 1L;
-	
-	private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-hwp-v5"));
+    private static final long serialVersionUID = 1L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-hwp-v5"));
     public static final String HWP_MIME_TYPE = "application/x-hwp-v5";
-    
+
     private HwpTextExtractorV5 extractor;
-    
+
     public HwpV5Parser() {
-    	extractor = new HwpTextExtractorV5();
+        extractor = new HwpTextExtractorV5();
     }
-    
-	@Override
-	public Set<MediaType> getSupportedTypes(ParseContext context) {
-		return SUPPORTED_TYPES;
-	}
 
-	@Override
-	public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
-			throws IOException, SAXException, TikaException {
-		
-		metadata.set(Metadata.CONTENT_TYPE, HWP_MIME_TYPE);
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        metadata.set(Metadata.CONTENT_TYPE, HWP_MIME_TYPE);
 
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
-        
+
         try {
-        	extractor.extract(stream, metadata, xhtml);
-		} finally {
-			xhtml.endDocument();
-		}
-	}
+            extractor.extract(stream, metadata, xhtml);
+        } finally {
+            xhtml.endDocument();
+        }
+    }
 
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
index 0ed06f9..1902e49 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
@@ -1,7 +1,5 @@
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
-
-
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
@@ -20,73 +18,39 @@ package org.apache.tika.parser.hwp;
 
 import static org.junit.Assert.assertEquals;
 
-import java.io.InputStream;
-
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.parser.Parser;
 import org.junit.Test;
-import org.xml.sax.ContentHandler;
 
 public class HwpV5ParserTest extends TikaTest {
 
-	@Test
+    @Test
     public void testHwpV5Parser() throws Exception {
-
-        try (InputStream input = HwpV5ParserTest.class.getResourceAsStream(
-                "/test-documents/test-documents-v5.hwp")) {
-            ContentHandler handler = new BodyContentHandler();
-            Metadata metadata = new Metadata();
-            new HwpV5Parser().parse(input, handler, metadata, new ParseContext());
-
+        for (Parser parser : new Parser[]{new HwpV5Parser(),
+                new AutoDetectParser()}) {
+            XMLResult result = getXML("test-documents-v5.hwp", parser);
+            Metadata metadata = result.metadata;
             assertEquals(
-                    "application/x-hwp-v5",
-                    metadata.get(Metadata.CONTENT_TYPE));
+                    "application/x-hwp-v5", metadata.get(Metadata.CONTENT_TYPE));
             assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
             assertEquals("SooMyung Lee", metadata.get(TikaCoreProperties.CREATOR));
-            
-            assertContains("Apache Tika", handler.toString());
+
+            assertContains("Apache Tika", result.xml.toString());
         }
     }
-	
-	@Test
-    public void testAutoDetectParser() throws Exception {
-	    AutoDetectParser parser = new AutoDetectParser();
-	    BodyContentHandler handler = new BodyContentHandler();
-	    Metadata metadata = new Metadata();
-	    try (InputStream stream = HwpV5ParserTest.class.getResourceAsStream("/test-documents/test-documents-v5.hwp")) {
-	        parser.parse(stream, handler, metadata);
-	        
-	        assertContains("Apache Tika", handler.toString());
-	        
-           assertEquals(
-                    "application/x-hwp-v5",
-                    metadata.get(Metadata.CONTENT_TYPE));
-            assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
-            assertEquals("SooMyung Lee", metadata.get(TikaCoreProperties.CREATOR));
-	    }
 
-    }
-	
-	@Test
+    @Test
     public void testDistributedHwp() throws Exception {
-	    AutoDetectParser parser = new AutoDetectParser();
-	    BodyContentHandler handler = new BodyContentHandler();
-	    Metadata metadata = new Metadata();
-	    try (InputStream stream = HwpV5ParserTest.class.getResourceAsStream("/test-documents/test-documents-v5-dist.hwp")) {
-	        parser.parse(stream, handler, metadata);
-	        
-	        assertContains("Apache Tika", handler.toString());
-	        
-           assertEquals(
-                    "application/x-hwp-v5",
-                    metadata.get(Metadata.CONTENT_TYPE));
-            assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
-            assertEquals("SooMyung Lee", metadata.get(TikaCoreProperties.CREATOR));
-	    }
-
+        XMLResult result = getXML("test-documents-v5-dist.hwp");
+        assertContains("Apache Tika", result.xml);
+
+        assertEquals(
+                "application/x-hwp-v5",
+                result.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Apache Tika", result.metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("SooMyung Lee", result.metadata.get(TikaCoreProperties.CREATOR));
     }
 }