You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/22 13:26:34 UTC

[tika] branch master updated: Tika 2909 (#277)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new c21ba30  Tika 2909 (#277)
c21ba30 is described below

commit c21ba30e2b8bf3b9482c833cba4b150a7e3758a7
Author: soomyung <so...@gmail.com>
AuthorDate: Mon Jul 22 22:26:29 2019 +0900

    Tika 2909 (#277)
    
    * add HWP-V5 Parser
    
    * add apache license at HwpV5Parser file
    
    * modify TestCase
    make AutoDetectParser detect HWP v5 Parser
    
    * modify hwp
    
    * mod
    
    * remove commented and not used codes
    
    * modify type casting bug
---
 tika-parsers/pom.xml                               |  17 +-
 .../apache/tika/parser/hwp/HwpStreamReader.java    | 131 ++++++
 .../apache/tika/parser/hwp/HwpTextExtractorV5.java | 500 +++++++++++++++++++++
 .../org/apache/tika/parser/hwp/HwpV5Parser.java    |  68 +++
 .../services/org.apache.tika.parser.Parser         |   1 +
 .../apache/tika/parser/hwp/HwpV5ParserTest.java    |  92 ++++
 .../test-documents/test-documents-v5-dist.hwp      | Bin 0 -> 19968 bytes
 .../resources/test-documents/test-documents-v5.hwp | Bin 0 -> 70144 bytes
 8 files changed, 807 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index bc0d79c..8ca6703 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -956,7 +956,7 @@
                   <pluginExecutionFilter>
                     <groupId>org.apache.felix</groupId>
                     <artifactId>maven-scr-plugin</artifactId>
-                    <version>${maven.scr.version}</version>
+                    <versionRange>${maven.scr.version}</versionRange>
                     <goals>
                       <goal>scr</goal>
                     </goals>
@@ -969,7 +969,7 @@
                   <pluginExecutionFilter>
                     <groupId>org.codehaus.gmaven</groupId>
                     <artifactId>groovy-maven-plugin</artifactId>
-                    <version>${groovy.maven.version}</version>
+                    <versionRange>${groovy.maven.version}</versionRange>
                     <goals>
                       <goal>execute</goal>
                     </goals>
@@ -978,6 +978,19 @@
                     <ignore />
                   </action>
                 </pluginExecution>
+                <pluginExecution>
+                	<pluginExecutionFilter>
+                		<groupId>org.sonatype.ossindex.maven</groupId>
+                		<artifactId>ossindex-maven-plugin</artifactId>
+                		<versionRange>[3.0.4,)</versionRange>
+                		<goals>
+                			<goal>audit</goal>
+                		</goals>
+                	</pluginExecutionFilter>
+                	<action>
+                		<ignore></ignore>
+                	</action>
+                </pluginExecution>
               </pluginExecutions>
             </lifecycleMappingMetadata>
           </configuration>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java
new file mode 100644
index 0000000..badcf20
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.hwp;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.LittleEndian;
+
+public class HwpStreamReader {
+	private InputStream input;
+	private byte[] buf;
+
+	public HwpStreamReader(InputStream inputStream) {
+		this.input = inputStream;
+		buf = new byte[4];
+	}
+
+	/**
+	 * More data to read ?
+	 * 
+	 * @return
+	 * @throws IOException
+	 */
+	public boolean available() throws IOException {
+		return input.available() > 0;
+	}
+
+	/**
+	 * unsigned 1 byte
+	 * 
+	 * @return
+	 * @throws IOException
+	 */
+	public short uint8() throws IOException {
+		int read = IOUtils.readFully(input, buf, 0, 1);
+
+		if (read == -1)
+			return -1;
+
+		return LittleEndian.getUByte(buf);
+	}
+
+	/**
+	 * unsigned 2 byte
+	 * 
+	 * @return
+	 * @throws IOException
+	 */
+	public int uint16() throws IOException {
+		int read = IOUtils.readFully(input, buf, 0, 2);
+
+		if (read == -1)
+			return -1;
+
+		if (read < 2)
+			throw new EOFException();
+
+		return LittleEndian.getUShort(buf);
+	}
+
+	/**
+	 * unsigned 2 byte array
+	 * 
+	 * @param i
+	 * @return
+	 * @throws IOException
+	 */
+	public int[] uint16(int i) throws IOException {
+		if (i <= 0)
+			throw new IllegalArgumentException();
+
+		byte[] buf = new byte[i * 2];
+		int read = IOUtils.readFully(input, buf, 0, i * 2);
+
+		if (read != i * 2)
+			throw new EOFException();
+		
+		int[] uints = new int[i];
+		for (int ii = 0; ii < i; ii++) {
+			uints[ii] = LittleEndian.getUShort(buf, ii * 2);
+		}
+
+		return uints;
+	}
+
+	/**
+	 * unsigned 4 byte
+	 * 
+	 * @return
+	 * @throws IOException
+	 */
+	public long uint32() throws IOException {
+		int read = IOUtils.readFully(input, buf, 0, 4);
+
+		if (read == -1)
+			return -1;
+
+		if (read < 4)
+			throw new EOFException();
+
+		return LittleEndian.getUInt(buf);
+	}
+
+	/**
+	 * ensure skip of n byte
+	 * 
+	 * @param n
+	 * @throws IOException
+	 */
+	public void ensureSkip(long n) throws IOException {
+		IOUtils.skipFully(input, n);
+	}
+}
\ No newline at end of file
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
new file mode 100644
index 0000000..9369873
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
@@ -0,0 +1,500 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.hwp;
+
+import java.io.EOFException;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.security.InvalidKeyException;
+import java.security.Key;
+import java.security.NoSuchAlgorithmException;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.zip.Inflater;
+import java.util.zip.InflaterInputStream;
+
+import javax.crypto.Cipher;
+import javax.crypto.CipherInputStream;
+import javax.crypto.NoSuchPaddingException;
+import javax.crypto.spec.SecretKeySpec;
+
+import org.apache.poi.hpsf.NoPropertySetStreamException;
+import org.apache.poi.hpsf.Property;
+import org.apache.poi.hpsf.PropertySet;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.LittleEndian;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.UnsupportedFormatException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
+
+public class HwpTextExtractorV5 {
+	protected static Logger log = LoggerFactory
+			.getLogger(HwpTextExtractorV5.class);
+
+	private static final byte[] HWP_V5_SIGNATURE = "HWP Document File"
+			.getBytes();
+
+	private static final int HWPTAG_BEGIN = 0x010;
+	
+	private static final int I = 1; // INLINE
+	private static final int C = 2; // CONTROL
+	private static final int X = 3; // EXTENDED
+	
+	private static final int[] HWP_CHAR_TYPE = new int[] { C, X, X, X, I, I, I, I, I, I, // 0-9
+			C, X, X, C, X, X, X, X, X, I, // 10-19
+			I, X, X, X, C, C, C, C, C, C, // 20-29
+			C, C }; // 30-31
+
+
+	/**
+	 * extract Text from HWP Stream.
+	 * 
+	 * @param source
+	 * @param writer
+	 * @return
+	 * @throws FileNotFoundException
+	 * @throws IOException
+	 * @throws SAXException 
+	 */
+	public void extract(InputStream source, Metadata metadata, XHTMLContentHandler xhtml)
+			throws FileNotFoundException, IOException,
+			TikaException, SAXException {
+		if (source == null || xhtml == null)
+			throw new IllegalArgumentException();
+
+		POIFSFileSystem fs = null;
+		try {
+			fs = new POIFSFileSystem(source);
+
+			DirectoryNode root = fs.getRoot();
+
+			extract0(root, metadata, xhtml);
+
+		} catch (IOException e) {
+			throw new TikaException(
+					"error occurred when parsing HWP Format, It may not HWP Format.", e);
+		} finally {
+			IOUtils.closeQuietly(fs);
+		}
+	}
+
+	private void extract0(DirectoryNode root, Metadata metadata, XHTMLContentHandler xhtml)
+			throws IOException, SAXException, TikaException {
+
+		Entry headerEntry = root.getEntry("FileHeader");
+		if (!headerEntry.isDocumentEntry())
+			throw new UnsupportedFormatException("cannot parse the File Header");
+
+		FileHeader header = getHeader(headerEntry);
+
+		if (header == null)
+			throw new UnsupportedFormatException("cannot parse the File Header");
+		if (header.encrypted)
+			throw new EncryptedDocumentException("document is encrypted");
+
+		parseSummaryInformation(root, metadata);
+		
+		if (header.viewtext) {
+			parseViewText(header, root, xhtml);
+		} else {
+			parseBodyText(header, root, xhtml);
+		}
+		
+	}
+
+	private void parseSummaryInformation(DirectoryNode root, Metadata metadata) throws TikaException {
+
+		try {
+			Entry summaryEntry = root.getEntry("\u0005HwpSummaryInformation");
+			
+			populateMatadata(summaryEntry, metadata);
+			
+		} catch (NoPropertySetStreamException | IOException e) {
+			throw new UnsupportedFormatException(
+					"cannot parse the Summary Information");
+		}
+		
+	}
+	
+	private void populateMatadata(Entry summaryEntry, Metadata metadata) throws IOException, NoPropertySetStreamException {
+		
+		DocumentInputStream summaryStream = new DocumentInputStream(
+				(DocumentEntry) summaryEntry);
+		
+		PropertySet ps = new PropertySet(summaryStream);
+		
+		Property[] props = ps.getProperties();
+		
+		for(Property prop : props) {
+			int propID = (int)prop.getID();
+			Object value = prop.getValue();
+			
+			switch(propID) {
+			case 2:
+				metadata.set(TikaCoreProperties.TITLE, (String)value);
+				break;
+			case 3:
+				metadata.set(OfficeOpenXMLCore.SUBJECT, (String)value);
+				break;
+			case 4:
+				metadata.set(TikaCoreProperties.CREATOR, (String)value);
+				break;
+			case 5:
+				metadata.set(Office.KEYWORDS, (String)value);
+				break;
+			case 6:
+				metadata.set(TikaCoreProperties.COMMENTS, (String)value);
+				break;
+			case 8:
+				metadata.set(TikaCoreProperties.MODIFIER, (String)value);
+				break;
+			case 12:
+				metadata.set(TikaCoreProperties.CREATED, (Date)value);
+				break;
+			case 13:
+				metadata.set(TikaCoreProperties.MODIFIED, (Date)value);
+				break;
+			case 14:
+				metadata.set(Office.PAGE_COUNT, (int)value);
+				break;
+			default:
+			}
+		}
+	}
+	
+	/**
+	 * extract the HWP File Header
+	 * 
+	 * @param fs
+	 * @return
+	 * @throws IOException
+	 */
+	private FileHeader getHeader(Entry headerEntry) throws IOException {
+		// confirm signature
+		byte[] header = new byte[256]; // the length of File header is 256
+		DocumentInputStream headerStream = new DocumentInputStream(
+				(DocumentEntry) headerEntry);
+		try {
+			int read = headerStream.read(header);
+			if (read != 256
+					|| !Arrays.equals(HWP_V5_SIGNATURE, Arrays.copyOfRange(
+							header, 0, HWP_V5_SIGNATURE.length)))
+				return null;
+		} finally {
+			headerStream.close();
+		}
+
+		FileHeader fileHeader = new FileHeader();
+
+		// version. debug
+		fileHeader.version = HwpVersion.parseVersion(LittleEndian.getUInt(
+				header, 32));
+		long flags = LittleEndian.getUInt(header, 36);
+		log.debug("Flags={}", Long.toBinaryString(flags).replace(' ', '0'));
+
+		fileHeader.compressed = (flags & 0x01) == 0x01;
+		fileHeader.encrypted = (flags & 0x02) == 0x02;
+		fileHeader.viewtext = (flags & 0x04) == 0x04;
+
+		return fileHeader;
+	}
+
+	/**
+	 * extract Text
+	 * 
+	 * @param writer
+	 * @param source
+	 * 
+	 * @return
+	 * @throws IOException
+	 * @throws SAXException 
+	 */
+	private void parseBodyText(FileHeader header, DirectoryNode root,
+			XHTMLContentHandler xhtml) throws IOException, SAXException {
+		// read BodyText
+		Entry bodyText = root.getEntry("BodyText");
+		if (bodyText == null || !bodyText.isDirectoryEntry())
+			throw new IOException("Invalid BodyText");
+
+		Iterator<Entry> iterator = ((DirectoryEntry) bodyText).getEntries();
+		while (iterator.hasNext()) {
+			Entry entry = iterator.next();
+			if (entry.getName().startsWith("Section")
+					&& entry instanceof DocumentEntry) {
+				log.debug("extract {}", entry.getName());
+
+				InputStream input = new DocumentInputStream(
+						(DocumentEntry) entry);
+				if (header.compressed)
+					input = new InflaterInputStream(input, new Inflater(true));
+
+				HwpStreamReader reader = new HwpStreamReader(input);
+
+				parse(reader, xhtml);
+				
+			} else {
+				log.warn("Unknown Entry '{}'({})", entry.getName(), entry);
+			}
+		}
+	}
+
+	/**
+	 * 텍스트 추출
+	 * 
+	 * @param writer
+	 * @param source
+	 * 
+	 * @return
+	 * @throws IOException
+	 */
+	private void parseViewText(FileHeader header, DirectoryNode root,
+			XHTMLContentHandler xhtml) throws IOException {
+		// read BodyText
+		Entry bodyText = root.getEntry("ViewText");
+		if (bodyText == null || !bodyText.isDirectoryEntry())
+			throw new IOException("Invalid ViewText");
+
+		Iterator<Entry> iterator = ((DirectoryEntry) bodyText).getEntries();
+		while (iterator.hasNext()) {
+			Entry entry = iterator.next();
+			if (entry.getName().startsWith("Section")
+					&& entry instanceof DocumentEntry) {
+				log.debug("extract {}", entry.getName());
+
+				InputStream input = new DocumentInputStream(
+						(DocumentEntry) entry);
+	
+				try {
+					Key key = readKey(input);
+					input = createDecryptStream(input, key);
+					if (header.compressed)
+						input = new InflaterInputStream(input, new Inflater(
+								true));
+
+					HwpStreamReader sectionStream = new HwpStreamReader(input);
+					parse(sectionStream, xhtml);
+				} catch (InvalidKeyException e) {
+					throw new IOException(e);
+				} catch (NoSuchAlgorithmException e) {
+					throw new IOException(e);
+				} catch (NoSuchPaddingException e) {
+					throw new IOException(e);
+				} catch (SAXException e) {
+					throw new IOException(e);
+				} finally {
+					IOUtils.closeQuietly(input);
+				}
+			} else {
+				log.warn("unknown Entry '{}'({})", entry.getName(), entry);
+			}
+		}
+	}
+
+	private Key readKey(InputStream input) throws IOException {
+		byte[] data = new byte[260];
+
+		if (IOUtils.readFully(input, data, 0, 4) != 4)// TAG,
+			throw new EOFException(); 
+
+		if (IOUtils.readFully(input, data, 0, 256) != 256)
+			throw new EOFException();
+
+		SRand srand = new SRand(LittleEndian.getInt(data));
+		byte xor = 0;
+		for (int i = 0, n = 0; i < 256; i++, n--) {
+			if (n == 0) {
+				xor = (byte) (srand.rand() & 0xFF);
+				n = (int) ((srand.rand() & 0xF) + 1);
+			}
+			if (i >= 4) {
+				data[i] = (byte) ((data[i]) ^ (xor));
+			}
+		}
+
+		int offset = 4 + (data[0] & 0xF); // 4 + (0~15) ?
+		byte[] key = Arrays.copyOfRange(data, offset, offset + 16);
+
+		SecretKeySpec secretKey = new SecretKeySpec(key, "AES");
+		return secretKey;
+	}
+
+	public InputStream createDecryptStream(InputStream input, Key key)
+			throws IOException, NoSuchAlgorithmException,
+			NoSuchPaddingException, InvalidKeyException {
+		Cipher cipher = null;
+
+		cipher = Cipher.getInstance("AES/ECB/NoPadding");
+		cipher.init(Cipher.DECRYPT_MODE, key);
+
+		return new CipherInputStream(input, cipher);
+	}
+
+	/**
+	 * extract characters from Section stream
+	 * 
+	 * @param reader
+	 * @param writer
+	 * @throws IOException
+	 * @throws SAXException 
+	 */
+	private void parse(HwpStreamReader reader, XHTMLContentHandler xhtml)
+			throws IOException, SAXException {
+		StringBuffer buf = new StringBuffer(1024);
+		TagInfo tag = new TagInfo();
+
+		while (true) {
+			if (!readTag(reader, tag))
+				break;
+
+			if (HWPTAG_BEGIN + 51 == tag.id) {
+				if (tag.length % 2 != 0)
+					throw new IOException("Invalid block size");
+
+				buf.setLength(0);
+				writeParaText(reader, tag.length, buf);
+
+				if (buf.length() > 0) {
+					buf.append('\n');
+					
+					xhtml.startElement("p");
+					xhtml.characters(buf.toString());
+					xhtml.endElement("p");
+				}
+			} else {
+				reader.ensureSkip(tag.length);
+			}
+		}
+	}
+
+
+	/**
+	 * transfer character stream of HWPTAG_PARA_TEXT to STRING
+	 * 
+	 * @param reader
+	 * @param datasize
+	 * @param buf
+	 * @throws IOException
+	 */
+	private void writeParaText(HwpStreamReader reader, long datasize,
+			StringBuffer buf) throws IOException {
+		int[] chars = reader.uint16((int) (datasize / 2));
+
+		for (int index = 0; index < chars.length; index++) {
+			int ch = chars[index];
+			if (ch < 32) {
+				if (ch == 9) { // tab, INLINE
+					buf.append('\t');
+					index += 7;
+				} else {
+					int type = HWP_CHAR_TYPE[ch];
+					if (I == type) { // INLINE
+						index += 7;
+					} else if (X == type) { // EXTENDED
+						index += 7;
+					} else if (C == type) { // CONTROL
+						buf.append(' ');
+					}
+				}
+			} else {
+				buf.append((char) ch);
+			}
+		}
+	}
+
+	private boolean readTag(HwpStreamReader reader, TagInfo tag)
+			throws IOException {
+		// see p.24 of hwp 5.0 format guide
+
+		long recordHeader = reader.uint32();
+		if (recordHeader == -1)
+			return false;
+
+		tag.id = recordHeader & 0x3FF;
+		tag.level = (recordHeader >> 10) & 0x3FF;
+		tag.length = (recordHeader >> 20) & 0xFFF;
+
+		// see p.24 of hwp 5.0 format guide
+		if (tag.length == 0xFFF)
+			tag.length = reader.uint32();
+
+		return true;
+	}
+
+	private static class SRand {
+		private int random_seed;
+
+		private SRand(int seed) {
+			random_seed = seed;
+		}
+
+		private int rand() {
+			random_seed = (random_seed * 214013 + 2531011) & 0xFFFFFFFF;
+			return (random_seed >> 16) & 0x7FFF;
+		}
+	}
+	
+	static class FileHeader {
+		HwpVersion version;
+		boolean compressed; // bit 0
+		boolean encrypted; // bit 1
+		boolean viewtext; // bit 2
+	}
+
+	static class TagInfo {
+		long id;
+		long level;
+		long length;
+	}
+
+	static class HwpVersion {
+		int m;
+		int n;
+		int p;
+		int r;
+
+		public String toString() {
+			return String.format("%d.%d.%d.%d", m, n, p, r);
+		}
+
+		public static HwpVersion parseVersion(long longVersion) {
+			HwpVersion version = new HwpVersion();
+			version.m = (int) ((longVersion & 0xFF000000L) >> 24);
+			version.n = (int) ((longVersion & 0x00FF0000L) >> 16);
+			version.p = (int) ((longVersion & 0x0000FF00L) >> 8);
+			version.r = (int) ((longVersion & 0x000000FFL));
+			return version;
+		}
+	}
+
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpV5Parser.java b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpV5Parser.java
new file mode 100644
index 0000000..98d724d
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpV5Parser.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.hwp;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class HwpV5Parser extends AbstractParser {
+
+	private static final long serialVersionUID = 1L;
+	
+	private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-hwp-v5"));
+    public static final String HWP_MIME_TYPE = "application/x-hwp-v5";
+    
+    private HwpTextExtractorV5 extractor;
+    
+    public HwpV5Parser() {
+    	extractor = new HwpTextExtractorV5();
+    }
+    
+	@Override
+	public Set<MediaType> getSupportedTypes(ParseContext context) {
+		return SUPPORTED_TYPES;
+	}
+
+	@Override
+	public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+			throws IOException, SAXException, TikaException {
+		
+		metadata.set(Metadata.CONTENT_TYPE, HWP_MIME_TYPE);
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        
+        try {
+        	extractor.extract(stream, metadata, xhtml);
+		} finally {
+			xhtml.endDocument();
+		}
+	}
+
+}
diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 330d473..6cdba30 100644
--- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -28,6 +28,7 @@ org.apache.tika.parser.feed.FeedParser
 org.apache.tika.parser.font.AdobeFontMetricParser
 org.apache.tika.parser.font.TrueTypeParser
 org.apache.tika.parser.html.HtmlParser
+org.apache.tika.parser.hwp.HwpV5Parser
 org.apache.tika.parser.image.BPGParser
 org.apache.tika.parser.image.ImageParser
 org.apache.tika.parser.image.PSDParser
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
new file mode 100644
index 0000000..0ed06f9
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+
+
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.hwp;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class HwpV5ParserTest extends TikaTest {
+
+	@Test
+    public void testHwpV5Parser() throws Exception {
+
+        try (InputStream input = HwpV5ParserTest.class.getResourceAsStream(
+                "/test-documents/test-documents-v5.hwp")) {
+            ContentHandler handler = new BodyContentHandler();
+            Metadata metadata = new Metadata();
+            new HwpV5Parser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals(
+                    "application/x-hwp-v5",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("SooMyung Lee", metadata.get(TikaCoreProperties.CREATOR));
+            
+            assertContains("Apache Tika", handler.toString());
+        }
+    }
+	
+	@Test
+    public void testAutoDetectParser() throws Exception {
+	    AutoDetectParser parser = new AutoDetectParser();
+	    BodyContentHandler handler = new BodyContentHandler();
+	    Metadata metadata = new Metadata();
+	    try (InputStream stream = HwpV5ParserTest.class.getResourceAsStream("/test-documents/test-documents-v5.hwp")) {
+	        parser.parse(stream, handler, metadata);
+	        
+	        assertContains("Apache Tika", handler.toString());
+	        
+           assertEquals(
+                    "application/x-hwp-v5",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("SooMyung Lee", metadata.get(TikaCoreProperties.CREATOR));
+	    }
+
+    }
+	
+	@Test
+    public void testDistributedHwp() throws Exception {
+	    AutoDetectParser parser = new AutoDetectParser();
+	    BodyContentHandler handler = new BodyContentHandler();
+	    Metadata metadata = new Metadata();
+	    try (InputStream stream = HwpV5ParserTest.class.getResourceAsStream("/test-documents/test-documents-v5-dist.hwp")) {
+	        parser.parse(stream, handler, metadata);
+	        
+	        assertContains("Apache Tika", handler.toString());
+	        
+           assertEquals(
+                    "application/x-hwp-v5",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("SooMyung Lee", metadata.get(TikaCoreProperties.CREATOR));
+	    }
+
+    }
+}
diff --git a/tika-parsers/src/test/resources/test-documents/test-documents-v5-dist.hwp b/tika-parsers/src/test/resources/test-documents/test-documents-v5-dist.hwp
new file mode 100644
index 0000000..b71fe94
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/test-documents-v5-dist.hwp differ
diff --git a/tika-parsers/src/test/resources/test-documents/test-documents-v5.hwp b/tika-parsers/src/test/resources/test-documents/test-documents-v5.hwp
new file mode 100644
index 0000000..7746b1f
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/test-documents-v5.hwp differ