You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/22 13:26:34 UTC
[tika] branch master updated: Tika 2909 (#277)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new c21ba30 Tika 2909 (#277)
c21ba30 is described below
commit c21ba30e2b8bf3b9482c833cba4b150a7e3758a7
Author: soomyung <so...@gmail.com>
AuthorDate: Mon Jul 22 22:26:29 2019 +0900
Tika 2909 (#277)
* add HWP-V5 Parser
* add apache license at HwpV5Parser file
* modify TestCase
make AutoDetectParser detect HWP v5 Parser
* modify hwp
* mod
* remove commented and not used codes
* modify type casting bug
---
tika-parsers/pom.xml | 17 +-
.../apache/tika/parser/hwp/HwpStreamReader.java | 131 ++++++
.../apache/tika/parser/hwp/HwpTextExtractorV5.java | 500 +++++++++++++++++++++
.../org/apache/tika/parser/hwp/HwpV5Parser.java | 68 +++
.../services/org.apache.tika.parser.Parser | 1 +
.../apache/tika/parser/hwp/HwpV5ParserTest.java | 92 ++++
.../test-documents/test-documents-v5-dist.hwp | Bin 0 -> 19968 bytes
.../resources/test-documents/test-documents-v5.hwp | Bin 0 -> 70144 bytes
8 files changed, 807 insertions(+), 2 deletions(-)
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index bc0d79c..8ca6703 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -956,7 +956,7 @@
<pluginExecutionFilter>
<groupId>org.apache.felix</groupId>
<artifactId>maven-scr-plugin</artifactId>
- <version>${maven.scr.version}</version>
+ <versionRange>${maven.scr.version}</versionRange>
<goals>
<goal>scr</goal>
</goals>
@@ -969,7 +969,7 @@
<pluginExecutionFilter>
<groupId>org.codehaus.gmaven</groupId>
<artifactId>groovy-maven-plugin</artifactId>
- <version>${groovy.maven.version}</version>
+ <versionRange>${groovy.maven.version}</versionRange>
<goals>
<goal>execute</goal>
</goals>
@@ -978,6 +978,19 @@
<ignore />
</action>
</pluginExecution>
+ <pluginExecution>
+ <pluginExecutionFilter>
+ <groupId>org.sonatype.ossindex.maven</groupId>
+ <artifactId>ossindex-maven-plugin</artifactId>
+ <versionRange>[3.0.4,)</versionRange>
+ <goals>
+ <goal>audit</goal>
+ </goals>
+ </pluginExecutionFilter>
+ <action>
+ <ignore></ignore>
+ </action>
+ </pluginExecution>
</pluginExecutions>
</lifecycleMappingMetadata>
</configuration>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java
new file mode 100644
index 0000000..badcf20
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.hwp;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.LittleEndian;
+
+public class HwpStreamReader {
+ private InputStream input;
+ private byte[] buf;
+
+ public HwpStreamReader(InputStream inputStream) {
+ this.input = inputStream;
+ buf = new byte[4];
+ }
+
+ /**
+ * More data to read ?
+ *
+ * @return
+ * @throws IOException
+ */
+ public boolean available() throws IOException {
+ return input.available() > 0;
+ }
+
+ /**
+ * unsigned 1 byte
+ *
+ * @return
+ * @throws IOException
+ */
+ public short uint8() throws IOException {
+ int read = IOUtils.readFully(input, buf, 0, 1);
+
+ if (read == -1)
+ return -1;
+
+ return LittleEndian.getUByte(buf);
+ }
+
+ /**
+ * unsigned 2 byte
+ *
+ * @return
+ * @throws IOException
+ */
+ public int uint16() throws IOException {
+ int read = IOUtils.readFully(input, buf, 0, 2);
+
+ if (read == -1)
+ return -1;
+
+ if (read < 2)
+ throw new EOFException();
+
+ return LittleEndian.getUShort(buf);
+ }
+
+ /**
+ * unsigned 2 byte array
+ *
+ * @param i
+ * @return
+ * @throws IOException
+ */
+ public int[] uint16(int i) throws IOException {
+ if (i <= 0)
+ throw new IllegalArgumentException();
+
+ byte[] buf = new byte[i * 2];
+ int read = IOUtils.readFully(input, buf, 0, i * 2);
+
+ if (read != i * 2)
+ throw new EOFException();
+
+ int[] uints = new int[i];
+ for (int ii = 0; ii < i; ii++) {
+ uints[ii] = LittleEndian.getUShort(buf, ii * 2);
+ }
+
+ return uints;
+ }
+
+ /**
+ * unsigned 4 byte
+ *
+ * @return
+ * @throws IOException
+ */
+ public long uint32() throws IOException {
+ int read = IOUtils.readFully(input, buf, 0, 4);
+
+ if (read == -1)
+ return -1;
+
+ if (read < 4)
+ throw new EOFException();
+
+ return LittleEndian.getUInt(buf);
+ }
+
+ /**
+ * ensure skip of n byte
+ *
+ * @param n
+ * @throws IOException
+ */
+ public void ensureSkip(long n) throws IOException {
+ IOUtils.skipFully(input, n);
+ }
+}
\ No newline at end of file
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
new file mode 100644
index 0000000..9369873
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
@@ -0,0 +1,500 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.hwp;
+
+import java.io.EOFException;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.security.InvalidKeyException;
+import java.security.Key;
+import java.security.NoSuchAlgorithmException;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.zip.Inflater;
+import java.util.zip.InflaterInputStream;
+
+import javax.crypto.Cipher;
+import javax.crypto.CipherInputStream;
+import javax.crypto.NoSuchPaddingException;
+import javax.crypto.spec.SecretKeySpec;
+
+import org.apache.poi.hpsf.NoPropertySetStreamException;
+import org.apache.poi.hpsf.Property;
+import org.apache.poi.hpsf.PropertySet;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.LittleEndian;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.UnsupportedFormatException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
+
+public class HwpTextExtractorV5 {
+ protected static Logger log = LoggerFactory
+ .getLogger(HwpTextExtractorV5.class);
+
+ private static final byte[] HWP_V5_SIGNATURE = "HWP Document File"
+ .getBytes();
+
+ private static final int HWPTAG_BEGIN = 0x010;
+
+ private static final int I = 1; // INLINE
+ private static final int C = 2; // CONTROL
+ private static final int X = 3; // EXTENDED
+
+ private static final int[] HWP_CHAR_TYPE = new int[] { C, X, X, X, I, I, I, I, I, I, // 0-9
+ C, X, X, C, X, X, X, X, X, I, // 10-19
+ I, X, X, X, C, C, C, C, C, C, // 20-29
+ C, C }; // 30-31
+
+
+ /**
+ * extract Text from HWP Stream.
+ *
+ * @param source
+ * @param writer
+ * @return
+ * @throws FileNotFoundException
+ * @throws IOException
+ * @throws SAXException
+ */
+ public void extract(InputStream source, Metadata metadata, XHTMLContentHandler xhtml)
+ throws FileNotFoundException, IOException,
+ TikaException, SAXException {
+ if (source == null || xhtml == null)
+ throw new IllegalArgumentException();
+
+ POIFSFileSystem fs = null;
+ try {
+ fs = new POIFSFileSystem(source);
+
+ DirectoryNode root = fs.getRoot();
+
+ extract0(root, metadata, xhtml);
+
+ } catch (IOException e) {
+ throw new TikaException(
+ "error occurred when parsing HWP Format, It may not HWP Format.", e);
+ } finally {
+ IOUtils.closeQuietly(fs);
+ }
+ }
+
+ private void extract0(DirectoryNode root, Metadata metadata, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+
+ Entry headerEntry = root.getEntry("FileHeader");
+ if (!headerEntry.isDocumentEntry())
+ throw new UnsupportedFormatException("cannot parse the File Header");
+
+ FileHeader header = getHeader(headerEntry);
+
+ if (header == null)
+ throw new UnsupportedFormatException("cannot parse the File Header");
+ if (header.encrypted)
+ throw new EncryptedDocumentException("document is encrypted");
+
+ parseSummaryInformation(root, metadata);
+
+ if (header.viewtext) {
+ parseViewText(header, root, xhtml);
+ } else {
+ parseBodyText(header, root, xhtml);
+ }
+
+ }
+
+ private void parseSummaryInformation(DirectoryNode root, Metadata metadata) throws TikaException {
+
+ try {
+ Entry summaryEntry = root.getEntry("\u0005HwpSummaryInformation");
+
+ populateMatadata(summaryEntry, metadata);
+
+ } catch (NoPropertySetStreamException | IOException e) {
+ throw new UnsupportedFormatException(
+ "cannot parse the Summary Information");
+ }
+
+ }
+
+ private void populateMatadata(Entry summaryEntry, Metadata metadata) throws IOException, NoPropertySetStreamException {
+
+ DocumentInputStream summaryStream = new DocumentInputStream(
+ (DocumentEntry) summaryEntry);
+
+ PropertySet ps = new PropertySet(summaryStream);
+
+ Property[] props = ps.getProperties();
+
+ for(Property prop : props) {
+ int propID = (int)prop.getID();
+ Object value = prop.getValue();
+
+ switch(propID) {
+ case 2:
+ metadata.set(TikaCoreProperties.TITLE, (String)value);
+ break;
+ case 3:
+ metadata.set(OfficeOpenXMLCore.SUBJECT, (String)value);
+ break;
+ case 4:
+ metadata.set(TikaCoreProperties.CREATOR, (String)value);
+ break;
+ case 5:
+ metadata.set(Office.KEYWORDS, (String)value);
+ break;
+ case 6:
+ metadata.set(TikaCoreProperties.COMMENTS, (String)value);
+ break;
+ case 8:
+ metadata.set(TikaCoreProperties.MODIFIER, (String)value);
+ break;
+ case 12:
+ metadata.set(TikaCoreProperties.CREATED, (Date)value);
+ break;
+ case 13:
+ metadata.set(TikaCoreProperties.MODIFIED, (Date)value);
+ break;
+ case 14:
+ metadata.set(Office.PAGE_COUNT, (int)value);
+ break;
+ default:
+ }
+ }
+ }
+
+ /**
+ * extract the HWP File Header
+ *
+ * @param fs
+ * @return
+ * @throws IOException
+ */
+ private FileHeader getHeader(Entry headerEntry) throws IOException {
+ // confirm signature
+ byte[] header = new byte[256]; // the length of File header is 256
+ DocumentInputStream headerStream = new DocumentInputStream(
+ (DocumentEntry) headerEntry);
+ try {
+ int read = headerStream.read(header);
+ if (read != 256
+ || !Arrays.equals(HWP_V5_SIGNATURE, Arrays.copyOfRange(
+ header, 0, HWP_V5_SIGNATURE.length)))
+ return null;
+ } finally {
+ headerStream.close();
+ }
+
+ FileHeader fileHeader = new FileHeader();
+
+ // version. debug
+ fileHeader.version = HwpVersion.parseVersion(LittleEndian.getUInt(
+ header, 32));
+ long flags = LittleEndian.getUInt(header, 36);
+ log.debug("Flags={}", Long.toBinaryString(flags).replace(' ', '0'));
+
+ fileHeader.compressed = (flags & 0x01) == 0x01;
+ fileHeader.encrypted = (flags & 0x02) == 0x02;
+ fileHeader.viewtext = (flags & 0x04) == 0x04;
+
+ return fileHeader;
+ }
+
+ /**
+ * extract Text
+ *
+ * @param writer
+ * @param source
+ *
+ * @return
+ * @throws IOException
+ * @throws SAXException
+ */
+ private void parseBodyText(FileHeader header, DirectoryNode root,
+ XHTMLContentHandler xhtml) throws IOException, SAXException {
+ // read BodyText
+ Entry bodyText = root.getEntry("BodyText");
+ if (bodyText == null || !bodyText.isDirectoryEntry())
+ throw new IOException("Invalid BodyText");
+
+ Iterator<Entry> iterator = ((DirectoryEntry) bodyText).getEntries();
+ while (iterator.hasNext()) {
+ Entry entry = iterator.next();
+ if (entry.getName().startsWith("Section")
+ && entry instanceof DocumentEntry) {
+ log.debug("extract {}", entry.getName());
+
+ InputStream input = new DocumentInputStream(
+ (DocumentEntry) entry);
+ if (header.compressed)
+ input = new InflaterInputStream(input, new Inflater(true));
+
+ HwpStreamReader reader = new HwpStreamReader(input);
+
+ parse(reader, xhtml);
+
+ } else {
+ log.warn("Unknown Entry '{}'({})", entry.getName(), entry);
+ }
+ }
+ }
+
+ /**
+ * 텍스트 추출
+ *
+ * @param writer
+ * @param source
+ *
+ * @return
+ * @throws IOException
+ */
+ private void parseViewText(FileHeader header, DirectoryNode root,
+ XHTMLContentHandler xhtml) throws IOException {
+ // read BodyText
+ Entry bodyText = root.getEntry("ViewText");
+ if (bodyText == null || !bodyText.isDirectoryEntry())
+ throw new IOException("Invalid ViewText");
+
+ Iterator<Entry> iterator = ((DirectoryEntry) bodyText).getEntries();
+ while (iterator.hasNext()) {
+ Entry entry = iterator.next();
+ if (entry.getName().startsWith("Section")
+ && entry instanceof DocumentEntry) {
+ log.debug("extract {}", entry.getName());
+
+ InputStream input = new DocumentInputStream(
+ (DocumentEntry) entry);
+
+ try {
+ Key key = readKey(input);
+ input = createDecryptStream(input, key);
+ if (header.compressed)
+ input = new InflaterInputStream(input, new Inflater(
+ true));
+
+ HwpStreamReader sectionStream = new HwpStreamReader(input);
+ parse(sectionStream, xhtml);
+ } catch (InvalidKeyException e) {
+ throw new IOException(e);
+ } catch (NoSuchAlgorithmException e) {
+ throw new IOException(e);
+ } catch (NoSuchPaddingException e) {
+ throw new IOException(e);
+ } catch (SAXException e) {
+ throw new IOException(e);
+ } finally {
+ IOUtils.closeQuietly(input);
+ }
+ } else {
+ log.warn("unknown Entry '{}'({})", entry.getName(), entry);
+ }
+ }
+ }
+
+ private Key readKey(InputStream input) throws IOException {
+ byte[] data = new byte[260];
+
+ if (IOUtils.readFully(input, data, 0, 4) != 4)// TAG,
+ throw new EOFException();
+
+ if (IOUtils.readFully(input, data, 0, 256) != 256)
+ throw new EOFException();
+
+ SRand srand = new SRand(LittleEndian.getInt(data));
+ byte xor = 0;
+ for (int i = 0, n = 0; i < 256; i++, n--) {
+ if (n == 0) {
+ xor = (byte) (srand.rand() & 0xFF);
+ n = (int) ((srand.rand() & 0xF) + 1);
+ }
+ if (i >= 4) {
+ data[i] = (byte) ((data[i]) ^ (xor));
+ }
+ }
+
+ int offset = 4 + (data[0] & 0xF); // 4 + (0~15) ?
+ byte[] key = Arrays.copyOfRange(data, offset, offset + 16);
+
+ SecretKeySpec secretKey = new SecretKeySpec(key, "AES");
+ return secretKey;
+ }
+
+ public InputStream createDecryptStream(InputStream input, Key key)
+ throws IOException, NoSuchAlgorithmException,
+ NoSuchPaddingException, InvalidKeyException {
+ Cipher cipher = null;
+
+ cipher = Cipher.getInstance("AES/ECB/NoPadding");
+ cipher.init(Cipher.DECRYPT_MODE, key);
+
+ return new CipherInputStream(input, cipher);
+ }
+
+ /**
+ * extract characters from Section stream
+ *
+ * @param reader
+ * @param writer
+ * @throws IOException
+ * @throws SAXException
+ */
+ private void parse(HwpStreamReader reader, XHTMLContentHandler xhtml)
+ throws IOException, SAXException {
+ StringBuffer buf = new StringBuffer(1024);
+ TagInfo tag = new TagInfo();
+
+ while (true) {
+ if (!readTag(reader, tag))
+ break;
+
+ if (HWPTAG_BEGIN + 51 == tag.id) {
+ if (tag.length % 2 != 0)
+ throw new IOException("Invalid block size");
+
+ buf.setLength(0);
+ writeParaText(reader, tag.length, buf);
+
+ if (buf.length() > 0) {
+ buf.append('\n');
+
+ xhtml.startElement("p");
+ xhtml.characters(buf.toString());
+ xhtml.endElement("p");
+ }
+ } else {
+ reader.ensureSkip(tag.length);
+ }
+ }
+ }
+
+
+ /**
+ * transfer character stream of HWPTAG_PARA_TEXT to STRING
+ *
+ * @param reader
+ * @param datasize
+ * @param buf
+ * @throws IOException
+ */
+ private void writeParaText(HwpStreamReader reader, long datasize,
+ StringBuffer buf) throws IOException {
+ int[] chars = reader.uint16((int) (datasize / 2));
+
+ for (int index = 0; index < chars.length; index++) {
+ int ch = chars[index];
+ if (ch < 32) {
+ if (ch == 9) { // tab, INLINE
+ buf.append('\t');
+ index += 7;
+ } else {
+ int type = HWP_CHAR_TYPE[ch];
+ if (I == type) { // INLINE
+ index += 7;
+ } else if (X == type) { // EXTENDED
+ index += 7;
+ } else if (C == type) { // CONTROL
+ buf.append(' ');
+ }
+ }
+ } else {
+ buf.append((char) ch);
+ }
+ }
+ }
+
+ private boolean readTag(HwpStreamReader reader, TagInfo tag)
+ throws IOException {
+ // see p.24 of hwp 5.0 format guide
+
+ long recordHeader = reader.uint32();
+ if (recordHeader == -1)
+ return false;
+
+ tag.id = recordHeader & 0x3FF;
+ tag.level = (recordHeader >> 10) & 0x3FF;
+ tag.length = (recordHeader >> 20) & 0xFFF;
+
+ // see p.24 of hwp 5.0 format guide
+ if (tag.length == 0xFFF)
+ tag.length = reader.uint32();
+
+ return true;
+ }
+
+ private static class SRand {
+ private int random_seed;
+
+ private SRand(int seed) {
+ random_seed = seed;
+ }
+
+ private int rand() {
+ random_seed = (random_seed * 214013 + 2531011) & 0xFFFFFFFF;
+ return (random_seed >> 16) & 0x7FFF;
+ }
+ }
+
+ static class FileHeader {
+ HwpVersion version;
+ boolean compressed; // bit 0
+ boolean encrypted; // bit 1
+ boolean viewtext; // bit 2
+ }
+
+ static class TagInfo {
+ long id;
+ long level;
+ long length;
+ }
+
+ static class HwpVersion {
+ int m;
+ int n;
+ int p;
+ int r;
+
+ public String toString() {
+ return String.format("%d.%d.%d.%d", m, n, p, r);
+ }
+
+ public static HwpVersion parseVersion(long longVersion) {
+ HwpVersion version = new HwpVersion();
+ version.m = (int) ((longVersion & 0xFF000000L) >> 24);
+ version.n = (int) ((longVersion & 0x00FF0000L) >> 16);
+ version.p = (int) ((longVersion & 0x0000FF00L) >> 8);
+ version.r = (int) ((longVersion & 0x000000FFL));
+ return version;
+ }
+ }
+
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpV5Parser.java b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpV5Parser.java
new file mode 100644
index 0000000..98d724d
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpV5Parser.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.hwp;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class HwpV5Parser extends AbstractParser {
+
+ private static final long serialVersionUID = 1L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-hwp-v5"));
+ public static final String HWP_MIME_TYPE = "application/x-hwp-v5";
+
+ private HwpTextExtractorV5 extractor;
+
+ public HwpV5Parser() {
+ extractor = new HwpTextExtractorV5();
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ metadata.set(Metadata.CONTENT_TYPE, HWP_MIME_TYPE);
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ try {
+ extractor.extract(stream, metadata, xhtml);
+ } finally {
+ xhtml.endDocument();
+ }
+ }
+
+}
diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 330d473..6cdba30 100644
--- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -28,6 +28,7 @@ org.apache.tika.parser.feed.FeedParser
org.apache.tika.parser.font.AdobeFontMetricParser
org.apache.tika.parser.font.TrueTypeParser
org.apache.tika.parser.html.HtmlParser
+org.apache.tika.parser.hwp.HwpV5Parser
org.apache.tika.parser.image.BPGParser
org.apache.tika.parser.image.ImageParser
org.apache.tika.parser.image.PSDParser
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
new file mode 100644
index 0000000..0ed06f9
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+
+
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.hwp;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class HwpV5ParserTest extends TikaTest {
+
+ @Test
+ public void testHwpV5Parser() throws Exception {
+
+ try (InputStream input = HwpV5ParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents-v5.hwp")) {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ new HwpV5Parser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals(
+ "application/x-hwp-v5",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("SooMyung Lee", metadata.get(TikaCoreProperties.CREATOR));
+
+ assertContains("Apache Tika", handler.toString());
+ }
+ }
+
+ @Test
+ public void testAutoDetectParser() throws Exception {
+ AutoDetectParser parser = new AutoDetectParser();
+ BodyContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ try (InputStream stream = HwpV5ParserTest.class.getResourceAsStream("/test-documents/test-documents-v5.hwp")) {
+ parser.parse(stream, handler, metadata);
+
+ assertContains("Apache Tika", handler.toString());
+
+ assertEquals(
+ "application/x-hwp-v5",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("SooMyung Lee", metadata.get(TikaCoreProperties.CREATOR));
+ }
+
+ }
+
+ @Test
+ public void testDistributedHwp() throws Exception {
+ AutoDetectParser parser = new AutoDetectParser();
+ BodyContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ try (InputStream stream = HwpV5ParserTest.class.getResourceAsStream("/test-documents/test-documents-v5-dist.hwp")) {
+ parser.parse(stream, handler, metadata);
+
+ assertContains("Apache Tika", handler.toString());
+
+ assertEquals(
+ "application/x-hwp-v5",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("SooMyung Lee", metadata.get(TikaCoreProperties.CREATOR));
+ }
+
+ }
+}
diff --git a/tika-parsers/src/test/resources/test-documents/test-documents-v5-dist.hwp b/tika-parsers/src/test/resources/test-documents/test-documents-v5-dist.hwp
new file mode 100644
index 0000000..b71fe94
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/test-documents-v5-dist.hwp differ
diff --git a/tika-parsers/src/test/resources/test-documents/test-documents-v5.hwp b/tika-parsers/src/test/resources/test-documents/test-documents-v5.hwp
new file mode 100644
index 0000000..7746b1f
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/test-documents-v5.hwp differ