You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2013/12/27 16:29:40 UTC
svn commit: r1553683 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/chm/ test/java/org/apache/tika/parser/chm/
Author: jukka
Date: Fri Dec 27 15:29:40 2013
New Revision: 1553683
URL: http://svn.apache.org/r1553683
Log:
TIKA-672: Proper error handling in the CHM parser
Simplify the CHMDocumentInformation class
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmDocumentInformation.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java?rev=1553683&r1=1553682&r2=1553683&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java Fri Dec 27 15:29:40 2013
@@ -32,44 +32,51 @@ import org.xml.sax.SAXException;
/**
* Extracts text and metadata from chm file
- *
*/
-public class CHMDocumentInformation {
- /* Class members */
- private ChmExtractor chmExtractor = null;
+class CHMDocumentInformation {
+
+ private final ChmExtractor chmExtractor;
/**
* Loads chm file as input stream and returns a new instance of chm doc info
*
- * @param is
- * InputStream
+ * @param stream chm input stream
+ */
+ public CHMDocumentInformation(InputStream stream)
+ throws TikaException, IOException {
+ this.chmExtractor = new ChmExtractor(stream);
+ }
+
+ /**
+ * Checks if an entry is a html or not.
*
- * @return chm document information
- * @throws TikaException
- * @throws IOException
+ * @param entry
+ * chm directory listing entry
+ *
+ * @return boolean
*/
- public static CHMDocumentInformation load(InputStream is) throws TikaException, IOException {
- CHMDocumentInformation document = new CHMDocumentInformation();
- document.setChmExtractor(new ChmExtractor(is));
- return document;
+ private boolean isRightEntry(DirectoryListingEntry entry) {
+ return (entry.getName().endsWith(".html") || entry.getName().endsWith(".htm"));
}
/**
- * Appends extracted data from chm listing entries
+ * Returns extracted text from chm file
+ *
+ * @return text
*
- * @return extracted content of chm
+ * @throws TikaException
*/
- private String getContent() {
+ public String getText() throws TikaException {
StringBuilder sb = new StringBuilder();
DirectoryListingEntry entry;
- for (Iterator<DirectoryListingEntry> it = getChmExtractor()
+ for (Iterator<DirectoryListingEntry> it = chmExtractor
.getChmDirList().getDirectoryListingEntryList().iterator(); it.hasNext();)
{
try {
entry = it.next();
if (isRightEntry(entry)) {
- byte[][] tmp = getChmExtractor().extractChmEntry(entry);
+ byte[][] tmp = chmExtractor.extractChmEntry(entry);
if (tmp != null) {
sb.append(extract(tmp));
}
@@ -83,65 +90,6 @@ public class CHMDocumentInformation {
}
/**
- * Checks if an entry is a html or not.
- *
- * @param entry
- * chm directory listing entry
- *
- * @return boolean
- */
- private boolean isRightEntry(DirectoryListingEntry entry) {
- return (entry.getName().endsWith(".html") || entry.getName().endsWith(".htm"));
- }
-
- /**
- * Returns chm extractor
- *
- * @return chmExtractor
- */
- private ChmExtractor getChmExtractor() {
- return chmExtractor;
- }
-
- /**
- * Sets a chm extractor
- *
- * @param chmExtractor
- */
- private void setChmExtractor(ChmExtractor chmExtractor) {
- this.chmExtractor = chmExtractor;
- }
-
- /**
- * Returns chm metadata
- *
- * @param metadata
- *
- * @throws TikaException
- * @throws IOException
- */
- public void getCHMDocInformation(Metadata metadata) throws TikaException,
- IOException {
- if (getChmExtractor() != null) {
- /* Checking if file is a chm, done during creating chmItsf header */
- metadata.add(Metadata.CONTENT_TYPE, "application/x-chm");
- } else {
- metadata.add(Metadata.CONTENT_TYPE, "unknown");
- }
- }
-
- /**
- * Returns extracted text from chm file
- *
- * @return text
- *
- * @throws TikaException
- */
- public String getText() throws TikaException {
- return getContent();
- }
-
- /**
* Extracts data from byte[][]
*
* @param byteObject
@@ -180,7 +128,4 @@ public class CHMDocumentInformation {
return wBuf.toString();
}
- public static void main(String[] args) {
-
- }
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java?rev=1553683&r1=1553682&r2=1553683&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java Fri Dec 27 15:29:40 2013
@@ -50,11 +50,10 @@ public class ChmParser extends AbstractP
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
- CHMDocumentInformation chmInfo = CHMDocumentInformation.load(stream);
+ CHMDocumentInformation chmInfo = new CHMDocumentInformation(stream);
// metadata
metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp");
- chmInfo.getCHMDocInformation(metadata);
// content
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmDocumentInformation.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmDocumentInformation.java?rev=1553683&r1=1553682&r2=1553683&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmDocumentInformation.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmDocumentInformation.java Fri Dec 27 15:29:40 2013
@@ -14,17 +14,13 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
package org.apache.tika.parser.chm;
-import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.ByteArrayInputStream;
-import java.io.IOException;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
import org.junit.Before;
import org.junit.Test;
@@ -33,19 +29,11 @@ public class TestChmDocumentInformation
@Before
public void setUp() throws Exception {
- chmDoc = CHMDocumentInformation.load(
+ chmDoc = new CHMDocumentInformation(
new ByteArrayInputStream(TestParameters.chmData));
}
@Test
- public void testGetCHMDocInformation() throws TikaException, IOException {
- Metadata md = new Metadata();
- chmDoc.getCHMDocInformation(md);
- assertEquals(TestParameters.VP_CHM_MIME_TYPE, md.toString()
- .trim());
- }
-
- @Test
public void testGetText() throws TikaException {
assertTrue(chmDoc.getText().contains(
"The TCard method accepts only numeric arguments"));
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java?rev=1553683&r1=1553682&r2=1553683&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java Fri Dec 27 15:29:40 2013
@@ -14,10 +14,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
package org.apache.tika.parser.chm;
-import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
@@ -29,7 +27,6 @@ import java.util.concurrent.Executors;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
-import org.apache.tika.metadata.Metadata;
import org.junit.Before;
import org.junit.Test;
@@ -56,13 +53,10 @@ public class TestChmExtraction {
try {
stream = TestChmExtraction.class.getResourceAsStream(fileName);
- CHMDocumentInformation chmDocInfo = CHMDocumentInformation
- .load(stream);
- Metadata md = new Metadata();
+ CHMDocumentInformation chmDocInfo =
+ new CHMDocumentInformation(stream);
mutex.lock();
String text = chmDocInfo.getText();
- chmDocInfo.getCHMDocInformation(md);
- assertEquals(TestParameters.VP_CHM_MIME_TYPE, md.toString().trim());
assertTrue(text.length() > 0);
} catch (Exception e) {
e.printStackTrace();
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java?rev=1553683&r1=1553682&r2=1553683&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java Fri Dec 27 15:29:40 2013
@@ -27,7 +27,6 @@ import java.util.Iterator;
import java.util.List;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
import org.apache.tika.parser.chm.core.ChmExtractor;
@@ -80,11 +79,8 @@ public class TestChmExtractor {
InputStream stream =
TestChmBlockInfo.class.getResourceAsStream(fileName);
try {
- CHMDocumentInformation chmDocInfo = CHMDocumentInformation.load(stream);
- Metadata md = new Metadata();
+ CHMDocumentInformation chmDocInfo = new CHMDocumentInformation(stream);
String text = chmDocInfo.getText();
- chmDocInfo.getCHMDocInformation(md);
- assertEquals(TestParameters.VP_CHM_MIME_TYPE, md.toString().trim());
assertTrue(text.length() > 0);
} finally {
stream.close();