You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2013/12/27 16:29:40 UTC

svn commit: r1553683 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/chm/ test/java/org/apache/tika/parser/chm/

Author: jukka
Date: Fri Dec 27 15:29:40 2013
New Revision: 1553683

URL: http://svn.apache.org/r1553683
Log:
TIKA-672: Proper error handling in the CHM parser

Simplify the CHMDocumentInformation class

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmDocumentInformation.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java?rev=1553683&r1=1553682&r2=1553683&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/CHMDocumentInformation.java Fri Dec 27 15:29:40 2013
@@ -32,44 +32,51 @@ import org.xml.sax.SAXException;
 
 /**
  * Extracts text and metadata from chm file
- * 
  */
-public class CHMDocumentInformation {
-    /* Class members */
-    private ChmExtractor chmExtractor = null;
+class CHMDocumentInformation {
+
+    private final ChmExtractor chmExtractor;
 
     /**
      * Loads chm file as input stream and returns a new instance of chm doc info
      * 
-     * @param is
-     *            InputStream
+     * @param stream chm input stream
+     */
+    public CHMDocumentInformation(InputStream stream)
+            throws TikaException, IOException {
+        this.chmExtractor = new ChmExtractor(stream);
+    }
+
+    /**
+     * Checks if an entry is a html or not.
      * 
-     * @return chm document information
-     * @throws TikaException 
-     * @throws IOException 
+     * @param entry
+     *            chm directory listing entry
+     * 
+     * @return boolean
      */
-    public static CHMDocumentInformation load(InputStream is) throws TikaException, IOException {
-        CHMDocumentInformation document = new CHMDocumentInformation();
-        document.setChmExtractor(new ChmExtractor(is));
-        return document;
+    private boolean isRightEntry(DirectoryListingEntry entry) {
+        return (entry.getName().endsWith(".html") || entry.getName().endsWith(".htm"));
     }
 
     /**
-     * Appends extracted data from chm listing entries
+     * Returns extracted text from chm file
+     * 
+     * @return text
      * 
-     * @return extracted content of chm
+     * @throws TikaException
      */
-    private String getContent() {
+    public String getText() throws TikaException {
         StringBuilder sb = new StringBuilder();
         DirectoryListingEntry entry;
         
-        for (Iterator<DirectoryListingEntry> it = getChmExtractor()
+        for (Iterator<DirectoryListingEntry> it = chmExtractor
                 .getChmDirList().getDirectoryListingEntryList().iterator(); it.hasNext();) 
         {
             try {
                 entry = it.next();
                 if (isRightEntry(entry)) {
-                    byte[][] tmp = getChmExtractor().extractChmEntry(entry);
+                    byte[][] tmp = chmExtractor.extractChmEntry(entry);
                     if (tmp != null) {
                         sb.append(extract(tmp));
                     }
@@ -83,65 +90,6 @@ public class CHMDocumentInformation {
     }
 
     /**
-     * Checks if an entry is a html or not.
-     * 
-     * @param entry
-     *            chm directory listing entry
-     * 
-     * @return boolean
-     */
-    private boolean isRightEntry(DirectoryListingEntry entry) {
-        return (entry.getName().endsWith(".html") || entry.getName().endsWith(".htm"));
-    }
-
-    /**
-     * Returns chm extractor
-     * 
-     * @return chmExtractor
-     */
-    private ChmExtractor getChmExtractor() {
-        return chmExtractor;
-    }
-
-    /**
-     * Sets a chm extractor
-     * 
-     * @param chmExtractor
-     */
-    private void setChmExtractor(ChmExtractor chmExtractor) {
-        this.chmExtractor = chmExtractor;
-    }
-
-    /**
-     * Returns chm metadata
-     * 
-     * @param metadata
-     * 
-     * @throws TikaException
-     * @throws IOException
-     */
-    public void getCHMDocInformation(Metadata metadata) throws TikaException,
-            IOException {
-        if (getChmExtractor() != null) {
-            /* Checking if file is a chm, done during creating chmItsf header */
-            metadata.add(Metadata.CONTENT_TYPE, "application/x-chm");
-        } else {
-            metadata.add(Metadata.CONTENT_TYPE, "unknown");
-        }
-    }
-
-    /**
-     * Returns extracted text from chm file
-     * 
-     * @return text
-     * 
-     * @throws TikaException
-     */
-    public String getText() throws TikaException {
-        return getContent();
-    }
-
-    /**
      * Extracts data from byte[][]
      * 
      * @param byteObject
@@ -180,7 +128,4 @@ public class CHMDocumentInformation {
         return wBuf.toString();
     }
 
-    public static void main(String[] args) {
-
-    }
 }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java?rev=1553683&r1=1553682&r2=1553683&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/chm/ChmParser.java Fri Dec 27 15:29:40 2013
@@ -50,11 +50,10 @@ public class ChmParser extends AbstractP
     public void parse(InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context) throws IOException,
             SAXException, TikaException {
-        CHMDocumentInformation chmInfo = CHMDocumentInformation.load(stream);
+        CHMDocumentInformation chmInfo = new CHMDocumentInformation(stream);
 
         // metadata
         metadata.set(Metadata.CONTENT_TYPE, "application/vnd.ms-htmlhelp");
-        chmInfo.getCHMDocInformation(metadata);
 
         // content
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmDocumentInformation.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmDocumentInformation.java?rev=1553683&r1=1553682&r2=1553683&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmDocumentInformation.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmDocumentInformation.java Fri Dec 27 15:29:40 2013
@@ -14,17 +14,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.tika.parser.chm;
 
-import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
 import java.io.ByteArrayInputStream;
-import java.io.IOException;
 
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
 import org.junit.Before;
 import org.junit.Test;
 
@@ -33,19 +29,11 @@ public class TestChmDocumentInformation 
 
     @Before
     public void setUp() throws Exception {
-        chmDoc = CHMDocumentInformation.load(
+        chmDoc = new CHMDocumentInformation(
                 new ByteArrayInputStream(TestParameters.chmData));
     }
 
     @Test
-    public void testGetCHMDocInformation() throws TikaException, IOException {
-        Metadata md = new Metadata();
-        chmDoc.getCHMDocInformation(md);
-        assertEquals(TestParameters.VP_CHM_MIME_TYPE, md.toString()
-                .trim());
-    }
-
-    @Test
     public void testGetText() throws TikaException {
         assertTrue(chmDoc.getText().contains(
                 "The TCard method accepts only numeric arguments"));

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java?rev=1553683&r1=1553682&r2=1553683&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java Fri Dec 27 15:29:40 2013
@@ -14,10 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.tika.parser.chm;
 
-import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
 import java.io.IOException;
@@ -29,7 +27,6 @@ import java.util.concurrent.Executors;
 import java.util.concurrent.locks.Lock;
 import java.util.concurrent.locks.ReentrantLock;
 
-import org.apache.tika.metadata.Metadata;
 import org.junit.Before;
 import org.junit.Test;
 
@@ -56,13 +53,10 @@ public class TestChmExtraction {
                         try {
                             stream = TestChmExtraction.class.getResourceAsStream(fileName);
 
-                            CHMDocumentInformation chmDocInfo = CHMDocumentInformation
-                                    .load(stream);
-                            Metadata md = new Metadata();
+                            CHMDocumentInformation chmDocInfo =
+                                    new CHMDocumentInformation(stream);
                             mutex.lock();
                             String text = chmDocInfo.getText();
-                            chmDocInfo.getCHMDocInformation(md);
-                            assertEquals(TestParameters.VP_CHM_MIME_TYPE, md.toString().trim());
                             assertTrue(text.length() > 0);
                         } catch (Exception e) {
                             e.printStackTrace();

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java?rev=1553683&r1=1553682&r2=1553683&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java Fri Dec 27 15:29:40 2013
@@ -27,7 +27,6 @@ import java.util.Iterator;
 import java.util.List;
 
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
 import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
 import org.apache.tika.parser.chm.core.ChmExtractor;
@@ -80,11 +79,8 @@ public class TestChmExtractor {
             InputStream stream =
                     TestChmBlockInfo.class.getResourceAsStream(fileName);
             try {
-                CHMDocumentInformation chmDocInfo = CHMDocumentInformation.load(stream);
-                Metadata md = new Metadata();
+                CHMDocumentInformation chmDocInfo = new CHMDocumentInformation(stream);
                 String text = chmDocInfo.getText();
-                chmDocInfo.getCHMDocInformation(md);
-                assertEquals(TestParameters.VP_CHM_MIME_TYPE, md.toString().trim());
                 assertTrue(text.length() > 0);
             } finally {
                 stream.close();