You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/02/08 23:22:11 UTC

svn commit: r742187 - in /lucene/tika/trunk: CHANGES.txt src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java src/test/resources/test-documents/testMSG.msg

Author: jukka
Date: Sun Feb  8 22:22:10 2009
New Revision: 742187

URL: http://svn.apache.org/viewvc?rev=742187&view=rev
Log:
TIKA-197: Microsoft Outlook (msg) files get parsed multiple times

Added a somewhat ugly marker flag to prevent Outlook documents from being parsed more than once. Added a test case for this fix.

Added:
    lucene/tika/trunk/src/test/resources/test-documents/testMSG.msg   (with props)
Modified:
    lucene/tika/trunk/CHANGES.txt
    lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
    lucene/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java

Modified: lucene/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/CHANGES.txt?rev=742187&r1=742186&r2=742187&view=diff
==============================================================================
--- lucene/tika/trunk/CHANGES.txt (original)
+++ lucene/tika/trunk/CHANGES.txt Sun Feb  8 22:22:10 2009
@@ -25,6 +25,9 @@
   * Embedded text in MIDI files is now extracted. For example many karaoke
     files contain song lyrics embedded as MIDI text.
 
+  * The text content of Microsoft Outlook message files no longer appears as
+    multiple copies in the extracted text. (TIKA-197)
+
 See http://tinyurl.com/tika-0-3-changes for a list of all changes in Tika 0.3.
 
 The following people have contributed to Tika 0.3 by submitting or commenting

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=742187&r1=742186&r2=742187&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Sun Feb  8 22:22:10 2009
@@ -61,6 +61,7 @@
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
 
+        boolean outlookExtracted = false;
         POIFSFileSystem filesystem = new POIFSFileSystem(stream);
         Iterator<?> entries = filesystem.getRoot().getEntries();
         while (entries.hasNext()) {
@@ -92,7 +93,9 @@
                 for (String text : extractor.getAllText()) {
                     xhtml.element("p", text);
                 }
-            } else if (name.startsWith("__substg1.0_")) {
+            } else if (!outlookExtracted && name.startsWith("__substg1.0_")) {
+                // TODO: Cleaner mechanism for detecting Outlook
+                outlookExtracted = true;
                 setType(metadata, "application/vnd.ms-outlook");
                 new OutlookExtractor(filesystem).parse(xhtml, metadata);
             }

Modified: lucene/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=742187&r1=742186&r2=742187&view=diff
==============================================================================
--- lucene/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java (original)
+++ lucene/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java Sun Feb  8 22:22:10 2009
@@ -17,6 +17,8 @@
 package org.apache.tika.parser.microsoft;
 
 import java.io.InputStream;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import junit.framework.TestCase;
 
@@ -62,4 +64,33 @@
         assertTrue(content.contains("Messagerie et groupes de discussion"));
     }
 
+    /**
+     * Test case for TIKA-197
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-197">TIKA-197</a>
+     */
+    public void testMultipleCopies() throws Exception {
+        Parser parser = new AutoDetectParser();
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/testMSG.msg");
+        try {
+            parser.parse(stream, handler, metadata);
+        } finally {
+            stream.close();
+        }
+
+        assertEquals(
+                "application/vnd.ms-outlook",
+                metadata.get(Metadata.CONTENT_TYPE));
+
+        String content = handler.toString();
+        Pattern pattern = Pattern.compile("From");
+        Matcher matcher = pattern.matcher(content);
+        assertTrue(matcher.find());
+        assertFalse(matcher.find());
+    }
+
 }

Added: lucene/tika/trunk/src/test/resources/test-documents/testMSG.msg
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/resources/test-documents/testMSG.msg?rev=742187&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/tika/trunk/src/test/resources/test-documents/testMSG.msg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream