You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/06/08 14:45:35 UTC
svn commit: r1133380 - in /tika/trunk/tika-parsers: ./
src/main/java/org/apache/tika/parser/microsoft/
src/test/java/org/apache/tika/parser/microsoft/
Author: nick
Date: Wed Jun 8 12:45:35 2011
New Revision: 1133380
URL: http://svn.apache.org/viewvc?rev=1133380&view=rev
Log:
TIKA-631 Apply Outlook extraction enhancement to better extract html and rtf versions using POI 3.8 beta 3
Modified:
tika/trunk/tika-parsers/pom.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1133380&r1=1133379&r2=1133380&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Wed Jun 8 12:45:35 2011
@@ -35,7 +35,7 @@
<url>http://tika.apache.org/</url>
<properties>
- <poi.version>3.8-beta2</poi.version>
+ <poi.version>3.8-beta3</poi.version>
</properties>
<dependencies>
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1133380&r1=1133379&r2=1133380&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Wed Jun 8 12:45:35 2011
@@ -28,6 +28,7 @@ import java.util.Set;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
import org.apache.poi.poifs.crypt.Decryptor;
+import org.apache.poi.poifs.crypt.EcmaDecryptor;
import org.apache.poi.poifs.crypt.EncryptionInfo;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.Entry;
@@ -223,7 +224,7 @@ public class OfficeParser extends Abstra
break;
case ENCRYPTED:
EncryptionInfo info = new EncryptionInfo(filesystem);
- Decryptor d = new Decryptor(info);
+ Decryptor d = new EcmaDecryptor(info);
try {
if (!d.verifyPassword(Decryptor.DEFAULT_PASSWORD)) {
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1133380&r1=1133379&r2=1133380&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Wed Jun 8 12:45:35 2011
@@ -39,6 +39,8 @@ import org.apache.tika.parser.html.HtmlP
import org.apache.tika.parser.mbox.MboxParser;
import org.apache.tika.parser.rtf.RTFParser;
import org.apache.tika.parser.txt.CharsetDetector;
+import org.apache.tika.parser.txt.CharsetMatch;
+import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
@@ -52,7 +54,7 @@ public class OutlookExtractor extends Ab
super(context);
try {
- this.msg = new MAPIMessage(filesystem);
+ this.msg = new MAPIMessage(filesystem.getRoot());
} catch (IOException e) {
throw new TikaException("Failed to parse Outlook message", e);
}
@@ -65,27 +67,22 @@ public class OutlookExtractor extends Ab
// If the message contains strings that aren't stored
// as Unicode, try to sort out an encoding for them
- // TODO Use new method
- boolean hasNonUnicodeStrings = false;
- for(Chunk chunk : msg.getMainChunks().getAll()) {
- if(chunk instanceof StringChunk) {
- StringChunk sc = (StringChunk)chunk;
- if(sc.getType() == Types.ASCII_STRING) {
- hasNonUnicodeStrings = true;
- break;
- }
- }
- }
-
- if(hasNonUnicodeStrings) {
+ if(msg.has7BitEncodingStrings()) {
if(msg.getHeaders() != null) {
// There's normally something in the headers
msg.guess7BitEncoding();
} else {
// Nothing in the header, try encoding detection
// on the message body
- CharsetDetector detector = new CharsetDetector();
- // TODO detect and use this
+ StringChunk text = msg.getMainChunks().textBodyChunk;
+ if(text != null) {
+ CharsetDetector detector = new CharsetDetector();
+ detector.setText( text.getRawValue() );
+ CharsetMatch match = detector.detect();
+ if(match.getConfidence() > 35) {
+ msg.set7BitEncoding( match.getName() );
+ }
+ }
}
}
@@ -184,13 +181,14 @@ public class OutlookExtractor extends Ab
if(htmlChunk instanceof ByteChunk) {
data = ((ByteChunk)htmlChunk).getValue();
} else if(htmlChunk instanceof StringChunk) {
- // TODO Needs POI 3.8 beta 3
+ data = ((StringChunk)htmlChunk).getRawValue();
}
if(data != null) {
HtmlParser htmlParser = new HtmlParser();
htmlParser.parse(
new ByteArrayInputStream(data),
- xhtml, new Metadata(), new ParseContext()
+ new BodyContentHandler(xhtml),
+ new Metadata(), new ParseContext()
);
doneBody = true;
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=1133380&r1=1133379&r2=1133380&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java Wed Jun 8 12:45:35 2011
@@ -17,9 +17,15 @@
package org.apache.tika.parser.microsoft;
import java.io.InputStream;
+import java.io.StringWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+
import junit.framework.TestCase;
import org.apache.tika.metadata.Metadata;
@@ -132,5 +138,38 @@ public class OutlookParserTest extends T
assertTrue(content.contains("Streamlined Mail Experience"));
assertTrue(content.contains("Navigation Pane"));
}
+
+ public void testOutlookHTMLVersion() throws Exception {
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+
+ // Check the HTML version
+ StringWriter sw = new StringWriter();
+ SAXTransformerFactory factory = (SAXTransformerFactory)
+ SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+ handler.setResult(new StreamResult(sw));
+ InputStream stream = OutlookParserTest.class.getResourceAsStream(
+ "/test-documents/testMSG_chinese.msg");
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ } finally {
+ stream.close();
+ }
+
+ // As the HTML version should have been processed, ensure
+ // we got some of the links
+ String content = sw.toString();
+ assertTrue(content.contains("<dd>tests.chang@fengttt.com</dd>"));
+ assertTrue(content.contains("<p>Alfresco MSG format testing"));
+ assertTrue(content.contains("<li>1"));
+ assertTrue(content.contains("<li>2"));
+
+ // Make sure we don't have nested html docs
+ assertEquals(2, content.split("<body>").length);
+ //assertEquals(2, content.split("<\\/body>").length); // TODO Fix
+ }
}