You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2011/10/01 12:55:20 UTC
svn commit: r1177957 - in /tika/trunk: ./
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/
tika-parsers/src/main/java/org/apache/tika/parser/rtf/
tika-parsers/src/test/java/org/apache/tika/parser/microsoft/
tika-parsers/src/test/java/org/ap...
Author: mikemccand
Date: Sat Oct 1 10:55:20 2011
New Revision: 1177957
URL: http://svn.apache.org/viewvc?rev=1177957&view=rev
Log:
TIKA-632: extract hyperlinks from RTF docs
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1177957&r1=1177956&r2=1177957&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sat Oct 1 10:55:20 2011
@@ -1,5 +1,10 @@
Apache Tika Change Log
======================
+Release 0.11 - Current Development
+
+ * TIKA-632: Hyperlinks in RTF documents are now extracted as an <a
+ href=...>...</a> element.
+
Release 0.10 - 09/25/2011
The most notable changes in Tika 0.10 over previous releases are:
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1177957&r1=1177956&r2=1177957&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Sat Oct 1 10:55:20 2011
@@ -198,13 +198,12 @@ public class OutlookExtractor extends Ab
MAPIRtfAttribute rtf = new MAPIRtfAttribute(
MAPIProperty.RTF_COMPRESSED, Types.BINARY, chunk.getValue()
);
- // Disabled pending a fix to TIKA-632
-// RTFParser rtfParser = new RTFParser();
-// rtfParser.parse(
-// new ByteArrayInputStream(rtf.getData()),
-// xhtml, new Metadata(), new ParseContext()
-// );
-// doneBody = true;
+ RTFParser rtfParser = new RTFParser();
+ rtfParser.parse(
+ new ByteArrayInputStream(rtf.getData()),
+ xhtml, new Metadata(), new ParseContext()
+ );
+ doneBody = true;
}
if(textChunk != null && !doneBody) {
xhtml.element("p", ((StringChunk)textChunk).getValue());
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1177957&r1=1177956&r2=1177957&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java Sat Oct 1 10:55:20 2011
@@ -94,7 +94,14 @@ final class TextExtractor {
private String nextMetaData;
private boolean inParagraph;
- private final StringBuilder headerBuffer = new StringBuilder();
+ // Non-zero if we are processing inside a field destination:
+ private int fieldState;
+
+ // Non-null if we've seen the url for a HYPERLINK but not yet
+ // its text:
+ private String pendingURL;
+
+ private final StringBuilder pendingBuffer = new StringBuilder();
// Used to process the sub-groups inside the upr
// group:
@@ -298,8 +305,8 @@ final class TextExtractor {
pushBytes();
}
- if (inHeader) {
- headerBuffer.append(ch);
+ if (inHeader || fieldState == 1) {
+ pendingBuffer.append(ch);
} else {
if (pendingCharCount == pendingChars.length) {
// Gradual but exponential growth:
@@ -534,7 +541,7 @@ final class TextExtractor {
// Decodes the buffered bytes in pendingBytes
// into UTF16 code units, and sends the characters
// to the out ContentHandler, if we are in the body,
- // else appends the characters to the headerBuffer
+ // else appends the characters to the pendingBuffer
private void pushBytes() throws IOException, SAXException, TikaException {
if (pendingByteCount > 0 && (!groupState.ignore || nextMetaData != null)) {
@@ -552,8 +559,8 @@ final class TextExtractor {
final int pos = outputBuffer.position();
if (pos > 0) {
- if (inHeader) {
- headerBuffer.append(outputArray, 0, pos);
+ if (inHeader || fieldState == 1) {
+ pendingBuffer.append(outputArray, 0, pos);
} else {
lazyStartParagraph();
out.characters(outputArray, 0, pos);
@@ -571,8 +578,8 @@ final class TextExtractor {
final int pos = outputBuffer.position();
if (pos > 0) {
- if (inHeader) {
- headerBuffer.append(outputArray, 0, pos);
+ if (inHeader || fieldState == 1) {
+ pendingBuffer.append(outputArray, 0, pos);
} else {
lazyStartParagraph();
out.characters(outputArray, 0, pos);
@@ -973,6 +980,16 @@ final class TextExtractor {
} else if (equals("rdblquote")) {
// unicode RIGHT DOUBLE QUOTATION MARK
addOutputChar('\u201D');
+ } else if (equals("fldinst")) {
+ fieldState = 1;
+ groupState.ignore = false;
+ } else if (equals("fldrslt") && fieldState == 2) {
+ assert pendingURL != null;
+ lazyStartParagraph();
+ out.startElement("a", "href", pendingURL);
+ pendingURL = null;
+ fieldState = 3;
+ groupState.ignore = false;
}
}
@@ -997,10 +1014,10 @@ final class TextExtractor {
if (inHeader) {
if (nextMetaData != null) {
- metadata.add(nextMetaData, headerBuffer.toString());
+ metadata.add(nextMetaData, pendingBuffer.toString());
nextMetaData = null;
}
- headerBuffer.setLength(0);
+ pendingBuffer.setLength(0);
}
assert groupState.depth > 0;
@@ -1035,5 +1052,35 @@ final class TextExtractor {
}
groupState = outerGroupState;
assert groupStates.size() == groupState.depth;
+
+ if (fieldState == 1) {
+ String s = pendingBuffer.toString().trim();
+ pendingBuffer.setLength(0);
+ if (s.startsWith("HYPERLINK")) {
+ s = s.substring(9).trim();
+ // TODO: what other instructions can be in a
+ // HYPERLINK destination?
+ final boolean isLocalLink = s.indexOf("\\l ") != -1;
+ int idx = s.indexOf('"');
+ if (idx != -1) {
+ int idx2 = s.indexOf('"', 1+idx);
+ if (idx2 != -1) {
+ s = s.substring(1+idx, idx2);
+ }
+ }
+ pendingURL = (isLocalLink ? "#" : "") + s;
+ fieldState = 2;
+ } else {
+ fieldState = 0;
+ }
+
+ // TODO: we could process the other known field
+ // types. Right now, we will extract their text
+ // inlined, but fail to record them in metadata
+ // as a field value.
+ } else if (fieldState == 3) {
+ out.endElement("a");
+ fieldState = 0;
+ }
}
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=1177957&r1=1177956&r2=1177957&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java Sat Oct 1 10:55:20 2011
@@ -173,10 +173,7 @@ public class OutlookParserTest extends T
//assertEquals(2, content.split("<\\/body>").length); // TODO Fix
}
- /**
- * Disabled pending a fix for TIKA-632
- */
- public void DISABLEDtestOutlookHTMLfromRTF() throws Exception {
+ public void testOutlookHTMLfromRTF() throws Exception {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
@@ -200,10 +197,9 @@ public class OutlookParserTest extends T
// As the HTML version should have been processed, ensure
// we got some of the links
String content = sw.toString().replaceAll("<p>\\s+","<p>");
-//System.err.println(content);
assertTrue(content.contains("<dd>New Outlook User</dd>"));
assertTrue(content.contains("designed <i>to help you"));
- assertTrue(content.contains("<p>Cached Exchange Mode"));
+ assertTrue(content.contains("<p><a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>"));
// Link - check text around it, and the link itself
assertTrue(content.contains("sign up for a free subscription"));
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1177957&r1=1177956&r2=1177957&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Sat Oct 1 10:55:20 2011
@@ -288,6 +288,11 @@ public class RTFParserTest extends TikaT
assertContains("<i>italic then </i><b><i>bold then</i></b><b> not italic</b>", content);
}
+ public void testHyperlink() throws Exception {
+ String content = getXML("testRTFHyperlink.rtf").xml;
+ assertContains("our most <a href=\"http://r.office.microsoft.com/r/rlidwelcomeFAQ?clid=1033\">frequently asked questions</a>", content);
+ }
+
private Result getResult(String filename) throws Exception {
File file = getResourceAsFile("/test-documents/" + filename);