You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2011/10/01 12:55:20 UTC

svn commit: r1177957 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/main/java/org/apache/tika/parser/rtf/ tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/java/org/ap...

Author: mikemccand
Date: Sat Oct  1 10:55:20 2011
New Revision: 1177957

URL: http://svn.apache.org/viewvc?rev=1177957&view=rev
Log:
TIKA-632: extract hyperlinks from RTF docs

Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1177957&r1=1177956&r2=1177957&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sat Oct  1 10:55:20 2011
@@ -1,5 +1,10 @@
 Apache Tika Change Log
 ======================
+Release 0.11 - Current Development
+
+ * TIKA-632: Hyperlinks in RTF documents are now extracted as an <a
+   href=...>...</a> element.
+
 Release 0.10 - 09/25/2011
 
 The most notable changes in Tika 0.10 over previous releases are:

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1177957&r1=1177956&r2=1177957&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Sat Oct  1 10:55:20 2011
@@ -198,13 +198,12 @@ public class OutlookExtractor extends Ab
               MAPIRtfAttribute rtf = new MAPIRtfAttribute(
                     MAPIProperty.RTF_COMPRESSED, Types.BINARY, chunk.getValue()
               );
-              // Disabled pending a fix to TIKA-632
-//              RTFParser rtfParser = new RTFParser();
-//              rtfParser.parse(
-//                    new ByteArrayInputStream(rtf.getData()),
-//                    xhtml, new Metadata(), new ParseContext()
-//              );
-//              doneBody = true;
+              RTFParser rtfParser = new RTFParser();
+              rtfParser.parse(
+                              new ByteArrayInputStream(rtf.getData()),
+                              xhtml, new Metadata(), new ParseContext()
+                              );
+              doneBody = true;
            }
            if(textChunk != null && !doneBody) {
               xhtml.element("p", ((StringChunk)textChunk).getValue());

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1177957&r1=1177956&r2=1177957&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java Sat Oct  1 10:55:20 2011
@@ -94,7 +94,14 @@ final class TextExtractor {
     private String nextMetaData;
     private boolean inParagraph;
 
-    private final StringBuilder headerBuffer = new StringBuilder();
+    // Non-zero if we are processing inside a field destination:
+    private int fieldState;
+    
+    // Non-null if we've seen the url for a HYPERLINK but not yet
+    // its text:
+    private String pendingURL;
+
+    private final StringBuilder pendingBuffer = new StringBuilder();
 
     // Used to process the sub-groups inside the upr
     // group:
@@ -298,8 +305,8 @@ final class TextExtractor {
             pushBytes();
         }
 
-        if (inHeader) {
-            headerBuffer.append(ch);
+        if (inHeader || fieldState == 1) {
+            pendingBuffer.append(ch);
         } else {
             if (pendingCharCount == pendingChars.length) {
                 // Gradual but exponential growth:
@@ -534,7 +541,7 @@ final class TextExtractor {
     // Decodes the buffered bytes in pendingBytes
     // into UTF16 code units, and sends the characters
     // to the out ContentHandler, if we are in the body,
-    // else appends the characters to the headerBuffer
+    // else appends the characters to the pendingBuffer
     private void pushBytes() throws IOException, SAXException, TikaException {
         if (pendingByteCount > 0 && (!groupState.ignore || nextMetaData != null)) {
 
@@ -552,8 +559,8 @@ final class TextExtractor {
 
                 final int pos = outputBuffer.position();
                 if (pos > 0) {
-                    if (inHeader) {
-                        headerBuffer.append(outputArray, 0, pos);
+                    if (inHeader || fieldState == 1) {
+                        pendingBuffer.append(outputArray, 0, pos);
                     } else {
                         lazyStartParagraph();
                         out.characters(outputArray, 0, pos);
@@ -571,8 +578,8 @@ final class TextExtractor {
 
                 final int pos = outputBuffer.position();
                 if (pos > 0) {
-                    if (inHeader) {
-                        headerBuffer.append(outputArray, 0, pos);
+                    if (inHeader || fieldState == 1) {
+                        pendingBuffer.append(outputArray, 0, pos);
                     } else {
                         lazyStartParagraph();
                         out.characters(outputArray, 0, pos);
@@ -973,6 +980,16 @@ final class TextExtractor {
         } else if (equals("rdblquote")) {
             // unicode RIGHT DOUBLE QUOTATION MARK
             addOutputChar('\u201D');
+        } else if (equals("fldinst")) {
+            fieldState = 1;
+            groupState.ignore = false;
+        } else if (equals("fldrslt") && fieldState == 2) {
+            assert pendingURL != null;
+            lazyStartParagraph();
+            out.startElement("a", "href", pendingURL);
+            pendingURL = null;
+            fieldState = 3;
+            groupState.ignore = false;
         }
     }
 
@@ -997,10 +1014,10 @@ final class TextExtractor {
 
         if (inHeader) {
             if (nextMetaData != null) {
-                metadata.add(nextMetaData, headerBuffer.toString());
+                metadata.add(nextMetaData, pendingBuffer.toString());
                 nextMetaData = null;
             }
-            headerBuffer.setLength(0);
+            pendingBuffer.setLength(0);
         }
 
         assert groupState.depth > 0;
@@ -1035,5 +1052,35 @@ final class TextExtractor {
         }
         groupState = outerGroupState;
         assert groupStates.size() == groupState.depth;
+
+        if (fieldState == 1) {
+            String s = pendingBuffer.toString().trim();
+            pendingBuffer.setLength(0);
+            if (s.startsWith("HYPERLINK")) {
+                s = s.substring(9).trim();
+                // TODO: what other instructions can be in a
+                // HYPERLINK destination?
+                final boolean isLocalLink = s.indexOf("\\l ") != -1;
+                int idx = s.indexOf('"');
+                if (idx != -1) {
+                    int idx2 = s.indexOf('"', 1+idx);
+                    if (idx2 != -1) {
+                        s = s.substring(1+idx, idx2);
+                    }
+                }
+                pendingURL = (isLocalLink ? "#" : "") + s;
+                fieldState = 2;
+            } else {
+                fieldState = 0;
+            }
+
+            // TODO: we could process the other known field
+            // types.  Right now, we will extract their text
+            // inlined, but fail to record them in metadata
+            // as a field value.
+        } else if (fieldState == 3) {
+            out.endElement("a");
+            fieldState = 0;
+        }
     }
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=1177957&r1=1177956&r2=1177957&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java Sat Oct  1 10:55:20 2011
@@ -173,10 +173,7 @@ public class OutlookParserTest extends T
         //assertEquals(2, content.split("<\\/body>").length); // TODO Fix
     }
     
-    /**
-     * Disabled pending a fix for TIKA-632
-     */
-    public void DISABLEDtestOutlookHTMLfromRTF() throws Exception {
+    public void testOutlookHTMLfromRTF() throws Exception {
         Parser parser = new AutoDetectParser();
         Metadata metadata = new Metadata();
        
@@ -200,10 +197,9 @@ public class OutlookParserTest extends T
         // As the HTML version should have been processed, ensure
         //  we got some of the links
         String content = sw.toString().replaceAll("<p>\\s+","<p>");
-//System.err.println(content);
         assertTrue(content.contains("<dd>New Outlook User</dd>"));
         assertTrue(content.contains("designed <i>to help you"));
-        assertTrue(content.contains("<p>Cached Exchange Mode"));
+        assertTrue(content.contains("<p><a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>"));
         
         // Link - check text around it, and the link itself
         assertTrue(content.contains("sign up for a free subscription"));

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1177957&r1=1177956&r2=1177957&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java Sat Oct  1 10:55:20 2011
@@ -288,6 +288,11 @@ public class RTFParserTest extends TikaT
         assertContains("<i>italic then </i><b><i>bold then</i></b><b> not italic</b>", content);
     }
 
+    public void testHyperlink() throws Exception {
+        String content = getXML("testRTFHyperlink.rtf").xml;
+        assertContains("our most <a href=\"http://r.office.microsoft.com/r/rlidwelcomeFAQ?clid=1033\">frequently asked questions</a>", content);
+    }
+
     private Result getResult(String filename) throws Exception {
         File file = getResourceAsFile("/test-documents/" + filename);