You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/07/06 20:20:52 UTC

tika git commit: TIKA-2029: add some content for links so that we don't generate bad html

Repository: tika
Updated Branches:
  refs/heads/master 23a11eff3 -> 95b2cd127


TIKA-2029: add some content for links so that we don't generate bad html <a href="http://tika.apache.org/"/>


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/95b2cd12
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/95b2cd12
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/95b2cd12

Branch: refs/heads/master
Commit: 95b2cd127346486cece4cb1450f444fd9bd54337
Parents: 23a11ef
Author: tballison <ta...@mitre.org>
Authored: Wed Jul 6 16:20:45 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jul 6 16:20:45 2016 -0400

----------------------------------------------------------------------
 .../java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java     | 6 +++++-
 .../test/java/org/apache/tika/parser/pdf/PDFParserTest.java    | 3 ++-
 2 files changed, 7 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/95b2cd12/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index d231a09..c3eafdc 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -308,11 +308,15 @@ class AbstractPDF2XHTML extends PDFTextStripper {
                         if (annotationlink.getAction() != null) {
                             PDAction action = annotationlink.getAction();
                             if (action instanceof PDActionURI) {
+                                //can't currently associate link to text.
+                                //for now, extract link and repeat the link as if it
+                                //were the visible text
                                 PDActionURI uri = (PDActionURI) action;
                                 String link = uri.getURI();
-                                if (link != null) {
+                                if (link != null && link.trim().length() > 0) {
                                     xhtml.startElement("div", "class", "annotation");
                                     xhtml.startElement("a", "href", link);
+                                    xhtml.characters(link);
                                     xhtml.endElement("a");
                                     xhtml.endElement("div");
                                 }

http://git-wip-us.apache.org/repos/asf/tika/blob/95b2cd12/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index be1f769..94b1548 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -387,7 +387,8 @@ public class PDFParserTest extends TikaTest {
     @Test
     public void testLinks() throws Exception {
         final XMLResult result = getXML("testPDFVarious.pdf");
-        assertContains("<div class=\"annotation\"><a href=\"http://tika.apache.org/\" /></div>", result.xml);
+        assertContains("<div class=\"annotation\"><a href=\"http://tika.apache.org/\">"+
+                "http://tika.apache.org/</a></div>", result.xml);
     }
 
     @Test