You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/07/06 20:20:52 UTC
tika git commit: TIKA-2029: add some content for links so that we
don't generate bad html
Repository: tika
Updated Branches:
refs/heads/master 23a11eff3 -> 95b2cd127
TIKA-2029: add some content for links so that we don't generate bad html <a href="http://tika.apache.org/"/>
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/95b2cd12
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/95b2cd12
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/95b2cd12
Branch: refs/heads/master
Commit: 95b2cd127346486cece4cb1450f444fd9bd54337
Parents: 23a11ef
Author: tballison <ta...@mitre.org>
Authored: Wed Jul 6 16:20:45 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jul 6 16:20:45 2016 -0400
----------------------------------------------------------------------
.../java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java | 6 +++++-
.../test/java/org/apache/tika/parser/pdf/PDFParserTest.java | 3 ++-
2 files changed, 7 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/95b2cd12/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index d231a09..c3eafdc 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -308,11 +308,15 @@ class AbstractPDF2XHTML extends PDFTextStripper {
if (annotationlink.getAction() != null) {
PDAction action = annotationlink.getAction();
if (action instanceof PDActionURI) {
+ //can't currently associate link to text.
+ //for now, extract link and repeat the link as if it
+ //were the visible text
PDActionURI uri = (PDActionURI) action;
String link = uri.getURI();
- if (link != null) {
+ if (link != null && link.trim().length() > 0) {
xhtml.startElement("div", "class", "annotation");
xhtml.startElement("a", "href", link);
+ xhtml.characters(link);
xhtml.endElement("a");
xhtml.endElement("div");
}
http://git-wip-us.apache.org/repos/asf/tika/blob/95b2cd12/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index be1f769..94b1548 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -387,7 +387,8 @@ public class PDFParserTest extends TikaTest {
@Test
public void testLinks() throws Exception {
final XMLResult result = getXML("testPDFVarious.pdf");
- assertContains("<div class=\"annotation\"><a href=\"http://tika.apache.org/\" /></div>", result.xml);
+ assertContains("<div class=\"annotation\"><a href=\"http://tika.apache.org/\">"+
+ "http://tika.apache.org/</a></div>", result.xml);
}
@Test