You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2014/01/31 19:49:51 UTC
svn commit: r1563216 - in /pdfbox/branches/1.8: ./
pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java
Author: lehmi
Date: Fri Jan 31 18:49:51 2014
New Revision: 1563216
URL: http://svn.apache.org/r1563216
Log:
PDFBOX-1860: don't escape formatting close tags as proposed by Cheng Leong
Modified:
pdfbox/branches/1.8/ (props changed)
pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java
Propchange: pdfbox/branches/1.8/
------------------------------------------------------------------------------
Merged /pdfbox/trunk:r1563215
Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java?rev=1563216&r1=1563215&r2=1563216&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java Fri Jan 31 18:49:51 2014
@@ -215,7 +215,7 @@ public class PDFText2HTML extends PDFTex
@Override
protected void writeParagraphEnd() throws IOException
{
- writeString(fontState.clear());
+ super.writeString(fontState.clear()); // do not escape HTML
super.writeParagraphEnd();
}
Modified: pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java?rev=1563216&r1=1563215&r2=1563216&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java Fri Jan 31 18:49:51 2014
@@ -30,16 +30,16 @@ import junit.framework.TestCase;
public class TestPDFText2HTML extends TestCase {
- private PDDocument createDocument() throws IOException {
+ private PDDocument createDocument(String title, PDFont font, String text) throws IOException {
PDDocument doc = new PDDocument();
+ doc.getDocumentInformation().setTitle(title);
PDPage page = new PDPage();
doc.addPage(page);
- PDFont font = PDType1Font.HELVETICA;
PDPageContentStream contentStream = new PDPageContentStream(doc, page);
contentStream.beginText();
contentStream.setFont(font, 12);
contentStream.moveTextPositionByAmount(100, 700);
- contentStream.drawString("<foo>");
+ contentStream.drawString(text);
contentStream.endText();
contentStream.close();
return doc;
@@ -47,15 +47,23 @@ public class TestPDFText2HTML extends Te
public void testEscapeTitle() throws IOException {
PDFTextStripper stripper = new PDFText2HTML("UTF-8");
- PDDocument doc = createDocument();
- doc.getDocumentInformation().setTitle("<script>\u3042");
+ PDDocument doc = createDocument("<script>\u3042", PDType1Font.HELVETICA, "<foo>");
String text = stripper.getText(doc);
Matcher m = Pattern.compile("<title>(.*?)</title>").matcher(text);
assertTrue(m.find());
assertEquals("<script>あ", m.group(1));
-
+
assertTrue(text.indexOf("<foo>") >= 0);
-
+ }
+
+ public void testStyle() throws IOException {
+ PDFTextStripper stripper = new PDFText2HTML("UTF-8");
+ PDDocument doc = createDocument("t", PDType1Font.HELVETICA_BOLD, "<bold>");
+ String text = stripper.getText(doc);
+
+ Matcher bodyMatcher = Pattern.compile("<p>(.*?)</p>").matcher(text);
+ assertTrue("body p exists", bodyMatcher.find());
+ assertEquals("body p", "<b><bold></b>", bodyMatcher.group(1));
}
}