You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by le...@apache.org on 2014/01/31 19:49:51 UTC

svn commit: r1563216 - in /pdfbox/branches/1.8: ./ pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java

Author: lehmi
Date: Fri Jan 31 18:49:51 2014
New Revision: 1563216

URL: http://svn.apache.org/r1563216
Log:
PDFBOX-1860: don't escape formatting close tags as proposed by Cheng Leong

Modified:
    pdfbox/branches/1.8/   (props changed)
    pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
    pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java

Propchange: pdfbox/branches/1.8/
------------------------------------------------------------------------------
  Merged /pdfbox/trunk:r1563215

Modified: pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java?rev=1563216&r1=1563215&r2=1563216&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/main/java/org/apache/pdfbox/util/PDFText2HTML.java Fri Jan 31 18:49:51 2014
@@ -215,7 +215,7 @@ public class PDFText2HTML extends PDFTex
     @Override
     protected void writeParagraphEnd() throws IOException
     {
-        writeString(fontState.clear());
+        super.writeString(fontState.clear()); // do not escape HTML
         super.writeParagraphEnd();
     }
 

Modified: pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java?rev=1563216&r1=1563215&r2=1563216&view=diff
==============================================================================
--- pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java (original)
+++ pdfbox/branches/1.8/pdfbox/src/test/java/org/apache/pdfbox/util/TestPDFText2HTML.java Fri Jan 31 18:49:51 2014
@@ -30,16 +30,16 @@ import junit.framework.TestCase;
 
 public class TestPDFText2HTML extends TestCase {
 
-    private PDDocument createDocument() throws IOException {
+    private PDDocument createDocument(String title, PDFont font, String text) throws IOException {
         PDDocument doc = new PDDocument();
+        doc.getDocumentInformation().setTitle(title);
         PDPage page = new PDPage();
         doc.addPage(page);
-        PDFont font = PDType1Font.HELVETICA;
         PDPageContentStream contentStream = new PDPageContentStream(doc, page);
         contentStream.beginText();
         contentStream.setFont(font, 12);
         contentStream.moveTextPositionByAmount(100, 700);
-        contentStream.drawString("<foo>");
+        contentStream.drawString(text);
         contentStream.endText();
         contentStream.close();
         return doc;
@@ -47,15 +47,23 @@ public class TestPDFText2HTML extends Te
 
     public void testEscapeTitle() throws IOException {
         PDFTextStripper stripper = new PDFText2HTML("UTF-8");
-        PDDocument doc = createDocument();
-        doc.getDocumentInformation().setTitle("<script>\u3042");
+        PDDocument doc = createDocument("<script>\u3042", PDType1Font.HELVETICA, "<foo>");
         String text = stripper.getText(doc);
        
         Matcher m = Pattern.compile("<title>(.*?)</title>").matcher(text);
         assertTrue(m.find());
         assertEquals("&lt;script&gt;&#12354;", m.group(1));
-        
+
         assertTrue(text.indexOf("&lt;foo&gt;") >= 0);
-        
+    }
+
+    public void testStyle() throws IOException {
+        PDFTextStripper stripper = new PDFText2HTML("UTF-8");
+        PDDocument doc = createDocument("t", PDType1Font.HELVETICA_BOLD, "<bold>");
+        String text = stripper.getText(doc);
+
+        Matcher bodyMatcher = Pattern.compile("<p>(.*?)</p>").matcher(text);
+        assertTrue("body p exists", bodyMatcher.find());
+        assertEquals("body p", "<b>&lt;bold&gt;</b>", bodyMatcher.group(1));
     }
 }