You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ja...@apache.org on 2014/09/26 22:12:14 UTC

svn commit: r1627874 - in /pdfbox/trunk: pdfbox/src/main/java/org/apache/pdfbox/text/ pdfbox/src/main/java/org/apache/pdfbox/util/ pdfbox/src/test/java/org/apache/pdfbox/util/ tools/src/main/java/org/apache/pdfbox/tools/ tools/src/test/java/org/apache/...

Author: jahewson
Date: Fri Sep 26 20:12:13 2014
New Revision: 1627874

URL: http://svn.apache.org/r1627874
Log:
PDFBOX-2384: Removed encoding from PDFTextStripper

Removed:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PositionWrapper.java
Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextNormalize.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFHighlighter.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java
    pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java
    pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java
    pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java
    pdfbox/trunk/tools/src/test/java/org/apache/pdfbox/tools/TestPDFText2HTML.java

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextNormalize.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextNormalize.java?rev=1627874&r1=1627873&r2=1627874&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextNormalize.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextNormalize.java Fri Sep 26 20:12:13 2014
@@ -71,15 +71,11 @@ public class TextNormalize
         return map;
     }
 
-    private String outputEncoding;
-
     /**
-     * 
-     * @param encoding The Encoding that the text will eventually be written as (or null)
+     * Constructor.
      */
-    public TextNormalize(String encoding)
+    public TextNormalize()
     {
-        outputEncoding = encoding;
     }
 
     /**
@@ -146,23 +142,16 @@ public class TextNormalize
     public String normalizeDiacritic(String str)
     {
         // Unicode contains special combining forms of the diacritic characters which we want to use
-        if (outputEncoding != null && outputEncoding.toUpperCase().startsWith("UTF"))
+        int codePoint = str.codePointAt(0);
+
+        // convert the characters not defined in the Unicode spec
+        if (DIACRITICS.containsKey(codePoint))
         {
-            Integer c = (int) str.charAt(0);
-            // convert the characters not defined in the Unicode spec
-            if (DIACRITICS.containsKey(c))
-            {
-                return DIACRITICS.get(c);
-            }
-            else
-            {
-                return Normalizer.normalize(str, Normalizer.Form.NFKC).trim();
-            }
+            return DIACRITICS.get(codePoint);
         }
         else
         {
-            return str;
+            return Normalizer.normalize(str, Normalizer.Form.NFKC).trim();
         }
     }
-
 }

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFHighlighter.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFHighlighter.java?rev=1627874&r1=1627873&r2=1627874&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFHighlighter.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFHighlighter.java Fri Sep 26 20:12:13 2014
@@ -54,7 +54,7 @@ public class PDFHighlighter extends PDFT
      */
     public PDFHighlighter() throws IOException
     {
-        super(ENCODING);
+        super();
         super.setLineSeparator( "" );
         super.setWordSeparator( "" );
         super.setShouldSeparateByBeads( false );

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java?rev=1627874&r1=1627873&r2=1627874&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java Fri Sep 26 20:12:13 2014
@@ -47,11 +47,6 @@ public class PDFMarkedContentExtractor e
     private Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>();
 
     /**
-     * encoding that text will be written in (or null).
-     */
-    protected String outputEncoding; 
-
-    /**
      * The normalizer is used to remove text ligatures/presentation forms
      * and to correct the direction of right to left text, such as Arabic and Hebrew.
      */
@@ -79,8 +74,7 @@ public class PDFMarkedContentExtractor e
         // todo: DP - Marked Content Point
         // todo: MP - Marked Content Point with Properties
 
-        this.outputEncoding = encoding;
-        this.normalize = new TextNormalize(this.outputEncoding);
+        this.normalize = new TextNormalize();
     }
 
     /**

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1627874&r1=1627873&r2=1627874&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Fri Sep 26 20:12:13 2014
@@ -44,7 +44,6 @@ import org.apache.pdfbox.pdmodel.encrypt
 import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
 import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
 import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
-import org.apache.pdfbox.text.PositionWrapper;
 import org.apache.pdfbox.text.TextNormalize;
 import org.apache.pdfbox.text.TextPosition;
 import org.apache.pdfbox.text.TextPositionComparator;
@@ -164,7 +163,6 @@ public class PDFTextStripper extends PDF
     private Map<String, TreeMap<Float, TreeSet<Float>>> characterListMapping =
         new HashMap<String, TreeMap<Float, TreeSet<Float>>>();
 
-    protected String outputEncoding;
     protected PDDocument document;
     protected Writer output;
 
@@ -187,20 +185,7 @@ public class PDFTextStripper extends PDF
      */
     public PDFTextStripper() throws IOException
     {
-        this(null);
-    }
-
-    /**
-     * Instantiate a new PDFTextStripper object. Will apply
-     * encoding-specific conversions to the output text.
-     *
-     * @param encoding The encoding that the output will be written in.
-     * @throws IOException If there is an error reading the properties.
-     */
-    public PDFTextStripper(String encoding) throws IOException
-    {
-        this.outputEncoding = encoding;
-        normalize = new TextNormalize(this.outputEncoding);
+        normalize = new TextNormalize();
     }
 
     /**
@@ -1887,4 +1872,113 @@ public class PDFTextStripper extends PDF
             return textPositions;
         }
     }
+
+    /**
+     * wrapper of TextPosition that adds flags to track
+     * status as linestart and paragraph start positions.
+     * <p>
+     * This is implemented as a wrapper since the TextPosition
+     * class doesn't provide complete access to its
+     * state fields to subclasses.  Also, conceptually TextPosition is
+     * immutable while these flags need to be set post-creation so
+     * it makes sense to put these flags in this separate class.
+     * </p>
+     * @author m.martinez@ll.mit.edu
+     */
+    private static final class PositionWrapper
+    {
+        private boolean isLineStart = false;
+        private boolean isParagraphStart = false;
+        private boolean isPageBreak = false;
+        private boolean isHangingIndent = false;
+        private boolean isArticleStart = false;
+
+        private TextPosition position = null;
+
+        /**
+         * Returns the underlying TextPosition object.
+         * @return the text position
+         */
+        public TextPosition getTextPosition()
+        {
+            return position;
+        }
+
+        public boolean isLineStart()
+        {
+            return isLineStart;
+        }
+
+        /**
+         * Sets the isLineStart() flag to true.
+         */
+        public void setLineStart()
+        {
+            this.isLineStart = true;
+        }
+
+
+        public boolean isParagraphStart()
+        {
+            return isParagraphStart;
+        }
+
+        /**
+         * sets the isParagraphStart() flag to true.
+         */
+        public void setParagraphStart()
+        {
+            this.isParagraphStart = true;
+        }
+
+
+        public boolean isArticleStart()
+        {
+            return isArticleStart;
+        }
+
+
+        /**
+         * Sets the isArticleStart() flag to true.
+         */
+        public void setArticleStart()
+        {
+            this.isArticleStart = true;
+        }
+
+        public boolean isPageBreak()
+        {
+            return isPageBreak;
+        }
+
+        /**
+         * Sets the isPageBreak() flag to true.
+         */
+        public void setPageBreak()
+        {
+            this.isPageBreak = true;
+        }
+
+        public boolean isHangingIndent()
+        {
+            return isHangingIndent;
+        }
+
+        /**
+         * Sets the isHangingIndent() flag to true.
+         */
+        public void setHangingIndent()
+        {
+            this.isHangingIndent = true;
+        }
+
+        /**
+         * Constructs a PositionWrapper around the specified TextPosition object.
+         * @param position the text position
+         */
+        public PositionWrapper(TextPosition position)
+        {
+            this.position = position;
+        }
+    }
 }

Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java?rev=1627874&r1=1627873&r2=1627874&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java Fri Sep 26 20:12:13 2014
@@ -54,20 +54,6 @@ public class PDFTextStripperByArea exten
         super();
     }
 
-    /**
-     * Instantiate a new PDFTextStripperArea object. Will apply
-     * encoding-specific conversions to the output text.
-     * 
-     * @param encoding
-     *            The encoding that the output will be written in.
-     * @throws IOException
-     *             If there is an error reading the properties.
-     */
-    public PDFTextStripperByArea(String encoding) throws IOException
-    {
-        super(encoding);
-    }
-    
    /**
      * Add a new region to group text by.
      *

Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java?rev=1627874&r1=1627873&r2=1627874&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java Fri Sep 26 20:12:13 2014
@@ -107,7 +107,7 @@ public class TestTextStripper extends Te
     public TestTextStripper( String name ) throws IOException
     {
         super( name );
-        stripper = new PDFTextStripper(encoding);
+        stripper = new PDFTextStripper();
         stripper.setLineSeparator("\n");
     }
 

Modified: pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java?rev=1627874&r1=1627873&r2=1627874&view=diff
==============================================================================
--- pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java (original)
+++ pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java Fri Sep 26 20:12:13 2014
@@ -252,14 +252,14 @@ public class ExtractText
                     }
                 }
 
-                PDFTextStripper stripper = null;
+                PDFTextStripper stripper;
                 if(toHTML)
                 {
-                    stripper = new PDFText2HTML(encoding);
+                    stripper = new PDFText2HTML();
                 }
                 else
                 {
-                    stripper = new PDFTextStripper(encoding);
+                    stripper = new PDFTextStripper();
                 }
                 stripper.setForceParsing( force );
                 stripper.setSortByPosition( sort );

Modified: pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java?rev=1627874&r1=1627873&r2=1627874&view=diff
==============================================================================
--- pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java (original)
+++ pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java Fri Sep 26 20:12:13 2014
@@ -44,12 +44,11 @@ public class PDFText2HTML extends PDFTex
 
     /**
      * Constructor.
-     * @param encoding The encoding to be used
      * @throws IOException If there is an error during initialization.
      */
-    public PDFText2HTML(String encoding) throws IOException
+    public PDFText2HTML() throws IOException
     {
-        super(encoding);
+        super();
         setLineSeparator(LINE_SEPARATOR);
         setParagraphStart("<p>");
         setParagraphEnd("</p>"+ LINE_SEPARATOR);
@@ -73,11 +72,7 @@ public class PDFText2HTML extends PDFTex
                 + "\"http://www.w3.org/TR/html4/loose.dtd\">\n");
         buf.append("<html><head>");
         buf.append("<title>" + escape(getTitle()) + "</title>\n");
-        if(outputEncoding != null)
-        {
-            buf.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset="
-                    + this.outputEncoding + "\">\n");
-        }
+        buf.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=\"UTF-16\">\n");
         buf.append("</head>\n");
         buf.append("<body>\n");
         super.writeString(buf.toString());

Modified: pdfbox/trunk/tools/src/test/java/org/apache/pdfbox/tools/TestPDFText2HTML.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/tools/src/test/java/org/apache/pdfbox/tools/TestPDFText2HTML.java?rev=1627874&r1=1627873&r2=1627874&view=diff
==============================================================================
--- pdfbox/trunk/tools/src/test/java/org/apache/pdfbox/tools/TestPDFText2HTML.java (original)
+++ pdfbox/trunk/tools/src/test/java/org/apache/pdfbox/tools/TestPDFText2HTML.java Fri Sep 26 20:12:13 2014
@@ -47,7 +47,7 @@ public class TestPDFText2HTML extends Te
     }
 
     public void testEscapeTitle() throws IOException {
-        PDFTextStripper stripper = new PDFText2HTML("UTF-8");
+        PDFTextStripper stripper = new PDFText2HTML();
         PDDocument doc = createDocument("<script>\u3042", PDType1Font.HELVETICA, "<foo>");
         String text = stripper.getText(doc);
        
@@ -59,7 +59,7 @@ public class TestPDFText2HTML extends Te
     }
 
     public void testStyle() throws IOException {
-        PDFTextStripper stripper = new PDFText2HTML("UTF-8");
+        PDFTextStripper stripper = new PDFText2HTML();
         PDDocument doc = createDocument("t", PDType1Font.HELVETICA_BOLD, "<bold>");
         String text = stripper.getText(doc);