You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ja...@apache.org on 2014/09/26 22:12:14 UTC
svn commit: r1627874 - in /pdfbox/trunk:
pdfbox/src/main/java/org/apache/pdfbox/text/
pdfbox/src/main/java/org/apache/pdfbox/util/
pdfbox/src/test/java/org/apache/pdfbox/util/
tools/src/main/java/org/apache/pdfbox/tools/
tools/src/test/java/org/apache/...
Author: jahewson
Date: Fri Sep 26 20:12:13 2014
New Revision: 1627874
URL: http://svn.apache.org/r1627874
Log:
PDFBOX-2384: Removed encoding from PDFTextStripper
Removed:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/PositionWrapper.java
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextNormalize.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFHighlighter.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java
pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java
pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java
pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java
pdfbox/trunk/tools/src/test/java/org/apache/pdfbox/tools/TestPDFText2HTML.java
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextNormalize.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextNormalize.java?rev=1627874&r1=1627873&r2=1627874&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextNormalize.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextNormalize.java Fri Sep 26 20:12:13 2014
@@ -71,15 +71,11 @@ public class TextNormalize
return map;
}
- private String outputEncoding;
-
/**
- *
- * @param encoding The Encoding that the text will eventually be written as (or null)
+ * Constructor.
*/
- public TextNormalize(String encoding)
+ public TextNormalize()
{
- outputEncoding = encoding;
}
/**
@@ -146,23 +142,16 @@ public class TextNormalize
public String normalizeDiacritic(String str)
{
// Unicode contains special combining forms of the diacritic characters which we want to use
- if (outputEncoding != null && outputEncoding.toUpperCase().startsWith("UTF"))
+ int codePoint = str.codePointAt(0);
+
+ // convert the characters not defined in the Unicode spec
+ if (DIACRITICS.containsKey(codePoint))
{
- Integer c = (int) str.charAt(0);
- // convert the characters not defined in the Unicode spec
- if (DIACRITICS.containsKey(c))
- {
- return DIACRITICS.get(c);
- }
- else
- {
- return Normalizer.normalize(str, Normalizer.Form.NFKC).trim();
- }
+ return DIACRITICS.get(codePoint);
}
else
{
- return str;
+ return Normalizer.normalize(str, Normalizer.Form.NFKC).trim();
}
}
-
}
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFHighlighter.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFHighlighter.java?rev=1627874&r1=1627873&r2=1627874&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFHighlighter.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFHighlighter.java Fri Sep 26 20:12:13 2014
@@ -54,7 +54,7 @@ public class PDFHighlighter extends PDFT
*/
public PDFHighlighter() throws IOException
{
- super(ENCODING);
+ super();
super.setLineSeparator( "" );
super.setWordSeparator( "" );
super.setShouldSeparateByBeads( false );
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java?rev=1627874&r1=1627873&r2=1627874&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFMarkedContentExtractor.java Fri Sep 26 20:12:13 2014
@@ -47,11 +47,6 @@ public class PDFMarkedContentExtractor e
private Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>();
/**
- * encoding that text will be written in (or null).
- */
- protected String outputEncoding;
-
- /**
* The normalizer is used to remove text ligatures/presentation forms
* and to correct the direction of right to left text, such as Arabic and Hebrew.
*/
@@ -79,8 +74,7 @@ public class PDFMarkedContentExtractor e
// todo: DP - Marked Content Point
// todo: MP - Marked Content Point with Properties
- this.outputEncoding = encoding;
- this.normalize = new TextNormalize(this.outputEncoding);
+ this.normalize = new TextNormalize();
}
/**
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java?rev=1627874&r1=1627873&r2=1627874&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripper.java Fri Sep 26 20:12:13 2014
@@ -44,7 +44,6 @@ import org.apache.pdfbox.pdmodel.encrypt
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
-import org.apache.pdfbox.text.PositionWrapper;
import org.apache.pdfbox.text.TextNormalize;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.text.TextPositionComparator;
@@ -164,7 +163,6 @@ public class PDFTextStripper extends PDF
private Map<String, TreeMap<Float, TreeSet<Float>>> characterListMapping =
new HashMap<String, TreeMap<Float, TreeSet<Float>>>();
- protected String outputEncoding;
protected PDDocument document;
protected Writer output;
@@ -187,20 +185,7 @@ public class PDFTextStripper extends PDF
*/
public PDFTextStripper() throws IOException
{
- this(null);
- }
-
- /**
- * Instantiate a new PDFTextStripper object. Will apply
- * encoding-specific conversions to the output text.
- *
- * @param encoding The encoding that the output will be written in.
- * @throws IOException If there is an error reading the properties.
- */
- public PDFTextStripper(String encoding) throws IOException
- {
- this.outputEncoding = encoding;
- normalize = new TextNormalize(this.outputEncoding);
+ normalize = new TextNormalize();
}
/**
@@ -1887,4 +1872,113 @@ public class PDFTextStripper extends PDF
return textPositions;
}
}
+
+ /**
+ * wrapper of TextPosition that adds flags to track
+ * status as linestart and paragraph start positions.
+ * <p>
+ * This is implemented as a wrapper since the TextPosition
+ * class doesn't provide complete access to its
+ * state fields to subclasses. Also, conceptually TextPosition is
+ * immutable while these flags need to be set post-creation so
+ * it makes sense to put these flags in this separate class.
+ * </p>
+ * @author m.martinez@ll.mit.edu
+ */
+ private static final class PositionWrapper
+ {
+ private boolean isLineStart = false;
+ private boolean isParagraphStart = false;
+ private boolean isPageBreak = false;
+ private boolean isHangingIndent = false;
+ private boolean isArticleStart = false;
+
+ private TextPosition position = null;
+
+ /**
+ * Returns the underlying TextPosition object.
+ * @return the text position
+ */
+ public TextPosition getTextPosition()
+ {
+ return position;
+ }
+
+ public boolean isLineStart()
+ {
+ return isLineStart;
+ }
+
+ /**
+ * Sets the isLineStart() flag to true.
+ */
+ public void setLineStart()
+ {
+ this.isLineStart = true;
+ }
+
+
+ public boolean isParagraphStart()
+ {
+ return isParagraphStart;
+ }
+
+ /**
+ * sets the isParagraphStart() flag to true.
+ */
+ public void setParagraphStart()
+ {
+ this.isParagraphStart = true;
+ }
+
+
+ public boolean isArticleStart()
+ {
+ return isArticleStart;
+ }
+
+
+ /**
+ * Sets the isArticleStart() flag to true.
+ */
+ public void setArticleStart()
+ {
+ this.isArticleStart = true;
+ }
+
+ public boolean isPageBreak()
+ {
+ return isPageBreak;
+ }
+
+ /**
+ * Sets the isPageBreak() flag to true.
+ */
+ public void setPageBreak()
+ {
+ this.isPageBreak = true;
+ }
+
+ public boolean isHangingIndent()
+ {
+ return isHangingIndent;
+ }
+
+ /**
+ * Sets the isHangingIndent() flag to true.
+ */
+ public void setHangingIndent()
+ {
+ this.isHangingIndent = true;
+ }
+
+ /**
+ * Constructs a PositionWrapper around the specified TextPosition object.
+ * @param position the text position
+ */
+ public PositionWrapper(TextPosition position)
+ {
+ this.position = position;
+ }
+ }
}
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java?rev=1627874&r1=1627873&r2=1627874&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java (original)
+++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/util/PDFTextStripperByArea.java Fri Sep 26 20:12:13 2014
@@ -54,20 +54,6 @@ public class PDFTextStripperByArea exten
super();
}
- /**
- * Instantiate a new PDFTextStripperArea object. Will apply
- * encoding-specific conversions to the output text.
- *
- * @param encoding
- * The encoding that the output will be written in.
- * @throws IOException
- * If there is an error reading the properties.
- */
- public PDFTextStripperByArea(String encoding) throws IOException
- {
- super(encoding);
- }
-
/**
* Add a new region to group text by.
*
Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java?rev=1627874&r1=1627873&r2=1627874&view=diff
==============================================================================
--- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java (original)
+++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/util/TestTextStripper.java Fri Sep 26 20:12:13 2014
@@ -107,7 +107,7 @@ public class TestTextStripper extends Te
public TestTextStripper( String name ) throws IOException
{
super( name );
- stripper = new PDFTextStripper(encoding);
+ stripper = new PDFTextStripper();
stripper.setLineSeparator("\n");
}
Modified: pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java?rev=1627874&r1=1627873&r2=1627874&view=diff
==============================================================================
--- pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java (original)
+++ pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java Fri Sep 26 20:12:13 2014
@@ -252,14 +252,14 @@ public class ExtractText
}
}
- PDFTextStripper stripper = null;
+ PDFTextStripper stripper;
if(toHTML)
{
- stripper = new PDFText2HTML(encoding);
+ stripper = new PDFText2HTML();
}
else
{
- stripper = new PDFTextStripper(encoding);
+ stripper = new PDFTextStripper();
}
stripper.setForceParsing( force );
stripper.setSortByPosition( sort );
Modified: pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java?rev=1627874&r1=1627873&r2=1627874&view=diff
==============================================================================
--- pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java (original)
+++ pdfbox/trunk/tools/src/main/java/org/apache/pdfbox/tools/PDFText2HTML.java Fri Sep 26 20:12:13 2014
@@ -44,12 +44,11 @@ public class PDFText2HTML extends PDFTex
/**
* Constructor.
- * @param encoding The encoding to be used
* @throws IOException If there is an error during initialization.
*/
- public PDFText2HTML(String encoding) throws IOException
+ public PDFText2HTML() throws IOException
{
- super(encoding);
+ super();
setLineSeparator(LINE_SEPARATOR);
setParagraphStart("<p>");
setParagraphEnd("</p>"+ LINE_SEPARATOR);
@@ -73,11 +72,7 @@ public class PDFText2HTML extends PDFTex
+ "\"http://www.w3.org/TR/html4/loose.dtd\">\n");
buf.append("<html><head>");
buf.append("<title>" + escape(getTitle()) + "</title>\n");
- if(outputEncoding != null)
- {
- buf.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset="
- + this.outputEncoding + "\">\n");
- }
+ buf.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=\"UTF-16\">\n");
buf.append("</head>\n");
buf.append("<body>\n");
super.writeString(buf.toString());
Modified: pdfbox/trunk/tools/src/test/java/org/apache/pdfbox/tools/TestPDFText2HTML.java
URL: http://svn.apache.org/viewvc/pdfbox/trunk/tools/src/test/java/org/apache/pdfbox/tools/TestPDFText2HTML.java?rev=1627874&r1=1627873&r2=1627874&view=diff
==============================================================================
--- pdfbox/trunk/tools/src/test/java/org/apache/pdfbox/tools/TestPDFText2HTML.java (original)
+++ pdfbox/trunk/tools/src/test/java/org/apache/pdfbox/tools/TestPDFText2HTML.java Fri Sep 26 20:12:13 2014
@@ -47,7 +47,7 @@ public class TestPDFText2HTML extends Te
}
public void testEscapeTitle() throws IOException {
- PDFTextStripper stripper = new PDFText2HTML("UTF-8");
+ PDFTextStripper stripper = new PDFText2HTML();
PDDocument doc = createDocument("<script>\u3042", PDType1Font.HELVETICA, "<foo>");
String text = stripper.getText(doc);
@@ -59,7 +59,7 @@ public class TestPDFText2HTML extends Te
}
public void testStyle() throws IOException {
- PDFTextStripper stripper = new PDFText2HTML("UTF-8");
+ PDFTextStripper stripper = new PDFText2HTML();
PDDocument doc = createDocument("t", PDType1Font.HELVETICA_BOLD, "<bold>");
String text = stripper.getText(doc);