You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2021/05/03 15:56:45 UTC
svn commit: r1889452 -
/pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/text/PDFMarkedContentExtractor.java
Author: tilman
Date: Mon May 3 15:56:45 2021
New Revision: 1889452
URL: http://svn.apache.org/viewvc?rev=1889452&view=rev
Log:
PDFBOX-5183: add suppressDuplicateOverlappingText getter / setter
Modified:
pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/text/PDFMarkedContentExtractor.java
Modified: pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/text/PDFMarkedContentExtractor.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/text/PDFMarkedContentExtractor.java?rev=1889452&r1=1889451&r2=1889452&view=diff
==============================================================================
--- pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/text/PDFMarkedContentExtractor.java (original)
+++ pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/text/PDFMarkedContentExtractor.java Mon May 3 15:56:45 2021
@@ -40,7 +40,7 @@ import org.apache.pdfbox.contentstream.o
*/
public class PDFMarkedContentExtractor extends LegacyPDFStreamEngine
{
- private final boolean suppressDuplicateOverlappingText = true;
+ private boolean suppressDuplicateOverlappingText = true;
private final List<PDMarkedContent> markedContents = new ArrayList<PDMarkedContent>();
private final Deque<PDMarkedContent> currentMarkedContents = new ArrayDeque<PDMarkedContent>();
private final Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>();
@@ -69,6 +69,28 @@ public class PDFMarkedContentExtractor e
}
/**
+ * @return the suppressDuplicateOverlappingText setting.
+ */
+ public boolean isSuppressDuplicateOverlappingText()
+ {
+ return suppressDuplicateOverlappingText;
+ }
+
+ /**
+ * By default the class will attempt to remove text that overlaps each other. Word paints the
+ * same character several times in order to make it look bold. By setting this to false all text
+ * will be extracted, which means that certain sections will be duplicated, but better
+ * performance will be noticed.
+ *
+ * @param suppressDuplicateOverlappingText The suppressDuplicateOverlappingText setting to set.
+ */
+ public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingText)
+ {
+ this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingText;
+ }
+
+
+ /**
* This will determine of two floating point numbers are within a specified variance.
*
* @param first The first number to compare to.