You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pdfbox.apache.org by ti...@apache.org on 2021/05/03 15:56:45 UTC

svn commit: r1889452 - /pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/text/PDFMarkedContentExtractor.java

Author: tilman
Date: Mon May  3 15:56:45 2021
New Revision: 1889452

URL: http://svn.apache.org/viewvc?rev=1889452&view=rev
Log:
PDFBOX-5183: add suppressDuplicateOverlappingText getter / setter

Modified:
    pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/text/PDFMarkedContentExtractor.java

Modified: pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/text/PDFMarkedContentExtractor.java
URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/text/PDFMarkedContentExtractor.java?rev=1889452&r1=1889451&r2=1889452&view=diff
==============================================================================
--- pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/text/PDFMarkedContentExtractor.java (original)
+++ pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/text/PDFMarkedContentExtractor.java Mon May  3 15:56:45 2021
@@ -40,7 +40,7 @@ import org.apache.pdfbox.contentstream.o
  */
 public class PDFMarkedContentExtractor extends LegacyPDFStreamEngine
 {
-    private final boolean suppressDuplicateOverlappingText = true;
+    private boolean suppressDuplicateOverlappingText = true;
     private final List<PDMarkedContent> markedContents = new ArrayList<PDMarkedContent>();
     private final Deque<PDMarkedContent> currentMarkedContents = new ArrayDeque<PDMarkedContent>();
     private final Map<String, List<TextPosition>> characterListMapping = new HashMap<String, List<TextPosition>>();
@@ -69,6 +69,28 @@ public class PDFMarkedContentExtractor e
     }
 
     /**
+     * @return the suppressDuplicateOverlappingText setting.
+     */
+    public boolean isSuppressDuplicateOverlappingText()
+    {
+        return suppressDuplicateOverlappingText;
+    }
+
+    /**
+     * By default the class will attempt to remove text that overlaps each other. Word paints the
+     * same character several times in order to make it look bold. By setting this to false all text
+     * will be extracted, which means that certain sections will be duplicated, but better
+     * performance will be noticed.
+     *
+     * @param suppressDuplicateOverlappingText The suppressDuplicateOverlappingText setting to set.
+     */
+    public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingText)
+    {
+        this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingText;
+    }
+
+
+    /**
      * This will determine of two floating point numbers are within a specified variance.
      *
      * @param first The first number to compare to.