You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@pdfbox.apache.org by "John Hewson (JIRA)" <ji...@apache.org> on 2014/10/11 00:17:33 UTC

[jira] [Closed] (PDFBOX-126) text extraction between bookmarks.

     [ https://issues.apache.org/jira/browse/PDFBOX-126?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

John Hewson closed PDFBOX-126.
------------------------------
    Resolution: Invalid

Closing as this is a "how to" question.

> text extraction between bookmarks.
> ----------------------------------
>
>                 Key: PDFBOX-126
>                 URL: https://issues.apache.org/jira/browse/PDFBOX-126
>             Project: PDFBox
>          Issue Type: Bug
>          Components: Text extraction
>
> [imported from SourceForge]
> http://sourceforge.net/tracker/index.php?group_id=78314&atid=552832&aid=1418344
> Originally submitted by srinivaskrishna on 2006-01-29 22:00.
> Hi,  
>  
> I am extracting text from pdf file and saving it in a 
> textfile. Each bookmark in a pdf file i am saving as a 
> different text file. Everything fine up to some level. 
> But as I am using start book mark and endbookmark as a 
> delimeters for pdftextstripper I am getting text 
> content between those bookmarks, and since one book 
> mark should be saved as individual file, In this text 
> file i am getting the first page of endbookmark which 
> i dont want. So can anybody suggest me the solution to 
> avoid the above problem. For your reference i am 
> pasting my code below.  
> ################################################## 
>  
> import java.io.*; 
> import java.lang.*; 
> import java.util.*; 
> import org.pdfbox.exceptions.InvalidPasswordException; 
> import org.pdfbox.pdmodel.PDDocument; 
> import 
> org.pdfbox.pdmodel.interactive.documentnavigation.outli
> ne.PDDocumentOutline; 
> import 
> org.pdfbox.pdmodel.interactive.documentnavigation.outli
> ne.PDOutlineItem; 
> import org.pdfbox.util.PDFText2HTML; 
> import org.pdfbox.util.PDFTextStripper; 
>  
> public class PdftoText{ 
> public static void main(String args[]){ 
> try{ 
>  
> File pdfFiles = new File
> (&quot;D:/Data/skrishna/bnpp/pdf/&quot;); 
> String[] children = pdfFiles.list(); 
> if(children == null){ 
> System.out.println(&quot;Directory is empty&quot;); 
> }else{ 
> for(int pdffile=0; 
> pdffile&lt;children.length;pdffile++){ 
> String pdffilename = children[pdffile]; 
> int index = pdffilename.indexOf(&quot;.pdf&quot;) ; 
> System.out.println(&quot;the index value is 
> #################################&quot;+index); 
>  
> String newfilename = pdffilename.substring(0,index); 
> System.out.println(&quot;Inside else part the new pdf 
> filename is &quot;+newfilename); 
> String filename = 
> &quot;D:/Data/skrishna/bnpp/pdf/&quot;; 
> filename+=pdffilename; 
>  
> int j =1; 
> Writer output = null; 
> PDDocument document = null; 
> document = PDDocument.load( filename ); 
> PDDocumentOutline root = document.getDocumentCatalog
> ().getDocumentOutline(); 
> PDOutlineItem item = root.getFirstChild(); 
> PDOutlineItem item1 = item.getNextSibling(); 
> while( item1 != null ){  
> System.out.println( &quot;Item:&quot; + item.getTitle
> () ); 
> System.out.println( &quot;Item1:&quot; + item1.getTitle
> () ); 
> output = new OutputStreamWriter(new FileOutputStream
> (&quot;D:/Data/skrishna/bnpp/text/&quot;+newfilename+&q
> uot;_&quot;+j+&quot;.txt&quot;)); 
>  
> PDFTextStripper stripper = null; 
> stripper = new PDFTextStripper(); 
> stripper.setStartBookmark(item); 
> stripper.setWordSeparator(&quot; &quot;); 
> stripper.setLineSeparator(&quot;\n&quot;); 
> stripper.setPageSeparator(&quot;\n\n\n\n&quot;); 
> stripper.setEndBookmark(item1); 
> stripper.writeText( document, output ); 
> j++; 
> item = item.getNextSibling(); 
> item1 = item1.getNextSibling(); 
> } 
>  
> PDOutlineItem child = item.getFirstChild(); 
> PDOutlineItem child1 = new PDOutlineItem(); 
> while( child != null ){ 
> child1 = child;  
> child = child.getNextSibling(); 
> } 
> System.out.println( &quot;Item:&quot; + item.getTitle
> () ); 
> System.out.println( &quot;Item1:&quot; + 
> child1.getTitle() ); 
> output = new OutputStreamWriter(new FileOutputStream( 
> &quot;D:/Data/skrishna/bnpp/text/&quot;+newfilename+&qu
> ot;_&quot;+j+&quot;.txt&quot;)); 
> PDFTextStripper stripper = null; 
> stripper = new PDFTextStripper(); 
> stripper.setWordSeparator(&quot; &quot;); 
> stripper.setLineSeparator(&quot;\n&quot;); 
> stripper.setPageSeparator(&quot;\n\n\n\n&quot;); 
> stripper.setStartBookmark(item); 
> stripper.setEndBookmark(child1); 
> stripper.writeText( document, output ); 
> output.close();  
> document.close(); 
> }//End of for 
> }//End of If Else 
> }catch(Exception e){ 
> System.out.println(e); 
> } 
> } 
> } 
> ################################################### 
>  
> So please let me know the solution as this is very 
> urgent in need.  
>  
> Thanks in advance.  
> Srinivas



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)