You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@pdfbox.apache.org by "John Hewson (JIRA)" <ji...@apache.org> on 2014/10/11 00:17:33 UTC
[jira] [Closed] (PDFBOX-126) text extraction between bookmarks.
[ https://issues.apache.org/jira/browse/PDFBOX-126?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
John Hewson closed PDFBOX-126.
------------------------------
Resolution: Invalid
Closing as this is a "how to" question.
> text extraction between bookmarks.
> ----------------------------------
>
> Key: PDFBOX-126
> URL: https://issues.apache.org/jira/browse/PDFBOX-126
> Project: PDFBox
> Issue Type: Bug
> Components: Text extraction
>
> [imported from SourceForge]
> http://sourceforge.net/tracker/index.php?group_id=78314&atid=552832&aid=1418344
> Originally submitted by srinivaskrishna on 2006-01-29 22:00.
> Hi,
>
> I am extracting text from pdf file and saving it in a
> textfile. Each bookmark in a pdf file i am saving as a
> different text file. Everything fine up to some level.
> But as I am using start book mark and endbookmark as a
> delimeters for pdftextstripper I am getting text
> content between those bookmarks, and since one book
> mark should be saved as individual file, In this text
> file i am getting the first page of endbookmark which
> i dont want. So can anybody suggest me the solution to
> avoid the above problem. For your reference i am
> pasting my code below.
> ##################################################
>
> import java.io.*;
> import java.lang.*;
> import java.util.*;
> import org.pdfbox.exceptions.InvalidPasswordException;
> import org.pdfbox.pdmodel.PDDocument;
> import
> org.pdfbox.pdmodel.interactive.documentnavigation.outli
> ne.PDDocumentOutline;
> import
> org.pdfbox.pdmodel.interactive.documentnavigation.outli
> ne.PDOutlineItem;
> import org.pdfbox.util.PDFText2HTML;
> import org.pdfbox.util.PDFTextStripper;
>
> public class PdftoText{
> public static void main(String args[]){
> try{
>
> File pdfFiles = new File
> ("D:/Data/skrishna/bnpp/pdf/");
> String[] children = pdfFiles.list();
> if(children == null){
> System.out.println("Directory is empty");
> }else{
> for(int pdffile=0;
> pdffile<children.length;pdffile++){
> String pdffilename = children[pdffile];
> int index = pdffilename.indexOf(".pdf") ;
> System.out.println("the index value is
> #################################"+index);
>
> String newfilename = pdffilename.substring(0,index);
> System.out.println("Inside else part the new pdf
> filename is "+newfilename);
> String filename =
> "D:/Data/skrishna/bnpp/pdf/";
> filename+=pdffilename;
>
> int j =1;
> Writer output = null;
> PDDocument document = null;
> document = PDDocument.load( filename );
> PDDocumentOutline root = document.getDocumentCatalog
> ().getDocumentOutline();
> PDOutlineItem item = root.getFirstChild();
> PDOutlineItem item1 = item.getNextSibling();
> while( item1 != null ){
> System.out.println( "Item:" + item.getTitle
> () );
> System.out.println( "Item1:" + item1.getTitle
> () );
> output = new OutputStreamWriter(new FileOutputStream
> ("D:/Data/skrishna/bnpp/text/"+newfilename+&q
> uot;_"+j+".txt"));
>
> PDFTextStripper stripper = null;
> stripper = new PDFTextStripper();
> stripper.setStartBookmark(item);
> stripper.setWordSeparator(" ");
> stripper.setLineSeparator("\n");
> stripper.setPageSeparator("\n\n\n\n");
> stripper.setEndBookmark(item1);
> stripper.writeText( document, output );
> j++;
> item = item.getNextSibling();
> item1 = item1.getNextSibling();
> }
>
> PDOutlineItem child = item.getFirstChild();
> PDOutlineItem child1 = new PDOutlineItem();
> while( child != null ){
> child1 = child;
> child = child.getNextSibling();
> }
> System.out.println( "Item:" + item.getTitle
> () );
> System.out.println( "Item1:" +
> child1.getTitle() );
> output = new OutputStreamWriter(new FileOutputStream(
> "D:/Data/skrishna/bnpp/text/"+newfilename+&qu
> ot;_"+j+".txt"));
> PDFTextStripper stripper = null;
> stripper = new PDFTextStripper();
> stripper.setWordSeparator(" ");
> stripper.setLineSeparator("\n");
> stripper.setPageSeparator("\n\n\n\n");
> stripper.setStartBookmark(item);
> stripper.setEndBookmark(child1);
> stripper.writeText( document, output );
> output.close();
> document.close();
> }//End of for
> }//End of If Else
> }catch(Exception e){
> System.out.println(e);
> }
> }
> }
> ###################################################
>
> So please let me know the solution as this is very
> urgent in need.
>
> Thanks in advance.
> Srinivas
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)