You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@pdfbox.apache.org by "Brian Carrier (JIRA)" <ji...@apache.org> on 2008/11/17 23:06:44 UTC
[jira] Commented: (PDFBOX-178) splitting some words randomnly
[ https://issues.apache.org/jira/browse/PDFBOX-178?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12648374#action_12648374 ]
Brian Carrier commented on PDFBOX-178:
--------------------------------------
I just ran this file on the trunk and the extra spaces no longer exist in "version". It must have been fixed at some point.
> splitting some words randomnly
> ------------------------------
>
> Key: PDFBOX-178
> URL: https://issues.apache.org/jira/browse/PDFBOX-178
> Project: PDFBox
> Issue Type: Bug
> Components: Text extraction
>
> [imported from SourceForge]
> http://sourceforge.net/tracker/index.php?group_id=78314&atid=552832&aid=1506499
> Originally submitted by nobody on 2006-06-14 23:57.
> i m using pdfbox to extract text.mainly using
> PDFTextStripper.
> please check the main file "ExtractText".
> import java.io.FileOutputStream;
> import java.io.FileInputStream;
> import java.io.InputStream;
> import java.io.OutputStreamWriter;
> import java.io.Writer;
> import org.pdfbox.exceptions.InvalidPasswordException;
> import org.pdfbox.pdmodel.PDDocument;
> import org.pdfbox.util.PDFText2HTML;
> import org.pdfbox.util.PDFTextStripper;
> import org.pdfbox.pdfparser.PDFParser;
> public class ExtractText
> {
> public static final String DEFAULT_ENCODING =
> null;
> private static final String PASSWORD = "-
> password";
> private static final String ENCODING = "-encoding";
> private static final String CONSOLE = "-console";
> private static final String START_PAGE = "-
> startPage";
> private static final String END_PAGE = "-endPage";
> private static final String SORT = "-sort";
> private static final String HTML = "-html"; //
> jjb - added simple HTML output
> public ExtractText()
> {
> }
> public static void main( String[] args ) throws
> Exception
> {
> boolean toConsole = false;
> boolean toHTML = false;
> boolean sort = false;
> String password = "";
> String encoding = DEFAULT_ENCODING;
> String pdfFile = null;
> String textFile = null;
> String parsertx = "";
> int startPage = 1;
> int endPage = Integer.MAX_VALUE;
> for( int i=0; i<args.length; i++ )
> {
> if( args[i].equals( PASSWORD ) )
> {
> i++;
> if( i >= args.length )
> {
> usage();
> }
> password = args[i];
> }
> else if( args[i].equals( ENCODING ) )
> {
> i++;
> if( i >= args.length )
> {
> usage();
> }
> encoding = args[i];
> }
> else if( args[i].equals( START_PAGE ) )
> {
> i++;
> if( i >= args.length )
> {
> usage();
> }
> startPage = Integer.parseInt( args
> [i] );
> }
> else if( args[i].equals( HTML ) )
> {
> toHTML = true;
> }
> else if( args[i].equals( SORT ) )
> {
> sort = true;
> }
> else if( args[i].equals( END_PAGE ) )
> {
> i++;
> if( i >= args.length )
> {
> usage();
> }
> endPage = Integer.parseInt( args[i] );
> }
> else if( args[i].equals( CONSOLE ) )
> {
> toConsole = true;
> }
> else
> {
> if( pdfFile == null )
> {
> // pdfFile = args[i];
>
> }
> else
> {
> //textFile = args[i];
> }
> }
> }
> pdfFile = "E:\\examples\\resources\\text.pdf";
> if( pdfFile == null )
> {
> usage();
> }
> if( textFile == null && pdfFile.length() >4 )
> {
> //textFile = pdfFile.substring( 0,
> pdfFile.length() -4 ) + ".txt";
> }
>
> Writer output = null;
> PDDocument document = null;
> try
> {
> document = PDDocument.load( pdfFile );
> //document.print();
> if( document.isEncrypted() )
> {
> try
> {
> document.decrypt( password );
> }
> catch( InvalidPasswordException e )
> {
> if( args.length == 4 )//they
> supplied the wrong password
> {
> System.err.println( "Error:
> The supplied password is incorrect." );
> System.exit( 2 );
> }
> else
> {
> //they didn't suppply a
> password and the default of "" was wrong.
> System.err.println( "Error:
> The document is encrypted." );
> usage();
> }
> }
> }
> if( toConsole )
> {
> output = new OutputStreamWriter(
> System.out );
> }
> else
> {
> if( encoding != null )
> {
> output = new OutputStreamWriter(
> new FileOutputStream(
> textFile ), encoding );
> }
> else
> {
> //use default encoding
> // output = new OutputStreamWriter
> (new FileOutputStream( textFile ) );
> output = new OutputStreamWriter
> ( System.out );
> }
> }
> PDFTextStripper stripper = null;
> if(toHTML)
> {
> stripper = new PDFText2HTML();
> }
> else
> {
> stripper = new PDFTextStripper();
> }
> stripper.setSortByPosition( sort );
> stripper.setStartPage( startPage );
> stripper.setEndPage( endPage );
> parsertx = stripper.writeText( document,
> output );
> System.out.println("==========>>parsed
> text:"+parsertx);
>
> }
> finally
> {
> if( output != null )
> {
> output.close();
> }
> if( document != null )
> {
> document.close();
> }
> }
> }
> private static void usage()
> {
> System.err.println( "Usage: java
> org.pdfbox.ExtractText [OPTIONS] <PDF file> [Text File]
> \n" +
> " -password <password> Password
> to decrypt document\n" +
> " -encoding <output encoding> (ISO-8859-
> 1,UTF-16BE,UTF-16LE,...)\n" +
> " -console Send text
> to console instead of file\n" +
> " -html Output in
> HTML format instead of raw text\n" +
> " -sort Sort the
> text before writing\n" +
> " -startPage <number> The first
> page to start extraction(1 based)\n" +
> " -endPage <number> The last
> page to extract(inclusive)\n" +
> " <PDF file> The PDF
> document to use\n" +
> " [Text File] The file
> to write the text to\n"
> );
> System.exit( 1 );
> }
> }
> PLEASE CHECK THE INPUT FILE ATTACHED....TEXT.PDF
> OUTPUT I GOT IS PASTED BELOW::::::::PLEASE CHECK
> IT:=>AS IN SECOND LINE WORD "VERSION" IS
> SPLITTED...MANY MORE R THERE LIKE THIS...PLEASE TELL
> ME HOW TO FIX THIS....OR PATH FROM WHERE I CAN GET THE
> JAR FILE AND THE MAIN CLASS WHICH WILL PARSE N GIVE
> THE TEXT AS A STRING.......
> Aspose.Pdf.Kit for Java ProductOverview
> Aspose.Pdf.Kit for Java is java ver sion of A
> spose.Pdf.Kit which developed purely in java for the
> manipulation of Pdf documents. I t provides rich
> set of features like document updation managing
> security settings updating meta information text
> and images extraction and form fields management
> etc. that enables you to perform almost al l
> powerful operations on your already existing Pdf
> documents. Product Description Aspose.Pdf.Kit is
> a n amazing product by Aspose Pt y Ltd. tha t
> lets its users especially developers to manage their
> Pdf documents programatically. Aspose.Pdf.Kit
> provides a ve ry simple Application Programming
> Interface (API) f or developers that i s very eas
> y to learn and use. Aspose.Pdf.Kit for Java is
> implemented in purely java and has the same power
> of Aspose.Pdf.Kit. It can be u sed under any platform
> if onl y installed JVM(see details at System
> Requirements). Aspose.Pdf.Kit for Java can be
> integrated with any kind of application either it's
> a n Web Application or a Java Application.
> Aspose.Pdf.Kit for Java is surprisingly fast and
> light weight. It processes Pdf documents
> efficiently leading towards the better performance
> of dev eloper's custom applications. Aspose.Pdf.Kit
> for Java is t he first choice of most of our
> clients for pr ocessing Pdf documents because of
> its affor dable price superb performance and
> qui ck Aspose Support. Document Features Two
> Pdf documents can be Concatenated together New
> Pages can be Appended to an existing Pdf document
> One Pdf document can be Splitted into two Pdf
> documents Several pages can be Extracted as a new
> pdf document Security Features Security settings
> of the Pdf document can be modified easily Pdf
> documents can be Encrypted and Decrypted with Owner's
> Password and User's Password Owner can change
> password and reset security settings Form Field
> Features You can re ad all For m Fields of
> the Pdf d ocuments including their Names and
> Values Form Field can be f illed programatically
> through the spe cified Field Name Specified Field
> or all the Form Fields of the Pdf docu ments
> can be Flattened Form Fields can be bound to any
> Data Source easily Extraction and Utility
> Features Show and modify a Pdf document Meta
> Information like Title Autho r etc. Add a
> Watermark or Logo at specified position on every
> page Extract Texts and Images from Pdf document
> easily Count the Words in Pdf document Adhere
> Text or Image to the page of the Pdf document
> THANKING U IN ANTICIPATION
> ANITHA
> [attachment on SourceForge]
> http://sourceforge.net/tracker/download.php?group_id=78314&atid=552832&aid=1506499&file_id=181702
> text.pdf (application/pdf), 14789 bytes
> input file for parsing
> [comment on SourceForge]
> Originally sent by nobody.
> Logged In: NO
> Hi Any updates on this issue. Can anyone please help me out? I am not sure if this has been fixed in the latest release. Please email me at enigma_ka@yahoo.co.in.
--
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.