You are viewing a plain text version of this content. The canonical link for it is here.

Posted to dev@pdfbox.apache.org by "Franklin (JIRA)" <ji...@apache.org> on 2011/06/17 10:04:47 UTC

[jira] [Created] (PDFBOX-1039) Arabic Text Extraction using PDFTextStripper working partially

Arabic Text Extraction using PDFTextStripper working partially
--------------------------------------------------------------

                 Key: PDFBOX-1039
                 URL: https://issues.apache.org/jira/browse/PDFBOX-1039
             Project: PDFBox
          Issue Type: Bug
          Components: Text extraction
    Affects Versions: 1.5.0
         Environment: Windows XP, Java 1.6
            Reporter: Franklin
         Attachments: TestPDFCreator.pdf, TestWord.pdf

I have been trying to extract the contents of PDF file (so as to index it with lucene). The PDF file contains arabic.

Both PDF files contain the exact same information. The strange thing is PDFTextStripper extract data from one file correctly(gives proper arabic) but not from the other(gives complete question marks ???? or [][][][][]  )

Below is the code being used

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
 
public class TesExtraction {
 
	// Extract text from PDF Document
	static String pdftoText(String fileName) {
		PDFParser parser;
		String parsedText = null;;
		PDFTextStripper pdfStripper = null;
		PDDocument pdDoc = null;
		COSDocument cosDoc = null;
		File file = new File(fileName);
		if (!file.isFile()) {
			System.err.println("File " + fileName + " does not exist.");
			return null;
		}
		try 
		{
			parser = new PDFParser(new FileInputStream(file));
		} catch (IOException e) {
			System.err.println("Unable to open PDF Parser. " + e.getMessage());
			return null;
		}
		try 
		{
			parser.parse();
			cosDoc = parser.getDocument();
			pdfStripper = new PDFTextStripper("CP-1252");
			pdDoc = new PDDocument(cosDoc);
			pdfStripper.setStartPage(1);
			pdfStripper.setEndPage(5);
			parsedText = pdfStripper.getText(pdDoc);
		} catch (Exception e) {
			System.err
					.println("An exception occured in parsing the PDF Document."
							+ e.getMessage());
		} finally {
			try {
				if (cosDoc != null)
					cosDoc.close();
				if (pdDoc != null)
					pdDoc.close();
			} catch (Exception e) {
				e.printStackTrace();
			}
		}
		return parsedText;
	}
	public static void main(String args[])
	{
		System.out.println(pdftoText("C:\\LuceneTest\\Data\\TestWord.pdf"));
		System.out.println(pdftoText("C:\\LuceneTest\\Data\\TestPDFCreator.pdf"));
	}
 
}

NOTE: Where can I upload the pdf files ?
 

--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira

[jira] [Updated] (PDFBOX-1039) Arabic Text Extraction using PDFTextStripper working partially

Posted by "Franklin (JIRA)" <ji...@apache.org>.

     [ https://issues.apache.org/jira/browse/PDFBOX-1039?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Franklin updated PDFBOX-1039:
-----------------------------

    Attachment: TestPDFCreator.pdf
                TestWord.pdf

These are the PDF files used.

The TestWord.pdf file works
The TestPDFCreator.pdf fails

> Arabic Text Extraction using PDFTextStripper working partially
> --------------------------------------------------------------
>
>                 Key: PDFBOX-1039
>                 URL: https://issues.apache.org/jira/browse/PDFBOX-1039
>             Project: PDFBox
>          Issue Type: Bug
>          Components: Text extraction
>    Affects Versions: 1.5.0
>         Environment: Windows XP, Java 1.6
>            Reporter: Franklin
>              Labels: arabic, textExtraction
>         Attachments: TestPDFCreator.pdf, TestWord.pdf
>
>   Original Estimate: 168h
>  Remaining Estimate: 168h
>
> I have been trying to extract the contents of PDF file (so as to index it with lucene). The PDF file contains arabic.
> Both PDF files contain the exact same information. The strange thing is PDFTextStripper extract data from one file correctly(gives proper arabic) but not from the other(gives complete question marks ???? or [][][][][]  )
> Below is the code being used
> import java.io.File;
> import java.io.FileInputStream;
> import java.io.IOException;
> import org.apache.pdfbox.cos.COSDocument;
> import org.apache.pdfbox.pdfparser.PDFParser;
> import org.apache.pdfbox.pdmodel.PDDocument;
> import org.apache.pdfbox.util.PDFTextStripper;
>  
> public class TesExtraction {
>  
> 	// Extract text from PDF Document
> 	static String pdftoText(String fileName) {
> 		PDFParser parser;
> 		String parsedText = null;;
> 		PDFTextStripper pdfStripper = null;
> 		PDDocument pdDoc = null;
> 		COSDocument cosDoc = null;
> 		File file = new File(fileName);
> 		if (!file.isFile()) {
> 			System.err.println("File " + fileName + " does not exist.");
> 			return null;
> 		}
> 		try 
> 		{
> 			parser = new PDFParser(new FileInputStream(file));
> 		} catch (IOException e) {
> 			System.err.println("Unable to open PDF Parser. " + e.getMessage());
> 			return null;
> 		}
> 		try 
> 		{
> 			parser.parse();
> 			cosDoc = parser.getDocument();
> 			pdfStripper = new PDFTextStripper("CP-1252");
> 			pdDoc = new PDDocument(cosDoc);
> 			pdfStripper.setStartPage(1);
> 			pdfStripper.setEndPage(5);
> 			parsedText = pdfStripper.getText(pdDoc);
> 		} catch (Exception e) {
> 			System.err
> 					.println("An exception occured in parsing the PDF Document."
> 							+ e.getMessage());
> 		} finally {
> 			try {
> 				if (cosDoc != null)
> 					cosDoc.close();
> 				if (pdDoc != null)
> 					pdDoc.close();
> 			} catch (Exception e) {
> 				e.printStackTrace();
> 			}
> 		}
> 		return parsedText;
> 	}
> 	public static void main(String args[])
> 	{
> 		System.out.println(pdftoText("C:\\LuceneTest\\Data\\TestWord.pdf"));
> 		System.out.println(pdftoText("C:\\LuceneTest\\Data\\TestPDFCreator.pdf"));
> 	}
>  
> }
> NOTE: Where can I upload the pdf files ?
>  

--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira

[jira] [Resolved] (PDFBOX-1039) Arabic Text Extraction using PDFTextStripper working partially

Posted by "Andreas Lehmkühler (JIRA)" <ji...@apache.org>.

     [ https://issues.apache.org/jira/browse/PDFBOX-1039?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Andreas Lehmkühler resolved PDFBOX-1039.
----------------------------------------

    Resolution: Not A Problem
      Assignee: Andreas Lehmkühler

Everything works as expected from the PDFBox point of view. 

I'm afraid one can't extract the text from those kind of pdfs. The font uses a builtin encoding. It just numbers all characters from 0 to 5 without any mapping to readable characters.

Try to extract the text using the acrobat reader (mark text, copy and paste it) and you'll get the same result.

> Arabic Text Extraction using PDFTextStripper working partially
> --------------------------------------------------------------
>
>                 Key: PDFBOX-1039
>                 URL: https://issues.apache.org/jira/browse/PDFBOX-1039
>             Project: PDFBox
>          Issue Type: Bug
>          Components: Text extraction
>    Affects Versions: 1.5.0
>         Environment: Windows XP, Java 1.6
>            Reporter: Franklin
>            Assignee: Andreas Lehmkühler
>              Labels: arabic, textExtraction
>         Attachments: TestPDFCreator.pdf, TestWord.pdf
>
>   Original Estimate: 168h
>  Remaining Estimate: 168h
>
> I have been trying to extract the contents of PDF file (so as to index it with lucene). The PDF file contains arabic.
> Both PDF files contain the exact same information. The strange thing is PDFTextStripper extract data from one file correctly(gives proper arabic) but not from the other(gives complete question marks ???? or [][][][][]  )
> Below is the code being used
> import java.io.File;
> import java.io.FileInputStream;
> import java.io.IOException;
> import org.apache.pdfbox.cos.COSDocument;
> import org.apache.pdfbox.pdfparser.PDFParser;
> import org.apache.pdfbox.pdmodel.PDDocument;
> import org.apache.pdfbox.util.PDFTextStripper;
>  
> public class TesExtraction {
>  
> 	// Extract text from PDF Document
> 	static String pdftoText(String fileName) {
> 		PDFParser parser;
> 		String parsedText = null;;
> 		PDFTextStripper pdfStripper = null;
> 		PDDocument pdDoc = null;
> 		COSDocument cosDoc = null;
> 		File file = new File(fileName);
> 		if (!file.isFile()) {
> 			System.err.println("File " + fileName + " does not exist.");
> 			return null;
> 		}
> 		try 
> 		{
> 			parser = new PDFParser(new FileInputStream(file));
> 		} catch (IOException e) {
> 			System.err.println("Unable to open PDF Parser. " + e.getMessage());
> 			return null;
> 		}
> 		try 
> 		{
> 			parser.parse();
> 			cosDoc = parser.getDocument();
> 			pdfStripper = new PDFTextStripper("CP-1252");
> 			pdDoc = new PDDocument(cosDoc);
> 			pdfStripper.setStartPage(1);
> 			pdfStripper.setEndPage(5);
> 			parsedText = pdfStripper.getText(pdDoc);
> 		} catch (Exception e) {
> 			System.err
> 					.println("An exception occured in parsing the PDF Document."
> 							+ e.getMessage());
> 		} finally {
> 			try {
> 				if (cosDoc != null)
> 					cosDoc.close();
> 				if (pdDoc != null)
> 					pdDoc.close();
> 			} catch (Exception e) {
> 				e.printStackTrace();
> 			}
> 		}
> 		return parsedText;
> 	}
> 	public static void main(String args[])
> 	{
> 		System.out.println(pdftoText("C:\\LuceneTest\\Data\\TestWord.pdf"));
> 		System.out.println(pdftoText("C:\\LuceneTest\\Data\\TestPDFCreator.pdf"));
> 	}
>  
> }
> NOTE: Where can I upload the pdf files ?
>  

--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira

[jira] [Commented] (PDFBOX-1039) Arabic Text Extraction using PDFTextStripper working partially

Posted by "Franklin (JIRA)" <ji...@apache.org>.

    [ https://issues.apache.org/jira/browse/PDFBOX-1039?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13051651#comment-13051651 ] 

Franklin commented on PDFBOX-1039:
----------------------------------

Thanks Andreas,
  Is there anyway I can handle this by myself. 

  Is there anyways I can detect this also?

Regards,
Franklin

> Arabic Text Extraction using PDFTextStripper working partially
> --------------------------------------------------------------
>
>                 Key: PDFBOX-1039
>                 URL: https://issues.apache.org/jira/browse/PDFBOX-1039
>             Project: PDFBox
>          Issue Type: Bug
>          Components: Text extraction
>    Affects Versions: 1.5.0
>         Environment: Windows XP, Java 1.6
>            Reporter: Franklin
>            Assignee: Andreas Lehmkühler
>              Labels: arabic, textExtraction
>         Attachments: TestPDFCreator.pdf, TestWord.pdf
>
>   Original Estimate: 168h
>  Remaining Estimate: 168h
>
> I have been trying to extract the contents of PDF file (so as to index it with lucene). The PDF file contains arabic.
> Both PDF files contain the exact same information. The strange thing is PDFTextStripper extract data from one file correctly(gives proper arabic) but not from the other(gives complete question marks ???? or [][][][][]  )
> Below is the code being used
> import java.io.File;
> import java.io.FileInputStream;
> import java.io.IOException;
> import org.apache.pdfbox.cos.COSDocument;
> import org.apache.pdfbox.pdfparser.PDFParser;
> import org.apache.pdfbox.pdmodel.PDDocument;
> import org.apache.pdfbox.util.PDFTextStripper;
>  
> public class TesExtraction {
>  
> 	// Extract text from PDF Document
> 	static String pdftoText(String fileName) {
> 		PDFParser parser;
> 		String parsedText = null;;
> 		PDFTextStripper pdfStripper = null;
> 		PDDocument pdDoc = null;
> 		COSDocument cosDoc = null;
> 		File file = new File(fileName);
> 		if (!file.isFile()) {
> 			System.err.println("File " + fileName + " does not exist.");
> 			return null;
> 		}
> 		try 
> 		{
> 			parser = new PDFParser(new FileInputStream(file));
> 		} catch (IOException e) {
> 			System.err.println("Unable to open PDF Parser. " + e.getMessage());
> 			return null;
> 		}
> 		try 
> 		{
> 			parser.parse();
> 			cosDoc = parser.getDocument();
> 			pdfStripper = new PDFTextStripper("CP-1252");
> 			pdDoc = new PDDocument(cosDoc);
> 			pdfStripper.setStartPage(1);
> 			pdfStripper.setEndPage(5);
> 			parsedText = pdfStripper.getText(pdDoc);
> 		} catch (Exception e) {
> 			System.err
> 					.println("An exception occured in parsing the PDF Document."
> 							+ e.getMessage());
> 		} finally {
> 			try {
> 				if (cosDoc != null)
> 					cosDoc.close();
> 				if (pdDoc != null)
> 					pdDoc.close();
> 			} catch (Exception e) {
> 				e.printStackTrace();
> 			}
> 		}
> 		return parsedText;
> 	}
> 	public static void main(String args[])
> 	{
> 		System.out.println(pdftoText("C:\\LuceneTest\\Data\\TestWord.pdf"));
> 		System.out.println(pdftoText("C:\\LuceneTest\\Data\\TestPDFCreator.pdf"));
> 	}
>  
> }
> NOTE: Where can I upload the pdf files ?
>  

--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira