You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@pdfbox.apache.org by "DURGA DEEP (JIRA)" <ji...@apache.org> on 2008/08/26 21:22:44 UTC
[jira] Updated: (PDFBOX-372) java.io.IOException: Error: expected
hex character and not :32
[ https://issues.apache.org/jira/browse/PDFBOX-372?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
DURGA DEEP updated PDFBOX-372:
------------------------------
Attachment: Webmail02.pdf
Trying to extract the contents give me the following error.
String contents = "";
/** PDFTextStripper Object. **/
PDFTextStripper pdftextstrip = null;
/** A new PDFParser Instance. **/
PDFParser pdfp = null;
/** PDDocument Object. **/
PDDocument pdfDocument = null;
/** The document metadata. **/
PDDocumentInformation pdfDocumentInfo = null;
try {
pdfp = new PDFParser(isr);
// pdfp.parse() is not thread safe; ensure only
// one PDFConverter is calling it at a time
// otherwise chance of one thread getting stuck at
// org.pdfbox.cos.COSNumber.<clinit>(COSNumber.java:49)
synchronized (PDFConverter.class) {
// This will parse the stream and create the PDF document.
pdfp.parse();
}
pdfDocument = pdfp.getPDDocument();
pdfDocumentInfo = pdfDocument.getDocumentInformation();
pdftextstrip = new PDFTextStripper();
contents = pdftextstrip.getText(pdfDocument);
try {
// convert first page to image object.
PDPage firstPage
= (PDPage)
pdfDocument.getDocumentCatalog().getAllPages().get(0);
image = firstPage.convertToImage();
} catch (Exception ex) {
if (LOGGER.isLoggable(Level.WARNING)) {
String msg = "Unable to convert PDF to image: ";
LOGGER.log(Level.WARNING, msg + ex);
}
}
} catch (IOException ioe) {
String msg = "Error parsing the inputstream";
if (LOGGER.isLoggable(Level.FINEST)) {
LOGGER.log(Level.FINEST, msg+ioe.getMessage());
}
ioe.printStackTrace();
throw new IssException(msg,
IssException.Reason.INDEX_DOCUMENT_FAILURE);
} finally {
PDFont.clearResources();
try {
if (null != pdfDocument) {
pdfDocument.close();
}
} catch (IOException ioe) {
String msg = "Unable to close pdfDocument";
if (LOGGER.isLoggable(Level.FINEST)) {
LOGGER.log(Level.FINEST, msg+ioe.getMessage());
}
} finally {
try {
if (isr != null) {
isr.close();
}
} catch (IOException ioe) {
String msg = "Unable to close the Input Stream";
if (LOGGER.isLoggable(Level.FINEST)) {
LOGGER.log(Level.FINEST, msg+ioe.getMessage());
}
}
}
}
return contents;
> java.io.IOException: Error: expected hex character and not :32
> ---------------------------------------------------------------
>
> Key: PDFBOX-372
> URL: https://issues.apache.org/jira/browse/PDFBOX-372
> Project: PDFBox
> Issue Type: Bug
> Components: Text extraction
> Affects Versions: 0.7.3
> Environment: Solaris OS JDK 6
> Reporter: DURGA DEEP
> Fix For: 0.7.3
>
> Attachments: Webmail02.pdf
>
>
> Unable to parse the following PDF Attachment.
> java.io.IOException: Error: expected hex character and not :32
> at org.fontbox.cmap.CMapParser.parseNextToken(CMapParser.java:283)
> at org.fontbox.cmap.CMapParser.parse(CMapParser.java:105)
> at org.pdfbox.pdmodel.font.PDFont.parseCmap(PDFont.java:535)
> at org.pdfbox.pdmodel.font.PDFont.encode(PDFont.java:387)
> at org.pdfbox.util.PDFStreamEngine.showString(PDFStreamEngine.java:325)
> at org.pdfbox.util.operator.ShowText.process(ShowText.java:64)
> at org.pdfbox.util.PDFStreamEngine.processOperator(PDFStreamEngine.java:452)
> at org.pdfbox.util.PDFStreamEngine.processSubStream(PDFStreamEngine.java:215)
> at org.pdfbox.util.PDFStreamEngine.processStream(PDFStreamEngine.java:174)
> at org.pdfbox.util.PDFTextStripper.processPage(PDFTextStripper.java:336)
> at org.pdfbox.util.PDFTextStripper.processPages(PDFTextStripper.java:259)
> at org.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:216)
> at org.pdfbox.util.PDFTextStripper.getText(PDFTextStripper.java:149)
--
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.