You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@pdfbox.apache.org by "Andreas Lehmkühler (Jira)" <ji...@apache.org> on 2022/10/26 06:17:00 UTC
[jira] [Updated] (PDFBOX-5532) COSString field non-ascii characters

     [ https://issues.apache.org/jira/browse/PDFBOX-5532?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Andreas Lehmkühler updated PDFBOX-5532:
---------------------------------------
    Description: 
 
Hello,

I am reading a pdf document but in the COSString field non-ascii characters are being retrieved. What can be the motive? I am using version pdfbox-2.0.24.jar

This would be an example of the pdf document parsed:
{code}
COSInt\{50} 
COSInt\{0} 
PDFOperator\{Td} 
COSString\{åÅÕãÁâ@} 
PDFOperator\{Tj} 
COSFloat\{770.18} 
COSInt\{0} 
PDFOperator\{Td} 
COSString\{×–Ž–©@} 
PDFOperator\{Tj} 
COSFloat\{520.21} 
COSInt\{0}
{code}


Function java:

{code}
 public static PDDocument replaceText(PDDocument document, String searchString, String replacement) throws IOException {
		 	    
		    PDPageTree pages = document.getDocumentCatalog().getPages();
		    for (PDPage page : pages) {
		    			    	
		        PDFStreamParser parser = new PDFStreamParser(page);
		        parser.parse();
		        List tokens = parser.getTokens();
		        for (int j = 0; j < tokens.size(); j++) {
		            Object next = tokens.get(j);
		           
		            if (next instanceof Operator) {
		                Operator op = (Operator) next;
		             
		                 if (op.getName().equals("Tj")) {
		                    COSString previous = (COSString) tokens.get(j - 1);		                 
		                    String string = previous.getString();
		                    System.out.println("previous:=" + string);
		                                
		                
		                    if (string.equals(searchString)){
		                    	 COSString sx = new COSString(replacement);		
		                    	previous.setValue(sx.getBytes());
		                    	
		                    }
		                }
		            }
		        }
		        // now that the tokens are updated we will replace the page content stream.
		        PDStream updatedStream = new PDStream(document);
		        OutputStream out = updatedStream.createOutputStream();
		        ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
		        tokenWriter.writeTokens(tokens);
		        page.setContents(updatedStream);
		        out.close();
		        
		        
		    }
		    return document;
		}
{code}


  was:
 
Hello,

I am reading a pdf document but in the COSString field non-ascii characters are being retrieved. What can be the motive? I am using version pdfbox-2.0.24.jar

This would be an example of the pdf document parsed:

COSInt\{50} 
COSInt\{0} 
PDFOperator\{Td} 
COSString\{åÅÕãÁâ@} 
PDFOperator\{Tj} 
COSFloat\{770.18} 
COSInt\{0} 
PDFOperator\{Td} 
COSString\{×–Ž–©@} 
PDFOperator\{Tj} 
COSFloat\{520.21} 
COSInt\{0}


Function java:

 public static PDDocument replaceText(PDDocument document, String searchString, String replacement) throws IOException {
		 	    
		    PDPageTree pages = document.getDocumentCatalog().getPages();
		    for (PDPage page : pages) {
		    			    	
		        PDFStreamParser parser = new PDFStreamParser(page);
		        parser.parse();
		        List tokens = parser.getTokens();
		        for (int j = 0; j < tokens.size(); j++) {
		            Object next = tokens.get(j);
		           
		            if (next instanceof Operator) {
		                Operator op = (Operator) next;
		             
		                 if (op.getName().equals("Tj")) {
		                    COSString previous = (COSString) tokens.get(j - 1);		                 
		                    String string = previous.getString();
		                    System.out.println("previous:=" + string);
		                                
		                
		                    if (string.equals(searchString)){
		                    	 COSString sx = new COSString(replacement);		
		                    	previous.setValue(sx.getBytes());
		                    	
		                    }
		                }
		            }
		        }
		        // now that the tokens are updated we will replace the page content stream.
		        PDStream updatedStream = new PDStream(document);
		        OutputStream out = updatedStream.createOutputStream();
		        ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
		        tokenWriter.writeTokens(tokens);
		        page.setContents(updatedStream);
		        out.close();
		        
		        
		    }
		    return document;
		}
	 


> COSString field non-ascii characters
> ------------------------------------
>
>                 Key: PDFBOX-5532
>                 URL: https://issues.apache.org/jira/browse/PDFBOX-5532
>             Project: PDFBox
>          Issue Type: Bug
>            Reporter: David
>            Priority: Major
>
>  
> Hello,
> I am reading a pdf document but in the COSString field non-ascii characters are being retrieved. What can be the motive? I am using version pdfbox-2.0.24.jar
> This would be an example of the pdf document parsed:
> {code}
> COSInt\{50} 
> COSInt\{0} 
> PDFOperator\{Td} 
> COSString\{åÅÕãÁâ@} 
> PDFOperator\{Tj} 
> COSFloat\{770.18} 
> COSInt\{0} 
> PDFOperator\{Td} 
> COSString\{×–Ž–©@} 
> PDFOperator\{Tj} 
> COSFloat\{520.21} 
> COSInt\{0}
> {code}
> Function java:
> {code}
>  public static PDDocument replaceText(PDDocument document, String searchString, String replacement) throws IOException {
> 		 	    
> 		    PDPageTree pages = document.getDocumentCatalog().getPages();
> 		    for (PDPage page : pages) {
> 		    			    	
> 		        PDFStreamParser parser = new PDFStreamParser(page);
> 		        parser.parse();
> 		        List tokens = parser.getTokens();
> 		        for (int j = 0; j < tokens.size(); j++) {
> 		            Object next = tokens.get(j);
> 		           
> 		            if (next instanceof Operator) {
> 		                Operator op = (Operator) next;
> 		             
> 		                 if (op.getName().equals("Tj")) {
> 		                    COSString previous = (COSString) tokens.get(j - 1);		                 
> 		                    String string = previous.getString();
> 		                    System.out.println("previous:=" + string);
> 		                                
> 		                
> 		                    if (string.equals(searchString)){
> 		                    	 COSString sx = new COSString(replacement);		
> 		                    	previous.setValue(sx.getBytes());
> 		                    	
> 		                    }
> 		                }
> 		            }
> 		        }
> 		        // now that the tokens are updated we will replace the page content stream.
> 		        PDStream updatedStream = new PDStream(document);
> 		        OutputStream out = updatedStream.createOutputStream();
> 		        ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
> 		        tokenWriter.writeTokens(tokens);
> 		        page.setContents(updatedStream);
> 		        out.close();
> 		        
> 		        
> 		    }
> 		    return document;
> 		}
> {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@pdfbox.apache.org
For additional commands, e-mail: dev-help@pdfbox.apache.org