You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@tika.apache.org by "Troy Witthoeft (Closed) (JIRA)" <ji...@apache.org> on 2011/11/08 19:19:51 UTC

[jira] [Closed] (TIKA-679) Proposal for PRT Parser

     [ https://issues.apache.org/jira/browse/TIKA-679?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Troy Witthoeft closed TIKA-679.
-------------------------------

    Resolution: Fixed
    
> Proposal for PRT Parser
> -----------------------
>
>                 Key: TIKA-679
>                 URL: https://issues.apache.org/jira/browse/TIKA-679
>             Project: Tika
>          Issue Type: Improvement
>          Components: mime, parser
>            Reporter: Troy Witthoeft
>            Priority: Minor
>              Labels: CAD, Mime, Parser, Prt, Tika
>         Attachments: PRTParser.patch, PRTParser.patch, PRTParser.patch, PRTParserTest.patch, TikaTest.prt, TikaTest2.prt, TikaTest2.prt.txt
>
>   Original Estimate: 672h
>  Remaining Estimate: 672h
>
> It would be nice if Tika had support for prt CAD files.
> A preliminary prt text extractor has been created.
> Any assistance further developing this code is appreciated.
> {code:title=PRTParser.java|borderStyle=solid}
> package org.apache.tika.parser.prt;
> import java.io.BufferedInputStream;
> import java.io.BufferedReader;
> import java.io.IOException;
> import java.io.InputStream;
> import java.io.InputStreamReader;
> import java.io.Reader;
> import java.io.UnsupportedEncodingException;
> import java.nio.charset.Charset;
> import java.util.Collections;
> import java.util.Set;
> import org.apache.poi.util.IOUtils;
> import org.apache.tika.exception.TikaException;
> import org.apache.tika.metadata.Metadata;
> import org.apache.tika.mime.MediaType;
> import org.apache.tika.parser.ParseContext;
> import org.apache.tika.parser.Parser;
> import org.apache.tika.sax.XHTMLContentHandler;
> import org.xml.sax.ContentHandler;
> import org.xml.sax.SAXException;
> /**
>  * Description: PRT (CAD Drawing) parser. This is a very basic parser.   
>  * Searches for specific byte prefix, and outputs text from note entities.
>  * Does not support special characters.
>  */
>  
> public class PRTParser implements Parser {
>     private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("prt"));
>     public static final String PRT_MIME_TYPE = "application/prt";
>         	
>     public Set<MediaType> getSupportedTypes(ParseContext context) {
>         return SUPPORTED_TYPES;
>         }
> 		
>     public void parse(
> 		InputStream stream, ContentHandler handler,
> 		Metadata metadata, ParseContext context)
> 		throws IOException, SAXException, TikaException {
> 		XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
> 		
> 		int[] prefix = new int[] {227, 63};  				//Looking for a prefix set of bytes {E3, 3F} 
> 		int pos = 0;										//position inside the prefix
> 		int read;
> 		while( (read = stream.read()) > -1) {					// stream.read() moves to the next byte, and returns an integer value of the byte.  a value of -1 signals the EOF
> 			if(read == prefix[pos]) {								// is the last byte read the same as the first byte in the prefix?
> 				pos++;													
> 					if(pos == prefix.length) {								//Are we at the last position of the prefix?
> 						stream.skip(11);										// skip the 11 bytes of the prefix which can vary.
> 						int lengthbyte = stream.read();							// Set the next byte equal to the length of text in the user input field, see PRT schema
> 						stream.skip(1);											
> 						byte[] text = new byte[lengthbyte];						// a new byte array called text is created.  It should contain an array of integer values of the user inputted text.
> 						IOUtils.readFully(stream, text);						
> 						String str = new String(text, 0, text.length, "Cp437");	// Cp437 turn it into a string, but does not remove null termination, assumes it's found to be MS-DOS Encoding
> 						str = str.replace("\u03C6","\u00D8");					// Note: Substitute CP437's lowercase "phi" for Nordic "O with slash" to represent diameter symbol. 
> 						metadata.add("Content",str);
> 						xhtml.startElement("p");	
> 						xhtml.characters(str);
> 						xhtml.endElement("p");
> 						pos = 0; 							
> 					}
> 			} 
> 			else {
> 				//Did not find the prefix. Reset the position counter.
> 				pos = 0;
> 			}
> 		}
> 	//Reached the end of file
> 	//System.out.println("Finished searching the file");	
> 	}
> 		
> 	/**
>     * @deprecated This method will be removed in Apache Tika 1.0.
>     */
>     public void parse(
>                    InputStream stream, ContentHandler handler, Metadata metadata)
>                    throws IOException, SAXException, TikaException {
>                 parse(stream, handler, metadata, new ParseContext());
>     }
> }{code}   
>  

--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators: https://issues.apache.org/jira/secure/ContactAdministrators!default.jspa
For more information on JIRA, see: http://www.atlassian.com/software/jira