You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/07/04 23:41:50 UTC
svn commit: r1142817 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/prt/
main/java/org/apache/tika/parser/prt/PRTParser.java
test/java/org/apache/tika/parser/prt/
test/java/org/apache/tika/parser/prt/PRTParserTest.java
Author: nick
Date: Mon Jul 4 21:41:50 2011
New Revision: 1142817
URL: http://svn.apache.org/viewvc?rev=1142817&view=rev
Log:
TIKA-679 CADKey PRT parser
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java?rev=1142817&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java Mon Jul 4 21:41:50 2011
@@ -0,0 +1,208 @@
+package org.apache.tika.parser.prt;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+import java.util.zip.InflaterOutputStream;
+
+import org.apache.poi.util.IOUtils;
+import org.apache.poi.util.LittleEndian;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * A basic text extracting parser for the CADKey PRT (CAD Drawing)
+ * format. It outputs text from note entries.
+ */
+public class PRTParser extends AbstractParser {
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-prt"));
+ public static final String PRT_MIME_TYPE = "application/x-prt";
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ private static final int MAX_SANE_TEXT_LENGTH = 0x0800;
+
+ /*
+ * Text types:
+ * 00 00 00 00 f0 [3b]f sz sz TEXT *view name*
+ * 00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT *view name*
+ * (anything) e0 3f sz sz TEXT *view name*
+ * 3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT *note entries*
+ *
+ * Note - all text is null terminated
+ */
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ Last5 l5 = new Last5();
+ int read;
+
+ // Try to get the creation date, which is YYYYMMDDhhmm
+ byte[] header = new byte[30];
+ IOUtils.readFully(stream, header);
+ byte[] date = new byte[12];
+ IOUtils.readFully(stream, date);
+
+ String dateStr = new String(date, "ASCII");
+ if(dateStr.startsWith("19") || dateStr.startsWith("20")) {
+ String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4,6) +
+ "-" + dateStr.substring(6,8) + "T" + dateStr.substring(8,10) + ":" +
+ dateStr.substring(10, 12) + ":00";
+ metadata.set(Metadata.CREATION_DATE, formattedDate);
+ metadata.set(Metadata.DATE, formattedDate);
+ }
+ metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
+
+
+ // Now look for text
+ while( (read = stream.read()) > -1) {
+ if(read == 0xe0 || read == 0xe3 || read == 0xf0) {
+ int nread = stream.read();
+ if(nread == 0x3f || nread == 0xbf) {
+ // Looks promising, check back for a suitable value
+ if(read == 0xe3 && nread == 0x3f) {
+ if(l5.is33()) {
+ // Bingo, note text
+ handleNoteText(stream, xhtml);
+ }
+ } else if(l5.is00()) {
+ // Likely view name
+ handleViewName(read, nread, stream, xhtml, l5);
+ }
+ }
+ } else {
+ l5.record(read);
+ }
+ }
+ }
+
+ private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ // Ensure we have the right padding text
+ int read;
+ for(int i=0; i<10; i++) {
+ read = stream.read();
+ if(read >= 0 && read <= 0x0f) {
+ // Promising
+ } else {
+ // Wrong, false detection
+ return;
+ }
+ }
+ read = stream.read();
+ if(read != 0x1f) {
+ // Wrong, false detection
+ return;
+ }
+
+ int length = LittleEndian.readUShort(stream);
+ if(length <= MAX_SANE_TEXT_LENGTH) {
+ // Length sanity check passed
+ handleText(length, stream, xhtml);
+ }
+ }
+
+ private void handleViewName(int typeA, int typeB, InputStream stream,
+ XHTMLContentHandler xhtml, Last5 l5)
+ throws IOException, SAXException, TikaException {
+ // Is it 8 byte zero padded?
+ int maybeLength = LittleEndian.readUShort(stream);
+ if(maybeLength == 0) {
+ // Check the next 6 bytes too
+ for(int i=0; i<6; i++) {
+ int read = stream.read();
+ if(read >= 0 && read <= 0x0f) {
+ // Promising
+ } else {
+ // Wrong, false detection
+ return;
+ }
+ }
+
+ byte[] b2 = new byte[2];
+ IOUtils.readFully(stream, b2);
+ int length = LittleEndian.getUShort(b2);
+ if(length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
+ // Length sanity check passed
+ handleText(length, stream, xhtml);
+ } else {
+ // Was probably something else
+ l5.record(b2[0]);
+ l5.record(b2[1]);
+ }
+ } else if(maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) {
+ // Looks like it's straight into the text
+ handleText(maybeLength, stream, xhtml);
+ }
+ }
+
+ private void handleText(int length, InputStream stream, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ byte[] str = new byte[length];
+ IOUtils.readFully(stream, str);
+ if(str[length-1] != 0) {
+ // Not properly null terminated, must be wrong
+ return;
+ }
+
+ // TODO Is this the right character set?
+ String text = new String(str, 0, length-1, "UTF-8");
+
+ xhtml.startElement("p");
+ xhtml.characters(text);
+ xhtml.endElement("p");
+ }
+
+ /**
+ * Provides a view on the previous 5 bytes
+ */
+ private static class Last5 {
+ byte[] data = new byte[5];
+ int pos = 0;
+
+ private void record(int b) {
+ data[pos] = (byte)b;
+ pos++;
+ if(pos >= data.length) {
+ pos = 0;
+ }
+ }
+
+ private byte[] get() {
+ byte[] ret = new byte[5];
+ for(int i=0; i<ret.length; i++) {
+ int p = pos - i;
+ if(p < 0) { p += ret.length; }
+ ret[i] = data[p];
+ }
+ return ret;
+ }
+
+ private boolean is33() {
+ byte[] last5 = get();
+ for(byte b : last5) {
+ if(b != 0x33) return false;
+ }
+ return true;
+ }
+
+ private boolean is00() {
+ byte[] last5 = get();
+ for(byte b : last5) {
+ if(b != 0x00) return false;
+ }
+ return true;
+ }
+ }
+}
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java?rev=1142817&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java Mon Jul 4 21:41:50 2011
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.prt;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+public class PRTParserTest extends TestCase {
+ public void testPRTParser() throws Exception {
+ InputStream input = PRTParserTest.class.getResourceAsStream(
+ "/test-documents/testCADKEY.prt");
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new PRTParser().parse(input, handler, metadata);
+
+ assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
+
+ assertEquals("2011-06-20T16:54:00",
+ metadata.get(Metadata.DATE));
+ assertEquals("2011-06-20T16:54:00",
+ metadata.get(Metadata.CREATION_DATE));
+
+ String contents = handler.toString();
+
+ assertContains("Front View", contents);
+ assertContains("Back View", contents);
+ assertContains("Bottom View", contents);
+ assertContains("Right View", contents);
+ assertContains("Left View", contents);
+ //assertContains("Isometric View", contents); // Can't detect yet
+ assertContains("Axonometric View", contents);
+
+ assertContains("You've managed to extract all the text!", contents);
+ assertContains("This is more text", contents);
+ assertContains("Text Inside a PRT file", contents);
+ } finally {
+ input.close();
+ }
+ }
+
+ public void assertContains(String needle, String haystack) {
+ assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
+ }
+}
Re: svn commit: r1142817 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/prt/ main/java/org/apache/tika/parser/prt/PRTParser.java
test/java/org/apache/tika/parser/prt/ test/java/org/apache/tika/parser/prt/PRTParserTest.java
Posted by Jukka Zitting <ju...@gmail.com>.
Hi,
On Mon, Jul 4, 2011 at 11:41 PM, <ni...@apache.org> wrote:
> Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java
> URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java?rev=1142817&view=auto
> ==============================================================================
> --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java (added)
> +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java Mon Jul 4 21:41:50 2011
> @@ -0,0 +1,208 @@
> +package org.apache.tika.parser.prt;
That one needs the Apache license header.
BR,
Jukka Zitting