You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/24 15:44:43 UTC
[1/4] tika git commit: fix indents and whitespace
Repository: tika
Updated Branches:
refs/heads/2.x 1ce93ed9e -> cd12917fa
fix indents and whitespace
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/76744261
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/76744261
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/76744261
Branch: refs/heads/2.x
Commit: 767442614756b51b64427e663a2af1f6b6ac0bff
Parents: 1ce93ed
Author: tballison <ta...@mitre.org>
Authored: Fri Jun 24 11:06:33 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Jun 24 11:06:33 2016 -0400
----------------------------------------------------------------------
.../org/apache/tika/parser/prt/PRTParser.java | 554 ++++++++++---------
1 file changed, 279 insertions(+), 275 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/76744261/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
index 92e3503..24418b0 100644
--- a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
+++ b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
@@ -1,275 +1,279 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.prt;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.UnsupportedEncodingException;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.EndianUtils;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import static java.nio.charset.StandardCharsets.US_ASCII;
-
-/**
- * A basic text extracting parser for the CADKey PRT (CAD Drawing)
- * format. It outputs text from note entries.
- */
-
-public class PRTParser extends AbstractParser {
-
- /** Serial version UID */
- private static final long serialVersionUID = 4659638314375035178L;
-
- private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-prt"));
- public static final String PRT_MIME_TYPE = "application/x-prt";
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- /**
- * How long do we allow a text run to claim to be, before we
- * decide we're confused and it's not really text after all?
- */
- private static final int MAX_SANE_TEXT_LENGTH = 0x0800;
-
- /*
- * Text types:
- * 00 00 00 00 f0 [3b]f sz sz TEXT *view name*
- * 00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT *view name*
- * (anything) e0 3f sz sz TEXT *view name*
- * 3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT *note entries*
- *
- * Note - all text is null terminated
- */
-
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- Last5 l5 = new Last5();
- int read;
-
- // Try to get the creation date, which is YYYYMMDDhhmm
- byte[] header = new byte[30];
- IOUtils.readFully(stream, header);
- byte[] date = new byte[12];
- IOUtils.readFully(stream, date);
-
- String dateStr = new String(date, US_ASCII);
- if(dateStr.startsWith("19") || dateStr.startsWith("20")) {
- String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4,6) +
- "-" + dateStr.substring(6,8) + "T" + dateStr.substring(8,10) + ":" +
- dateStr.substring(10, 12) + ":00";
- metadata.set(TikaCoreProperties.CREATED, formattedDate);
- // TODO Metadata.DATE is used as modified, should it be here?
- metadata.set(Metadata.DATE, formattedDate);
- }
- metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
-
- // The description, if set, is the next up-to-500 bytes
- byte[] desc = new byte[500];
- IOUtils.readFully(stream, desc);
- String description = extractText(desc, true);
- if(description.length() > 0) {
- metadata.set(TikaCoreProperties.DESCRIPTION, description);
- }
-
- // Now look for text
- while( (read = stream.read()) > -1) {
- if(read == 0xe0 || read == 0xe3 || read == 0xf0) {
- int nread = stream.read();
- if(nread == 0x3f || nread == 0xbf) {
- // Looks promising, check back for a suitable value
- if(read == 0xe3 && nread == 0x3f) {
- if(l5.is33()) {
- // Bingo, note text
- handleNoteText(stream, xhtml);
- }
- } else if(l5.is00()) {
- // Likely view name
- handleViewName(read, nread, stream, xhtml, l5);
- }
- }
- } else {
- l5.record(read);
- }
- }
- }
-
- private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml)
- throws IOException, SAXException, TikaException {
- // Ensure we have the right padding text
- int read;
- for(int i=0; i<10; i++) {
- read = stream.read();
- if(read >= 0 && read <= 0x0f) {
- // Promising
- } else {
- // Wrong, false detection
- return;
- }
- }
- read = stream.read();
- if(read != 0x1f) {
- // Wrong, false detection
- return;
- }
-
- int length = EndianUtils.readUShortLE(stream);
- if(length <= MAX_SANE_TEXT_LENGTH) {
- // Length sanity check passed
- handleText(length, stream, xhtml);
- }
- }
-
- private void handleViewName(int typeA, int typeB, InputStream stream,
- XHTMLContentHandler xhtml, Last5 l5)
- throws IOException, SAXException, TikaException {
- // Is it 8 byte zero padded?
- int maybeLength = EndianUtils.readUShortLE(stream);
- if(maybeLength == 0) {
- // Check the next 6 bytes too
- for(int i=0; i<6; i++) {
- int read = stream.read();
- if(read >= 0 && read <= 0x0f) {
- // Promising
- } else {
- // Wrong, false detection
- return;
- }
- }
-
- byte[] b2 = new byte[2];
- IOUtils.readFully(stream, b2);
- int length = EndianUtils.getUShortLE(b2);
- if(length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
- // Length sanity check passed
- handleText(length, stream, xhtml);
- } else {
- // Was probably something else
- l5.record(b2[0]);
- l5.record(b2[1]);
- }
- } else if(maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) {
- // Looks like it's straight into the text
- handleText(maybeLength, stream, xhtml);
- }
- }
-
- private void handleText(int length, InputStream stream, XHTMLContentHandler xhtml)
- throws IOException, SAXException, TikaException {
- byte[] str = new byte[length];
- IOUtils.readFully(stream, str);
- if(str[length-1] != 0) {
- // Not properly null terminated, must be wrong
- return;
- }
-
- String text = extractText(str, false);
-
- xhtml.startElement("p");
- xhtml.characters(text);
- xhtml.endElement("p");
- }
-
- /**
- * Does our best to turn the bytes into text
- */
- private String extractText(byte[] data, boolean trim) throws TikaException {
- // The text is always stored null terminated, but sometimes
- // may have extra null padding too
- int length = data.length - 1;
- if(trim) {
- for(int i=0; i<data.length; i++) {
- if(data[i] == 0) {
- length = i;
- break;
- }
- }
- }
-
- // We believe that the text is basically stored as CP437
- // That said, there are a few characters slightly wrong for that...
- String text;
- try {
- text = new String(data, 0, length, "cp437");
- } catch(UnsupportedEncodingException e) {
- throw new TikaException("JVM Broken, core codepage CP437 missing!");
- }
-
- // Fix up the known character issues
- text = text.replace("\u03C6","\u00D8");
-
- // All done, as best as we can!
- return text;
- }
-
- /**
- * Provides a view on the previous 5 bytes
- */
- private static class Last5 {
- byte[] data = new byte[5];
- int pos = 0;
-
- private void record(int b) {
- data[pos] = (byte)b;
- pos++;
- if(pos >= data.length) {
- pos = 0;
- }
- }
-
- private byte[] get() {
- byte[] ret = new byte[5];
- for(int i=0; i<ret.length; i++) {
- int p = pos - i;
- if(p < 0) { p += ret.length; }
- ret[i] = data[p];
- }
- return ret;
- }
-
- private boolean is33() {
- byte[] last5 = get();
- for(byte b : last5) {
- if(b != 0x33) return false;
- }
- return true;
- }
-
- private boolean is00() {
- byte[] last5 = get();
- for(byte b : last5) {
- if(b != 0x00) return false;
- }
- return true;
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.prt;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * A basic text extracting parser for the CADKey PRT (CAD Drawing)
+ * format. It outputs text from note entries.
+ */
+
+public class PRTParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 4659638314375035178L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-prt"));
+ public static final String PRT_MIME_TYPE = "application/x-prt";
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ /**
+ * How long do we allow a text run to claim to be, before we
+ * decide we're confused and it's not really text after all?
+ */
+ private static final int MAX_SANE_TEXT_LENGTH = 0x0800;
+
+ /*
+ * Text types:
+ * 00 00 00 00 f0 [3b]f sz sz TEXT *view name*
+ * 00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT *view name*
+ * (anything) e0 3f sz sz TEXT *view name*
+ * 3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT *note entries*
+ *
+ * Note - all text is null terminated
+ */
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ Last5 l5 = new Last5();
+ int read;
+
+ // Try to get the creation date, which is YYYYMMDDhhmm
+ byte[] header = new byte[30];
+ IOUtils.readFully(stream, header);
+ byte[] date = new byte[12];
+ IOUtils.readFully(stream, date);
+
+ String dateStr = new String(date, US_ASCII);
+ if (dateStr.startsWith("19") || dateStr.startsWith("20")) {
+ String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4, 6) +
+ "-" + dateStr.substring(6, 8) + "T" + dateStr.substring(8, 10) + ":" +
+ dateStr.substring(10, 12) + ":00";
+ metadata.set(TikaCoreProperties.CREATED, formattedDate);
+ // TODO Metadata.DATE is used as modified, should it be here?
+ metadata.set(Metadata.DATE, formattedDate);
+ }
+ metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
+
+ // The description, if set, is the next up-to-500 bytes
+ byte[] desc = new byte[500];
+ IOUtils.readFully(stream, desc);
+ String description = extractText(desc, true);
+ if (description.length() > 0) {
+ metadata.set(TikaCoreProperties.DESCRIPTION, description);
+ }
+
+ // Now look for text
+ while ((read = stream.read()) > -1) {
+ if (read == 0xe0 || read == 0xe3 || read == 0xf0) {
+ int nread = stream.read();
+ if (nread == 0x3f || nread == 0xbf) {
+ // Looks promising, check back for a suitable value
+ if (read == 0xe3 && nread == 0x3f) {
+ if (l5.is33()) {
+ // Bingo, note text
+ handleNoteText(stream, xhtml);
+ }
+ } else if (l5.is00()) {
+ // Likely view name
+ handleViewName(read, nread, stream, xhtml, l5);
+ }
+ }
+ } else {
+ l5.record(read);
+ }
+ }
+ }
+
+ private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ // Ensure we have the right padding text
+ int read;
+ for (int i = 0; i < 10; i++) {
+ read = stream.read();
+ if (read >= 0 && read <= 0x0f) {
+ // Promising
+ } else {
+ // Wrong, false detection
+ return;
+ }
+ }
+ read = stream.read();
+ if (read != 0x1f) {
+ // Wrong, false detection
+ return;
+ }
+
+ int length = EndianUtils.readUShortLE(stream);
+ if (length <= MAX_SANE_TEXT_LENGTH) {
+ // Length sanity check passed
+ handleText(length, stream, xhtml);
+ }
+ }
+
+ private void handleViewName(int typeA, int typeB, InputStream stream,
+ XHTMLContentHandler xhtml, Last5 l5)
+ throws IOException, SAXException, TikaException {
+ // Is it 8 byte zero padded?
+ int maybeLength = EndianUtils.readUShortLE(stream);
+ if (maybeLength == 0) {
+ // Check the next 6 bytes too
+ for (int i = 0; i < 6; i++) {
+ int read = stream.read();
+ if (read >= 0 && read <= 0x0f) {
+ // Promising
+ } else {
+ // Wrong, false detection
+ return;
+ }
+ }
+
+ byte[] b2 = new byte[2];
+ IOUtils.readFully(stream, b2);
+ int length = EndianUtils.getUShortLE(b2);
+ if (length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
+ // Length sanity check passed
+ handleText(length, stream, xhtml);
+ } else {
+ // Was probably something else
+ l5.record(b2[0]);
+ l5.record(b2[1]);
+ }
+ } else if (maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) {
+ // Looks like it's straight into the text
+ handleText(maybeLength, stream, xhtml);
+ }
+ }
+
+ private void handleText(int length, InputStream stream, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ byte[] str = new byte[length];
+ IOUtils.readFully(stream, str);
+ if (str[length - 1] != 0) {
+ // Not properly null terminated, must be wrong
+ return;
+ }
+
+ String text = extractText(str, false);
+
+ xhtml.startElement("p");
+ xhtml.characters(text);
+ xhtml.endElement("p");
+ }
+
+ /**
+ * Does our best to turn the bytes into text
+ */
+ private String extractText(byte[] data, boolean trim) throws TikaException {
+ // The text is always stored null terminated, but sometimes
+ // may have extra null padding too
+ int length = data.length - 1;
+ if (trim) {
+ for (int i = 0; i < data.length; i++) {
+ if (data[i] == 0) {
+ length = i;
+ break;
+ }
+ }
+ }
+
+ // We believe that the text is basically stored as CP437
+ // That said, there are a few characters slightly wrong for that...
+ String text;
+ try {
+ text = new String(data, 0, length, "cp437");
+ } catch (UnsupportedEncodingException e) {
+ throw new TikaException("JVM Broken, core codepage CP437 missing!");
+ }
+
+ // Fix up the known character issues
+ text = text.replace("\u03C6", "\u00D8");
+
+ // All done, as best as we can!
+ return text;
+ }
+
+ /**
+ * Provides a view on the previous 5 bytes
+ */
+ private static class Last5 {
+ byte[] data = new byte[5];
+ int pos = 0;
+
+ private void record(int b) {
+ data[pos] = (byte) b;
+ pos++;
+ if (pos >= data.length) {
+ pos = 0;
+ }
+ }
+
+ private byte[] get() {
+ byte[] ret = new byte[5];
+ for (int i = 0; i < ret.length; i++) {
+ int p = pos - i;
+ if (p < 0) {
+ p += ret.length;
+ }
+ ret[i] = data[p];
+ }
+ return ret;
+ }
+
+ private boolean is33() {
+ byte[] last5 = get();
+ for (byte b : last5) {
+ if (b != 0x33) return false;
+ }
+ return true;
+ }
+
+ private boolean is00() {
+ byte[] last5 = get();
+ for (byte b : last5) {
+ if (b != 0x00) return false;
+ }
+ return true;
+ }
+ }
+}
[4/4] tika git commit: TIKA-2020 -- remove 3 parameter parse() and
simplify CAD tests
Posted by ta...@apache.org.
TIKA-2020 -- remove 3 parameter parse() and simplify CAD tests
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/cd12917f
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/cd12917f
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/cd12917f
Branch: refs/heads/2.x
Commit: cd12917fad98e9164e79b5026a551b1eb58f516c
Parents: 0c71b2f
Author: tballison <ta...@mitre.org>
Authored: Fri Jun 24 11:43:44 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Jun 24 11:43:44 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 17 ++++++++++----
.../org/apache/tika/parser/AbstractParser.java | 24 --------------------
.../apache/tika/parser/prt/PRTParserTest.java | 2 +-
3 files changed, 13 insertions(+), 30 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/cd12917f/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 1c93618..6a6bca1 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,13 +1,20 @@
-Release 2.0 - Future Development
+Release 2.0 - ???
- * Upgrade rome to 1.5.1 (TIKA-1820)
+ MAJOR CHANGES
- * The default LoadErrorHandler is now WARN, to alert you to missing
- parser classes and their dependencies. To keep the old behaviour,
- set your LoadErrorHandler to IGNORE. (TIKA-1805)
+ * Remove 3 parameter parse option from AbstractParser (TIKA-2020).
+ Clients will now need to include a ParseContext.
* (Something about more specific parser bundles, plus an overall one)
+ MINOR CHANGES
+
+ * Upgrade rome to 1.5.1 (TIKA-1820)
+
+ * The default LoadErrorHandler is now WARN, to alert you to missing
+ parser classes and their dependencies. To keep the old behaviour,
+ set your LoadErrorHandler to IGNORE. (TIKA-1805)
+
Release 1.14 - ???
* Add mime types, mime magic and/or globs for:
http://git-wip-us.apache.org/repos/asf/tika/blob/cd12917f/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
index a4c7719..c478c75 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
@@ -16,14 +16,6 @@
*/
package org.apache.tika.parser;
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
/**
* Abstract base class for new parsers. This method implements the old
* deprecated parse method so subclasses won't have to.
@@ -38,22 +30,6 @@ public abstract class AbstractParser implements Parser {
private static final long serialVersionUID = 7186985395903074255L;
/**
- * Calls the
- * {@link Parser#parse(InputStream, ContentHandler, Metadata, ParseContext)}
- * method with an empty {@link ParseContext}. This method exists as a
- * leftover from Tika 0.x when the three-argument parse() method still
- * existed in the {@link Parser} interface. No new code should call this
- * method anymore, it's only here for backwards compatibility.
- *
- * @deprecated use the {@link Parser#parse(InputStream, ContentHandler, Metadata, ParseContext)} method instead
- */
- public void parse(
- InputStream stream, ContentHandler handler, Metadata metadata)
- throws IOException, SAXException, TikaException {
- parse(stream, handler, metadata, new ParseContext());
- }
-
- /**
* Convenience method for creating ParserProxy instances
* with the current class' ClassLoader
*
http://git-wip-us.apache.org/repos/asf/tika/blob/cd12917f/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java b/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
index de870ed..3de7d75 100644
--- a/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
+++ b/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
@@ -29,7 +29,7 @@ public class PRTParserTest extends TikaTest {
*/
@Test
public void testPRTParserBasics() throws Exception {
- XMLResult r = getXML("testCADKey.prt", new PRTParser());
+ XMLResult r = getXML("testCADKEY.prt", new PRTParser());
Metadata metadata = r.metadata;
String contents = r.xml;
assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
[3/4] tika git commit: TIKA-2020,
remove 3 parameter parse() and simplify CAD tests
Posted by ta...@apache.org.
TIKA-2020, remove 3 parameter parse() and simplify CAD tests
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/0c71b2ff
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/0c71b2ff
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/0c71b2ff
Branch: refs/heads/2.x
Commit: 0c71b2ffc97a3907a541fdd164ba79302f5c0637
Parents: 6bb6827
Author: tballison <ta...@mitre.org>
Authored: Fri Jun 24 11:13:54 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Jun 24 11:13:54 2016 -0400
----------------------------------------------------------------------
.../apache/tika/parser/dwg/DWGParserTest.java | 372 +++++++++----------
.../apache/tika/parser/prt/PRTParserTest.java | 214 +++++------
2 files changed, 271 insertions(+), 315 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/0c71b2ff/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java b/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
index 321d715..ee3e767 100644
--- a/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
+++ b/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
@@ -1,202 +1,170 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.dwg;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-import static org.apache.tika.TikaTest.assertContains;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class DWGParserTest {
-
- @Test
- public void testDWG2000Parser() throws Exception {
- InputStream input = DWGParserTest.class.getResourceAsStream(
- "/test-documents/testDWG2000.dwg");
- testParserAlt(input);
- }
-
- @Test
- public void testDWG2004Parser() throws Exception {
- InputStream input = DWGParserTest.class.getResourceAsStream(
- "/test-documents/testDWG2004.dwg");
- testParser(input);
- }
-
- @Test
- public void testDWG2004ParserNoHeaderAddress() throws Exception {
- InputStream input = DWGParserTest.class.getResourceAsStream(
- "/test-documents/testDWG2004_no_header.dwg");
- testParserNoHeader(input);
- }
-
- @Test
- public void testDWG2007Parser() throws Exception {
- InputStream input = DWGParserTest.class.getResourceAsStream(
- "/test-documents/testDWG2007.dwg");
- testParser(input);
- }
-
- @Test
- public void testDWG2010Parser() throws Exception {
- InputStream input = DWGParserTest.class.getResourceAsStream(
- "/test-documents/testDWG2010.dwg");
- testParser(input);
- }
-
- @Test
- public void testDWG2010CustomPropertiesParser() throws Exception {
- // Check that standard parsing works
- InputStream testInput = DWGParserTest.class.getResourceAsStream(
- "/test-documents/testDWG2010_custom_props.dwg");
- testParser(testInput);
-
- // Check that custom properties with alternate padding work
- try (InputStream input = DWGParserTest.class.getResourceAsStream(
- "/test-documents/testDWG2010_custom_props.dwg")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new DWGParser().parse(input, handler, metadata, null);
-
- assertEquals("valueforcustomprop1",
- metadata.get("customprop1"));
- assertEquals("valueforcustomprop2",
- metadata.get("customprop2"));
- }
- }
-
- @Test
- public void testDWGMechParser() throws Exception {
- String[] types = new String[] {
- "6", "2004", "2004DX", "2005", "2006",
- "2007", "2008", "2009", "2010", "2011"
- };
- for (String type : types) {
- InputStream input = DWGParserTest.class.getResourceAsStream(
- "/test-documents/testDWGmech"+type+".dwg");
- testParserAlt(input);
- }
- }
-
- @SuppressWarnings("deprecation")
- private void testParser(InputStream input) throws Exception {
- try {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new DWGParser().parse(input, handler, metadata);
-
- assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
-
- assertEquals("The quick brown fox jumps over the lazy dog",
- metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Gym class featuring a brown fox and lazy dog",
- metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("Gym class featuring a brown fox and lazy dog",
- metadata.get(Metadata.SUBJECT));
- assertEquals("Nevin Nollop",
- metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Pangram, fox, dog",
- metadata.get(TikaCoreProperties.KEYWORDS));
- assertEquals("Lorem ipsum",
- metadata.get(TikaCoreProperties.COMMENTS).substring(0,11));
- assertEquals("http://www.alfresco.com",
- metadata.get(TikaCoreProperties.RELATION));
-
- // Check some of the old style metadata too
- assertEquals("The quick brown fox jumps over the lazy dog",
- metadata.get(Metadata.TITLE));
- assertEquals("Gym class featuring a brown fox and lazy dog",
- metadata.get(Metadata.SUBJECT));
-
- String content = handler.toString();
- assertContains("The quick brown fox jumps over the lazy dog", content);
- assertContains("Gym class", content);
- assertContains("www.alfresco.com", content);
- } finally {
- input.close();
- }
- }
-
- @SuppressWarnings("deprecation")
- private void testParserNoHeader(InputStream input) throws Exception {
- try {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new DWGParser().parse(input, handler, metadata);
-
- assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
-
- assertNull(metadata.get(TikaCoreProperties.TITLE));
- assertNull(metadata.get(TikaCoreProperties.DESCRIPTION));
- assertNull(metadata.get(Metadata.SUBJECT));
- assertNull(metadata.get(TikaCoreProperties.CREATOR));
- assertNull(metadata.get(TikaCoreProperties.KEYWORDS));
- assertNull(metadata.get(TikaCoreProperties.COMMENTS));
- assertNull(metadata.get(TikaCoreProperties.RELATION));
-
- String content = handler.toString();
- assertEquals("", content);
- } finally {
- input.close();
- }
- }
-
- @SuppressWarnings("deprecation")
- private void testParserAlt(InputStream input) throws Exception {
- try {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new DWGParser().parse(input, handler, metadata);
-
- assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
-
- assertEquals("Test Title",
- metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Test Subject",
- metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("Test Subject",
- metadata.get(Metadata.SUBJECT));
- assertEquals("My Author",
- metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("My keyword1, MyKeyword2",
- metadata.get(TikaCoreProperties.KEYWORDS));
- assertEquals("This is a comment",
- metadata.get(TikaCoreProperties.COMMENTS));
- assertEquals("bejanpol",
- metadata.get(TikaCoreProperties.MODIFIER));
- assertEquals("bejanpol",
- metadata.get(Metadata.LAST_AUTHOR));
- assertEquals("http://mycompany/drawings",
- metadata.get(TikaCoreProperties.RELATION));
- assertEquals("MyCustomPropertyValue",
- metadata.get("MyCustomProperty"));
-
- String content = handler.toString();
- assertContains("This is a comment", content);
- assertContains("mycompany", content);
- } finally {
- input.close();
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dwg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class DWGParserTest extends TikaTest {
+
+ @Test
+ public void testDWG2000Parser() throws Exception {
+ testParserAlt("testDWG2000.dwg");
+ }
+
+ @Test
+ public void testDWG2004Parser() throws Exception {
+ testParser("testDWG2004.dwg");
+ }
+
+ @Test
+ public void testDWG2004ParserNoHeaderAddress() throws Exception {
+ testParserNoHeader("testDWG2004_no_header.dwg");
+ }
+
+ @Test
+ public void testDWG2007Parser() throws Exception {
+ testParser("testDWG2007.dwg");
+ }
+
+ @Test
+ public void testDWG2010Parser() throws Exception {
+ testParser("testDWG2010.dwg");
+ }
+
+ @Test
+ public void testDWG2010CustomPropertiesParser() throws Exception {
+ // Check that standard parsing works
+ testParser("testDWG2010_custom_props.dwg");
+
+ // Check that custom properties with alternate padding work
+
+ XMLResult r = getXML("testDWG2010_custom_props.dwg");
+ assertEquals("valueforcustomprop1",
+ r.metadata.get("customprop1"));
+ assertEquals("valueforcustomprop2",
+ r.metadata.get("customprop2"));
+ }
+
+ @Test
+ public void testDWGMechParser() throws Exception {
+ String[] types = new String[]{
+ "6", "2004", "2004DX", "2005", "2006",
+ "2007", "2008", "2009", "2010", "2011"
+ };
+ for (String type : types) {
+ testParserAlt("testDWGmech" + type + ".dwg");
+ }
+ }
+
+ private void testParser(String testFileName) throws Exception {
+ XMLResult r = getXML(testFileName, new DWGParser());
+ Metadata metadata = r.metadata;
+ String content = r.xml;
+
+
+ assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
+
+ assertEquals("The quick brown fox jumps over the lazy dog",
+ metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Gym class featuring a brown fox and lazy dog",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Gym class featuring a brown fox and lazy dog",
+ metadata.get(Metadata.SUBJECT));
+ assertEquals("Nevin Nollop",
+ metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Pangram, fox, dog",
+ metadata.get(TikaCoreProperties.KEYWORDS));
+ assertEquals("Lorem ipsum",
+ metadata.get(TikaCoreProperties.COMMENTS).substring(0, 11));
+ assertEquals("http://www.alfresco.com",
+ metadata.get(TikaCoreProperties.RELATION));
+
+ // Check some of the old style metadata too
+ assertEquals("The quick brown fox jumps over the lazy dog",
+ metadata.get(Metadata.TITLE));
+ assertEquals("Gym class featuring a brown fox and lazy dog",
+ metadata.get(Metadata.SUBJECT));
+
+ assertContains("The quick brown fox jumps over the lazy dog", content);
+ assertContains("Gym class", content);
+ assertContains("www.alfresco.com", content);
+ }
+
+ @SuppressWarnings("deprecation")
+ private void testParserNoHeader(String testFileName) throws Exception {
+ try (InputStream input = getResourceAsStream("/test-documents/" + testFileName)) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new DWGParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
+
+ assertNull(metadata.get(TikaCoreProperties.TITLE));
+ assertNull(metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertNull(metadata.get(Metadata.SUBJECT));
+ assertNull(metadata.get(TikaCoreProperties.CREATOR));
+ assertNull(metadata.get(TikaCoreProperties.KEYWORDS));
+ assertNull(metadata.get(TikaCoreProperties.COMMENTS));
+ assertNull(metadata.get(TikaCoreProperties.RELATION));
+
+ String content = handler.toString();
+ assertEquals("", content);
+ }
+ }
+
+ private void testParserAlt(String testFileName) throws Exception {
+ XMLResult r = getXML(testFileName, new DWGParser());
+ Metadata metadata = r.metadata;
+ String content = r.xml;
+
+ assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
+
+ assertEquals("Test Title",
+ metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Test Subject",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Test Subject",
+ metadata.get(Metadata.SUBJECT));
+ assertEquals("My Author",
+ metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("My keyword1, MyKeyword2",
+ metadata.get(TikaCoreProperties.KEYWORDS));
+ assertEquals("This is a comment",
+ metadata.get(TikaCoreProperties.COMMENTS));
+ assertEquals("bejanpol",
+ metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals("bejanpol",
+ metadata.get(Metadata.LAST_AUTHOR));
+ assertEquals("http://mycompany/drawings",
+ metadata.get(TikaCoreProperties.RELATION));
+ assertEquals("MyCustomPropertyValue",
+ metadata.get("MyCustomProperty"));
+ assertContains("This is a comment", content);
+ assertContains("mycompany", content);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/0c71b2ff/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java b/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
index 155512c..de870ed 100644
--- a/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
+++ b/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
@@ -1,113 +1,101 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.prt;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class PRTParserTest extends TikaTest {
- /**
- * Try with a simple file
- */
- @Test
- public void testPRTParserBasics() throws Exception {
- try (InputStream input = getResourceAsStream("/test-documents/testCADKEY.prt")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new PRTParser().parse(input, handler, metadata);
-
- assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
-
- // This file has a date
- assertEquals("2011-06-20T16:54:00",
- metadata.get(TikaCoreProperties.CREATED));
- assertEquals("2011-06-20T16:54:00",
- metadata.get(Metadata.CREATION_DATE));
- // But no description
- assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION));
-
- String contents = handler.toString();
-
- assertContains("Front View", contents);
- assertContains("Back View", contents);
- assertContains("Bottom View", contents);
- assertContains("Right View", contents);
- assertContains("Left View", contents);
- //assertContains("Isometric View", contents); // Can't detect yet
- assertContains("Axonometric View", contents);
-
- assertContains("You've managed to extract all the text!", contents);
- assertContains("This is more text", contents);
- assertContains("Text Inside a PRT file", contents);
- }
- }
-
- /**
- * Now a more complex one
- */
- @Test
- public void testPRTParserComplex() throws Exception {
- try (InputStream input = getResourceAsStream("/test-documents/testCADKEY2.prt")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new PRTParser().parse(input, handler, metadata);
-
- assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
-
- // File has both a date and a description
- assertEquals("1997-04-01T08:59:00",
- metadata.get(Metadata.DATE));
- assertEquals("1997-04-01T08:59:00",
- metadata.get(Metadata.CREATION_DATE));
- assertEquals("TIKA TEST PART DESCRIPTION INFORMATION\r\n",
- metadata.get(TikaCoreProperties.DESCRIPTION));
-
- String contents = handler.toString();
-
- assertContains("ITEM", contents);
- assertContains("REQ.", contents);
- assertContains("DESCRIPTION", contents);
- assertContains("MAT'L", contents);
- assertContains("TOLERANCES UNLESS", contents);
- assertContains("FRACTIONS", contents);
- assertContains("ANGLES", contents);
- assertContains("Acme Corporation", contents);
-
- assertContains("DATE", contents);
- assertContains("CHANGE", contents);
- assertContains("DRAWN BY", contents);
- assertContains("SCALE", contents);
- assertContains("TIKA TEST DRAWING", contents);
- assertContains("TIKA LETTERS", contents);
- assertContains("5.82", contents);
- assertContains("112" + '\u00b0', contents); // Degrees
- assertContains("TIKA TEST LETTER", contents);
- assertContains("17.11", contents);
- assertContains('\u00d8' + "\ufffd2.000", contents); // Diameter
- assertContains("Diameter", contents);
- assertContains("The Apache Tika toolkit", contents);
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.prt;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+public class PRTParserTest extends TikaTest {
+ /**
+ * Try with a simple file
+ */
+ @Test
+ public void testPRTParserBasics() throws Exception {
+ XMLResult r = getXML("testCADKey.prt", new PRTParser());
+ Metadata metadata = r.metadata;
+ String contents = r.xml;
+ assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
+
+ // This file has a date
+ assertEquals("2011-06-20T16:54:00",
+ metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("2011-06-20T16:54:00",
+ metadata.get(Metadata.CREATION_DATE));
+ // But no description
+ assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION));
+
+ assertContains("Front View", contents);
+ assertContains("Back View", contents);
+ assertContains("Bottom View", contents);
+ assertContains("Right View", contents);
+ assertContains("Left View", contents);
+ //assertContains("Isometric View", contents); // Can't detect yet
+ assertContains("Axonometric View", contents);
+
+ assertContains("You've managed to extract all the text!", contents);
+ assertContains("This is more text", contents);
+ assertContains("Text Inside a PRT file", contents);
+
+ }
+
+ /**
+ * Now a more complex one
+ */
+ @Test
+ public void testPRTParserComplex() throws Exception {
+
+ XMLResult r = getXML("testCADKEY2.prt", new PRTParser());
+ Metadata metadata = r.metadata;
+ String contents = r.xml;
+ assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
+
+ // File has both a date and a description
+ assertEquals("1997-04-01T08:59:00",
+ metadata.get(Metadata.DATE));
+ assertEquals("1997-04-01T08:59:00",
+ metadata.get(Metadata.CREATION_DATE));
+ assertEquals("TIKA TEST PART DESCRIPTION INFORMATION\r\n",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+
+ assertContains("ITEM", contents);
+ assertContains("REQ.", contents);
+ assertContains("DESCRIPTION", contents);
+ assertContains("MAT'L", contents);
+ assertContains("TOLERANCES UNLESS", contents);
+ assertContains("FRACTIONS", contents);
+ assertContains("ANGLES", contents);
+ assertContains("Acme Corporation", contents);
+
+ assertContains("DATE", contents);
+ assertContains("CHANGE", contents);
+ assertContains("DRAWN BY", contents);
+ assertContains("SCALE", contents);
+ assertContains("TIKA TEST DRAWING", contents);
+ assertContains("TIKA LETTERS", contents);
+ assertContains("5.82", contents);
+ assertContains("112" + '\u00b0', contents); // Degrees
+ assertContains("TIKA TEST LETTER", contents);
+ assertContains("17.11", contents);
+ assertContains('\u00d8' + "\ufffd2.000", contents); // Diameter
+ assertContains("Diameter", contents);
+ assertContains("The Apache Tika toolkit", contents);
+ }
+}
[2/4] tika git commit: add startDocument and endDocument() to
PRTParser so that it works with the ToXMLHandler
Posted by ta...@apache.org.
add startDocument and endDocument() to PRTParser so that it works with the ToXMLHandler
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/6bb6827e
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/6bb6827e
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/6bb6827e
Branch: refs/heads/2.x
Commit: 6bb6827e0dfd833c99e2dc0e568e5f06a6b5a6d6
Parents: 7674426
Author: tballison <ta...@mitre.org>
Authored: Fri Jun 24 11:12:18 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Jun 24 11:12:18 2016 -0400
----------------------------------------------------------------------
.../src/main/java/org/apache/tika/parser/prt/PRTParser.java | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/6bb6827e/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
index 24418b0..0c9689f 100644
--- a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
+++ b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
@@ -77,7 +77,7 @@ public class PRTParser extends AbstractParser {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
Last5 l5 = new Last5();
int read;
-
+ xhtml.startDocument();
// Try to get the creation date, which is YYYYMMDDhhmm
byte[] header = new byte[30];
IOUtils.readFully(stream, header);
@@ -123,6 +123,7 @@ public class PRTParser extends AbstractParser {
l5.record(read);
}
}
+ xhtml.endDocument();
}
private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml)