You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/24 15:44:43 UTC

[1/4] tika git commit: fix indents and whitespace

Repository: tika
Updated Branches:
  refs/heads/2.x 1ce93ed9e -> cd12917fa


fix indents and whitespace


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/76744261
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/76744261
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/76744261

Branch: refs/heads/2.x
Commit: 767442614756b51b64427e663a2af1f6b6ac0bff
Parents: 1ce93ed
Author: tballison <ta...@mitre.org>
Authored: Fri Jun 24 11:06:33 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Jun 24 11:06:33 2016 -0400

----------------------------------------------------------------------
 .../org/apache/tika/parser/prt/PRTParser.java   | 554 ++++++++++---------
 1 file changed, 279 insertions(+), 275 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/76744261/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
index 92e3503..24418b0 100644
--- a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
+++ b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
@@ -1,275 +1,279 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.prt;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.UnsupportedEncodingException;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.EndianUtils;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-import static java.nio.charset.StandardCharsets.US_ASCII;
-
-/**
- * A basic text extracting parser for the CADKey PRT (CAD Drawing)
- *  format. It outputs text from note entries.
- */
-
-public class PRTParser extends AbstractParser {
-
-    /** Serial version UID */
-    private static final long serialVersionUID = 4659638314375035178L;
-
-    private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-prt"));
-    public static final String PRT_MIME_TYPE = "application/x-prt";
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-       return SUPPORTED_TYPES;
-    }
-
-    /**
-     * How long do we allow a text run to claim to be, before we
-     * decide we're confused and it's not really text after all?
-     */
-    private static final int MAX_SANE_TEXT_LENGTH = 0x0800;
-    
-    /*
-     * Text types:
-     *   00 00 00 00 f0 [3b]f sz sz TEXT     *view name*
-     *   00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT  *view name*
-     *   (anything)  e0 3f sz sz TEXT    *view name*
-     *   3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT    *note entries* 
-     *   
-     *  Note - all text is null terminated
-     */
-      
-    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, 
-          ParseContext context) throws IOException, SAXException, TikaException {
-       
-       XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-       Last5 l5 = new Last5();
-       int read;
-       
-       // Try to get the creation date, which is YYYYMMDDhhmm
-       byte[] header = new byte[30];
-       IOUtils.readFully(stream, header);
-       byte[] date = new byte[12];
-       IOUtils.readFully(stream, date);
-       
-       String dateStr = new String(date, US_ASCII);
-       if(dateStr.startsWith("19") || dateStr.startsWith("20")) {
-          String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4,6) +
-             "-" + dateStr.substring(6,8) + "T" + dateStr.substring(8,10) + ":" +
-             dateStr.substring(10, 12) + ":00";
-          metadata.set(TikaCoreProperties.CREATED, formattedDate);
-          // TODO Metadata.DATE is used as modified, should it be here?
-          metadata.set(Metadata.DATE, formattedDate);
-       }
-       metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
-       
-       // The description, if set, is the next up-to-500 bytes
-       byte[] desc = new byte[500];
-       IOUtils.readFully(stream, desc);
-       String description = extractText(desc, true);
-       if(description.length() > 0) {
-          metadata.set(TikaCoreProperties.DESCRIPTION, description);
-       }
-       
-       // Now look for text
-       while( (read = stream.read()) > -1) {
-          if(read == 0xe0 || read == 0xe3 || read == 0xf0) {
-             int nread = stream.read();
-             if(nread == 0x3f || nread == 0xbf) {
-                // Looks promising, check back for a suitable value
-                if(read == 0xe3 && nread == 0x3f) {
-                   if(l5.is33()) {
-                      // Bingo, note text
-                      handleNoteText(stream, xhtml);
-                   }
-                } else if(l5.is00()) {
-                   // Likely view name
-                   handleViewName(read, nread, stream, xhtml, l5);
-                }
-             }
-          } else {
-             l5.record(read);
-          }
-       }
-    }
-    
-    private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml) 
-    throws IOException, SAXException, TikaException {
-       // Ensure we have the right padding text
-       int read;
-       for(int i=0; i<10; i++) {
-          read = stream.read();
-          if(read >= 0 && read <= 0x0f) {
-             // Promising
-          } else {
-             // Wrong, false detection
-             return;
-          }
-       }
-       read = stream.read();
-       if(read != 0x1f) {
-          // Wrong, false detection
-          return;
-       }
-       
-       int length = EndianUtils.readUShortLE(stream);
-       if(length <= MAX_SANE_TEXT_LENGTH) {
-          // Length sanity check passed
-          handleText(length, stream, xhtml);
-       }
-    }
-    
-    private void handleViewName(int typeA, int typeB, InputStream stream, 
-          XHTMLContentHandler xhtml, Last5 l5) 
-    throws IOException, SAXException, TikaException {
-       // Is it 8 byte zero padded?
-       int maybeLength = EndianUtils.readUShortLE(stream);
-       if(maybeLength == 0) {
-          // Check the next 6 bytes too
-          for(int i=0; i<6; i++) {
-             int read = stream.read();
-             if(read >= 0 && read <= 0x0f) {
-                // Promising
-             } else {
-                // Wrong, false detection
-                return;
-             }
-          }
-          
-          byte[] b2 = new byte[2];
-          IOUtils.readFully(stream, b2);
-          int length = EndianUtils.getUShortLE(b2);
-          if(length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
-             // Length sanity check passed
-             handleText(length, stream, xhtml);
-          } else {
-             // Was probably something else
-             l5.record(b2[0]);
-             l5.record(b2[1]);
-          }
-       } else if(maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) {
-          // Looks like it's straight into the text
-          handleText(maybeLength, stream, xhtml);
-       }
-    }
-    
-    private void handleText(int length, InputStream stream, XHTMLContentHandler xhtml) 
-    throws IOException, SAXException, TikaException {
-       byte[] str = new byte[length];
-       IOUtils.readFully(stream, str);
-       if(str[length-1] != 0) {
-          // Not properly null terminated, must be wrong
-          return;
-       }
-       
-       String text = extractText(str, false);
-       
-       xhtml.startElement("p");
-       xhtml.characters(text);
-       xhtml.endElement("p");
-    }
-    
-    /**
-     * Does our best to turn the bytes into text
-     */
-    private String extractText(byte[] data, boolean trim) throws TikaException {
-       // The text is always stored null terminated, but sometimes
-       //  may have extra null padding too
-       int length = data.length - 1;
-       if(trim) {
-          for(int i=0; i<data.length; i++) {
-             if(data[i] == 0) {
-                length = i;
-                break;
-             }
-          }
-       }
-       
-       // We believe that the text is basically stored as CP437
-       // That said, there are a few characters slightly wrong for that...
-       String text;
-       try {
-          text = new String(data, 0, length, "cp437");
-       } catch(UnsupportedEncodingException e) {
-          throw new TikaException("JVM Broken, core codepage CP437 missing!");
-       }
-       
-       // Fix up the known character issues
-       text = text.replace("\u03C6","\u00D8");
-
-       // All done, as best as we can!
-       return text;
-    }
-    
-    /**
-     * Provides a view on the previous 5 bytes
-     */
-    private static class Last5 {
-       byte[] data = new byte[5];
-       int pos = 0;
-       
-       private void record(int b) {
-          data[pos] = (byte)b;
-          pos++;
-          if(pos >= data.length) {
-             pos = 0;
-          }
-       }
-       
-       private byte[] get() {
-          byte[] ret = new byte[5];
-          for(int i=0; i<ret.length; i++) {
-             int p = pos - i;
-             if(p < 0) { p += ret.length; }
-             ret[i] = data[p];
-          }
-          return ret;
-       }
-       
-       private boolean is33() {
-          byte[] last5 = get();
-          for(byte b : last5) {
-             if(b != 0x33) return false;
-          }
-          return true;
-       }
-       
-       private boolean is00() {
-          byte[] last5 = get();
-          for(byte b : last5) {
-             if(b != 0x00) return false;
-          }
-          return true;
-       }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.prt;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * A basic text extracting parser for the CADKey PRT (CAD Drawing)
+ * format. It outputs text from note entries.
+ */
+
+public class PRTParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = 4659638314375035178L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-prt"));
+    public static final String PRT_MIME_TYPE = "application/x-prt";
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    /**
+     * How long do we allow a text run to claim to be, before we
+     * decide we're confused and it's not really text after all?
+     */
+    private static final int MAX_SANE_TEXT_LENGTH = 0x0800;
+    
+    /*
+     * Text types:
+     *   00 00 00 00 f0 [3b]f sz sz TEXT     *view name*
+     *   00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT  *view name*
+     *   (anything)  e0 3f sz sz TEXT    *view name*
+     *   3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT    *note entries* 
+     *   
+     *  Note - all text is null terminated
+     */
+
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+                      ParseContext context) throws IOException, SAXException, TikaException {
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        Last5 l5 = new Last5();
+        int read;
+
+        // Try to get the creation date, which is YYYYMMDDhhmm
+        byte[] header = new byte[30];
+        IOUtils.readFully(stream, header);
+        byte[] date = new byte[12];
+        IOUtils.readFully(stream, date);
+
+        String dateStr = new String(date, US_ASCII);
+        if (dateStr.startsWith("19") || dateStr.startsWith("20")) {
+            String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4, 6) +
+                    "-" + dateStr.substring(6, 8) + "T" + dateStr.substring(8, 10) + ":" +
+                    dateStr.substring(10, 12) + ":00";
+            metadata.set(TikaCoreProperties.CREATED, formattedDate);
+            // TODO Metadata.DATE is used as modified, should it be here?
+            metadata.set(Metadata.DATE, formattedDate);
+        }
+        metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
+
+        // The description, if set, is the next up-to-500 bytes
+        byte[] desc = new byte[500];
+        IOUtils.readFully(stream, desc);
+        String description = extractText(desc, true);
+        if (description.length() > 0) {
+            metadata.set(TikaCoreProperties.DESCRIPTION, description);
+        }
+
+        // Now look for text
+        while ((read = stream.read()) > -1) {
+            if (read == 0xe0 || read == 0xe3 || read == 0xf0) {
+                int nread = stream.read();
+                if (nread == 0x3f || nread == 0xbf) {
+                    // Looks promising, check back for a suitable value
+                    if (read == 0xe3 && nread == 0x3f) {
+                        if (l5.is33()) {
+                            // Bingo, note text
+                            handleNoteText(stream, xhtml);
+                        }
+                    } else if (l5.is00()) {
+                        // Likely view name
+                        handleViewName(read, nread, stream, xhtml, l5);
+                    }
+                }
+            } else {
+                l5.record(read);
+            }
+        }
+    }
+
+    private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+        // Ensure we have the right padding text
+        int read;
+        for (int i = 0; i < 10; i++) {
+            read = stream.read();
+            if (read >= 0 && read <= 0x0f) {
+                // Promising
+            } else {
+                // Wrong, false detection
+                return;
+            }
+        }
+        read = stream.read();
+        if (read != 0x1f) {
+            // Wrong, false detection
+            return;
+        }
+
+        int length = EndianUtils.readUShortLE(stream);
+        if (length <= MAX_SANE_TEXT_LENGTH) {
+            // Length sanity check passed
+            handleText(length, stream, xhtml);
+        }
+    }
+
+    private void handleViewName(int typeA, int typeB, InputStream stream,
+                                XHTMLContentHandler xhtml, Last5 l5)
+            throws IOException, SAXException, TikaException {
+        // Is it 8 byte zero padded?
+        int maybeLength = EndianUtils.readUShortLE(stream);
+        if (maybeLength == 0) {
+            // Check the next 6 bytes too
+            for (int i = 0; i < 6; i++) {
+                int read = stream.read();
+                if (read >= 0 && read <= 0x0f) {
+                    // Promising
+                } else {
+                    // Wrong, false detection
+                    return;
+                }
+            }
+
+            byte[] b2 = new byte[2];
+            IOUtils.readFully(stream, b2);
+            int length = EndianUtils.getUShortLE(b2);
+            if (length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
+                // Length sanity check passed
+                handleText(length, stream, xhtml);
+            } else {
+                // Was probably something else
+                l5.record(b2[0]);
+                l5.record(b2[1]);
+            }
+        } else if (maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) {
+            // Looks like it's straight into the text
+            handleText(maybeLength, stream, xhtml);
+        }
+    }
+
+    private void handleText(int length, InputStream stream, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+        byte[] str = new byte[length];
+        IOUtils.readFully(stream, str);
+        if (str[length - 1] != 0) {
+            // Not properly null terminated, must be wrong
+            return;
+        }
+
+        String text = extractText(str, false);
+
+        xhtml.startElement("p");
+        xhtml.characters(text);
+        xhtml.endElement("p");
+    }
+
+    /**
+     * Does our best to turn the bytes into text
+     */
+    private String extractText(byte[] data, boolean trim) throws TikaException {
+        // The text is always stored null terminated, but sometimes
+        //  may have extra null padding too
+        int length = data.length - 1;
+        if (trim) {
+            for (int i = 0; i < data.length; i++) {
+                if (data[i] == 0) {
+                    length = i;
+                    break;
+                }
+            }
+        }
+
+        // We believe that the text is basically stored as CP437
+        // That said, there are a few characters slightly wrong for that...
+        String text;
+        try {
+            text = new String(data, 0, length, "cp437");
+        } catch (UnsupportedEncodingException e) {
+            throw new TikaException("JVM Broken, core codepage CP437 missing!");
+        }
+
+        // Fix up the known character issues
+        text = text.replace("\u03C6", "\u00D8");
+
+        // All done, as best as we can!
+        return text;
+    }
+
+    /**
+     * Provides a view on the previous 5 bytes
+     */
+    private static class Last5 {
+        byte[] data = new byte[5];
+        int pos = 0;
+
+        private void record(int b) {
+            data[pos] = (byte) b;
+            pos++;
+            if (pos >= data.length) {
+                pos = 0;
+            }
+        }
+
+        private byte[] get() {
+            byte[] ret = new byte[5];
+            for (int i = 0; i < ret.length; i++) {
+                int p = pos - i;
+                if (p < 0) {
+                    p += ret.length;
+                }
+                ret[i] = data[p];
+            }
+            return ret;
+        }
+
+        private boolean is33() {
+            byte[] last5 = get();
+            for (byte b : last5) {
+                if (b != 0x33) return false;
+            }
+            return true;
+        }
+
+        private boolean is00() {
+            byte[] last5 = get();
+            for (byte b : last5) {
+                if (b != 0x00) return false;
+            }
+            return true;
+        }
+    }
+}


[4/4] tika git commit: TIKA-2020 -- remove 3 parameter parse() and simplify CAD tests

Posted by ta...@apache.org.
TIKA-2020 -- remove 3 parameter parse() and simplify CAD tests


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/cd12917f
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/cd12917f
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/cd12917f

Branch: refs/heads/2.x
Commit: cd12917fad98e9164e79b5026a551b1eb58f516c
Parents: 0c71b2f
Author: tballison <ta...@mitre.org>
Authored: Fri Jun 24 11:43:44 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Jun 24 11:43:44 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     | 17 ++++++++++----
 .../org/apache/tika/parser/AbstractParser.java  | 24 --------------------
 .../apache/tika/parser/prt/PRTParserTest.java   |  2 +-
 3 files changed, 13 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/cd12917f/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 1c93618..6a6bca1 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,13 +1,20 @@
-Release 2.0 - Future Development
+Release 2.0 - ???
 
-  * Upgrade rome to 1.5.1 (TIKA-1820)
+  MAJOR CHANGES
 
-  * The default LoadErrorHandler is now WARN, to alert you to missing
-    parser classes and their dependencies. To keep the old behaviour,
-    set your LoadErrorHandler to IGNORE. (TIKA-1805)
+  * Remove 3 parameter parse option from AbstractParser (TIKA-2020).
+    Clients will now need to include a ParseContext.
 
   * (Something about more specific parser bundles, plus an overall one)
 
+  MINOR CHANGES
+
+    * Upgrade rome to 1.5.1 (TIKA-1820)
+
+    * The default LoadErrorHandler is now WARN, to alert you to missing
+      parser classes and their dependencies. To keep the old behaviour,
+      set your LoadErrorHandler to IGNORE. (TIKA-1805)
+
 Release 1.14 - ???
 
   * Add mime types, mime magic and/or globs for:

http://git-wip-us.apache.org/repos/asf/tika/blob/cd12917f/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
index a4c7719..c478c75 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
@@ -16,14 +16,6 @@
  */
 package org.apache.tika.parser;
 
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
 /**
  * Abstract base class for new parsers. This method implements the old
  * deprecated parse method so subclasses won't have to.
@@ -38,22 +30,6 @@ public abstract class AbstractParser implements Parser {
     private static final long serialVersionUID = 7186985395903074255L;
 
     /**
-     * Calls the
-     * {@link Parser#parse(InputStream, ContentHandler, Metadata, ParseContext)}
-     * method with an empty {@link ParseContext}. This method exists as a
-     * leftover from Tika 0.x when the three-argument parse() method still
-     * existed in the {@link Parser} interface. No new code should call this
-     * method anymore, it's only here for backwards compatibility.
-     *
-     * @deprecated use the {@link Parser#parse(InputStream, ContentHandler, Metadata, ParseContext)} method instead
-     */
-    public void parse(
-            InputStream stream, ContentHandler handler, Metadata metadata)
-            throws IOException, SAXException, TikaException {
-        parse(stream, handler, metadata, new ParseContext());
-    }
-    
-    /**
      * Convenience method for creating ParserProxy instances
      * with the current class' ClassLoader
      * 

http://git-wip-us.apache.org/repos/asf/tika/blob/cd12917f/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java b/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
index de870ed..3de7d75 100644
--- a/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
+++ b/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
@@ -29,7 +29,7 @@ public class PRTParserTest extends TikaTest {
      */
     @Test
     public void testPRTParserBasics() throws Exception {
-        XMLResult r = getXML("testCADKey.prt", new PRTParser());
+        XMLResult r = getXML("testCADKEY.prt", new PRTParser());
         Metadata metadata = r.metadata;
         String contents = r.xml;
         assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));


[3/4] tika git commit: TIKA-2020, remove 3 parameter parse() and simplify CAD tests

Posted by ta...@apache.org.
TIKA-2020, remove 3 parameter parse() and simplify CAD tests


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/0c71b2ff
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/0c71b2ff
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/0c71b2ff

Branch: refs/heads/2.x
Commit: 0c71b2ffc97a3907a541fdd164ba79302f5c0637
Parents: 6bb6827
Author: tballison <ta...@mitre.org>
Authored: Fri Jun 24 11:13:54 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Jun 24 11:13:54 2016 -0400

----------------------------------------------------------------------
 .../apache/tika/parser/dwg/DWGParserTest.java   | 372 +++++++++----------
 .../apache/tika/parser/prt/PRTParserTest.java   | 214 +++++------
 2 files changed, 271 insertions(+), 315 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/0c71b2ff/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java b/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
index 321d715..ee3e767 100644
--- a/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
+++ b/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
@@ -1,202 +1,170 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.dwg;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-import static org.apache.tika.TikaTest.assertContains;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class DWGParserTest {
-  
-    @Test
-    public void testDWG2000Parser() throws Exception {
-        InputStream input = DWGParserTest.class.getResourceAsStream(
-                "/test-documents/testDWG2000.dwg");
-        testParserAlt(input);
-    }
-
-    @Test
-    public void testDWG2004Parser() throws Exception {
-        InputStream input = DWGParserTest.class.getResourceAsStream(
-                "/test-documents/testDWG2004.dwg");
-        testParser(input);
-    }
-
-    @Test
-    public void testDWG2004ParserNoHeaderAddress() throws Exception {
-        InputStream input = DWGParserTest.class.getResourceAsStream(
-                "/test-documents/testDWG2004_no_header.dwg");
-        testParserNoHeader(input);
-    }
-
-    @Test
-    public void testDWG2007Parser() throws Exception {
-        InputStream input = DWGParserTest.class.getResourceAsStream(
-                "/test-documents/testDWG2007.dwg");
-        testParser(input);
-    }
-
-    @Test
-    public void testDWG2010Parser() throws Exception {
-        InputStream input = DWGParserTest.class.getResourceAsStream(
-                "/test-documents/testDWG2010.dwg");
-        testParser(input);
-    }
-    
-    @Test
-    public void testDWG2010CustomPropertiesParser() throws Exception {
-        // Check that standard parsing works
-        InputStream testInput = DWGParserTest.class.getResourceAsStream(
-                "/test-documents/testDWG2010_custom_props.dwg");
-        testParser(testInput);
-        
-        // Check that custom properties with alternate padding work
-        try (InputStream input = DWGParserTest.class.getResourceAsStream(
-                "/test-documents/testDWG2010_custom_props.dwg")) {
-            Metadata metadata = new Metadata();
-            ContentHandler handler = new BodyContentHandler();
-            new DWGParser().parse(input, handler, metadata, null);
-
-            assertEquals("valueforcustomprop1",
-                    metadata.get("customprop1"));
-            assertEquals("valueforcustomprop2",
-                    metadata.get("customprop2"));
-        }
-    }
-
-    @Test
-    public void testDWGMechParser() throws Exception {
-        String[] types = new String[] {
-              "6", "2004", "2004DX", "2005", "2006",
-              "2007", "2008", "2009", "2010", "2011"
-        };
-        for (String type : types) {
-           InputStream input = DWGParserTest.class.getResourceAsStream(
-                   "/test-documents/testDWGmech"+type+".dwg");
-           testParserAlt(input);
-        }
-    }
-
-    @SuppressWarnings("deprecation")
-    private void testParser(InputStream input) throws Exception {
-        try {
-            Metadata metadata = new Metadata();
-            ContentHandler handler = new BodyContentHandler();
-            new DWGParser().parse(input, handler, metadata);
-
-            assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
-
-            assertEquals("The quick brown fox jumps over the lazy dog", 
-                    metadata.get(TikaCoreProperties.TITLE));
-            assertEquals("Gym class featuring a brown fox and lazy dog",
-                    metadata.get(TikaCoreProperties.DESCRIPTION));
-            assertEquals("Gym class featuring a brown fox and lazy dog",
-                    metadata.get(Metadata.SUBJECT));
-            assertEquals("Nevin Nollop",
-                    metadata.get(TikaCoreProperties.CREATOR));
-            assertEquals("Pangram, fox, dog",
-                    metadata.get(TikaCoreProperties.KEYWORDS));
-            assertEquals("Lorem ipsum",
-                    metadata.get(TikaCoreProperties.COMMENTS).substring(0,11));
-            assertEquals("http://www.alfresco.com",
-                    metadata.get(TikaCoreProperties.RELATION));
-            
-            // Check some of the old style metadata too
-            assertEquals("The quick brown fox jumps over the lazy dog", 
-                  metadata.get(Metadata.TITLE));
-            assertEquals("Gym class featuring a brown fox and lazy dog",
-                  metadata.get(Metadata.SUBJECT));
-
-            String content = handler.toString();
-            assertContains("The quick brown fox jumps over the lazy dog", content);
-            assertContains("Gym class", content);
-            assertContains("www.alfresco.com", content);
-        } finally {
-            input.close();
-        }
-    }
-
-    @SuppressWarnings("deprecation")
-    private void testParserNoHeader(InputStream input) throws Exception {
-        try {
-            Metadata metadata = new Metadata();
-            ContentHandler handler = new BodyContentHandler();
-            new DWGParser().parse(input, handler, metadata);
-
-            assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
-            
-            assertNull(metadata.get(TikaCoreProperties.TITLE));
-            assertNull(metadata.get(TikaCoreProperties.DESCRIPTION));
-            assertNull(metadata.get(Metadata.SUBJECT));
-            assertNull(metadata.get(TikaCoreProperties.CREATOR));
-            assertNull(metadata.get(TikaCoreProperties.KEYWORDS));
-            assertNull(metadata.get(TikaCoreProperties.COMMENTS));
-            assertNull(metadata.get(TikaCoreProperties.RELATION));
-
-            String content = handler.toString();
-            assertEquals("", content);
-        } finally {
-            input.close();
-        }
-    }
-
-    @SuppressWarnings("deprecation")
-    private void testParserAlt(InputStream input) throws Exception {
-        try {
-            Metadata metadata = new Metadata();
-            ContentHandler handler = new BodyContentHandler();
-            new DWGParser().parse(input, handler, metadata);
-
-            assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
-
-            assertEquals("Test Title", 
-                    metadata.get(TikaCoreProperties.TITLE));
-            assertEquals("Test Subject",
-                    metadata.get(TikaCoreProperties.DESCRIPTION));
-            assertEquals("Test Subject",
-                    metadata.get(Metadata.SUBJECT));
-            assertEquals("My Author",
-                    metadata.get(TikaCoreProperties.CREATOR));
-            assertEquals("My keyword1, MyKeyword2",
-                    metadata.get(TikaCoreProperties.KEYWORDS));
-            assertEquals("This is a comment",
-                    metadata.get(TikaCoreProperties.COMMENTS));
-            assertEquals("bejanpol",
-                    metadata.get(TikaCoreProperties.MODIFIER));
-            assertEquals("bejanpol",
-                    metadata.get(Metadata.LAST_AUTHOR));
-            assertEquals("http://mycompany/drawings",
-                    metadata.get(TikaCoreProperties.RELATION));
-            assertEquals("MyCustomPropertyValue",
-                  metadata.get("MyCustomProperty"));
-
-            String content = handler.toString();
-            assertContains("This is a comment", content);
-            assertContains("mycompany", content);
-        } finally {
-            input.close();
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dwg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class DWGParserTest extends TikaTest {
+
+    @Test
+    public void testDWG2000Parser() throws Exception {
+        testParserAlt("testDWG2000.dwg");
+    }
+
+    @Test
+    public void testDWG2004Parser() throws Exception {
+        testParser("testDWG2004.dwg");
+    }
+
+    @Test
+    public void testDWG2004ParserNoHeaderAddress() throws Exception {
+        testParserNoHeader("testDWG2004_no_header.dwg");
+    }
+
+    @Test
+    public void testDWG2007Parser() throws Exception {
+        testParser("testDWG2007.dwg");
+    }
+
+    @Test
+    public void testDWG2010Parser() throws Exception {
+        testParser("testDWG2010.dwg");
+    }
+
+    @Test
+    public void testDWG2010CustomPropertiesParser() throws Exception {
+        // Check that standard parsing works
+        testParser("testDWG2010_custom_props.dwg");
+
+        // Check that custom properties with alternate padding work
+
+        XMLResult r = getXML("testDWG2010_custom_props.dwg");
+        assertEquals("valueforcustomprop1",
+                r.metadata.get("customprop1"));
+        assertEquals("valueforcustomprop2",
+                r.metadata.get("customprop2"));
+    }
+
+    @Test
+    public void testDWGMechParser() throws Exception {
+        String[] types = new String[]{
+                "6", "2004", "2004DX", "2005", "2006",
+                "2007", "2008", "2009", "2010", "2011"
+        };
+        for (String type : types) {
+            testParserAlt("testDWGmech" + type + ".dwg");
+        }
+    }
+
+    private void testParser(String testFileName) throws Exception {
+        XMLResult r = getXML(testFileName, new DWGParser());
+        Metadata metadata = r.metadata;
+        String content = r.xml;
+
+
+        assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
+
+        assertEquals("The quick brown fox jumps over the lazy dog",
+                metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Gym class featuring a brown fox and lazy dog",
+                metadata.get(TikaCoreProperties.DESCRIPTION));
+        assertEquals("Gym class featuring a brown fox and lazy dog",
+                metadata.get(Metadata.SUBJECT));
+        assertEquals("Nevin Nollop",
+                metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Pangram, fox, dog",
+                metadata.get(TikaCoreProperties.KEYWORDS));
+        assertEquals("Lorem ipsum",
+                metadata.get(TikaCoreProperties.COMMENTS).substring(0, 11));
+        assertEquals("http://www.alfresco.com",
+                metadata.get(TikaCoreProperties.RELATION));
+
+        // Check some of the old style metadata too
+        assertEquals("The quick brown fox jumps over the lazy dog",
+                metadata.get(Metadata.TITLE));
+        assertEquals("Gym class featuring a brown fox and lazy dog",
+                metadata.get(Metadata.SUBJECT));
+
+        assertContains("The quick brown fox jumps over the lazy dog", content);
+        assertContains("Gym class", content);
+        assertContains("www.alfresco.com", content);
+    }
+
+    @SuppressWarnings("deprecation")
+    private void testParserNoHeader(String testFileName) throws Exception {
+        try (InputStream input = getResourceAsStream("/test-documents/" + testFileName)) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new DWGParser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
+
+            assertNull(metadata.get(TikaCoreProperties.TITLE));
+            assertNull(metadata.get(TikaCoreProperties.DESCRIPTION));
+            assertNull(metadata.get(Metadata.SUBJECT));
+            assertNull(metadata.get(TikaCoreProperties.CREATOR));
+            assertNull(metadata.get(TikaCoreProperties.KEYWORDS));
+            assertNull(metadata.get(TikaCoreProperties.COMMENTS));
+            assertNull(metadata.get(TikaCoreProperties.RELATION));
+
+            String content = handler.toString();
+            assertEquals("", content);
+        }
+    }
+
+    private void testParserAlt(String testFileName) throws Exception {
+        XMLResult r = getXML(testFileName, new DWGParser());
+        Metadata metadata = r.metadata;
+        String content = r.xml;
+
+        assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
+
+        assertEquals("Test Title",
+                metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Test Subject",
+                metadata.get(TikaCoreProperties.DESCRIPTION));
+        assertEquals("Test Subject",
+                metadata.get(Metadata.SUBJECT));
+        assertEquals("My Author",
+                metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("My keyword1, MyKeyword2",
+                metadata.get(TikaCoreProperties.KEYWORDS));
+        assertEquals("This is a comment",
+                metadata.get(TikaCoreProperties.COMMENTS));
+        assertEquals("bejanpol",
+                metadata.get(TikaCoreProperties.MODIFIER));
+        assertEquals("bejanpol",
+                metadata.get(Metadata.LAST_AUTHOR));
+        assertEquals("http://mycompany/drawings",
+                metadata.get(TikaCoreProperties.RELATION));
+        assertEquals("MyCustomPropertyValue",
+                metadata.get("MyCustomProperty"));
+        assertContains("This is a comment", content);
+        assertContains("mycompany", content);
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/0c71b2ff/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java b/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
index 155512c..de870ed 100644
--- a/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
+++ b/tika-parser-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
@@ -1,113 +1,101 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.prt;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class PRTParserTest extends TikaTest {
-    /**
-     * Try with a simple file
-     */
-    @Test
-    public void testPRTParserBasics() throws Exception {
-       try (InputStream input = getResourceAsStream("/test-documents/testCADKEY.prt")) {
-          Metadata metadata = new Metadata();
-          ContentHandler handler = new BodyContentHandler();
-          new PRTParser().parse(input, handler, metadata);
-
-          assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
-
-          // This file has a date
-          assertEquals("2011-06-20T16:54:00",
-                  metadata.get(TikaCoreProperties.CREATED));
-          assertEquals("2011-06-20T16:54:00",
-                  metadata.get(Metadata.CREATION_DATE));
-          // But no description
-          assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION));
-
-          String contents = handler.toString();
-
-          assertContains("Front View", contents);
-          assertContains("Back View", contents);
-          assertContains("Bottom View", contents);
-          assertContains("Right View", contents);
-          assertContains("Left View", contents);
-          //assertContains("Isometric View", contents); // Can't detect yet
-          assertContains("Axonometric View", contents);
-
-          assertContains("You've managed to extract all the text!", contents);
-          assertContains("This is more text", contents);
-          assertContains("Text Inside a PRT file", contents);
-       }
-    }
-
-    /**
-     * Now a more complex one
-     */
-    @Test
-    public void testPRTParserComplex() throws Exception {
-       try (InputStream input = getResourceAsStream("/test-documents/testCADKEY2.prt")) {
-          Metadata metadata = new Metadata();
-          ContentHandler handler = new BodyContentHandler();
-          new PRTParser().parse(input, handler, metadata);
-
-          assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
-
-          // File has both a date and a description
-          assertEquals("1997-04-01T08:59:00",
-                  metadata.get(Metadata.DATE));
-          assertEquals("1997-04-01T08:59:00",
-                  metadata.get(Metadata.CREATION_DATE));
-          assertEquals("TIKA TEST PART DESCRIPTION INFORMATION\r\n",
-                  metadata.get(TikaCoreProperties.DESCRIPTION));
-
-          String contents = handler.toString();
-
-          assertContains("ITEM", contents);
-          assertContains("REQ.", contents);
-          assertContains("DESCRIPTION", contents);
-          assertContains("MAT'L", contents);
-          assertContains("TOLERANCES UNLESS", contents);
-          assertContains("FRACTIONS", contents);
-          assertContains("ANGLES", contents);
-          assertContains("Acme Corporation", contents);
-
-          assertContains("DATE", contents);
-          assertContains("CHANGE", contents);
-          assertContains("DRAWN BY", contents);
-          assertContains("SCALE", contents);
-          assertContains("TIKA TEST DRAWING", contents);
-          assertContains("TIKA LETTERS", contents);
-          assertContains("5.82", contents);
-          assertContains("112" + '\u00b0', contents); // Degrees
-          assertContains("TIKA TEST LETTER", contents);
-          assertContains("17.11", contents);
-          assertContains('\u00d8' + "\ufffd2.000", contents); // Diameter
-          assertContains("Diameter", contents);
-          assertContains("The Apache Tika toolkit", contents);
-       }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.prt;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+public class PRTParserTest extends TikaTest {
+    /**
+     * Try with a simple file
+     */
+    @Test
+    public void testPRTParserBasics() throws Exception {
+        XMLResult r = getXML("testCADKey.prt", new PRTParser());
+        Metadata metadata = r.metadata;
+        String contents = r.xml;
+        assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
+
+        // This file has a date
+        assertEquals("2011-06-20T16:54:00",
+                metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2011-06-20T16:54:00",
+                metadata.get(Metadata.CREATION_DATE));
+        // But no description
+        assertEquals(null, metadata.get(TikaCoreProperties.DESCRIPTION));
+
+        assertContains("Front View", contents);
+        assertContains("Back View", contents);
+        assertContains("Bottom View", contents);
+        assertContains("Right View", contents);
+        assertContains("Left View", contents);
+        //assertContains("Isometric View", contents); // Can't detect yet
+        assertContains("Axonometric View", contents);
+
+        assertContains("You've managed to extract all the text!", contents);
+        assertContains("This is more text", contents);
+        assertContains("Text Inside a PRT file", contents);
+
+    }
+
+    /**
+     * Now a more complex one
+     */
+    @Test
+    public void testPRTParserComplex() throws Exception {
+
+        XMLResult r = getXML("testCADKEY2.prt", new PRTParser());
+        Metadata metadata = r.metadata;
+        String contents = r.xml;
+        assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE));
+
+        // File has both a date and a description
+        assertEquals("1997-04-01T08:59:00",
+                metadata.get(Metadata.DATE));
+        assertEquals("1997-04-01T08:59:00",
+                metadata.get(Metadata.CREATION_DATE));
+        assertEquals("TIKA TEST PART DESCRIPTION INFORMATION\r\n",
+                metadata.get(TikaCoreProperties.DESCRIPTION));
+
+        assertContains("ITEM", contents);
+        assertContains("REQ.", contents);
+        assertContains("DESCRIPTION", contents);
+        assertContains("MAT'L", contents);
+        assertContains("TOLERANCES UNLESS", contents);
+        assertContains("FRACTIONS", contents);
+        assertContains("ANGLES", contents);
+        assertContains("Acme Corporation", contents);
+
+        assertContains("DATE", contents);
+        assertContains("CHANGE", contents);
+        assertContains("DRAWN BY", contents);
+        assertContains("SCALE", contents);
+        assertContains("TIKA TEST DRAWING", contents);
+        assertContains("TIKA LETTERS", contents);
+        assertContains("5.82", contents);
+        assertContains("112" + '\u00b0', contents); // Degrees
+        assertContains("TIKA TEST LETTER", contents);
+        assertContains("17.11", contents);
+        assertContains('\u00d8' + "\ufffd2.000", contents); // Diameter
+        assertContains("Diameter", contents);
+        assertContains("The Apache Tika toolkit", contents);
+    }
+}


[2/4] tika git commit: add startDocument and endDocument() to PRTParser so that it works with the ToXMLHandler

Posted by ta...@apache.org.
add startDocument and endDocument() to PRTParser so that it works with the ToXMLHandler


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/6bb6827e
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/6bb6827e
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/6bb6827e

Branch: refs/heads/2.x
Commit: 6bb6827e0dfd833c99e2dc0e568e5f06a6b5a6d6
Parents: 7674426
Author: tballison <ta...@mitre.org>
Authored: Fri Jun 24 11:12:18 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Jun 24 11:12:18 2016 -0400

----------------------------------------------------------------------
 .../src/main/java/org/apache/tika/parser/prt/PRTParser.java       | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/6bb6827e/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
index 24418b0..0c9689f 100644
--- a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
+++ b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
@@ -77,7 +77,7 @@ public class PRTParser extends AbstractParser {
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         Last5 l5 = new Last5();
         int read;
-
+        xhtml.startDocument();
         // Try to get the creation date, which is YYYYMMDDhhmm
         byte[] header = new byte[30];
         IOUtils.readFully(stream, header);
@@ -123,6 +123,7 @@ public class PRTParser extends AbstractParser {
                 l5.record(read);
             }
         }
+        xhtml.endDocument();
     }
 
     private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml)