You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/03/09 17:23:11 UTC

svn commit: r635273 - in /incubator/tika/trunk: CHANGES.txt src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java src/main/resources/mime/tika-mimetypes.xml src/main/resources/tika-config.xml

Author: jukka
Date: Sun Mar  9 09:23:06 2008
New Revision: 635273

URL: http://svn.apache.org/viewvc?rev=635273&view=rev
Log:
TIKA-127: Add support for Visio files

Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
    incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
    incubator/tika/trunk/src/main/resources/tika-config.xml

Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=635273&r1=635272&r2=635273&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Sun Mar  9 09:23:06 2008
@@ -2,25 +2,27 @@
 
 Unreleased changes (0.2-incubating)
 
-1. TIKA-109 - WordParser fails on some Word files (Dave Meikle)
+1.  TIKA-109 - WordParser fails on some Word files (Dave Meikle)
 
-2. TIKA-105 - Excel parser implementation based on POI's Event API
-              (Niall Pemberton)
+2.  TIKA-105 - Excel parser implementation based on POI's Event API
+               (Niall Pemberton)
 
-3. TIKA-116 - Streaming parser for OpenDocument files (Jukka Zitting)
+3.  TIKA-116 - Streaming parser for OpenDocument files (Jukka Zitting)
 
-4. TIKA-117 - Drop JDOM and Jaxen dependencies (Jukka Zitting)
+4.  TIKA-117 - Drop JDOM and Jaxen dependencies (Jukka Zitting)
 
-5. TIKA-115 - Tika package with all the dependencies (Jukka Zitting)
+5.  TIKA-115 - Tika package with all the dependencies (Jukka Zitting)
 
-6. TIKA-97  - Tika GUI (Jukka Zitting)
+6.  TIKA-97  - Tika GUI (Jukka Zitting)
 
-7. TIKA-96  - Tika CLI (Jukka Zitting)
+7.  TIKA-96  - Tika CLI (Jukka Zitting)
 
-8. TIKA-112 - Use Commons IO 1.4 (Jukka Zitting)
+8.  TIKA-112 - Use Commons IO 1.4 (Jukka Zitting)
 
-9. TIKA-126 - Add Parser.parse(InputStream, Metadata) for metadata extraction
+9.  TIKA-126 - Add Parser.parse(InputStream, Metadata) for metadata extraction
               (Jukka Zitting)
+
+10. TIKA-127 - Add support for Visio files (Jukka Zitting)
 
 
 Release 0.1-incubating - 12/27/2007

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=635273&r1=635272&r2=635273&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Sun Mar  9 09:23:06 2008
@@ -21,6 +21,7 @@
 import java.util.Date;
 import java.util.Iterator;
 
+import org.apache.poi.hdgf.extractor.VisioTextExtractor;
 import org.apache.poi.hpsf.DocumentSummaryInformation;
 import org.apache.poi.hpsf.MarkUnsupportedException;
 import org.apache.poi.hpsf.NoPropertySetStreamException;
@@ -72,6 +73,8 @@
                 setType(metadata, "application/vnd.ms-powerpoint");
             } else if ("Workbook".equals(name)) {
                 setType(metadata, "application/vnd.ms-excel");
+            } else if ("VisioDocument".equals(name)) {
+                setType(metadata, "application/vnd.visio");
             }
         }
     }
@@ -109,6 +112,13 @@
             } else if ("Workbook".equals(name)) {
                 setType(metadata, "application/vnd.ms-excel");
                 new ExcelExtractor().parse(filesystem, xhtml);
+            } else if ("VisioDocument".equals(name)) {
+                setType(metadata, "application/vnd.visio");
+                VisioTextExtractor extractor =
+                    new VisioTextExtractor(filesystem);
+                for (String text : extractor.getAllText()) {
+                    xhtml.element("p", text);
+                }
             }
         }
 

Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=635273&r1=635272&r2=635273&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Sun Mar  9 09:23:06 2008
@@ -160,6 +160,14 @@
                 </magic>
         </mime-type>
 
+        <!-- http://www.iana.org/assignments/media-types/application/vnd.visio -->
+        <mime-type type="application/vnd.visio">
+                <glob pattern="*.vsd"/>
+                <glob pattern="*.vst"/>
+                <glob pattern="*.vsw"/>
+                <glob pattern="*.vss"/>
+        </mime-type>
+
 	<mime-type type="application/vnd.ms-powerpoint">
 		<glob pattern="*.ppz" />
 		<glob pattern="*.ppt" />

Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=635273&r1=635272&r2=635273&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Sun Mar  9 09:23:06 2008
@@ -32,6 +32,7 @@
                 <mime>application/msword</mime>
                 <mime>application/vnd.ms-excel</mime>
                 <mime>application/vnd.ms-powerpoint</mime>
+                <mime>application/vnd.visio</mime>
         </parser>
 
         <parser name="parse-html" class="org.apache.tika.parser.html.HtmlParser">