You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/03/09 17:23:11 UTC
svn commit: r635273 - in /incubator/tika/trunk: CHANGES.txt
src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
src/main/resources/mime/tika-mimetypes.xml src/main/resources/tika-config.xml
Author: jukka
Date: Sun Mar 9 09:23:06 2008
New Revision: 635273
URL: http://svn.apache.org/viewvc?rev=635273&view=rev
Log:
TIKA-127: Add support for Visio files
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
incubator/tika/trunk/src/main/resources/tika-config.xml
Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=635273&r1=635272&r2=635273&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Sun Mar 9 09:23:06 2008
@@ -2,25 +2,27 @@
Unreleased changes (0.2-incubating)
-1. TIKA-109 - WordParser fails on some Word files (Dave Meikle)
+1. TIKA-109 - WordParser fails on some Word files (Dave Meikle)
-2. TIKA-105 - Excel parser implementation based on POI's Event API
- (Niall Pemberton)
+2. TIKA-105 - Excel parser implementation based on POI's Event API
+ (Niall Pemberton)
-3. TIKA-116 - Streaming parser for OpenDocument files (Jukka Zitting)
+3. TIKA-116 - Streaming parser for OpenDocument files (Jukka Zitting)
-4. TIKA-117 - Drop JDOM and Jaxen dependencies (Jukka Zitting)
+4. TIKA-117 - Drop JDOM and Jaxen dependencies (Jukka Zitting)
-5. TIKA-115 - Tika package with all the dependencies (Jukka Zitting)
+5. TIKA-115 - Tika package with all the dependencies (Jukka Zitting)
-6. TIKA-97 - Tika GUI (Jukka Zitting)
+6. TIKA-97 - Tika GUI (Jukka Zitting)
-7. TIKA-96 - Tika CLI (Jukka Zitting)
+7. TIKA-96 - Tika CLI (Jukka Zitting)
-8. TIKA-112 - Use Commons IO 1.4 (Jukka Zitting)
+8. TIKA-112 - Use Commons IO 1.4 (Jukka Zitting)
-9. TIKA-126 - Add Parser.parse(InputStream, Metadata) for metadata extraction
+9. TIKA-126 - Add Parser.parse(InputStream, Metadata) for metadata extraction
(Jukka Zitting)
+
+10. TIKA-127 - Add support for Visio files (Jukka Zitting)
Release 0.1-incubating - 12/27/2007
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=635273&r1=635272&r2=635273&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Sun Mar 9 09:23:06 2008
@@ -21,6 +21,7 @@
import java.util.Date;
import java.util.Iterator;
+import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.MarkUnsupportedException;
import org.apache.poi.hpsf.NoPropertySetStreamException;
@@ -72,6 +73,8 @@
setType(metadata, "application/vnd.ms-powerpoint");
} else if ("Workbook".equals(name)) {
setType(metadata, "application/vnd.ms-excel");
+ } else if ("VisioDocument".equals(name)) {
+ setType(metadata, "application/vnd.visio");
}
}
}
@@ -109,6 +112,13 @@
} else if ("Workbook".equals(name)) {
setType(metadata, "application/vnd.ms-excel");
new ExcelExtractor().parse(filesystem, xhtml);
+ } else if ("VisioDocument".equals(name)) {
+ setType(metadata, "application/vnd.visio");
+ VisioTextExtractor extractor =
+ new VisioTextExtractor(filesystem);
+ for (String text : extractor.getAllText()) {
+ xhtml.element("p", text);
+ }
}
}
Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=635273&r1=635272&r2=635273&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Sun Mar 9 09:23:06 2008
@@ -160,6 +160,14 @@
</magic>
</mime-type>
+ <!-- http://www.iana.org/assignments/media-types/application/vnd.visio -->
+ <mime-type type="application/vnd.visio">
+ <glob pattern="*.vsd"/>
+ <glob pattern="*.vst"/>
+ <glob pattern="*.vsw"/>
+ <glob pattern="*.vss"/>
+ </mime-type>
+
<mime-type type="application/vnd.ms-powerpoint">
<glob pattern="*.ppz" />
<glob pattern="*.ppt" />
Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=635273&r1=635272&r2=635273&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Sun Mar 9 09:23:06 2008
@@ -32,6 +32,7 @@
<mime>application/msword</mime>
<mime>application/vnd.ms-excel</mime>
<mime>application/vnd.ms-powerpoint</mime>
+ <mime>application/vnd.visio</mime>
</parser>
<parser name="parse-html" class="org.apache.tika.parser.html.HtmlParser">