You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2010/11/23 15:16:47 UTC
svn commit: r1038124 - in /jackrabbit/trunk: jackrabbit-core/pom.xml
jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitParser.java
jackrabbit-parent/pom.xml
Author: jukka
Date: Tue Nov 23 14:16:46 2010
New Revision: 1038124
URL: http://svn.apache.org/viewvc?rev=1038124&view=rev
Log:
JCR-2817: Upgrade to Tika 0.8
Also fix the backwards-compatibility settings for MsPowerPoint*Text*Extractor as discussed on users@.
Modified:
jackrabbit/trunk/jackrabbit-core/pom.xml
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitParser.java
jackrabbit/trunk/jackrabbit-parent/pom.xml
Modified: jackrabbit/trunk/jackrabbit-core/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/pom.xml?rev=1038124&r1=1038123&r2=1038124&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/pom.xml (original)
+++ jackrabbit/trunk/jackrabbit-core/pom.xml Tue Nov 23 14:16:46 2010
@@ -233,18 +233,6 @@ org.apache.jackrabbit.test.api.Shareable
</exclusion>
</exclusions>
</dependency>
- <!-- JCR-2502: Force a PDFBox upgrade until Tika 0.7 is out -->
- <dependency>
- <groupId>org.apache.pdfbox</groupId>
- <artifactId>pdfbox</artifactId>
- <version>1.0.0</version>
- <exclusions>
- <exclusion>
- <groupId>commons-logging</groupId>
- <artifactId>commons-logging</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitParser.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitParser.java?rev=1038124&r1=1038123&r2=1038124&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitParser.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitParser.java Tue Nov 23 14:16:46 2010
@@ -77,10 +77,14 @@ class JackrabbitParser implements Parser
InputStream stream =
JackrabbitParser.class.getResourceAsStream("tika-config.xml");
try {
- try {
- parser = new AutoDetectParser(new TikaConfig(stream));
- } finally {
- stream.close();
+ if (stream != null) {
+ try {
+ parser = new AutoDetectParser(new TikaConfig(stream));
+ } finally {
+ stream.close();
+ }
+ } else {
+ parser = new AutoDetectParser();
}
} catch (Exception e) {
// Should never happen
@@ -98,80 +102,70 @@ class JackrabbitParser implements Parser
* @param classes configured list of text extractor classes
*/
public void setTextFilterClasses(String classes) {
- Map<String, Parser> parsers = new HashMap<String, Parser>();
+ Map<MediaType, Parser> parsers = new HashMap<MediaType, Parser>();
StringTokenizer tokenizer = new StringTokenizer(classes, ", \t\n\r\f");
while (tokenizer.hasMoreTokens()) {
String name = tokenizer.nextToken();
if (name.equals(
"org.apache.jackrabbit.extractor.HTMLTextExtractor")) {
- parsers.put("text/html", new HtmlParser());
- } else if (name.equals(
- "org.apache.jackrabbit.extractor.MsExcelTextExtractor")) {
+ parsers.put(MediaType.text("html"), new HtmlParser());
+ } else if (name.equals("org.apache.jackrabbit.extractor.MsExcelTextExtractor")) {
Parser parser = new OfficeParser();
- parsers.put("application/vnd.ms-excel", parser);
- parsers.put("application/msexcel", parser);
- parsers.put("application/excel", parser);
- } else if (name.equals(
- "org.apache.jackrabbit.extractor.MsOutlookTextExtractor")) {
- parsers.put("application/vnd.ms-outlook", new OfficeParser());
- } else if (name.equals(
- "org.apache.jackrabbit.extractor.MsPowerPointExtractor")) {
+ parsers.put(MediaType.application("vnd.ms-excel"), parser);
+ parsers.put(MediaType.application("msexcel"), parser);
+ parsers.put(MediaType.application("excel"), parser);
+ } else if (name.equals("org.apache.jackrabbit.extractor.MsOutlookTextExtractor")) {
+ parsers.put(MediaType.application("vnd.ms-outlook"), new OfficeParser());
+ } else if (name.equals("org.apache.jackrabbit.extractor.MsPowerPointExtractor")
+ || name.equals("org.apache.jackrabbit.extractor.MsPowerPointTextExtractor")) {
Parser parser = new OfficeParser();
- parsers.put("application/vnd.ms-powerpoint", parser);
- parsers.put("application/mspowerpoint", parser);
- parsers.put("application/powerpoint", parser);
- } else if (name.equals(
- "org.apache.jackrabbit.extractor.MsWordTextExtractor")) {
+ parsers.put(MediaType.application("vnd.ms-powerpoint"), parser);
+ parsers.put(MediaType.application("mspowerpoint"), parser);
+ parsers.put(MediaType.application("powerpoint"), parser);
+ } else if (name.equals("org.apache.jackrabbit.extractor.MsWordTextExtractor")) {
Parser parser = new OfficeParser();
- parsers.put("application/vnd.ms-word", parser);
- parsers.put("application/msword", parser);
- } else if (name.equals(
- "org.apache.jackrabbit.extractor.MsTextExtractor")) {
+ parsers.put(MediaType.application("vnd.ms-word"), parser);
+ parsers.put(MediaType.application("msword"), parser);
+ } else if (name.equals("org.apache.jackrabbit.extractor.MsTextExtractor")) {
Parser parser = new OfficeParser();
- parsers.put("application/vnd.ms-word", parser);
- parsers.put("application/msword", parser);
- parsers.put("application/vnd.ms-powerpoint", parser);
- parsers.put("application/mspowerpoint", parser);
- parsers.put("application/vnd.ms-excel", parser);
- parsers.put("application/vnd.openxmlformats-officedocument.wordprocessingml.document", parser);
- parsers.put("application/vnd.openxmlformats-officedocument.presentationml.presentation", parser);
- parsers.put("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", parser);
- } else if (name.equals(
- "org.apache.jackrabbit.extractor.OpenOfficeTextExtractor")) {
+ parsers.put(MediaType.application("vnd.ms-word"), parser);
+ parsers.put(MediaType.application("msword"), parser);
+ parsers.put(MediaType.application("vnd.ms-powerpoint"), parser);
+ parsers.put(MediaType.application("mspowerpoint"), parser);
+ parsers.put(MediaType.application("vnd.ms-excel"), parser);
+ parsers.put(MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"), parser);
+ parsers.put(MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"), parser);
+ parsers.put(MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"), parser);
+ } else if (name.equals("org.apache.jackrabbit.extractor.OpenOfficeTextExtractor")) {
Parser parser = new OpenDocumentParser();
- parsers.put("application/vnd.oasis.opendocument.database", parser);
- parsers.put("application/vnd.oasis.opendocument.formula", parser);
- parsers.put("application/vnd.oasis.opendocument.graphics", parser);
- parsers.put("application/vnd.oasis.opendocument.presentation", parser);
- parsers.put("application/vnd.oasis.opendocument.spreadsheet", parser);
- parsers.put("application/vnd.oasis.opendocument.text", parser);
- parsers.put("application/vnd.sun.xml.calc", parser);
- parsers.put("application/vnd.sun.xml.draw", parser);
- parsers.put("application/vnd.sun.xml.impress", parser);
- parsers.put("application/vnd.sun.xml.writer", parser);
- } else if (name.equals(
- "org.apache.jackrabbit.extractor.PdfTextExtractor")) {
- parsers.put("application/pdf", new PDFParser());
- } else if (name.equals(
- "org.apache.jackrabbit.extractor.PlainTextExtractor")) {
- parsers.put("text/plain", new TXTParser());
- } else if (name.equals(
- "org.apache.jackrabbit.extractor.PngTextExtractor")) {
+ parsers.put(MediaType.application("vnd.oasis.opendocument.database"), parser);
+ parsers.put(MediaType.application("vnd.oasis.opendocument.formula"), parser);
+ parsers.put(MediaType.application("vnd.oasis.opendocument.graphics"), parser);
+ parsers.put(MediaType.application("vnd.oasis.opendocument.presentation"), parser);
+ parsers.put(MediaType.application("vnd.oasis.opendocument.spreadsheet"), parser);
+ parsers.put(MediaType.application("vnd.oasis.opendocument.text"), parser);
+ parsers.put(MediaType.application("vnd.sun.xml.calc"), parser);
+ parsers.put(MediaType.application("vnd.sun.xml.draw"), parser);
+ parsers.put(MediaType.application("vnd.sun.xml.impress"), parser);
+ parsers.put(MediaType.application("vnd.sun.xml.writer"), parser);
+ } else if (name.equals("org.apache.jackrabbit.extractor.PdfTextExtractor")) {
+ parsers.put(MediaType.application("pdf"), new PDFParser());
+ } else if (name.equals("org.apache.jackrabbit.extractor.PlainTextExtractor")) {
+ parsers.put(MediaType.TEXT_PLAIN, new TXTParser());
+ } else if (name.equals("org.apache.jackrabbit.extractor.PngTextExtractor")) {
Parser parser = new ImageParser();
- parsers.put("image/png", parser);
- parsers.put("image/apng", parser);
- parsers.put("image/mng", parser);
- } else if (name.equals(
- "org.apache.jackrabbit.extractor.RTFTextExtractor")) {
+ parsers.put(MediaType.image("png"), parser);
+ parsers.put(MediaType.image("apng"), parser);
+ parsers.put(MediaType.image("mng"), parser);
+ } else if (name.equals("org.apache.jackrabbit.extractor.RTFTextExtractor")) {
Parser parser = new RTFParser();
- parsers.put("application/rtf", parser);
- parsers.put("text/rtf", parser);
- } else if (name.equals(
- "org.apache.jackrabbit.extractor.XMLTextExtractor")) {
+ parsers.put(MediaType.application("rtf"), parser);
+ parsers.put(MediaType.text("rtf"), parser);
+ } else if (name.equals("org.apache.jackrabbit.extractor.XMLTextExtractor")) {
Parser parser = new XMLParser();
- parsers.put("application/xml", parser);
- parsers.put("text/xml", parser);
+ parsers.put(MediaType.APPLICATION_XML, parser);
+ parsers.put(MediaType.text("xml"), parser);
} else {
logger.warn("Ignoring unknown text extractor class: {}", name);
}
Modified: jackrabbit/trunk/jackrabbit-parent/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-parent/pom.xml?rev=1038124&r1=1038123&r2=1038124&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-parent/pom.xml (original)
+++ jackrabbit/trunk/jackrabbit-parent/pom.xml Tue Nov 23 14:16:46 2010
@@ -44,7 +44,7 @@
<slf4j.version>1.5.11</slf4j.version>
<logback.version>0.9.20</logback.version>
<jetty.version>6.1.22</jetty.version>
- <tika.version>0.7</tika.version>
+ <tika.version>0.8</tika.version>
</properties>
<build>