You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2009/04/08 06:11:02 UTC
svn commit: r762789 - in /jackrabbit/trunk:
jackrabbit-jcr-server/src/main/resources/org/apache/jackrabbit/server/io/
jackrabbit-text-extractors/
jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/
Author: jukka
Date: Tue Apr 7 14:25:43 2009
New Revision: 762789
URL: http://svn.apache.org/viewvc?rev=762789&view=rev
Log:
JCR-1887: msoffice text extractor for office 2007 files
Applied patch by Philipp Koch.
Added:
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java (with props)
Modified:
jackrabbit/trunk/jackrabbit-jcr-server/src/main/resources/org/apache/jackrabbit/server/io/mimetypes.properties
jackrabbit/trunk/jackrabbit-text-extractors/pom.xml
Modified: jackrabbit/trunk/jackrabbit-jcr-server/src/main/resources/org/apache/jackrabbit/server/io/mimetypes.properties
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-jcr-server/src/main/resources/org/apache/jackrabbit/server/io/mimetypes.properties?rev=762789&r1=762788&r2=762789&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-jcr-server/src/main/resources/org/apache/jackrabbit/server/io/mimetypes.properties (original)
+++ jackrabbit/trunk/jackrabbit-jcr-server/src/main/resources/org/apache/jackrabbit/server/io/mimetypes.properties Tue Apr 7 14:25:43 2009
@@ -35,6 +35,7 @@
dir=application/x-director
dms=application/octet-stream
doc=application/msword
+docx=application/vnd.openxmlformats-officedocument.wordprocessingml.document
dvi=application/x-dvi
dxr=application/x-director
ecma=text/qhtml
@@ -93,6 +94,7 @@
pnm=image/x-portable-anymap
ppm=image/x-portable-pixmap
ppt=application/vnd.ms-powerpoint
+pptx=application/vnd.openxmlformats-officedocument.presentationml.presentation
ps=application/postscript
qhtml=text/qhtml
qt=video/quicktime
@@ -142,6 +144,7 @@
wrl=model/vrml
xbm=image/x-xbitmap
xls=application/vnd.ms-excel
+xlsx=application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
xml=text/xml
xpm=image/x-xpixmap
xwd=image/x-xwindowdump
Modified: jackrabbit/trunk/jackrabbit-text-extractors/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/pom.xml?rev=762789&r1=762788&r2=762789&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/pom.xml (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/pom.xml Tue Apr 7 14:25:43 2009
@@ -61,13 +61,28 @@
</build>
<dependencies>
- <dependency>
+ <dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
+ <version>3.5-beta3</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.poi</groupId>
+ <artifactId>openxml4j</artifactId>
+ <version>1.0-beta</version>
+ </dependency>
+
+ <!-- not sure if this is requred at runtime
+ <dependency>
+ <groupId>org.apache.poi</groupId>
+ <artifactId>ooxml-schemas</artifactId>
+ <version>1.0</version>
</dependency>
+ -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
+ <version>3.5-beta3</version>
</dependency>
<dependency>
<groupId>pdfbox</groupId>
Added: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java?rev=762789&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java Tue Apr 7 14:25:43 2009
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import org.apache.poi.extractor.ExtractorFactory;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.Reader;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.StringReader;
+
+/**
+ * Text extractor for Microsoft Word documents.
+ */
+public class MsTextExtractor extends AbstractTextExtractor {
+
+ /**
+ * Logger instance.
+ */
+ private static final Logger logger =
+ LoggerFactory.getLogger(MsTextExtractor.class);
+
+ /**
+ * Force loading of dependent class.
+ */
+ static {
+ ExtractorFactory.class.getName();
+ }
+
+ /**
+ * Creates a new <code>MsWordTextExtractor</code> instance.
+ */
+ public MsTextExtractor() {
+ super(new String[]{"application/vnd.ms-word",
+ "application/msword",
+ "application/vnd.ms-powerpoint",
+ "application/mspowerpoint",
+ "application/vnd.ms-excel",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"});
+ }
+
+ //-------------------------------------------------------< TextExtractor >
+
+ /**
+ * {@inheritDoc}
+ * Returns an empty reader if an error occured extracting text from
+ * the word document.
+ */
+ public Reader extractText(InputStream stream,
+ String type,
+ String encoding) throws IOException {
+ try {
+ String text = ExtractorFactory.createExtractor(stream).getText();
+ return new StringReader(text);
+ } catch (Exception e) {
+ logger.warn("Failed to extract Microsoft Document text content", e);
+ return new StringReader("");
+ } finally {
+ stream.close();
+ }
+ }
+
+}
Propchange: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native