You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2009/04/08 06:11:02 UTC

svn commit: r762789 - in /jackrabbit/trunk: jackrabbit-jcr-server/src/main/resources/org/apache/jackrabbit/server/io/ jackrabbit-text-extractors/ jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/

Author: jukka
Date: Tue Apr  7 14:25:43 2009
New Revision: 762789

URL: http://svn.apache.org/viewvc?rev=762789&view=rev
Log:
JCR-1887: msoffice text extractor for office 2007 files

Applied patch by Philipp Koch.

Added:
    jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java   (with props)
Modified:
    jackrabbit/trunk/jackrabbit-jcr-server/src/main/resources/org/apache/jackrabbit/server/io/mimetypes.properties
    jackrabbit/trunk/jackrabbit-text-extractors/pom.xml

Modified: jackrabbit/trunk/jackrabbit-jcr-server/src/main/resources/org/apache/jackrabbit/server/io/mimetypes.properties
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-jcr-server/src/main/resources/org/apache/jackrabbit/server/io/mimetypes.properties?rev=762789&r1=762788&r2=762789&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-jcr-server/src/main/resources/org/apache/jackrabbit/server/io/mimetypes.properties (original)
+++ jackrabbit/trunk/jackrabbit-jcr-server/src/main/resources/org/apache/jackrabbit/server/io/mimetypes.properties Tue Apr  7 14:25:43 2009
@@ -35,6 +35,7 @@
 dir=application/x-director
 dms=application/octet-stream
 doc=application/msword
+docx=application/vnd.openxmlformats-officedocument.wordprocessingml.document
 dvi=application/x-dvi
 dxr=application/x-director
 ecma=text/qhtml
@@ -93,6 +94,7 @@
 pnm=image/x-portable-anymap
 ppm=image/x-portable-pixmap
 ppt=application/vnd.ms-powerpoint
+pptx=application/vnd.openxmlformats-officedocument.presentationml.presentation
 ps=application/postscript
 qhtml=text/qhtml
 qt=video/quicktime
@@ -142,6 +144,7 @@
 wrl=model/vrml
 xbm=image/x-xbitmap
 xls=application/vnd.ms-excel
+xlsx=application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
 xml=text/xml
 xpm=image/x-xpixmap
 xwd=image/x-xwindowdump

Modified: jackrabbit/trunk/jackrabbit-text-extractors/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/pom.xml?rev=762789&r1=762788&r2=762789&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/pom.xml (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/pom.xml Tue Apr  7 14:25:43 2009
@@ -61,13 +61,28 @@
   </build>
 
   <dependencies>
-    <dependency>
+  <dependency>
       <groupId>org.apache.poi</groupId>
       <artifactId>poi</artifactId>
+      <version>3.5-beta3</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.poi</groupId>
+      <artifactId>openxml4j</artifactId>
+      <version>1.0-beta</version>
+    </dependency>
+
+    <!-- not sure if this is requred at runtime
+    <dependency>
+      <groupId>org.apache.poi</groupId>
+      <artifactId>ooxml-schemas</artifactId>
+      <version>1.0</version>
     </dependency>
+    -->
     <dependency>
       <groupId>org.apache.poi</groupId>
       <artifactId>poi-scratchpad</artifactId>
+      <version>3.5-beta3</version>
     </dependency>
     <dependency>
       <groupId>pdfbox</groupId>

Added: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java?rev=762789&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java (added)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java Tue Apr  7 14:25:43 2009
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.extractor;
+
+import org.apache.poi.extractor.ExtractorFactory;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.Reader;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.StringReader;
+
+/**
+ * Text extractor for Microsoft Word documents.
+ */
+public class MsTextExtractor extends AbstractTextExtractor {
+
+    /**
+     * Logger instance.
+     */
+    private static final Logger logger =
+        LoggerFactory.getLogger(MsTextExtractor.class);
+
+    /**
+     * Force loading of dependent class.
+     */
+    static {
+        ExtractorFactory.class.getName();
+    }
+
+    /**
+     * Creates a new <code>MsWordTextExtractor</code> instance.
+     */
+    public MsTextExtractor() {
+        super(new String[]{"application/vnd.ms-word", 
+                           "application/msword",
+                           "application/vnd.ms-powerpoint",
+                           "application/mspowerpoint",
+                           "application/vnd.ms-excel",
+                           "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                           "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+                           "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"});
+    }
+
+    //-------------------------------------------------------< TextExtractor >
+
+    /**
+     * {@inheritDoc}
+     * Returns an empty reader if an error occured extracting text from
+     * the word document.
+     */
+    public Reader extractText(InputStream stream,
+                              String type,
+                              String encoding) throws IOException {
+        try {
+            String text = ExtractorFactory.createExtractor(stream).getText();
+            return new StringReader(text);
+        } catch (Exception e) {
+            logger.warn("Failed to extract Microsoft Document text content", e);
+            return new StringReader("");
+        } finally {
+            stream.close();
+        }
+    }
+
+}

Propchange: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native