You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cxf.apache.org by re...@apache.org on 2016/09/16 01:36:59 UTC

[08/22] cxf git commit: Updating TikaContentExtractor to support the embedded attachments

Updating TikaContentExtractor to support the embedded attachments


Project: http://git-wip-us.apache.org/repos/asf/cxf/repo
Commit: http://git-wip-us.apache.org/repos/asf/cxf/commit/cc2341a4
Tree: http://git-wip-us.apache.org/repos/asf/cxf/tree/cc2341a4
Diff: http://git-wip-us.apache.org/repos/asf/cxf/diff/cc2341a4

Branch: refs/heads/master-jaxrs-2.1
Commit: cc2341a453a8edd467d83fdeb2c09ea62aee0ffa
Parents: 9810a84
Author: Sergey Beryozkin <sb...@gmail.com>
Authored: Thu Sep 15 11:21:46 2016 +0100
Committer: Sergey Beryozkin <sb...@gmail.com>
Committed: Thu Sep 15 11:21:46 2016 +0100

----------------------------------------------------------------------
 .../ext/search/tika/TikaContentExtractor.java   | 40 +++++++++++++++-----
 1 file changed, 31 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cxf/blob/cc2341a4/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
index fd3511a..e4d1918 100644
--- a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
+++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
@@ -36,6 +36,7 @@ import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.ToTextContentHandler;
@@ -47,6 +48,13 @@ public class TikaContentExtractor {
     private final Detector detector;
     
     /**
+     * Create new Tika-based content extractor using AutoDetectParser.  
+     */
+    public TikaContentExtractor() {
+        this(new AutoDetectParser(), false);
+    }
+    
+    /**
      * Create new Tika-based content extractor using the provided parser instance.  
      * @param parser parser instance
      */
@@ -159,9 +167,6 @@ public class TikaContentExtractor {
         if (in == null) {
             return null;
         }
-        if (context == null) {
-            context = new ParseContext();
-        }
         final Metadata metadata = new Metadata();            
         
         try {
@@ -171,20 +176,37 @@ public class TikaContentExtractor {
                 mediaType = MediaType.parse(mtHint.toString());
             } else if (detector != null && in.markSupported()) {
                 mediaType = detector.detect(in, metadata);
-            } 
+            }
+            if (mediaType != null) {
+                metadata.set(Metadata.CONTENT_TYPE, mediaType.toString());
+            }
             
             Parser parser = null;
-            for (Parser p : parsers) {
-                if (mediaType != null && !p.getSupportedTypes(context).contains(mediaType)) {
-                    continue;
+            if (parsers.size() == 1) {
+                parser = parsers.get(0);
+            } else {
+                for (Parser p : parsers) {
+                    if (mediaType != null && !p.getSupportedTypes(context).contains(mediaType)) {
+                        continue;
+                    }
+                    parser = p;
+                    break;
                 }
-                parser = p;
-                break;
             }
             if (parser == null) {
                 return null;
             }
             
+            if (context == null) {
+                context = new ParseContext();
+            }
+            if (context.get(Parser.class) == null) {
+                // to process the embedded attachments
+                context.set(Parser.class, 
+                            parser instanceof AutoDetectParser ? parser : new AutoDetectParser());
+            }
+            
+            
             try {
                 parser.parse(in, handler, metadata, context);
             } catch (Exception ex) {