You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cxf.apache.org by re...@apache.org on 2016/09/16 01:36:59 UTC
[08/22] cxf git commit: Updating TikaContentExtractor to support the
embedded attachments
Updating TikaContentExtractor to support the embedded attachments
Project: http://git-wip-us.apache.org/repos/asf/cxf/repo
Commit: http://git-wip-us.apache.org/repos/asf/cxf/commit/cc2341a4
Tree: http://git-wip-us.apache.org/repos/asf/cxf/tree/cc2341a4
Diff: http://git-wip-us.apache.org/repos/asf/cxf/diff/cc2341a4
Branch: refs/heads/master-jaxrs-2.1
Commit: cc2341a453a8edd467d83fdeb2c09ea62aee0ffa
Parents: 9810a84
Author: Sergey Beryozkin <sb...@gmail.com>
Authored: Thu Sep 15 11:21:46 2016 +0100
Committer: Sergey Beryozkin <sb...@gmail.com>
Committed: Thu Sep 15 11:21:46 2016 +0100
----------------------------------------------------------------------
.../ext/search/tika/TikaContentExtractor.java | 40 +++++++++++++++-----
1 file changed, 31 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/cxf/blob/cc2341a4/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
index fd3511a..e4d1918 100644
--- a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
+++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
@@ -36,6 +36,7 @@ import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.ToTextContentHandler;
@@ -47,6 +48,13 @@ public class TikaContentExtractor {
private final Detector detector;
/**
+ * Create new Tika-based content extractor using AutoDetectParser.
+ */
+ public TikaContentExtractor() {
+ this(new AutoDetectParser(), false);
+ }
+
+ /**
* Create new Tika-based content extractor using the provided parser instance.
* @param parser parser instance
*/
@@ -159,9 +167,6 @@ public class TikaContentExtractor {
if (in == null) {
return null;
}
- if (context == null) {
- context = new ParseContext();
- }
final Metadata metadata = new Metadata();
try {
@@ -171,20 +176,37 @@ public class TikaContentExtractor {
mediaType = MediaType.parse(mtHint.toString());
} else if (detector != null && in.markSupported()) {
mediaType = detector.detect(in, metadata);
- }
+ }
+ if (mediaType != null) {
+ metadata.set(Metadata.CONTENT_TYPE, mediaType.toString());
+ }
Parser parser = null;
- for (Parser p : parsers) {
- if (mediaType != null && !p.getSupportedTypes(context).contains(mediaType)) {
- continue;
+ if (parsers.size() == 1) {
+ parser = parsers.get(0);
+ } else {
+ for (Parser p : parsers) {
+ if (mediaType != null && !p.getSupportedTypes(context).contains(mediaType)) {
+ continue;
+ }
+ parser = p;
+ break;
}
- parser = p;
- break;
}
if (parser == null) {
return null;
}
+ if (context == null) {
+ context = new ParseContext();
+ }
+ if (context.get(Parser.class) == null) {
+ // to process the embedded attachments
+ context.set(Parser.class,
+ parser instanceof AutoDetectParser ? parser : new AutoDetectParser());
+ }
+
+
try {
parser.parse(in, handler, metadata, context);
} catch (Exception ex) {