You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cxf.apache.org by se...@apache.org on 2016/09/08 17:27:21 UTC
cxf git commit: Updating Tika extractor to accept a media type hint
Repository: cxf
Updated Branches:
refs/heads/master 4427f7790 -> f5a1c14e4
Updating Tika extractor to accept a media type hint
Project: http://git-wip-us.apache.org/repos/asf/cxf/repo
Commit: http://git-wip-us.apache.org/repos/asf/cxf/commit/f5a1c14e
Tree: http://git-wip-us.apache.org/repos/asf/cxf/tree/f5a1c14e
Diff: http://git-wip-us.apache.org/repos/asf/cxf/diff/f5a1c14e
Branch: refs/heads/master
Commit: f5a1c14e47ee1bb041f3687ea7b27b58e21112bf
Parents: 4427f77
Author: Sergey Beryozkin <sb...@gmail.com>
Authored: Thu Sep 8 18:26:55 2016 +0100
Committer: Sergey Beryozkin <sb...@gmail.com>
Committed: Thu Sep 8 18:26:55 2016 +0100
----------------------------------------------------------------------
.../ext/search/tika/TikaContentExtractor.java | 133 ++++++++++++-------
.../search/tika/TikaLuceneContentExtractor.java | 25 ++--
2 files changed, 100 insertions(+), 58 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/cxf/blob/f5a1c14e/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
index 1d2d30a..fd3511a 100644
--- a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
+++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
@@ -88,93 +88,103 @@ public class TikaContentExtractor {
* the detector could be run against input stream in order to ensure that parser supports this
* type of content.
* @param in input stream to extract the content and metadata from
- * @return the extracted content or null if extraction is not possible or was unsuccessful
+ * @return the extracted content and metadata or null if extraction is not possible or was unsuccessful
*/
public TikaContent extract(final InputStream in) {
- return extract(in, true);
+ return extract(in, (javax.ws.rs.core.MediaType)null);
}
/**
- * Extract the metadata only from the input stream. Depending on media type validation,
- * the detector could be run against input stream in order to ensure that parser supports this
- * type of content.
- * @param in input stream to extract the metadata from
- * @return the extracted content or null if extraction is not possible or was unsuccessful
+ * Extract the content and metadata from the input stream with a media type hint.
+ * @param in input stream to extract the content and metadata from
+ * @param mt JAX-RS MediaType of the stream content
+ * @return the extracted content and metadata or null if extraction is not possible or was unsuccessful
*/
- public TikaContent extractMetadata(final InputStream in) {
- return extract(in, false);
+ public TikaContent extract(final InputStream in, javax.ws.rs.core.MediaType mt) {
+ return extract(in, new ToTextContentHandler(), mt);
}
/**
- * Extract the metadata only from the input stream. Depending on media type validation,
+ * Extract the content and metadata from the input stream. Depending on media type validation,
* the detector could be run against input stream in order to ensure that parser supports this
* type of content.
- * @param in input stream to extract the metadata from
- * @return the extracted metadata converted to SearchBean or null if extraction is not possible
+ * @param in input stream to extract the content and metadata from
+ * @param handler custom ContentHandler
+ * @return the extracted content and metadata or null if extraction is not possible
* or was unsuccessful
*/
- public SearchBean extractMetadataToSearchBean(final InputStream in) {
- TikaContent tc = extractMetadata(in);
- if (tc == null) {
- return null;
- }
- Metadata metadata = tc.getMetadata();
- SearchBean bean = new SearchBean();
- for (final String property: metadata.names()) {
- bean.set(property, metadata.get(property));
- }
- return bean;
+ public TikaContent extract(final InputStream in, final ContentHandler handler) {
+ return extract(in, handler, (javax.ws.rs.core.MediaType)null);
}
+
/**
- * Extract the content and metadata from the input stream. Depending on media type validation,
- * the detector could be run against input stream in order to ensure that parser supports this
- * type of content.
- * @param in input stream to extract the metadata from
+ * Extract the content and metadata from the input stream with a media type hint.
+ * @param in input stream to extract the content and metadata from
* @param handler custom ContentHandler
- * @return the extracted metadata converted to SearchBean or null if extraction is not possible
+ * @param mt JAX-RS MediaType of the stream content
+ * @return the extracted content and metadata or null if extraction is not possible
* or was unsuccessful
*/
- public TikaContent extract(final InputStream in, final ContentHandler handler) {
- return extract(in, handler, null);
+ public TikaContent extract(final InputStream in, final ContentHandler handler,
+ javax.ws.rs.core.MediaType mt) {
+ return extract(in, handler, mt, (ParseContext)null);
}
+
/**
* Extract the content and metadata from the input stream. Depending on media type validation,
* the detector could be run against input stream in order to ensure that parser supports this
* type of content.
- * @param in input stream to extract the metadata from
+ * @param in input stream to extract the content and metadata from
* @param handler custom ContentHandler
* @param context custom context
- * @return the extracted metadata converted to SearchBean or null if extraction is not possible
+ * @return the extracted content and metadata or null if extraction is not possible
* or was unsuccessful
*/
public TikaContent extract(final InputStream in, final ContentHandler handler, ParseContext context) {
+ return extract(in, handler, (javax.ws.rs.core.MediaType)null, context);
+ }
+
+ /**
+ * Extract the content and metadata from the input stream with a media type hint
+ * type of content.
+ * @param in input stream to extract the metadata from
+ * @param handler custom ContentHandler
+ * @param mt JAX-RS MediaType of the stream content
+ * @param context custom context
+ * @return the extracted content and metadata or null if extraction is not possible
+ * or was unsuccessful
+ */
+ public TikaContent extract(final InputStream in, final ContentHandler handler,
+ javax.ws.rs.core.MediaType mtHint, ParseContext context) {
if (in == null) {
return null;
}
+ if (context == null) {
+ context = new ParseContext();
+ }
+ final Metadata metadata = new Metadata();
try {
- final Metadata metadata = new Metadata();
// Try to validate that input stream media type is supported by the parser
MediaType mediaType = null;
+ if (mtHint != null) {
+ mediaType = MediaType.parse(mtHint.toString());
+ } else if (detector != null && in.markSupported()) {
+ mediaType = detector.detect(in, metadata);
+ }
+
Parser parser = null;
for (Parser p : parsers) {
- if (detector != null && in.markSupported()) {
- mediaType = detector.detect(in, metadata);
- if (mediaType != null && p.getSupportedTypes(context).contains(mediaType)) {
- parser = p;
- break;
- }
- } else {
- parser = p;
- break;
+ if (mediaType != null && !p.getSupportedTypes(context).contains(mediaType)) {
+ continue;
}
+ parser = p;
+ break;
}
if (parser == null) {
return null;
}
- if (context == null) {
- context = new ParseContext();
- }
+
try {
parser.parse(in, handler, metadata, context);
} catch (Exception ex) {
@@ -199,12 +209,39 @@ public class TikaContentExtractor {
return null;
}
+ /**
+ * Extract the metadata only from the input stream. Depending on media type validation,
+ * the detector could be run against input stream in order to ensure that parser supports this
+ * type of content.
+ * @param in input stream to extract the metadata from
+ * @return the extracted content or null if extraction is not possible or was unsuccessful
+ */
+ public TikaContent extractMetadata(final InputStream in) {
+ return extract(in, (ContentHandler)null);
+ }
- TikaContent extract(final InputStream in, boolean extractContent) {
- final ToTextContentHandler handler = extractContent ? new ToTextContentHandler() : null;
- return extract(in, handler, null);
+ /**
+ * Extract the metadata only from the input stream. Depending on media type validation,
+ * the detector could be run against input stream in order to ensure that parser supports this
+ * type of content.
+ * @param in input stream to extract the metadata from
+ * @return the extracted metadata converted to SearchBean or null if extraction is not possible
+ * or was unsuccessful
+ */
+ public SearchBean extractMetadataToSearchBean(final InputStream in) {
+ TikaContent tc = extractMetadata(in);
+ if (tc == null) {
+ return null;
+ }
+ Metadata metadata = tc.getMetadata();
+ SearchBean bean = new SearchBean();
+ for (final String property: metadata.names()) {
+ bean.set(property, metadata.get(property));
+ }
+ return bean;
}
+
/**
* Extracted content, metadata and media type container
*/
http://git-wip-us.apache.org/repos/asf/cxf/blob/f5a1c14e/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
index 0e526de..009546c 100644
--- a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
+++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
@@ -25,6 +25,7 @@ import java.util.List;
import javax.ws.rs.ext.ParamConverterProvider;
import org.apache.cxf.common.util.StringUtils;
+import org.apache.cxf.jaxrs.ext.search.ParamConverterUtils;
import org.apache.cxf.jaxrs.ext.search.tika.TikaContentExtractor.TikaContent;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoubleField;
@@ -37,9 +38,7 @@ import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
-
-import static org.apache.cxf.jaxrs.ext.search.ParamConverterUtils.getString;
-import static org.apache.cxf.jaxrs.ext.search.ParamConverterUtils.getValue;
+import org.apache.tika.sax.ToTextContentHandler;
public class TikaLuceneContentExtractor {
private final LuceneDocumentMetadata defaultDocumentMetadata;
@@ -171,7 +170,8 @@ public class TikaLuceneContentExtractor {
boolean extractContent,
boolean extractMetadata) {
- TikaContent content = extractor.extract(in, extractContent);
+ TikaContent content =
+ extractor.extract(in, extractContent ? new ToTextContentHandler() : null);
if (content == null) {
return null;
@@ -214,20 +214,25 @@ public class TikaLuceneContentExtractor {
if (type != null) {
if (Number.class.isAssignableFrom(type)) {
if (Double.class.isAssignableFrom(type)) {
- return new DoubleField(name, getValue(Double.class, provider, value), Store.YES);
+ return new DoubleField(name,
+ ParamConverterUtils.getValue(Double.class, provider, value), Store.YES);
} else if (Float.class.isAssignableFrom(type)) {
- return new FloatField(name, getValue(Float.class, provider, value), Store.YES);
+ return new FloatField(name,
+ ParamConverterUtils.getValue(Float.class, provider, value), Store.YES);
} else if (Long.class.isAssignableFrom(type)) {
- return new LongField(name, getValue(Long.class, provider, value), Store.YES);
+ return new LongField(name,
+ ParamConverterUtils.getValue(Long.class, provider, value), Store.YES);
} else if (Integer.class.isAssignableFrom(type) || Byte.class.isAssignableFrom(type)) {
- return new IntField(name, getValue(Integer.class, provider, value), Store.YES);
+ return new IntField(name,
+ ParamConverterUtils.getValue(Integer.class, provider, value), Store.YES);
}
} else if (Date.class.isAssignableFrom(type)) {
- final Date date = getValue(Date.class, provider, value);
+ final Date date = ParamConverterUtils.getValue(Date.class, provider, value);
Field field = null;
if (date != null) {
- field = new StringField(name, getString(Date.class, provider, date), Store.YES);
+ field = new StringField(name,
+ ParamConverterUtils.getString(Date.class, provider, date), Store.YES);
} else {
field = new StringField(name, value, Store.YES);
}