You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cxf.apache.org by se...@apache.org on 2016/09/08 17:27:21 UTC
cxf git commit: Updating Tika extractor to accept a media type hint

Repository: cxf
Updated Branches:
  refs/heads/master 4427f7790 -> f5a1c14e4


Updating Tika extractor to accept a media type hint


Project: http://git-wip-us.apache.org/repos/asf/cxf/repo
Commit: http://git-wip-us.apache.org/repos/asf/cxf/commit/f5a1c14e
Tree: http://git-wip-us.apache.org/repos/asf/cxf/tree/f5a1c14e
Diff: http://git-wip-us.apache.org/repos/asf/cxf/diff/f5a1c14e

Branch: refs/heads/master
Commit: f5a1c14e47ee1bb041f3687ea7b27b58e21112bf
Parents: 4427f77
Author: Sergey Beryozkin <sb...@gmail.com>
Authored: Thu Sep 8 18:26:55 2016 +0100
Committer: Sergey Beryozkin <sb...@gmail.com>
Committed: Thu Sep 8 18:26:55 2016 +0100

----------------------------------------------------------------------
 .../ext/search/tika/TikaContentExtractor.java   | 133 ++++++++++++-------
 .../search/tika/TikaLuceneContentExtractor.java |  25 ++--
 2 files changed, 100 insertions(+), 58 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cxf/blob/f5a1c14e/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
index 1d2d30a..fd3511a 100644
--- a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
+++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
@@ -88,93 +88,103 @@ public class TikaContentExtractor {
      * the detector could be run against input stream in order to ensure that parser supports this
      * type of content. 
      * @param in input stream to extract the content and metadata from  
-     * @return the extracted content or null if extraction is not possible or was unsuccessful
+     * @return the extracted content and metadata or null if extraction is not possible or was unsuccessful
      */
     public TikaContent extract(final InputStream in) {
-        return extract(in, true);
+        return extract(in, (javax.ws.rs.core.MediaType)null);
     }
     
     /**
-     * Extract the metadata only from the input stream. Depending on media type validation,
-     * the detector could be run against input stream in order to ensure that parser supports this
-     * type of content. 
-     * @param in input stream to extract the metadata from  
-     * @return the extracted content or null if extraction is not possible or was unsuccessful
+     * Extract the content and metadata from the input stream with a media type hint. 
+     * @param in input stream to extract the content and metadata from  
+     * @param mt JAX-RS MediaType of the stream content
+     * @return the extracted content and metadata or null if extraction is not possible or was unsuccessful
      */
-    public TikaContent extractMetadata(final InputStream in) {
-        return extract(in, false);
+    public TikaContent extract(final InputStream in, javax.ws.rs.core.MediaType mt) {
+        return extract(in, new ToTextContentHandler(), mt);
     }
     
     /**
-     * Extract the metadata only from the input stream. Depending on media type validation,
+     * Extract the content and metadata from the input stream. Depending on media type validation,
      * the detector could be run against input stream in order to ensure that parser supports this
      * type of content. 
-     * @param in input stream to extract the metadata from  
-     * @return the extracted metadata converted to SearchBean or null if extraction is not possible 
+     * @param in input stream to extract the content and metadata from 
+     * @param handler custom ContentHandler 
+     * @return the extracted content and metadata or null if extraction is not possible 
      *         or was unsuccessful
      */
-    public SearchBean extractMetadataToSearchBean(final InputStream in) {
-        TikaContent tc = extractMetadata(in);
-        if (tc == null) {
-            return null;
-        }
-        Metadata metadata = tc.getMetadata();
-        SearchBean bean = new SearchBean();
-        for (final String property: metadata.names()) {
-            bean.set(property, metadata.get(property));
-        }
-        return bean;
+    public TikaContent extract(final InputStream in, final ContentHandler handler) {
+        return extract(in, handler, (javax.ws.rs.core.MediaType)null);
     }
+    
     /**
-     * Extract the content and metadata from the input stream. Depending on media type validation,
-     * the detector could be run against input stream in order to ensure that parser supports this
-     * type of content. 
-     * @param in input stream to extract the metadata from 
+     * Extract the content and metadata from the input stream with a media type hint. 
+     * @param in input stream to extract the content and metadata from 
      * @param handler custom ContentHandler 
-     * @return the extracted metadata converted to SearchBean or null if extraction is not possible 
+     * @param mt JAX-RS MediaType of the stream content
+     * @return the extracted content and metadata or null if extraction is not possible 
      *         or was unsuccessful
      */
-    public TikaContent extract(final InputStream in, final ContentHandler handler) {
-        return extract(in, handler, null);
+    public TikaContent extract(final InputStream in, final ContentHandler handler, 
+                               javax.ws.rs.core.MediaType mt) {
+        return extract(in, handler, mt, (ParseContext)null);
     }
+    
     /**
      * Extract the content and metadata from the input stream. Depending on media type validation,
      * the detector could be run against input stream in order to ensure that parser supports this
      * type of content. 
-     * @param in input stream to extract the metadata from 
+     * @param in input stream to extract the content and metadata from 
      * @param handler custom ContentHandler
      * @param context custom context 
-     * @return the extracted metadata converted to SearchBean or null if extraction is not possible 
+     * @return the extracted content and metadata or null if extraction is not possible 
      *         or was unsuccessful
      */
     public TikaContent extract(final InputStream in, final ContentHandler handler, ParseContext context) {
+        return extract(in, handler, (javax.ws.rs.core.MediaType)null, context);
+    }
+    
+    /**
+     * Extract the content and metadata from the input stream with a media type hint
+     * type of content. 
+     * @param in input stream to extract the metadata from 
+     * @param handler custom ContentHandler
+     * @param mt JAX-RS MediaType of the stream content
+     * @param context custom context 
+     * @return the extracted content and metadata or null if extraction is not possible 
+     *         or was unsuccessful
+     */
+    public TikaContent extract(final InputStream in, final ContentHandler handler, 
+                               javax.ws.rs.core.MediaType mtHint, ParseContext context) {    
         if (in == null) {
             return null;
         }
+        if (context == null) {
+            context = new ParseContext();
+        }
+        final Metadata metadata = new Metadata();            
         
         try {
-            final Metadata metadata = new Metadata();            
             // Try to validate that input stream media type is supported by the parser
             MediaType mediaType = null;
+            if (mtHint != null) {
+                mediaType = MediaType.parse(mtHint.toString());
+            } else if (detector != null && in.markSupported()) {
+                mediaType = detector.detect(in, metadata);
+            } 
+            
             Parser parser = null;
             for (Parser p : parsers) {
-                if (detector != null && in.markSupported()) {
-                    mediaType = detector.detect(in, metadata);
-                    if (mediaType != null && p.getSupportedTypes(context).contains(mediaType)) {
-                        parser = p;
-                        break;
-                    }
-                } else {
-                    parser = p;
-                    break;
+                if (mediaType != null && !p.getSupportedTypes(context).contains(mediaType)) {
+                    continue;
                 }
+                parser = p;
+                break;
             }
             if (parser == null) {
                 return null;
             }
-            if (context == null) {
-                context = new ParseContext();
-            }
+            
             try {
                 parser.parse(in, handler, metadata, context);
             } catch (Exception ex) {
@@ -199,12 +209,39 @@ public class TikaContentExtractor {
      
         return null;
     }
+    /**
+     * Extract the metadata only from the input stream. Depending on media type validation,
+     * the detector could be run against input stream in order to ensure that parser supports this
+     * type of content. 
+     * @param in input stream to extract the metadata from  
+     * @return the extracted content or null if extraction is not possible or was unsuccessful
+     */
+    public TikaContent extractMetadata(final InputStream in) {
+        return extract(in, (ContentHandler)null);
+    }
     
-    TikaContent extract(final InputStream in, boolean extractContent) {
-        final ToTextContentHandler handler = extractContent ? new ToTextContentHandler() : null;
-        return extract(in, handler, null);
+    /**
+     * Extract the metadata only from the input stream. Depending on media type validation,
+     * the detector could be run against input stream in order to ensure that parser supports this
+     * type of content. 
+     * @param in input stream to extract the metadata from  
+     * @return the extracted metadata converted to SearchBean or null if extraction is not possible 
+     *         or was unsuccessful
+     */
+    public SearchBean extractMetadataToSearchBean(final InputStream in) {
+        TikaContent tc = extractMetadata(in);
+        if (tc == null) {
+            return null;
+        }
+        Metadata metadata = tc.getMetadata();
+        SearchBean bean = new SearchBean();
+        for (final String property: metadata.names()) {
+            bean.set(property, metadata.get(property));
+        }
+        return bean;
     }
     
+    
     /**
      * Extracted content, metadata and media type container
      */

http://git-wip-us.apache.org/repos/asf/cxf/blob/f5a1c14e/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
index 0e526de..009546c 100644
--- a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
+++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
@@ -25,6 +25,7 @@ import java.util.List;
 import javax.ws.rs.ext.ParamConverterProvider;
 
 import org.apache.cxf.common.util.StringUtils;
+import org.apache.cxf.jaxrs.ext.search.ParamConverterUtils;
 import org.apache.cxf.jaxrs.ext.search.tika.TikaContentExtractor.TikaContent;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.DoubleField;
@@ -37,9 +38,7 @@ import org.apache.lucene.document.StringField;
 import org.apache.lucene.document.TextField;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.Parser;
-
-import static org.apache.cxf.jaxrs.ext.search.ParamConverterUtils.getString;
-import static org.apache.cxf.jaxrs.ext.search.ParamConverterUtils.getValue;
+import org.apache.tika.sax.ToTextContentHandler;
 
 public class TikaLuceneContentExtractor {
     private final LuceneDocumentMetadata defaultDocumentMetadata;    
@@ -171,7 +170,8 @@ public class TikaLuceneContentExtractor {
                                 boolean extractContent, 
                                 boolean extractMetadata) {
         
-        TikaContent content = extractor.extract(in, extractContent);
+        TikaContent content = 
+            extractor.extract(in, extractContent ? new ToTextContentHandler() : null);
         
         if (content == null) {
             return null;
@@ -214,20 +214,25 @@ public class TikaLuceneContentExtractor {
         if (type != null) {
             if (Number.class.isAssignableFrom(type)) {
                 if (Double.class.isAssignableFrom(type)) {
-                    return new DoubleField(name, getValue(Double.class, provider, value), Store.YES);
+                    return new DoubleField(name, 
+                        ParamConverterUtils.getValue(Double.class, provider, value), Store.YES);
                 } else if (Float.class.isAssignableFrom(type)) {
-                    return new FloatField(name, getValue(Float.class, provider, value), Store.YES);
+                    return new FloatField(name, 
+                        ParamConverterUtils.getValue(Float.class, provider, value), Store.YES);
                 } else if (Long.class.isAssignableFrom(type)) {
-                    return new LongField(name, getValue(Long.class, provider, value), Store.YES);
+                    return new LongField(name, 
+                        ParamConverterUtils.getValue(Long.class, provider, value), Store.YES);
                 } else if (Integer.class.isAssignableFrom(type) || Byte.class.isAssignableFrom(type)) {
-                    return new IntField(name, getValue(Integer.class, provider, value), Store.YES);
+                    return new IntField(name, 
+                        ParamConverterUtils.getValue(Integer.class, provider, value), Store.YES);
                 }
             } else if (Date.class.isAssignableFrom(type)) {
-                final Date date = getValue(Date.class, provider, value);                
+                final Date date = ParamConverterUtils.getValue(Date.class, provider, value);                
                 Field field = null;
                 
                 if (date != null) {
-                    field = new StringField(name, getString(Date.class, provider, date), Store.YES);
+                    field = new StringField(name, 
+                                            ParamConverterUtils.getString(Date.class, provider, date), Store.YES);
                 } else {
                     field = new StringField(name, value, Store.YES); 
                 }