You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cxf.apache.org by re...@apache.org on 2014/06/14 20:14:10 UTC
git commit: CXF-5549: Introduce Tika Search Visitor

Repository: cxf
Updated Branches:
  refs/heads/master a9264b0a5 -> ca2de0d7e


CXF-5549: Introduce Tika Search Visitor


Project: http://git-wip-us.apache.org/repos/asf/cxf/repo
Commit: http://git-wip-us.apache.org/repos/asf/cxf/commit/ca2de0d7
Tree: http://git-wip-us.apache.org/repos/asf/cxf/tree/ca2de0d7
Diff: http://git-wip-us.apache.org/repos/asf/cxf/diff/ca2de0d7

Branch: refs/heads/master
Commit: ca2de0d7efa24e380fdb0bbee45d4c383134a207
Parents: a9264b0
Author: reta <dr...@gmail.com>
Authored: Sat Jun 14 14:13:52 2014 -0400
Committer: reta <dr...@gmail.com>
Committed: Sat Jun 14 14:13:52 2014 -0400

----------------------------------------------------------------------
 parent/pom.xml                                  |  13 ++-
 rt/rs/extensions/search/pom.xml                 |  23 +++++
 .../ext/search/tika/TikaContentExtractor.java   |  83 ++++++++++++++++
 .../search/tika/TikaContentExtractorTest.java   |  98 +++++++++++++++++++
 .../test/resources/files/testPDF.Encrypted.pdf  | Bin 0 -> 34911 bytes
 .../search/src/test/resources/files/testPDF.pdf | Bin 0 -> 34824 bytes
 6 files changed, 216 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cxf/blob/ca2de0d7/parent/pom.xml
----------------------------------------------------------------------
diff --git a/parent/pom.xml b/parent/pom.xml
index 17191ac..9bd5ac9 100644
--- a/parent/pom.xml
+++ b/parent/pom.xml
@@ -196,6 +196,7 @@
         <cxf.dom4j.bundle.version>1.6.1_5</cxf.dom4j.bundle.version>
         <cxf.jdom.bundle.version>1.1_4</cxf.jdom.bundle.version>
         <cxf.olingo.version>1.2.0</cxf.olingo.version>
+        <cxf.tika.version>1.5</cxf.tika.version>
         <cxf.checkstyle.extension />
         <cxf.jaxb.context.class />
         <cxf.spring.validation.mode>VALIDATION_AUTO</cxf.spring.validation.mode>
@@ -1709,7 +1710,17 @@
                 <artifactId>swagger-jaxrs_2.10</artifactId>
                 <version>${cxf.swagger.version}</version>
             </dependency>
-
+			<dependency>
+                <groupId>org.apache.tika</groupId>
+                <artifactId>tika-core</artifactId>
+                <version>${cxf.tika.version}</version>
+			</dependency>
+			<dependency>
+                <groupId>org.apache.tika</groupId>
+                <artifactId>tika-parsers</artifactId>
+                <version>${cxf.tika.version}</version>
+			</dependency>
+			
         </dependencies>
     </dependencyManagement>
     <profiles>

http://git-wip-us.apache.org/repos/asf/cxf/blob/ca2de0d7/rt/rs/extensions/search/pom.xml
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/pom.xml b/rt/rs/extensions/search/pom.xml
index 1eca985..47e9371 100644
--- a/rt/rs/extensions/search/pom.xml
+++ b/rt/rs/extensions/search/pom.xml
@@ -71,6 +71,22 @@
 		    <optional>true</optional>
 		</dependency>
         <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-core</artifactId>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-parsers</artifactId>
+            <optional>true</optional>
+            <exclusions>
+            	<exclusion>
+            		<groupId>org.apache.poi</groupId>
+            		<artifactId>poi-ooxml</artifactId>            	
+            	</exclusion>       
+            </exclusions>
+        </dependency>
+        <dependency>
             <groupId>org.apache.lucene</groupId>
             <artifactId>lucene-analyzers-common</artifactId>
             <version>${cxf.lucene.version}</version>
@@ -92,6 +108,13 @@
             <artifactId>hibernate-entitymanager</artifactId>
             <version>${hibernate.em.version}</version>
             <scope>test</scope>
+            <!-- Conflicts with Apache Tika dependencies -->
+            <exclusions>
+            	<exclusion>
+            		<groupId>xml-apis</groupId>
+	                <artifactId>xml-apis</artifactId>            	
+            	</exclusion>
+            </exclusions>
         </dependency>
         <dependency>
             <groupId>hsqldb</groupId>

http://git-wip-us.apache.org/repos/asf/cxf/blob/ca2de0d7/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
new file mode 100644
index 0000000..258917e
--- /dev/null
+++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cxf.jaxrs.ext.search.tika;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.xml.sax.SAXException;
+
+import org.apache.cxf.common.logging.LogUtils;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParser;
+import org.apache.tika.sax.ToTextContentHandler;
+
+public class TikaContentExtractor {
+    private static final Logger LOG = LogUtils.getL7dLogger(TikaContentExtractor.class);
+    
+    private final PDFParser parser;
+    private final DefaultDetector detector;
+    
+    public TikaContentExtractor() {
+        detector = new DefaultDetector();
+        parser = new PDFParser();
+    }
+    
+    public Document extract(final InputStream in) {
+        try {
+            final Metadata metadata = new Metadata();
+            final MediaType mediaType = detector.detect(in, metadata);
+            final ParseContext context = new ParseContext(); 
+            if (mediaType == null || !parser.getSupportedTypes(context).contains(mediaType)) {
+                return null;
+            }
+            
+            final ToTextContentHandler handler = new ToTextContentHandler();
+            parser.parse(in, handler, metadata, context);
+            
+            final Document document = new Document();
+            document.add(new Field("contents", handler.toString(), TextField.TYPE_STORED));
+            
+            for (final String property: metadata.names()) {
+                document.add(new StringField(property, metadata.get(property), Store.YES));
+            }
+            
+            return document;
+        } catch (final IOException ex) {
+            LOG.log(Level.WARNING, "Unable to extract media type from input stream", ex);
+        } catch (final SAXException ex) {
+            LOG.log(Level.WARNING, "Unable to parse input stream", ex);
+        } catch (final TikaException ex) {
+            LOG.log(Level.WARNING, "Unable to parse input stream", ex);
+        }
+     
+        return null;
+    }
+}

http://git-wip-us.apache.org/repos/asf/cxf/blob/ca2de0d7/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
new file mode 100644
index 0000000..19ab4ce
--- /dev/null
+++ b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cxf.jaxrs.ext.search.tika;
+
+import java.io.IOException;
+
+import org.apache.cxf.jaxrs.ext.search.SearchBean;
+import org.apache.cxf.jaxrs.ext.search.SearchConditionParser;
+import org.apache.cxf.jaxrs.ext.search.fiql.FiqlParser;
+import org.apache.cxf.jaxrs.ext.search.lucene.LuceneQueryVisitor;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.Version;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TikaContentExtractorTest extends Assert {
+    private TikaContentExtractor extractor;
+    private Directory directory;
+    private IndexWriter writer;
+    private SearchConditionParser< SearchBean > parser;
+    
+    @Before
+    public void setUp() throws Exception {
+        final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
+        directory = new RAMDirectory();
+        
+        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
+        writer = new IndexWriter(directory, config);    
+        writer.commit();
+        
+        parser = new FiqlParser<SearchBean>(SearchBean.class);
+        extractor = new TikaContentExtractor();
+    }
+    
+    @Test
+    public void testExtractedTextContentMatchesSearchCriteria() throws Exception {
+        final Document document = extractor.extract(getClass().getResourceAsStream("/files/testPDF.pdf"));
+        assertNotNull("Document should not be null", document);
+        
+        writer.addDocument(document);
+        writer.commit();
+
+        assertEquals(1, getHits("ct==tika").length);
+        assertEquals(1, getHits("ct==incubation").length);
+        assertEquals(0, getHits("ct==toolsuite").length);
+    }
+
+    private ScoreDoc[] getHits(final String expression) throws IOException {
+        IndexReader reader = DirectoryReader.open(directory);
+        IndexSearcher searcher = new IndexSearcher(reader);        
+
+        try {
+            LuceneQueryVisitor<SearchBean> visitor = new LuceneQueryVisitor<SearchBean>("ct", "contents");
+            visitor.visit(parser.parse(expression));
+    
+            ScoreDoc[] hits = searcher.search(visitor.getQuery(), null, 1000).scoreDocs;
+            assertNotNull(hits);
+            
+            return hits;            
+        } finally {
+            reader.close();
+        }
+    }
+    
+    @After
+    public void tearDown() throws Exception {
+        writer.close();        
+        directory.close();
+    }
+}

http://git-wip-us.apache.org/repos/asf/cxf/blob/ca2de0d7/rt/rs/extensions/search/src/test/resources/files/testPDF.Encrypted.pdf
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/test/resources/files/testPDF.Encrypted.pdf b/rt/rs/extensions/search/src/test/resources/files/testPDF.Encrypted.pdf
new file mode 100644
index 0000000..36ede6a
Binary files /dev/null and b/rt/rs/extensions/search/src/test/resources/files/testPDF.Encrypted.pdf differ

http://git-wip-us.apache.org/repos/asf/cxf/blob/ca2de0d7/rt/rs/extensions/search/src/test/resources/files/testPDF.pdf
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/test/resources/files/testPDF.pdf b/rt/rs/extensions/search/src/test/resources/files/testPDF.pdf
new file mode 100644
index 0000000..1f1bcff
Binary files /dev/null and b/rt/rs/extensions/search/src/test/resources/files/testPDF.pdf differ