You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cxf.apache.org by re...@apache.org on 2014/06/14 20:14:10 UTC
git commit: CXF-5549: Introduce Tika Search Visitor
Repository: cxf
Updated Branches:
refs/heads/master a9264b0a5 -> ca2de0d7e
CXF-5549: Introduce Tika Search Visitor
Project: http://git-wip-us.apache.org/repos/asf/cxf/repo
Commit: http://git-wip-us.apache.org/repos/asf/cxf/commit/ca2de0d7
Tree: http://git-wip-us.apache.org/repos/asf/cxf/tree/ca2de0d7
Diff: http://git-wip-us.apache.org/repos/asf/cxf/diff/ca2de0d7
Branch: refs/heads/master
Commit: ca2de0d7efa24e380fdb0bbee45d4c383134a207
Parents: a9264b0
Author: reta <dr...@gmail.com>
Authored: Sat Jun 14 14:13:52 2014 -0400
Committer: reta <dr...@gmail.com>
Committed: Sat Jun 14 14:13:52 2014 -0400
----------------------------------------------------------------------
parent/pom.xml | 13 ++-
rt/rs/extensions/search/pom.xml | 23 +++++
.../ext/search/tika/TikaContentExtractor.java | 83 ++++++++++++++++
.../search/tika/TikaContentExtractorTest.java | 98 +++++++++++++++++++
.../test/resources/files/testPDF.Encrypted.pdf | Bin 0 -> 34911 bytes
.../search/src/test/resources/files/testPDF.pdf | Bin 0 -> 34824 bytes
6 files changed, 216 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/cxf/blob/ca2de0d7/parent/pom.xml
----------------------------------------------------------------------
diff --git a/parent/pom.xml b/parent/pom.xml
index 17191ac..9bd5ac9 100644
--- a/parent/pom.xml
+++ b/parent/pom.xml
@@ -196,6 +196,7 @@
<cxf.dom4j.bundle.version>1.6.1_5</cxf.dom4j.bundle.version>
<cxf.jdom.bundle.version>1.1_4</cxf.jdom.bundle.version>
<cxf.olingo.version>1.2.0</cxf.olingo.version>
+ <cxf.tika.version>1.5</cxf.tika.version>
<cxf.checkstyle.extension />
<cxf.jaxb.context.class />
<cxf.spring.validation.mode>VALIDATION_AUTO</cxf.spring.validation.mode>
@@ -1709,7 +1710,17 @@
<artifactId>swagger-jaxrs_2.10</artifactId>
<version>${cxf.swagger.version}</version>
</dependency>
-
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${cxf.tika.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ <version>${cxf.tika.version}</version>
+ </dependency>
+
</dependencies>
</dependencyManagement>
<profiles>
http://git-wip-us.apache.org/repos/asf/cxf/blob/ca2de0d7/rt/rs/extensions/search/pom.xml
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/pom.xml b/rt/rs/extensions/search/pom.xml
index 1eca985..47e9371 100644
--- a/rt/rs/extensions/search/pom.xml
+++ b/rt/rs/extensions/search/pom.xml
@@ -71,6 +71,22 @@
<optional>true</optional>
</dependency>
<dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ <optional>true</optional>
+ <exclusions>
+ <exclusion>
+ <groupId>org.apache.poi</groupId>
+ <artifactId>poi-ooxml</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>${cxf.lucene.version}</version>
@@ -92,6 +108,13 @@
<artifactId>hibernate-entitymanager</artifactId>
<version>${hibernate.em.version}</version>
<scope>test</scope>
+ <!-- Conflicts with Apache Tika dependencies -->
+ <exclusions>
+ <exclusion>
+ <groupId>xml-apis</groupId>
+ <artifactId>xml-apis</artifactId>
+ </exclusion>
+ </exclusions>
</dependency>
<dependency>
<groupId>hsqldb</groupId>
http://git-wip-us.apache.org/repos/asf/cxf/blob/ca2de0d7/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
new file mode 100644
index 0000000..258917e
--- /dev/null
+++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
@@ -0,0 +1,83 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cxf.jaxrs.ext.search.tika;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.xml.sax.SAXException;
+
+import org.apache.cxf.common.logging.LogUtils;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.TextField;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParser;
+import org.apache.tika.sax.ToTextContentHandler;
+
+public class TikaContentExtractor {
+ private static final Logger LOG = LogUtils.getL7dLogger(TikaContentExtractor.class);
+
+ private final PDFParser parser;
+ private final DefaultDetector detector;
+
+ public TikaContentExtractor() {
+ detector = new DefaultDetector();
+ parser = new PDFParser();
+ }
+
+ public Document extract(final InputStream in) {
+ try {
+ final Metadata metadata = new Metadata();
+ final MediaType mediaType = detector.detect(in, metadata);
+ final ParseContext context = new ParseContext();
+ if (mediaType == null || !parser.getSupportedTypes(context).contains(mediaType)) {
+ return null;
+ }
+
+ final ToTextContentHandler handler = new ToTextContentHandler();
+ parser.parse(in, handler, metadata, context);
+
+ final Document document = new Document();
+ document.add(new Field("contents", handler.toString(), TextField.TYPE_STORED));
+
+ for (final String property: metadata.names()) {
+ document.add(new StringField(property, metadata.get(property), Store.YES));
+ }
+
+ return document;
+ } catch (final IOException ex) {
+ LOG.log(Level.WARNING, "Unable to extract media type from input stream", ex);
+ } catch (final SAXException ex) {
+ LOG.log(Level.WARNING, "Unable to parse input stream", ex);
+ } catch (final TikaException ex) {
+ LOG.log(Level.WARNING, "Unable to parse input stream", ex);
+ }
+
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/cxf/blob/ca2de0d7/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
new file mode 100644
index 0000000..19ab4ce
--- /dev/null
+++ b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractorTest.java
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cxf.jaxrs.ext.search.tika;
+
+import java.io.IOException;
+
+import org.apache.cxf.jaxrs.ext.search.SearchBean;
+import org.apache.cxf.jaxrs.ext.search.SearchConditionParser;
+import org.apache.cxf.jaxrs.ext.search.fiql.FiqlParser;
+import org.apache.cxf.jaxrs.ext.search.lucene.LuceneQueryVisitor;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.Version;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TikaContentExtractorTest extends Assert {
+ private TikaContentExtractor extractor;
+ private Directory directory;
+ private IndexWriter writer;
+ private SearchConditionParser< SearchBean > parser;
+
+ @Before
+ public void setUp() throws Exception {
+ final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
+ directory = new RAMDirectory();
+
+ IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
+ writer = new IndexWriter(directory, config);
+ writer.commit();
+
+ parser = new FiqlParser<SearchBean>(SearchBean.class);
+ extractor = new TikaContentExtractor();
+ }
+
+ @Test
+ public void testExtractedTextContentMatchesSearchCriteria() throws Exception {
+ final Document document = extractor.extract(getClass().getResourceAsStream("/files/testPDF.pdf"));
+ assertNotNull("Document should not be null", document);
+
+ writer.addDocument(document);
+ writer.commit();
+
+ assertEquals(1, getHits("ct==tika").length);
+ assertEquals(1, getHits("ct==incubation").length);
+ assertEquals(0, getHits("ct==toolsuite").length);
+ }
+
+ private ScoreDoc[] getHits(final String expression) throws IOException {
+ IndexReader reader = DirectoryReader.open(directory);
+ IndexSearcher searcher = new IndexSearcher(reader);
+
+ try {
+ LuceneQueryVisitor<SearchBean> visitor = new LuceneQueryVisitor<SearchBean>("ct", "contents");
+ visitor.visit(parser.parse(expression));
+
+ ScoreDoc[] hits = searcher.search(visitor.getQuery(), null, 1000).scoreDocs;
+ assertNotNull(hits);
+
+ return hits;
+ } finally {
+ reader.close();
+ }
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ writer.close();
+ directory.close();
+ }
+}
http://git-wip-us.apache.org/repos/asf/cxf/blob/ca2de0d7/rt/rs/extensions/search/src/test/resources/files/testPDF.Encrypted.pdf
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/test/resources/files/testPDF.Encrypted.pdf b/rt/rs/extensions/search/src/test/resources/files/testPDF.Encrypted.pdf
new file mode 100644
index 0000000..36ede6a
Binary files /dev/null and b/rt/rs/extensions/search/src/test/resources/files/testPDF.Encrypted.pdf differ
http://git-wip-us.apache.org/repos/asf/cxf/blob/ca2de0d7/rt/rs/extensions/search/src/test/resources/files/testPDF.pdf
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/test/resources/files/testPDF.pdf b/rt/rs/extensions/search/src/test/resources/files/testPDF.pdf
new file mode 100644
index 0000000..1f1bcff
Binary files /dev/null and b/rt/rs/extensions/search/src/test/resources/files/testPDF.pdf differ