You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cxf.apache.org by se...@apache.org on 2014/06/27 17:48:44 UTC
[1/2] git commit: [CXF-5549] Moving Lucene DocumentMetadata to its
own class and supporting injecting it via the constructor as discussed with
Andriy, updating TikaContentExtractor to accept multile parsers
Repository: cxf
Updated Branches:
refs/heads/master e0a449ec7 -> 785c0bd70
[CXF-5549] Moving Lucene DocumentMetadata to its own class and supporting injecting it via the constructor as discussed with Andriy, updating TikaContentExtractor to accept multile parsers
Project: http://git-wip-us.apache.org/repos/asf/cxf/repo
Commit: http://git-wip-us.apache.org/repos/asf/cxf/commit/8f99f309
Tree: http://git-wip-us.apache.org/repos/asf/cxf/tree/8f99f309
Diff: http://git-wip-us.apache.org/repos/asf/cxf/diff/8f99f309
Branch: refs/heads/master
Commit: 8f99f30970db784a6d741fe76dbca517585d31cd
Parents: e0a449e
Author: Sergey Beryozkin <sb...@talend.com>
Authored: Fri Jun 27 16:47:49 2014 +0100
Committer: Sergey Beryozkin <sb...@talend.com>
Committed: Fri Jun 27 16:47:49 2014 +0100
----------------------------------------------------------------------
.../ext/search/tika/TikaContentExtractor.java | 62 ++++++--
.../search/tika/TikaLuceneContentExtractor.java | 148 ++++++++++---------
.../tika/TikaLuceneContentExtractorTest.java | 4 +-
3 files changed, 128 insertions(+), 86 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/cxf/blob/8f99f309/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
index e7cb623..266457e 100644
--- a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
+++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaContentExtractor.java
@@ -20,6 +20,8 @@ package org.apache.cxf.jaxrs.ext.search.tika;
import java.io.IOException;
import java.io.InputStream;
+import java.util.Collections;
+import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
@@ -28,6 +30,7 @@ import org.xml.sax.SAXException;
import org.apache.cxf.common.logging.LogUtils;
import org.apache.cxf.jaxrs.ext.search.SearchBean;
import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -38,9 +41,8 @@ import org.apache.tika.sax.ToTextContentHandler;
public class TikaContentExtractor {
private static final Logger LOG = LogUtils.getL7dLogger(TikaContentExtractor.class);
- private final Parser parser;
- private final DefaultDetector detector;
- private final boolean validateMediaType;
+ private final List<Parser> parsers;
+ private final Detector detector;
/**
* Create new Tika-based content extractor using the provided parser instance.
@@ -51,17 +53,32 @@ public class TikaContentExtractor {
}
/**
+ * Create new Tika-based content extractor using the provided parser instances.
+ * @param parsers parser instances
+ */
+ public TikaContentExtractor(final List<Parser> parsers) {
+ this(parsers, new DefaultDetector());
+ }
+
+ /**
+ * Create new Tika-based content extractor using the provided parser instances.
+ * @param parsers parser instances
+ */
+ public TikaContentExtractor(final List<Parser> parsers, Detector detector) {
+ this.parsers = parsers;
+ this.detector = detector;
+ }
+
+ /**
* Create new Tika-based content extractor using the provided parser instance and
- * optional media type validation. If validation is enabled, the implementation
+ * optional media type validation. If validation is enabled, the implementation parser
* will try to detect the media type of the input and validate it against media types
* supported by the parser.
* @param parser parser instance
- * @param validateMediaType enabled or disable media type validation
+ * @param validateMediaType enabled or disable media type validationparser
*/
public TikaContentExtractor(final Parser parser, final boolean validateMediaType) {
- this.parser = parser;
- this.validateMediaType = validateMediaType;
- this.detector = validateMediaType ? new DefaultDetector() : null;
+ this(Collections.singletonList(parser), validateMediaType ? new DefaultDetector() : null);
}
/**
@@ -111,18 +128,28 @@ public class TikaContentExtractor {
final Metadata metadata = new Metadata();
final ParseContext context = new ParseContext();
- // Try to validate that input stream media type is supported by the parser
- if (validateMediaType) {
- final MediaType mediaType = detector.detect(in, metadata);
- if (mediaType == null || !parser.getSupportedTypes(context).contains(mediaType)) {
- return null;
+ // Try to validate that input stream media type is supported by the parser
+ MediaType mediaType = null;
+ Parser parser = null;
+ for (Parser p : parsers) {
+ if (detector != null) {
+ mediaType = detector.detect(in, metadata);
+ if (mediaType != null && p.getSupportedTypes(context).contains(mediaType)) {
+ parser = p;
+ break;
+ }
+ } else {
+ parser = p;
}
}
+ if (parser == null) {
+ return null;
+ }
final ToTextContentHandler handler = extractContent
? new ToTextContentHandler() : new IgnoreContentHandler();
parser.parse(in, handler, metadata, context);
- return new TikaContent(handler.toString(), metadata);
+ return new TikaContent(handler.toString(), metadata, mediaType);
} catch (final IOException ex) {
LOG.log(Level.WARNING, "Unable to extract media type from input stream", ex);
} catch (final SAXException ex) {
@@ -136,9 +163,11 @@ public class TikaContentExtractor {
public static class TikaContent {
private String content;
private Metadata metadata;
- public TikaContent(String content, Metadata metadata) {
+ private MediaType mediaType;
+ public TikaContent(String content, Metadata metadata, MediaType mediaType) {
this.content = content;
this.metadata = metadata;
+ this.mediaType = mediaType;
}
public String getContent() {
return content;
@@ -146,6 +175,9 @@ public class TikaContentExtractor {
public Metadata getMetadata() {
return metadata;
}
+ public MediaType getMediaType() {
+ return mediaType;
+ }
}
private static class IgnoreContentHandler extends ToTextContentHandler {
http://git-wip-us.apache.org/repos/asf/cxf/blob/8f99f309/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
index 8911df8..28eaa35 100644
--- a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
+++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractor.java
@@ -20,8 +20,6 @@ package org.apache.cxf.jaxrs.ext.search.tika;
import java.io.InputStream;
import java.util.Date;
-import java.util.LinkedHashMap;
-import java.util.Map;
import org.apache.cxf.jaxrs.ext.search.tika.TikaContentExtractor.TikaContent;
import org.apache.lucene.document.Document;
@@ -37,55 +35,9 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
public class TikaLuceneContentExtractor {
- private final DocumentMetadata defaultDocumentMetadata;
+ private final LuceneDocumentMetadata defaultDocumentMetadata;
private final TikaContentExtractor extractor;
- public static class DocumentMetadata {
- private final Map< String, Class< ? > > fieldTypes =
- new LinkedHashMap< String, Class< ? > >();
- private final String contentFieldName;
-
- public DocumentMetadata(final String contentFieldName) {
- this.contentFieldName = contentFieldName;
- }
-
- public DocumentMetadata withField(final String name, final Class< ? > type) {
- fieldTypes.put(name, type);
- return this;
- }
-
- public String getContentFieldName() {
- return contentFieldName;
- }
-
- private Field contentField(final String content) {
- return new TextField(contentFieldName, content, Store.YES);
- }
-
- private Field field(final String name, final String value) {
- final Class< ? > type = fieldTypes.get(name);
-
- if (type != null) {
- if (Number.class.isAssignableFrom(type)) {
- if (Double.class.isAssignableFrom(type)) {
- return new DoubleField(name, Double.valueOf(value), Store.YES);
- } else if (Float.class.isAssignableFrom(type)) {
- return new FloatField(name, Float.valueOf(value), Store.YES);
- } else if (Long.class.isAssignableFrom(type)) {
- return new LongField(name, Long.valueOf(value), Store.YES);
- } else if (Integer.class.isAssignableFrom(type)) {
- return new IntField(name, Integer.valueOf(value), Store.YES);
- }
- } else if (Date.class.isAssignableFrom(type)) {
- return new StringField(name, value, Store.YES);
- }
- }
-
- return new StringField(name, value, Store.YES);
- }
- }
-
-
/**
* Create new Tika-based content extractor using the provided parser instance.
* @param parser parser instance
@@ -97,13 +49,13 @@ public class TikaLuceneContentExtractor {
/**
* Create new Tika-based content extractor using the provided parser instance and
* optional media type validation. If validation is enabled, the implementation
- * will try to detect the media type of the input and validate it against media types
+ * will try to detect the media type of the input and validate it against media typesthis.contentFieldName
* supported by the parser.
* @param parser parser instance
* @param validateMediaType enabled or disable media type validation
*/
public TikaLuceneContentExtractor(final Parser parser, final boolean validateMediaType) {
- this(parser, validateMediaType, "contents");
+ this(parser, validateMediaType, new LuceneDocumentMetadata());
}
/**
@@ -111,14 +63,28 @@ public class TikaLuceneContentExtractor {
* optional media type validation. If validation is enabled, the implementation
* will try to detect the media type of the input and validate it against media types
* supported by the parser.
- * @param parser parser instance
+ * @param parser parser instancethis.contentFieldName
+ * @param documentMetadata documentMetadata
+ */
+ public TikaLuceneContentExtractor(final Parser parser,
+ final LuceneDocumentMetadata documentMetadata) {
+ this(parser, false, new LuceneDocumentMetadata());
+ }
+
+ /**
+ * Create new Tika-based content extractor using the provided parser instance and
+ * optional media type validation. If validation is enabled, the implementation
+ * will try to detect the media type of the input and validate it against media types
+ * supported by the parser.
+ * @param parser parser instancethis.contentFieldName
* @param validateMediaType enabled or disable media type validation
- * @param contentFieldName name of the content field, default is "contents"
+ * @param documentMetadata documentMetadata
*/
- public TikaLuceneContentExtractor(final Parser parser, final boolean validateMediaType,
- final String contentFieldName) {
- extractor = new TikaContentExtractor(parser, validateMediaType);
- defaultDocumentMetadata = new DocumentMetadata(contentFieldName);
+ public TikaLuceneContentExtractor(final Parser parser,
+ final boolean validateMediaType,
+ final LuceneDocumentMetadata documentMetadata) {
+ this.extractor = new TikaContentExtractor(parser, validateMediaType);
+ this.defaultDocumentMetadata = documentMetadata;
}
/**
@@ -129,20 +95,19 @@ public class TikaLuceneContentExtractor {
* @return the extracted document or null if extraction is not possible or was unsuccessful
*/
public Document extract(final InputStream in) {
- return extractAll(in, defaultDocumentMetadata, true, true);
+ return extractAll(in, null, true, true);
}
/**
- * Extract the content and metadata from the input stream using DocumentMetadata descriptor to
- * create a document with strongly typed fields. Depending on media type validation,
+ * Extract the content and metadata from the input stream. Depending on media type validation,
* the detector could be run against input stream in order to ensure that parser supports this
* type of content.
* @param in input stream to extract the content and metadata from
- * @param metadata document descriptor with field names and their types
+ * @param documentMetadata documentMetadata
* @return the extracted document or null if extraction is not possible or was unsuccessful
*/
- public Document extract(final InputStream in, final DocumentMetadata metadata) {
- return extractAll(in, metadata, true, true);
+ public Document extract(final InputStream in, final LuceneDocumentMetadata documentMetadata) {
+ return extractAll(in, documentMetadata, true, true);
}
/**
@@ -153,7 +118,7 @@ public class TikaLuceneContentExtractor {
* @return the extracted document or null if extraction is not possible or was unsuccessful
*/
public Document extractContent(final InputStream in) {
- return extractAll(in, defaultDocumentMetadata, true, false);
+ return extractAll(in, null, true, false);
}
/**
@@ -164,11 +129,25 @@ public class TikaLuceneContentExtractor {
* @return the extracted document or null if extraction is not possible or was unsuccessful
*/
public Document extractMetadata(final InputStream in) {
- return extractAll(in, defaultDocumentMetadata, false, true);
+ return extractAll(in, null, false, true);
}
- private Document extractAll(final InputStream in, final DocumentMetadata documentMetadata,
- boolean extractContent, boolean extractMetadata) {
+ /**
+ * Extract the metadata only from the input stream. Depending on media type validation,
+ * the detector could be run against input stream in order to ensure that parser supports this
+ * type of content.
+ * @param in input stream to extract the metadata from
+ * @param documentMetadata documentMetadata
+ * @return the extracted document or null if extraction is not possible or was unsuccessful
+ */
+ public Document extractMetadata(final InputStream in, final LuceneDocumentMetadata documentMetadata) {
+ return extractAll(in, documentMetadata, false, true);
+ }
+
+ private Document extractAll(final InputStream in,
+ LuceneDocumentMetadata documentMetadata,
+ boolean extractContent,
+ boolean extractMetadata) {
TikaContent content = extractor.extractAll(in, extractContent);
@@ -176,18 +155,49 @@ public class TikaLuceneContentExtractor {
return null;
}
final Document document = new Document();
+
+ if (documentMetadata == null) {
+ documentMetadata = defaultDocumentMetadata;
+ }
if (content.getContent() != null) {
- document.add(documentMetadata.contentField(content.getContent()));
+ document.add(getContentField(documentMetadata, content.getContent()));
}
if (extractMetadata) {
Metadata metadata = content.getMetadata();
for (final String property: metadata.names()) {
- document.add(documentMetadata.field(property, metadata.get(property)));
+ document.add(getField(documentMetadata, property, metadata.get(property)));
}
}
return document;
}
+
+ private static Field getContentField(final LuceneDocumentMetadata documentMetadata, final String content) {
+ return new TextField(documentMetadata.getContentFieldName(), content, Store.YES);
+ }
+
+ private static Field getField(final LuceneDocumentMetadata documentMetadata,
+ final String name, final String value) {
+ final Class< ? > type = documentMetadata.getFieldType(name);
+
+ if (type != null) {
+ if (Number.class.isAssignableFrom(type)) {
+ if (Double.class.isAssignableFrom(type)) {
+ return new DoubleField(name, Double.valueOf(value), Store.YES);
+ } else if (Float.class.isAssignableFrom(type)) {
+ return new FloatField(name, Float.valueOf(value), Store.YES);
+ } else if (Long.class.isAssignableFrom(type)) {
+ return new LongField(name, Long.valueOf(value), Store.YES);
+ } else if (Integer.class.isAssignableFrom(type)) {
+ return new IntField(name, Integer.valueOf(value), Store.YES);
+ }
+ } else if (Date.class.isAssignableFrom(type)) {
+ return new StringField(name, value, Store.YES);
+ }
+ }
+
+ return new StringField(name, value, Store.YES);
+ }
}
http://git-wip-us.apache.org/repos/asf/cxf/blob/8f99f309/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractorTest.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractorTest.java b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractorTest.java
index 3ebe02d..ef36439 100644
--- a/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractorTest.java
+++ b/rt/rs/extensions/search/src/test/java/org/apache/cxf/jaxrs/ext/search/tika/TikaLuceneContentExtractorTest.java
@@ -25,7 +25,6 @@ import org.apache.cxf.jaxrs.ext.search.SearchBean;
import org.apache.cxf.jaxrs.ext.search.SearchConditionParser;
import org.apache.cxf.jaxrs.ext.search.fiql.FiqlParser;
import org.apache.cxf.jaxrs.ext.search.lucene.LuceneQueryVisitor;
-import org.apache.cxf.jaxrs.ext.search.tika.TikaLuceneContentExtractor.DocumentMetadata;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
@@ -39,6 +38,7 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.parser.pdf.PDFParser;
+
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
@@ -80,7 +80,7 @@ public class TikaLuceneContentExtractorTest extends Assert {
@Test
public void testExtractedTextContentMatchesTypesAndSearchCriteria() throws Exception {
- final DocumentMetadata documentMetadata = new DocumentMetadata("contents")
+ final LuceneDocumentMetadata documentMetadata = new LuceneDocumentMetadata("contents")
.withField("modified", Date.class);
final Document document = extractor.extract(
[2/2] git commit: [CXF-5549] Moving Lucene DocumentMetadata to its
own class and supporting injecting it via the constructor as discussed with
Andriy, updating TikaContentExtractor to accept multile parsers
Posted by se...@apache.org.
[CXF-5549] Moving Lucene DocumentMetadata to its own class and supporting injecting it via the constructor as discussed with Andriy, updating TikaContentExtractor to accept multile parsers
Project: http://git-wip-us.apache.org/repos/asf/cxf/repo
Commit: http://git-wip-us.apache.org/repos/asf/cxf/commit/785c0bd7
Tree: http://git-wip-us.apache.org/repos/asf/cxf/tree/785c0bd7
Diff: http://git-wip-us.apache.org/repos/asf/cxf/diff/785c0bd7
Branch: refs/heads/master
Commit: 785c0bd70d69a279e73360dc2f3b19ae0101fb93
Parents: 8f99f30
Author: Sergey Beryozkin <sb...@talend.com>
Authored: Fri Jun 27 16:48:17 2014 +0100
Committer: Sergey Beryozkin <sb...@talend.com>
Committed: Fri Jun 27 16:48:17 2014 +0100
----------------------------------------------------------------------
.../ext/search/tika/LuceneDocumentMetadata.java | 55 ++++++++++++++++++++
1 file changed, 55 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/cxf/blob/785c0bd7/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/LuceneDocumentMetadata.java
----------------------------------------------------------------------
diff --git a/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/LuceneDocumentMetadata.java b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/LuceneDocumentMetadata.java
new file mode 100644
index 0000000..d44ab25
--- /dev/null
+++ b/rt/rs/extensions/search/src/main/java/org/apache/cxf/jaxrs/ext/search/tika/LuceneDocumentMetadata.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cxf.jaxrs.ext.search.tika;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+public class LuceneDocumentMetadata {
+ private final Map< String, Class< ? > > fieldTypes;
+ private final String contentFieldName;
+
+ public LuceneDocumentMetadata() {
+ this("contents");
+ }
+ public LuceneDocumentMetadata(final String contentFieldName) {
+ this(contentFieldName, new LinkedHashMap< String, Class< ? > >());
+ }
+ public LuceneDocumentMetadata(final Map< String, Class< ? > > fieldTypes) {
+ this("contents", fieldTypes);
+ }
+ public LuceneDocumentMetadata(final String contentFieldName, final Map< String, Class< ? > > fieldTypes) {
+ this.contentFieldName = contentFieldName;
+ this.fieldTypes = fieldTypes;
+ }
+
+ public LuceneDocumentMetadata withField(final String name, final Class< ? > type) {
+ fieldTypes.put(name, type);
+ return this;
+ }
+
+ public String getContentFieldName() {
+ return contentFieldName;
+ }
+ public Class<?> getFieldType(String name) {
+ return fieldTypes.get(name);
+ }
+
+
+}