You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2015/11/04 21:17:05 UTC
svn commit: r1712632 - in /lucene/dev/branches/branch_5x: ./ solr/
solr/contrib/
solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/
solr/contrib/extraction/src/test-files/extraction/
solr/contrib/extraction/src/test-files/extraction/s...
Author: uschindler
Date: Wed Nov 4 20:17:04 2015
New Revision: 1712632
URL: http://svn.apache.org/viewvc?rev=1712632&view=rev
Log:
Merged revision(s) 1712629 from lucene/dev/trunk:
SOLR-8166: Introduce possibility to configure ParseContext in ExtractingRequestHandler/ExtractingDocumentLoader
This closes Github PR #206
Added:
lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
- copied unchanged from r1712629, lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test-files/extraction/pdf-with-image.pdf
- copied unchanged from r1712629, lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/pdf-with-image.pdf
lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/parseContext.xml
- copied unchanged from r1712629, lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/parseContext.xml
lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
- copied unchanged from r1712629, lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
Modified:
lucene/dev/branches/branch_5x/ (props changed)
lucene/dev/branches/branch_5x/solr/ (props changed)
lucene/dev/branches/branch_5x/solr/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_5x/solr/contrib/ (props changed)
lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
Modified: lucene/dev/branches/branch_5x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/CHANGES.txt?rev=1712632&r1=1712631&r2=1712632&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/solr/CHANGES.txt Wed Nov 4 20:17:04 2015
@@ -83,6 +83,10 @@ New Features
* SOLR-8139: Create/delete fields/dynamic fields/copy fields via schema tab on Angular UI
+* SOLR-8166: Introduce possibility to configure ParseContext in
+ ExtractingRequestHandler/ExtractingDocumentLoader (Andriy Binetsky
+ via Uwe Schindler)
+
Bug Fixes
----------------------
Modified: lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1712632&r1=1712631&r2=1712632&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Wed Nov 4 20:17:04 2015
@@ -91,13 +91,16 @@ public class ExtractingDocumentLoader ex
private final AddUpdateCommand templateAdd;
protected TikaConfig config;
+ protected ParseContextConfig parseContextConfig;
protected SolrContentHandlerFactory factory;
public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor,
- TikaConfig config, SolrContentHandlerFactory factory) {
+ TikaConfig config, ParseContextConfig parseContextConfig,
+ SolrContentHandlerFactory factory) {
this.params = req.getParams();
this.core = req.getCore();
this.config = config;
+ this.parseContextConfig = parseContextConfig;
this.processor = processor;
templateAdd = new AddUpdateCommand(req);
@@ -199,7 +202,10 @@ public class ExtractingDocumentLoader ex
try{
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
- ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
+ ParseContext context = parseContextConfig.create();
+
+
+ context.set(Parser.class, parser);
context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
// Password handling
Modified: lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java?rev=1712632&r1=1712631&r2=1712632&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java Wed Nov 4 20:17:04 2015
@@ -48,10 +48,12 @@ public class ExtractingRequestHandler ex
private transient static Logger log = LoggerFactory.getLogger(ExtractingRequestHandler.class);
+ public static final String PARSE_CONTEXT_CONFIG = "parseContext.config";
public static final String CONFIG_LOCATION = "tika.config";
public static final String DATE_FORMATS = "date.formats";
protected TikaConfig config;
+ protected ParseContextConfig parseContextConfig;
protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
@@ -79,6 +81,16 @@ public class ExtractingRequestHandler ex
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
+
+ String parseContextConfigLoc = (String) initArgs.get(PARSE_CONTEXT_CONFIG);
+ if (parseContextConfigLoc != null) {
+ try {
+ parseContextConfig = new ParseContextConfig(core.getResourceLoader(), parseContextConfigLoc);
+ } catch (Exception e) {
+ throw new SolrException(ErrorCode.SERVER_ERROR, e);
+ }
+ }
+
NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
if (configDateFormats != null && configDateFormats.size() > 0) {
dateFormats = new HashSet<>();
@@ -97,6 +109,9 @@ public class ExtractingRequestHandler ex
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
+ if (parseContextConfig == null) {
+ parseContextConfig = new ParseContextConfig();
+ }
factory = createFactory();
}
@@ -111,7 +126,7 @@ public class ExtractingRequestHandler ex
@Override
protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
- return new ExtractingDocumentLoader(req, processor, config, factory);
+ return new ExtractingDocumentLoader(req, processor, config, parseContextConfig, factory);
}
// ////////////////////// SolrInfoMBeans methods //////////////////////
Modified: lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml?rev=1712632&r1=1712631&r2=1712632&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml Wed Nov 4 20:17:04 2015
@@ -185,7 +185,9 @@
</lst>
</requestHandler>
- <requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler"/>
+ <requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
+ <str name="parseContext.config">parseContext.xml</str>
+ </requestHandler>
<requestHandler name="/update/extract/lit-def" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
<lst name="defaults">
Modified: lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java?rev=1712632&r1=1712631&r2=1712632&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java Wed Nov 4 20:17:04 2015
@@ -657,6 +657,28 @@ public class ExtractingRequestHandlerTes
}
@Test
+ public void testPdfWithImages() throws Exception {
+ //Tests possibility to configure ParseContext (by example to extract embedded images from pdf)
+ loadLocal("extraction/pdf-with-image.pdf",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator",
+ "fmap.Keywords", "extractedKeywords",
+ "fmap.Creation-Date", "extractedDate",
+ "uprefix", "ignored_",
+ "fmap.Author", "extractedAuthor",
+ "fmap.content", "wdf_nocase",
+ "literal.id", "pdfWithImage",
+ "resource.name", "pdf-with-image.pdf",
+ "resource.password", "solrRules",
+ "fmap.Last-Modified", "extractedDate");
+
+ assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='0']");
+ assertU(commit());
+ assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='1']");
+ }
+
+ @Test
public void testPasswordProtected() throws Exception {
// PDF, Passwords from resource.password
loadLocal("extraction/encrypted-password-is-solrRules.pdf",
@@ -705,7 +727,7 @@ public class ExtractingRequestHandlerTes
// DOCX, Passwords from file
loadLocal("extraction/password-is-Word2010.docx",
- "fmap.created", "extractedDate",
+ "fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator",
"fmap.Keywords", "extractedKeywords",