You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2015/11/04 21:17:05 UTC

svn commit: r1712632 - in /lucene/dev/branches/branch_5x: ./ solr/ solr/contrib/ solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ solr/contrib/extraction/src/test-files/extraction/ solr/contrib/extraction/src/test-files/extraction/s...

Author: uschindler
Date: Wed Nov  4 20:17:04 2015
New Revision: 1712632

URL: http://svn.apache.org/viewvc?rev=1712632&view=rev
Log:
Merged revision(s) 1712629 from lucene/dev/trunk:
SOLR-8166: Introduce possibility to configure ParseContext in ExtractingRequestHandler/ExtractingDocumentLoader
This closes Github PR #206

Added:
    lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
      - copied unchanged from r1712629, lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ParseContextConfig.java
    lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test-files/extraction/pdf-with-image.pdf
      - copied unchanged from r1712629, lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/pdf-with-image.pdf
    lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/parseContext.xml
      - copied unchanged from r1712629, lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/parseContext.xml
    lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
      - copied unchanged from r1712629, lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ParseContextConfigTest.java
Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/solr/   (props changed)
    lucene/dev/branches/branch_5x/solr/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_5x/solr/contrib/   (props changed)
    lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
    lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
    lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
    lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java

Modified: lucene/dev/branches/branch_5x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/CHANGES.txt?rev=1712632&r1=1712631&r2=1712632&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/solr/CHANGES.txt Wed Nov  4 20:17:04 2015
@@ -83,6 +83,10 @@ New Features
 
 * SOLR-8139: Create/delete fields/dynamic fields/copy fields via schema tab on Angular UI
 
+* SOLR-8166: Introduce possibility to configure ParseContext in
+  ExtractingRequestHandler/ExtractingDocumentLoader (Andriy Binetsky
+  via Uwe Schindler)
+
 Bug Fixes
 ----------------------
 

Modified: lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1712632&r1=1712631&r2=1712632&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Wed Nov  4 20:17:04 2015
@@ -91,13 +91,16 @@ public class ExtractingDocumentLoader ex
   private final AddUpdateCommand templateAdd;
 
   protected TikaConfig config;
+  protected ParseContextConfig parseContextConfig;
   protected SolrContentHandlerFactory factory;
 
   public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor,
-                           TikaConfig config, SolrContentHandlerFactory factory) {
+                           TikaConfig config, ParseContextConfig parseContextConfig,
+                                  SolrContentHandlerFactory factory) {
     this.params = req.getParams();
     this.core = req.getCore();
     this.config = config;
+    this.parseContextConfig = parseContextConfig;
     this.processor = processor;
 
     templateAdd = new AddUpdateCommand(req);
@@ -199,7 +202,10 @@ public class ExtractingDocumentLoader ex
 
         try{
           //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
-          ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
+          ParseContext context = parseContextConfig.create();
+
+
+          context.set(Parser.class, parser);
           context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
 
           // Password handling

Modified: lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java?rev=1712632&r1=1712631&r2=1712632&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java Wed Nov  4 20:17:04 2015
@@ -48,10 +48,12 @@ public class ExtractingRequestHandler ex
 
   private transient static Logger log = LoggerFactory.getLogger(ExtractingRequestHandler.class);
 
+  public static final String PARSE_CONTEXT_CONFIG = "parseContext.config";
   public static final String CONFIG_LOCATION = "tika.config";
   public static final String DATE_FORMATS = "date.formats";
 
   protected TikaConfig config;
+  protected ParseContextConfig parseContextConfig;
 
 
   protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
@@ -79,6 +81,16 @@ public class ExtractingRequestHandler ex
           throw new SolrException(ErrorCode.SERVER_ERROR, e);
         }
       }
+
+      String parseContextConfigLoc = (String) initArgs.get(PARSE_CONTEXT_CONFIG);
+      if (parseContextConfigLoc != null) {
+        try {
+          parseContextConfig = new ParseContextConfig(core.getResourceLoader(), parseContextConfigLoc);
+        } catch (Exception e) {
+          throw new SolrException(ErrorCode.SERVER_ERROR, e);
+        }
+      }
+
       NamedList configDateFormats = (NamedList) initArgs.get(DATE_FORMATS);
       if (configDateFormats != null && configDateFormats.size() > 0) {
         dateFormats = new HashSet<>();
@@ -97,6 +109,9 @@ public class ExtractingRequestHandler ex
         throw new SolrException(ErrorCode.SERVER_ERROR, e);
       }
     }
+    if (parseContextConfig == null) {
+      parseContextConfig = new ParseContextConfig();
+    }
     factory = createFactory();
   }
 
@@ -111,7 +126,7 @@ public class ExtractingRequestHandler ex
 
   @Override
   protected ContentStreamLoader newLoader(SolrQueryRequest req, UpdateRequestProcessor processor) {
-    return new ExtractingDocumentLoader(req, processor, config, factory);
+    return new ExtractingDocumentLoader(req, processor, config, parseContextConfig, factory);
   }
 
   // ////////////////////// SolrInfoMBeans methods //////////////////////

Modified: lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml?rev=1712632&r1=1712631&r2=1712632&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/solrconfig.xml Wed Nov  4 20:17:04 2015
@@ -185,7 +185,9 @@
     </lst>
   </requestHandler>
 
-  <requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler"/>
+  <requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
+    <str name="parseContext.config">parseContext.xml</str>
+  </requestHandler>
 
   <requestHandler name="/update/extract/lit-def" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
     <lst name="defaults">

Modified: lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java?rev=1712632&r1=1712631&r2=1712632&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java Wed Nov  4 20:17:04 2015
@@ -657,6 +657,28 @@ public class ExtractingRequestHandlerTes
   }
 
   @Test
+  public void testPdfWithImages() throws Exception {
+    //Tests possibility to configure ParseContext (by example to extract embedded images from pdf)
+    loadLocal("extraction/pdf-with-image.pdf",
+        "fmap.created", "extractedDate",
+        "fmap.producer", "extractedProducer",
+        "fmap.creator", "extractedCreator",
+        "fmap.Keywords", "extractedKeywords",
+        "fmap.Creation-Date", "extractedDate",
+        "uprefix", "ignored_",
+        "fmap.Author", "extractedAuthor",
+        "fmap.content", "wdf_nocase",
+        "literal.id", "pdfWithImage",
+        "resource.name", "pdf-with-image.pdf",
+        "resource.password", "solrRules",
+        "fmap.Last-Modified", "extractedDate");
+
+    assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='0']");
+    assertU(commit());
+    assertQ(req("wdf_nocase:\"embedded:image0.jpg\""), "//*[@numFound='1']");
+  }
+
+  @Test
   public void testPasswordProtected() throws Exception {
     // PDF, Passwords from resource.password
     loadLocal("extraction/encrypted-password-is-solrRules.pdf",
@@ -705,7 +727,7 @@ public class ExtractingRequestHandlerTes
 
     // DOCX, Passwords from file
     loadLocal("extraction/password-is-Word2010.docx", 
-        "fmap.created", "extractedDate", 
+        "fmap.created", "extractedDate",
         "fmap.producer", "extractedProducer",
         "fmap.creator", "extractedCreator", 
         "fmap.Keywords", "extractedKeywords",