You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ja...@apache.org on 2012/06/28 11:46:51 UTC

svn commit: r1354887 - in /lucene/dev/trunk/solr: ./ contrib/extraction/src/java/org/apache/solr/handler/extraction/ contrib/extraction/src/test-files/extraction/ contrib/extraction/src/test-files/extraction/solr/collection1/conf/ contrib/extraction/sr...

Author: janhoy
Date: Thu Jun 28 09:46:49 2012
New Revision: 1354887

URL: http://svn.apache.org/viewvc?rev=1354887&view=rev
Log:
SOLR-1929: Index encrypted files in SolrCell

Added:
    lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java   (with props)
    lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/enctypted-password-is-solrRules.pdf   (with props)
    lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/password-is-Word2010.docx   (with props)
    lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/passwordRegex.properties   (with props)
Modified:
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
    lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
    lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1354887&r1=1354886&r2=1354887&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Thu Jun 28 09:46:49 2012
@@ -368,6 +368,11 @@ New Features
 
 * SOLR-3351: eDismax: ps2 and ps3 params (janhoy)
 
+* SOLR-1929: Index encrypted documents with ExtractingUpdateRequestHandler.
+  By supplying resource.password=<mypw> or specifying an external file with regular
+  expressions matching file names, Solr will decrypt and index PDFs and DOCX formats.
+  (janhoy, Yiannis Pericleous)
+
 * SOLR-3542: Add WeightedFragListBuilder for FVH and set it to default fragListBuilder
   in example solrconfig.xml. (Sebastian Lutze, koji)
 

Modified: lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1354887&r1=1354886&r2=1354887&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Thu Jun 28 09:46:49 2012
@@ -44,6 +44,7 @@ import org.apache.tika.parser.AutoDetect
 import org.apache.tika.parser.DefaultParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.sax.xpath.Matcher;
 import org.apache.tika.sax.xpath.MatchingContentHandler;
@@ -90,7 +91,6 @@ public class ExtractingDocumentLoader ex
 
   protected TikaConfig config;
   protected SolrContentHandlerFactory factory;
-  //protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
 
   public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor,
                            TikaConfig config, SolrContentHandlerFactory factory) {
@@ -206,6 +206,23 @@ public class ExtractingDocumentLoader ex
         try{
           //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
           ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
+
+          // Password handling
+          RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
+          String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
+          if(pwMapFile != null && pwMapFile.length() > 0) {
+            InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
+            if(is != null) {
+              log.debug("Password file supplied: "+pwMapFile);
+              epp.parse(is);
+            }
+          }
+          context.set(PasswordProvider.class, epp);
+          String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
+          if(resourcePassword != null) {
+            epp.setExplicitPassword(resourcePassword);
+            log.debug("Literal password supplied for file "+resourceName);
+          }
           parser.parse(inputStream, parsingHandler, metadata, context);
         } catch (TikaException e) {
           if(ignoreTikaException)

Modified: lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java?rev=1354887&r1=1354886&r2=1354887&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java (original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java Thu Jun 28 09:46:49 2012
@@ -136,6 +136,10 @@ public interface ExtractingParams {
    */
   public static final String RESOURCE_NAME = "resource.name";
 
+  /**
+   * Optional. The password for this resource. Will be used instead of the rule based password lookup mechanisms 
+   */
+  public static final String RESOURCE_PASSWORD = "resource.password";
 
   /**
    * Optional.  If specified, the prefix will be prepended to all Metadata, such that it would be possible
@@ -148,4 +152,14 @@ public interface ExtractingParams {
    * will be used instead.
    */
   public static final String DEFAULT_FIELD = "defaultField";
+
+  /**
+   * Optional. If specified, loads the file as a source for password lookups for Tika encrypted documents.
+   * <p>
+   * File format is Java properties format with one key=value per line.
+   * The key is evaluated as a regex against the file name, and the value is the password
+   * The rules are evaluated top-bottom, i.e. the first match will be used
+   * If you want a fallback password to be always used, supply a .*=<defaultmypassword> at the end  
+   */
+  public static final String PASSWORD_MAP_FILE = "passwordsFile";
 }

Added: lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java?rev=1354887&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java (added)
+++ lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java Thu Jun 28 09:46:49 2012
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.extraction;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.LinkedHashMap;
+import java.util.Map.Entry;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.parser.PasswordProvider;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Password provider for Extracting request handler which finds correct
+ * password based on file name matching against a list of regular expressions. 
+ * The list of passwords is supplied in an optional Map.
+ * If an explicit password is set, it will be used.
+ */
+public class RegexRulesPasswordProvider implements PasswordProvider {
+  private static final Logger log = LoggerFactory.getLogger(RegexRulesPasswordProvider.class);
+  
+  private LinkedHashMap<Pattern,String> passwordMap = new LinkedHashMap<Pattern,String>(); 
+  private String explicitPassword; 
+  
+  @Override
+  public String getPassword(Metadata meta) {
+    if(getExplicitPassword() != null) {
+      return getExplicitPassword();
+    }
+    
+    if(passwordMap.size() > 0)
+      return lookupPasswordFromMap(meta.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
+    
+    return null;
+  }
+
+  private String lookupPasswordFromMap(String fileName) {
+    if(fileName != null && fileName.length() > 0) {
+      for(Entry<Pattern,String> e : passwordMap.entrySet()) {
+        if(e.getKey().matcher(fileName).matches()) {
+          return e.getValue();
+        }
+      }
+    }
+    return null;
+  }
+  
+  /**
+   * Parses rule file from stream and returns a Map of all rules found
+   * @param is input stream for the file
+   */
+  public static LinkedHashMap<Pattern,String> parseRulesFile(InputStream is) {
+    LinkedHashMap<Pattern,String> rules = new LinkedHashMap<Pattern,String>();
+    BufferedReader br = new BufferedReader(new InputStreamReader(is));
+    String line;
+    try {
+      int linenum = 0;
+      while ((line = br.readLine()) != null)   {
+        linenum++;
+        // Remove comments
+        String[] arr = line.split("#");
+        if(arr.length > 0)
+          line = arr[0].trim();
+        if(line.length() == 0) 
+          continue;
+        int sep = line.indexOf("=");
+        if(sep <= 0) {
+          log.warn("Wrong format of password line "+linenum);
+          continue;
+        }
+        String pass = line.substring(sep+1).trim();
+        String regex = line.substring(0, sep).trim();
+        try {
+          Pattern pattern = Pattern.compile(regex);
+          rules.put(pattern,  pass);
+        } catch(PatternSyntaxException pse) {
+          log.warn("Key of line "+linenum+" was not a valid regex pattern", pse);
+          continue;
+        }
+      }
+      is.close();
+    } catch (IOException e) {
+      throw new RuntimeException();
+    }
+    return rules;
+  }
+
+  /**
+   * Initialize rules through file input stream. This is a convenience for first calling
+   * setPasswordMap(parseRulesFile(is)).
+   * @param is the input stream with rules file, one line per rule on format regex=password
+   */
+  public void parse(InputStream is) {
+    setPasswordMap(parseRulesFile(is));
+  }
+  
+  public LinkedHashMap<Pattern,String> getPasswordMap() {
+    return passwordMap;
+  }
+
+  public void setPasswordMap(LinkedHashMap<Pattern,String> linkedHashMap) {
+    this.passwordMap = linkedHashMap;
+  }
+
+  /**
+   * Gets the explicit password, if set
+   * @return the password, or null if not set
+   */
+  public String getExplicitPassword() {
+    return explicitPassword;
+  }
+
+  /**
+   * Sets an explicit password which will be used instead of password map
+   * @param explicitPassword the password to use
+   */
+  public void setExplicitPassword(String explicitPassword) {
+    this.explicitPassword = explicitPassword;
+  }
+  
+  /**
+   * Resets explicit password, so that map will be used for lookups
+   */
+  public void resetExplicitPassword() {
+    this.explicitPassword = null;
+  }
+
+}

Added: lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/enctypted-password-is-solrRules.pdf
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/enctypted-password-is-solrRules.pdf?rev=1354887&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/password-is-Word2010.docx
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/password-is-Word2010.docx?rev=1354887&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/passwordRegex.properties
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/passwordRegex.properties?rev=1354887&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/passwordRegex.properties (added)
+++ lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/passwordRegex.properties Thu Jun 28 09:46:49 2012
@@ -0,0 +1,7 @@
+# Filename regex -> password map
+# Example any file ending in .doc should use password foobar:
+#  .*\.doc = fooBar
+#
+# Note: Apache Tika 1.1 supports password for .pdf and .docx only, not .doc or other formats 
+.*\.pdf$ = solrRules
+.*\.docx$ = Word2010
\ No newline at end of file

Modified: lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java?rev=1354887&r1=1354886&r2=1354887&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java (original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java Thu Jun 28 09:46:49 2012
@@ -394,7 +394,7 @@ public class ExtractingRequestHandlerTes
     try{
       loadLocal("extraction/password-is-solrcell.docx",
           "literal.id", "one");
-      fail("TikaException is expected because of trying to extract text from password protected word file.");
+      fail("TikaException is expected because of trying to extract text from password protected word file without supplying a password.");
     }
     catch(Exception expected){}
     assertU(commit());
@@ -509,6 +509,74 @@ public class ExtractingRequestHandlerTes
     assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']");
   }
 
+  @Test
+  public void testPasswordProtected() throws Exception {
+    // PDF, Passwords from resource.password
+    loadLocal("extraction/enctypted-password-is-solrRules.pdf", 
+        "fmap.created", "extractedDate", 
+        "fmap.producer", "extractedProducer",
+        "fmap.creator", "extractedCreator", 
+        "fmap.Keywords", "extractedKeywords",
+        "fmap.Creation-Date", "extractedDate",
+        "uprefix", "ignored_",
+        "fmap.Author", "extractedAuthor",
+        "fmap.content", "wdf_nocase",
+        "literal.id", "pdfpwliteral",
+        "resource.name", "enctypted-password-is-solrRules.pdf",
+        "resource.password", "solrRules",
+        "fmap.Last-Modified", "extractedDate");
+
+    // PDF, Passwords from passwords property file
+    loadLocal("extraction/enctypted-password-is-solrRules.pdf", 
+        "fmap.created", "extractedDate", 
+        "fmap.producer", "extractedProducer",
+        "fmap.creator", "extractedCreator", 
+        "fmap.Keywords", "extractedKeywords",
+        "fmap.Creation-Date", "extractedDate",
+        "uprefix", "ignored_",
+        "fmap.Author", "extractedAuthor",
+        "fmap.content", "wdf_nocase",
+        "literal.id", "pdfpwfile",
+        "resource.name", "enctypted-password-is-solrRules.pdf",
+        "passwordsFile", "passwordRegex.properties", // Passwords-file
+        "fmap.Last-Modified", "extractedDate");
+
+    // DOCX, Explicit password
+    loadLocal("extraction/password-is-Word2010.docx", 
+        "fmap.created", "extractedDate", 
+        "fmap.producer", "extractedProducer",
+        "fmap.creator", "extractedCreator", 
+        "fmap.Keywords", "extractedKeywords",
+        "fmap.Creation-Date", "extractedDate",
+        "fmap.Author", "extractedAuthor",
+        "fmap.content", "wdf_nocase",
+        "uprefix", "ignored_",
+        "literal.id", "docxpwliteral",
+        "resource.name", "password-is-Word2010.docx",
+        "resource.password", "Word2010", // Explicit password
+        "fmap.Last-Modified", "extractedDate");
+
+    // DOCX, Passwords from file
+    loadLocal("extraction/password-is-Word2010.docx", 
+        "fmap.created", "extractedDate", 
+        "fmap.producer", "extractedProducer",
+        "fmap.creator", "extractedCreator", 
+        "fmap.Keywords", "extractedKeywords",
+        "fmap.Creation-Date", "extractedDate",
+        "uprefix", "ignored_",
+        "fmap.Author", "extractedAuthor",
+        "fmap.content", "wdf_nocase",
+        "literal.id", "docxpwfile",
+        "resource.name", "password-is-Word2010.docx",
+        "passwordsFile", "passwordRegex.properties", // Passwords-file
+        "fmap.Last-Modified", "extractedDate");
+    
+    assertU(commit());
+    Thread.sleep(100);
+    assertQ(req("wdf_nocase:\"This is a test of PDF\""), "//*[@numFound='2']");
+    assertQ(req("wdf_nocase:\"Test password protected word doc\""), "//*[@numFound='2']");
+  }
+  
   SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
     LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
     try {