You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ja...@apache.org on 2012/06/28 11:46:51 UTC
svn commit: r1354887 - in /lucene/dev/trunk/solr: ./
contrib/extraction/src/java/org/apache/solr/handler/extraction/
contrib/extraction/src/test-files/extraction/
contrib/extraction/src/test-files/extraction/solr/collection1/conf/
contrib/extraction/sr...
Author: janhoy
Date: Thu Jun 28 09:46:49 2012
New Revision: 1354887
URL: http://svn.apache.org/viewvc?rev=1354887&view=rev
Log:
SOLR-1929: Index encrypted files in SolrCell
Added:
lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java (with props)
lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/enctypted-password-is-solrRules.pdf (with props)
lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/password-is-Word2010.docx (with props)
lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/passwordRegex.properties (with props)
Modified:
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1354887&r1=1354886&r2=1354887&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Thu Jun 28 09:46:49 2012
@@ -368,6 +368,11 @@ New Features
* SOLR-3351: eDismax: ps2 and ps3 params (janhoy)
+* SOLR-1929: Index encrypted documents with ExtractingUpdateRequestHandler.
+ By supplying resource.password=<mypw> or specifying an external file with regular
+ expressions matching file names, Solr will decrypt and index PDFs and DOCX formats.
+ (janhoy, Yiannis Pericleous)
+
* SOLR-3542: Add WeightedFragListBuilder for FVH and set it to default fragListBuilder
in example solrconfig.xml. (Sebastian Lutze, koji)
Modified: lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1354887&r1=1354886&r2=1354887&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Thu Jun 28 09:46:49 2012
@@ -44,6 +44,7 @@ import org.apache.tika.parser.AutoDetect
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
@@ -90,7 +91,6 @@ public class ExtractingDocumentLoader ex
protected TikaConfig config;
protected SolrContentHandlerFactory factory;
- //protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor,
TikaConfig config, SolrContentHandlerFactory factory) {
@@ -206,6 +206,23 @@ public class ExtractingDocumentLoader ex
try{
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
+
+ // Password handling
+ RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
+ String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
+ if(pwMapFile != null && pwMapFile.length() > 0) {
+ InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
+ if(is != null) {
+ log.debug("Password file supplied: "+pwMapFile);
+ epp.parse(is);
+ }
+ }
+ context.set(PasswordProvider.class, epp);
+ String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
+ if(resourcePassword != null) {
+ epp.setExplicitPassword(resourcePassword);
+ log.debug("Literal password supplied for file "+resourceName);
+ }
parser.parse(inputStream, parsingHandler, metadata, context);
} catch (TikaException e) {
if(ignoreTikaException)
Modified: lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java?rev=1354887&r1=1354886&r2=1354887&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java (original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java Thu Jun 28 09:46:49 2012
@@ -136,6 +136,10 @@ public interface ExtractingParams {
*/
public static final String RESOURCE_NAME = "resource.name";
+ /**
+ * Optional. The password for this resource. Will be used instead of the rule based password lookup mechanisms
+ */
+ public static final String RESOURCE_PASSWORD = "resource.password";
/**
* Optional. If specified, the prefix will be prepended to all Metadata, such that it would be possible
@@ -148,4 +152,14 @@ public interface ExtractingParams {
* will be used instead.
*/
public static final String DEFAULT_FIELD = "defaultField";
+
+ /**
+ * Optional. If specified, loads the file as a source for password lookups for Tika encrypted documents.
+ * <p>
+ * File format is Java properties format with one key=value per line.
+ * The key is evaluated as a regex against the file name, and the value is the password
+ * The rules are evaluated top-bottom, i.e. the first match will be used
+ * If you want a fallback password to be always used, supply a .*=<defaultmypassword> at the end
+ */
+ public static final String PASSWORD_MAP_FILE = "passwordsFile";
}
Added: lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java?rev=1354887&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java (added)
+++ lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/RegexRulesPasswordProvider.java Thu Jun 28 09:46:49 2012
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.extraction;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.LinkedHashMap;
+import java.util.Map.Entry;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.parser.PasswordProvider;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Password provider for Extracting request handler which finds correct
+ * password based on file name matching against a list of regular expressions.
+ * The list of passwords is supplied in an optional Map.
+ * If an explicit password is set, it will be used.
+ */
+public class RegexRulesPasswordProvider implements PasswordProvider {
+ private static final Logger log = LoggerFactory.getLogger(RegexRulesPasswordProvider.class);
+
+ private LinkedHashMap<Pattern,String> passwordMap = new LinkedHashMap<Pattern,String>();
+ private String explicitPassword;
+
+ @Override
+ public String getPassword(Metadata meta) {
+ if(getExplicitPassword() != null) {
+ return getExplicitPassword();
+ }
+
+ if(passwordMap.size() > 0)
+ return lookupPasswordFromMap(meta.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
+
+ return null;
+ }
+
+ private String lookupPasswordFromMap(String fileName) {
+ if(fileName != null && fileName.length() > 0) {
+ for(Entry<Pattern,String> e : passwordMap.entrySet()) {
+ if(e.getKey().matcher(fileName).matches()) {
+ return e.getValue();
+ }
+ }
+ }
+ return null;
+ }
+
+ /**
+ * Parses rule file from stream and returns a Map of all rules found
+ * @param is input stream for the file
+ */
+ public static LinkedHashMap<Pattern,String> parseRulesFile(InputStream is) {
+ LinkedHashMap<Pattern,String> rules = new LinkedHashMap<Pattern,String>();
+ BufferedReader br = new BufferedReader(new InputStreamReader(is));
+ String line;
+ try {
+ int linenum = 0;
+ while ((line = br.readLine()) != null) {
+ linenum++;
+ // Remove comments
+ String[] arr = line.split("#");
+ if(arr.length > 0)
+ line = arr[0].trim();
+ if(line.length() == 0)
+ continue;
+ int sep = line.indexOf("=");
+ if(sep <= 0) {
+ log.warn("Wrong format of password line "+linenum);
+ continue;
+ }
+ String pass = line.substring(sep+1).trim();
+ String regex = line.substring(0, sep).trim();
+ try {
+ Pattern pattern = Pattern.compile(regex);
+ rules.put(pattern, pass);
+ } catch(PatternSyntaxException pse) {
+ log.warn("Key of line "+linenum+" was not a valid regex pattern", pse);
+ continue;
+ }
+ }
+ is.close();
+ } catch (IOException e) {
+ throw new RuntimeException();
+ }
+ return rules;
+ }
+
+ /**
+ * Initialize rules through file input stream. This is a convenience for first calling
+ * setPasswordMap(parseRulesFile(is)).
+ * @param is the input stream with rules file, one line per rule on format regex=password
+ */
+ public void parse(InputStream is) {
+ setPasswordMap(parseRulesFile(is));
+ }
+
+ public LinkedHashMap<Pattern,String> getPasswordMap() {
+ return passwordMap;
+ }
+
+ public void setPasswordMap(LinkedHashMap<Pattern,String> linkedHashMap) {
+ this.passwordMap = linkedHashMap;
+ }
+
+ /**
+ * Gets the explicit password, if set
+ * @return the password, or null if not set
+ */
+ public String getExplicitPassword() {
+ return explicitPassword;
+ }
+
+ /**
+ * Sets an explicit password which will be used instead of password map
+ * @param explicitPassword the password to use
+ */
+ public void setExplicitPassword(String explicitPassword) {
+ this.explicitPassword = explicitPassword;
+ }
+
+ /**
+ * Resets explicit password, so that map will be used for lookups
+ */
+ public void resetExplicitPassword() {
+ this.explicitPassword = null;
+ }
+
+}
Added: lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/enctypted-password-is-solrRules.pdf
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/enctypted-password-is-solrRules.pdf?rev=1354887&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/password-is-Word2010.docx
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/password-is-Word2010.docx?rev=1354887&view=auto
==============================================================================
Binary file - no diff available.
Added: lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/passwordRegex.properties
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/passwordRegex.properties?rev=1354887&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/passwordRegex.properties (added)
+++ lucene/dev/trunk/solr/contrib/extraction/src/test-files/extraction/solr/collection1/conf/passwordRegex.properties Thu Jun 28 09:46:49 2012
@@ -0,0 +1,7 @@
+# Filename regex -> password map
+# Example any file ending in .doc should use password foobar:
+# .*\.doc = fooBar
+#
+# Note: Apache Tika 1.1 supports password for .pdf and .docx only, not .doc or other formats
+.*\.pdf$ = solrRules
+.*\.docx$ = Word2010
\ No newline at end of file
Modified: lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java?rev=1354887&r1=1354886&r2=1354887&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java (original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java Thu Jun 28 09:46:49 2012
@@ -394,7 +394,7 @@ public class ExtractingRequestHandlerTes
try{
loadLocal("extraction/password-is-solrcell.docx",
"literal.id", "one");
- fail("TikaException is expected because of trying to extract text from password protected word file.");
+ fail("TikaException is expected because of trying to extract text from password protected word file without supplying a password.");
}
catch(Exception expected){}
assertU(commit());
@@ -509,6 +509,74 @@ public class ExtractingRequestHandlerTes
assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']");
}
+ @Test
+ public void testPasswordProtected() throws Exception {
+ // PDF, Passwords from resource.password
+ loadLocal("extraction/enctypted-password-is-solrRules.pdf",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator",
+ "fmap.Keywords", "extractedKeywords",
+ "fmap.Creation-Date", "extractedDate",
+ "uprefix", "ignored_",
+ "fmap.Author", "extractedAuthor",
+ "fmap.content", "wdf_nocase",
+ "literal.id", "pdfpwliteral",
+ "resource.name", "enctypted-password-is-solrRules.pdf",
+ "resource.password", "solrRules",
+ "fmap.Last-Modified", "extractedDate");
+
+ // PDF, Passwords from passwords property file
+ loadLocal("extraction/enctypted-password-is-solrRules.pdf",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator",
+ "fmap.Keywords", "extractedKeywords",
+ "fmap.Creation-Date", "extractedDate",
+ "uprefix", "ignored_",
+ "fmap.Author", "extractedAuthor",
+ "fmap.content", "wdf_nocase",
+ "literal.id", "pdfpwfile",
+ "resource.name", "enctypted-password-is-solrRules.pdf",
+ "passwordsFile", "passwordRegex.properties", // Passwords-file
+ "fmap.Last-Modified", "extractedDate");
+
+ // DOCX, Explicit password
+ loadLocal("extraction/password-is-Word2010.docx",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator",
+ "fmap.Keywords", "extractedKeywords",
+ "fmap.Creation-Date", "extractedDate",
+ "fmap.Author", "extractedAuthor",
+ "fmap.content", "wdf_nocase",
+ "uprefix", "ignored_",
+ "literal.id", "docxpwliteral",
+ "resource.name", "password-is-Word2010.docx",
+ "resource.password", "Word2010", // Explicit password
+ "fmap.Last-Modified", "extractedDate");
+
+ // DOCX, Passwords from file
+ loadLocal("extraction/password-is-Word2010.docx",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator",
+ "fmap.Keywords", "extractedKeywords",
+ "fmap.Creation-Date", "extractedDate",
+ "uprefix", "ignored_",
+ "fmap.Author", "extractedAuthor",
+ "fmap.content", "wdf_nocase",
+ "literal.id", "docxpwfile",
+ "resource.name", "password-is-Word2010.docx",
+ "passwordsFile", "passwordRegex.properties", // Passwords-file
+ "fmap.Last-Modified", "extractedDate");
+
+ assertU(commit());
+ Thread.sleep(100);
+ assertQ(req("wdf_nocase:\"This is a test of PDF\""), "//*[@numFound='2']");
+ assertQ(req("wdf_nocase:\"Test password protected word doc\""), "//*[@numFound='2']");
+ }
+
SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
try {