You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2018/01/09 18:52:09 UTC
svn commit: r1820689 - in /manifoldcf/trunk: CHANGES.txt
connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java
Author: kwright
Date: Tue Jan 9 18:52:09 2018
New Revision: 1820689
URL: http://svn.apache.org/viewvc?rev=1820689&view=rev
Log:
Fix for CONNECTORS-1482.
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java
Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1820689&r1=1820688&r2=1820689&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Tue Jan 9 18:52:09 2018
@@ -3,6 +3,10 @@ $Id$
======================= 2.10-dev =====================
+CONNECTORS-1482: Improve solr connector handling of include/exclude
+for mime types to be case insenstive and to actually reject documents
+if upstream connector ignores or doesn't call checkMimeTypeIndexable().
+
CONNECTORS-1476: TLS/SSL support for MySQL/MariaDB database connections
(Markus Schuch)
Modified: manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java?rev=1820689&r1=1820688&r2=1820689&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java (original)
+++ manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java Tue Jan 9 18:52:09 2018
@@ -113,6 +113,10 @@ public class HttpPoster
// Document max length
private final Long maxDocumentLength;
+ // Included and excluded mime types
+ private final Set<String> includedMimeTypes;
+ private final Set<String>excludedMimeTypes;
+
// Commit-within flag
private final String commitWithin;
@@ -134,7 +138,9 @@ public class HttpPoster
String originalSizeAttributeName, String modifiedDateAttributeName, String createdDateAttributeName, String indexedDateAttributeName,
String fileNameAttributeName, String mimeTypeAttributeName, String contentAttributeName,
Long maxDocumentLength,
- String commitWithin, boolean useExtractUpdateHandler, boolean allowCompression)
+ String commitWithin, boolean useExtractUpdateHandler,
+ final Set<String> includedMimeTypes, final Set<String> excludedMimeTypes,
+ boolean allowCompression)
throws ManifoldCFException
{
// These are the paths to the handlers in Solr that deal with the actions we need to do
@@ -155,6 +161,8 @@ public class HttpPoster
this.mimeTypeAttributeName = mimeTypeAttributeName;
this.contentAttributeName = contentAttributeName;
this.useExtractUpdateHandler = useExtractUpdateHandler;
+ this.includedMimeTypes = includedMimeTypes;
+ this.excludedMimeTypes = excludedMimeTypes;
this.maxDocumentLength = maxDocumentLength;
@@ -186,7 +194,9 @@ public class HttpPoster
String originalSizeAttributeName, String modifiedDateAttributeName, String createdDateAttributeName, String indexedDateAttributeName,
String fileNameAttributeName, String mimeTypeAttributeName, String contentAttributeName,
IKeystoreManager keystoreManager, Long maxDocumentLength,
- String commitWithin, boolean useExtractUpdateHandler, boolean allowCompression)
+ String commitWithin, boolean useExtractUpdateHandler,
+ final Set<String> includedMimeTypes, final Set<String> excludedMimeTypes,
+ boolean allowCompression)
throws ManifoldCFException
{
// These are the paths to the handlers in Solr that deal with the actions we need to do
@@ -207,6 +217,8 @@ public class HttpPoster
this.mimeTypeAttributeName = mimeTypeAttributeName;
this.contentAttributeName = contentAttributeName;
this.useExtractUpdateHandler = useExtractUpdateHandler;
+ this.includedMimeTypes = includedMimeTypes;
+ this.excludedMimeTypes = excludedMimeTypes;
this.maxDocumentLength = maxDocumentLength;
@@ -564,6 +576,11 @@ public class HttpPoster
return false;
}
+ // If not the right mime type, reject it.
+ if ((includedMimeTypes !=null || excludedMimeTypes != null) && !checkMimeTypeIndexable(document.getMimeType(), useExtractUpdateHandler, includedMimeTypes, excludedMimeTypes)) {
+ activities.recordActivity(null,SolrConnector.INGEST_ACTIVITY,null,documentURI,activities.EXCLUDED_MIMETYPE,"Solr connector rejected document due to mime type restrictions: ("+document.getMimeType()+")");
+ return false;
+ }
// Convert the incoming acls that we know about to qualified forms, and reject the document if
// we don't know how to deal with its acls
@@ -776,6 +793,30 @@ public class HttpPoster
}
+ private final static Set<String> acceptableMimeTypes = new HashSet<String>();
+ static
+ {
+ acceptableMimeTypes.add("text/plain;charset=utf-8");
+ acceptableMimeTypes.add("text/plain;charset=ascii");
+ acceptableMimeTypes.add("text/plain;charset=us-ascii");
+ acceptableMimeTypes.add("text/plain");
+ }
+
+ public static boolean checkMimeTypeIndexable(final String mimeType, final boolean useExtractUpdateHandler,
+ final Set<String> includedMimeTypes, final Set<String> excludedMimeTypes)
+ {
+ final String lowerMimeType = mimeType.toLowerCase(Locale.ROOT);
+ if (useExtractUpdateHandler)
+ {
+ if (includedMimeTypes != null && !includedMimeTypes.contains(lowerMimeType))
+ return false;
+ if (excludedMimeTypes != null && excludedMimeTypes.contains(lowerMimeType))
+ return false;
+ return true;
+ }
+ return acceptableMimeTypes.contains(lowerMimeType);
+ }
+
/** Convert an unqualified ACL to qualified form.
* @param acl is the initial, unqualified ACL.
* @param authorityNameString is the name of the governing authority for this document's acls, or null if none.
Modified: manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java?rev=1820689&r1=1820688&r2=1820689&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java (original)
+++ manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java Tue Jan 9 18:52:09 2018
@@ -77,11 +77,11 @@ public class SolrConnector extends org.a
/** Included mime types string */
protected String includedMimeTypesString = null;
/** Included mime types */
- protected Map<String,String> includedMimeTypes = null;
+ protected Set<String> includedMimeTypes = null;
/** Excluded mime types string */
protected String excludedMimeTypesString = null;
/** Excluded mime types */
- protected Map<String,String> excludedMimeTypes = null;
+ protected Set<String> excludedMimeTypes = null;
// Attributes going into Solr
protected String idAttributeName = null;
@@ -365,7 +365,9 @@ public class SolrConnector extends org.a
allowAttributeName,denyAttributeName,idAttributeName,
originalSizeAttributeName,modifiedDateAttributeName,createdDateAttributeName,indexedDateAttributeName,
fileNameAttributeName,mimeTypeAttributeName,contentAttributeName,
- keystoreManager,maxDocumentLength,commitWithin,useExtractUpdateHandler,allowCompression);
+ keystoreManager,maxDocumentLength,commitWithin,useExtractUpdateHandler,
+ includedMimeTypes,excludedMimeTypes,
+ allowCompression);
}
catch (NumberFormatException e)
@@ -421,7 +423,9 @@ public class SolrConnector extends org.a
allowAttributeName,denyAttributeName,idAttributeName,
originalSizeAttributeName,modifiedDateAttributeName,createdDateAttributeName,indexedDateAttributeName,
fileNameAttributeName,mimeTypeAttributeName,contentAttributeName,
- maxDocumentLength,commitWithin,useExtractUpdateHandler,allowCompression);
+ maxDocumentLength,commitWithin,useExtractUpdateHandler,
+ includedMimeTypes,excludedMimeTypes,
+ allowCompression);
}
catch (NumberFormatException e)
@@ -438,10 +442,10 @@ public class SolrConnector extends org.a
}
/** Parse a mime type field into individual mime types in a hash */
- protected static Map<String,String> parseMimeTypes(String mimeTypes)
+ protected static Set<String> parseMimeTypes(final String mimeTypes)
throws ManifoldCFException
{
- Map<String,String> rval = new HashMap<String,String>();
+ Set<String> rval = new HashSet<>();
try
{
java.io.Reader str = new java.io.StringReader(mimeTypes);
@@ -457,7 +461,7 @@ public class SolrConnector extends org.a
break;
if (nextString.length() == 0)
continue;
- rval.put(nextString,nextString);
+ rval.add(nextString.toLowerCase(Locale.ROOT));
}
return rval;
}
@@ -516,15 +520,6 @@ public class SolrConnector extends org.a
return new VersionContext(sp.toPackedString(),params,spec);
}
- private final static Set<String> acceptableMimeTypes = new HashSet<String>();
- static
- {
- acceptableMimeTypes.add("text/plain;charset=utf-8");
- acceptableMimeTypes.add("text/plain;charset=ascii");
- acceptableMimeTypes.add("text/plain;charset=us-ascii");
- acceptableMimeTypes.add("text/plain");
- }
-
/** Detect if a mime type is indexable or not. This method is used by participating repository connectors to pre-filter the number of
* unusable documents that will be passed to this output connector.
*@param outputDescription is the document's output version.
@@ -536,15 +531,7 @@ public class SolrConnector extends org.a
throws ManifoldCFException, ServiceInterruption
{
getSession();
- if (useExtractUpdateHandler)
- {
- if (includedMimeTypes != null && includedMimeTypes.get(mimeType) == null)
- return false;
- if (excludedMimeTypes != null && excludedMimeTypes.get(mimeType) != null)
- return false;
- return true;
- }
- return acceptableMimeTypes.contains(mimeType.toLowerCase(Locale.ROOT));
+ return HttpPoster.checkMimeTypeIndexable(mimeType, useExtractUpdateHandler, includedMimeTypes, excludedMimeTypes);
}
/** Pre-determine whether a document's length is indexable by this connector. This method is used by participating repository connectors