You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2018/01/09 18:52:09 UTC

svn commit: r1820689 - in /manifoldcf/trunk: CHANGES.txt connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java

Author: kwright
Date: Tue Jan  9 18:52:09 2018
New Revision: 1820689

URL: http://svn.apache.org/viewvc?rev=1820689&view=rev
Log:
Fix for CONNECTORS-1482.

Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
    manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1820689&r1=1820688&r2=1820689&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Tue Jan  9 18:52:09 2018
@@ -3,6 +3,10 @@ $Id$
 
 ======================= 2.10-dev =====================
 
+CONNECTORS-1482: Improve solr connector handling of include/exclude
+for mime types to be case insenstive and to actually reject documents
+if upstream connector ignores or doesn't call checkMimeTypeIndexable().
+
 CONNECTORS-1476: TLS/SSL support for MySQL/MariaDB database connections
 (Markus Schuch)
 

Modified: manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java?rev=1820689&r1=1820688&r2=1820689&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java (original)
+++ manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java Tue Jan  9 18:52:09 2018
@@ -113,6 +113,10 @@ public class HttpPoster
   // Document max length
   private final Long maxDocumentLength;
 
+  // Included and excluded mime types
+  private final Set<String> includedMimeTypes;
+  private final Set<String>excludedMimeTypes;
+  
   // Commit-within flag
   private final String commitWithin;
 
@@ -134,7 +138,9 @@ public class HttpPoster
     String originalSizeAttributeName, String modifiedDateAttributeName, String createdDateAttributeName, String indexedDateAttributeName,
     String fileNameAttributeName, String mimeTypeAttributeName, String contentAttributeName,
     Long maxDocumentLength,
-    String commitWithin, boolean useExtractUpdateHandler, boolean allowCompression)
+    String commitWithin, boolean useExtractUpdateHandler,
+    final Set<String> includedMimeTypes, final Set<String> excludedMimeTypes,
+    boolean allowCompression)
     throws ManifoldCFException
   {
     // These are the paths to the handlers in Solr that deal with the actions we need to do
@@ -155,6 +161,8 @@ public class HttpPoster
     this.mimeTypeAttributeName = mimeTypeAttributeName;
     this.contentAttributeName = contentAttributeName;
     this.useExtractUpdateHandler = useExtractUpdateHandler;
+    this.includedMimeTypes = includedMimeTypes;
+    this.excludedMimeTypes = excludedMimeTypes;
     
     this.maxDocumentLength = maxDocumentLength;
     
@@ -186,7 +194,9 @@ public class HttpPoster
     String originalSizeAttributeName, String modifiedDateAttributeName, String createdDateAttributeName, String indexedDateAttributeName,
     String fileNameAttributeName, String mimeTypeAttributeName, String contentAttributeName,
     IKeystoreManager keystoreManager, Long maxDocumentLength,
-    String commitWithin, boolean useExtractUpdateHandler, boolean allowCompression)
+    String commitWithin, boolean useExtractUpdateHandler,
+    final Set<String> includedMimeTypes, final Set<String> excludedMimeTypes,
+    boolean allowCompression)
     throws ManifoldCFException
   {
     // These are the paths to the handlers in Solr that deal with the actions we need to do
@@ -207,6 +217,8 @@ public class HttpPoster
     this.mimeTypeAttributeName = mimeTypeAttributeName;
     this.contentAttributeName = contentAttributeName;
     this.useExtractUpdateHandler = useExtractUpdateHandler;
+    this.includedMimeTypes = includedMimeTypes;
+    this.excludedMimeTypes = excludedMimeTypes;
     
     this.maxDocumentLength = maxDocumentLength;
 
@@ -564,6 +576,11 @@ public class HttpPoster
       return false;
     }
 
+    // If not the right mime type, reject it.
+    if ((includedMimeTypes !=null || excludedMimeTypes != null) && !checkMimeTypeIndexable(document.getMimeType(), useExtractUpdateHandler, includedMimeTypes, excludedMimeTypes)) {
+      activities.recordActivity(null,SolrConnector.INGEST_ACTIVITY,null,documentURI,activities.EXCLUDED_MIMETYPE,"Solr connector rejected document due to mime type restrictions: ("+document.getMimeType()+")");
+      return false;
+    }
     
     // Convert the incoming acls that we know about to qualified forms, and reject the document if
     // we don't know how to deal with its acls
@@ -776,6 +793,30 @@ public class HttpPoster
 
   }
 
+  private final static Set<String> acceptableMimeTypes = new HashSet<String>();
+  static
+  {
+    acceptableMimeTypes.add("text/plain;charset=utf-8");
+    acceptableMimeTypes.add("text/plain;charset=ascii");
+    acceptableMimeTypes.add("text/plain;charset=us-ascii");
+    acceptableMimeTypes.add("text/plain");
+  }
+
+  public static boolean checkMimeTypeIndexable(final String mimeType, final boolean useExtractUpdateHandler,
+    final Set<String> includedMimeTypes, final Set<String> excludedMimeTypes)
+  {
+    final String lowerMimeType = mimeType.toLowerCase(Locale.ROOT);
+    if (useExtractUpdateHandler)
+    {
+      if (includedMimeTypes != null && !includedMimeTypes.contains(lowerMimeType))
+        return false;
+      if (excludedMimeTypes != null && excludedMimeTypes.contains(lowerMimeType))
+        return false;
+      return true;
+    }
+    return acceptableMimeTypes.contains(lowerMimeType);
+  }
+
   /** Convert an unqualified ACL to qualified form.
   * @param acl is the initial, unqualified ACL.
   * @param authorityNameString is the name of the governing authority for this document's acls, or null if none.

Modified: manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java?rev=1820689&r1=1820688&r2=1820689&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java (original)
+++ manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java Tue Jan  9 18:52:09 2018
@@ -77,11 +77,11 @@ public class SolrConnector extends org.a
   /** Included mime types string */
   protected String includedMimeTypesString = null;
   /** Included mime types */
-  protected Map<String,String> includedMimeTypes = null;
+  protected Set<String> includedMimeTypes = null;
   /** Excluded mime types string */
   protected String excludedMimeTypesString = null;
   /** Excluded mime types */
-  protected Map<String,String> excludedMimeTypes = null;
+  protected Set<String> excludedMimeTypes = null;
   
   // Attributes going into Solr
   protected String idAttributeName = null;
@@ -365,7 +365,9 @@ public class SolrConnector extends org.a
             allowAttributeName,denyAttributeName,idAttributeName,
             originalSizeAttributeName,modifiedDateAttributeName,createdDateAttributeName,indexedDateAttributeName,
             fileNameAttributeName,mimeTypeAttributeName,contentAttributeName,
-            keystoreManager,maxDocumentLength,commitWithin,useExtractUpdateHandler,allowCompression);
+            keystoreManager,maxDocumentLength,commitWithin,useExtractUpdateHandler,
+            includedMimeTypes,excludedMimeTypes,
+            allowCompression);
           
         }
         catch (NumberFormatException e)
@@ -421,7 +423,9 @@ public class SolrConnector extends org.a
             allowAttributeName,denyAttributeName,idAttributeName,
             originalSizeAttributeName,modifiedDateAttributeName,createdDateAttributeName,indexedDateAttributeName,
             fileNameAttributeName,mimeTypeAttributeName,contentAttributeName,
-            maxDocumentLength,commitWithin,useExtractUpdateHandler,allowCompression);
+            maxDocumentLength,commitWithin,useExtractUpdateHandler,
+            includedMimeTypes,excludedMimeTypes,
+            allowCompression);
           
         }
         catch (NumberFormatException e)
@@ -438,10 +442,10 @@ public class SolrConnector extends org.a
   }
 
   /** Parse a mime type field into individual mime types in a hash */
-  protected static Map<String,String> parseMimeTypes(String mimeTypes)
+  protected static Set<String> parseMimeTypes(final String mimeTypes)
     throws ManifoldCFException
   {
-    Map<String,String> rval = new HashMap<String,String>();
+    Set<String> rval = new HashSet<>();
     try
     {
       java.io.Reader str = new java.io.StringReader(mimeTypes);
@@ -457,7 +461,7 @@ public class SolrConnector extends org.a
               break;
             if (nextString.length() == 0)
               continue;
-            rval.put(nextString,nextString);
+            rval.add(nextString.toLowerCase(Locale.ROOT));
           }
           return rval;
         }
@@ -516,15 +520,6 @@ public class SolrConnector extends org.a
     return new VersionContext(sp.toPackedString(),params,spec);
   }
 
-  private final static Set<String> acceptableMimeTypes = new HashSet<String>();
-  static
-  {
-    acceptableMimeTypes.add("text/plain;charset=utf-8");
-    acceptableMimeTypes.add("text/plain;charset=ascii");
-    acceptableMimeTypes.add("text/plain;charset=us-ascii");
-    acceptableMimeTypes.add("text/plain");
-  }
-
   /** Detect if a mime type is indexable or not.  This method is used by participating repository connectors to pre-filter the number of
   * unusable documents that will be passed to this output connector.
   *@param outputDescription is the document's output version.
@@ -536,15 +531,7 @@ public class SolrConnector extends org.a
     throws ManifoldCFException, ServiceInterruption
   {
     getSession();
-    if (useExtractUpdateHandler)
-    {
-      if (includedMimeTypes != null && includedMimeTypes.get(mimeType) == null)
-        return false;
-      if (excludedMimeTypes != null && excludedMimeTypes.get(mimeType) != null)
-        return false;
-      return true;
-    }
-    return acceptableMimeTypes.contains(mimeType.toLowerCase(Locale.ROOT));
+    return HttpPoster.checkMimeTypeIndexable(mimeType, useExtractUpdateHandler, includedMimeTypes, excludedMimeTypes);
   }
 
   /** Pre-determine whether a document's length is indexable by this connector.  This method is used by participating repository connectors