You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2019/08/23 11:44:44 UTC

svn commit: r1865744 - in /manifoldcf/trunk: CHANGES.txt connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java

Author: kwright
Date: Fri Aug 23 11:44:43 2019
New Revision: 1865744

URL: http://svn.apache.org/viewvc?rev=1865744&view=rev
Log:
Fix for CONNECTORS-1621.

Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1865744&r1=1865743&r2=1865744&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri Aug 23 11:44:43 2019
@@ -3,6 +3,9 @@ $Id$
 
 ======================= 2.14-dev =====================
 
+CONNECTORS-1621: Fix broken ability to index Tika-extracted documents in Solr.
+(Markus Schuch, Karl Wright)
+
 CONNECTORS-1620: Add ability to extract links from application/xml documents.
 (Markus Schuch)
 

Modified: manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java?rev=1865744&r1=1865743&r2=1865744&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java (original)
+++ manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java Fri Aug 23 11:44:43 2019
@@ -578,7 +578,9 @@ public class HttpPoster
     }
 
     // If not the right mime type, reject it.
-    if ((includedMimeTypes !=null || excludedMimeTypes != null) && !checkMimeTypeIndexable(document.getMimeType(), useExtractUpdateHandler, includedMimeTypes, excludedMimeTypes)) {
+    // Note: this code added as part of CONNECTORS-1482 was incorrect!  Document filtering specified in the solr connector is always against the
+    // ORIGINAL mime type (which is what's in the document).  This why the checkMimeTypeIndexable second argument is always "true".
+    if ((includedMimeTypes !=null || excludedMimeTypes != null) && !checkMimeTypeIndexable(document.getMimeType(), true, includedMimeTypes, excludedMimeTypes)) {
       activities.recordActivity(null,SolrConnector.INGEST_ACTIVITY,null,documentURI,activities.EXCLUDED_MIMETYPE,"Solr connector rejected document due to mime type restrictions: ("+document.getMimeType()+")");
       return false;
     }
@@ -812,9 +814,17 @@ public class HttpPoster
     final String lowerMimeType = (mimeType==null)?null:mimeType.toLowerCase(Locale.ROOT);
     if (useExtractUpdateHandler)
     {
-      if (includedMimeTypes != null && !includedMimeTypes.contains(lowerMimeType))
+      // Strip the charset off for this check
+      int index = lowerMimeType == null ? -1 : lowerMimeType.indexOf(";");
+      final String checkMimeType;
+      if (index != -1) {
+        checkMimeType = lowerMimeType.substring(0,index);
+      } else {
+        checkMimeType = lowerMimeType;
+      }
+      if (includedMimeTypes != null && !includedMimeTypes.contains(checkMimeType))
         return false;
-      if (excludedMimeTypes != null && excludedMimeTypes.contains(lowerMimeType))
+      if (excludedMimeTypes != null && excludedMimeTypes.contains(checkMimeType))
         return false;
       return true;
     }